diff --git a/Makefile b/Makefile
index 99324a3..60ededb 100644
--- a/Makefile
+++ b/Makefile
@@ -19,14 +19,8 @@ clean:
-$(RM) -r build dist
-cd $(PACKAGE) && $(MAKE) clean
-cd tools && $(MAKE) clean
- -cd samples && $(MAKE) clean
-distclean: clean cmap_clean
-
-test:
- cd samples && $(MAKE) test
-check:
- cd $(PACKAGE) && make check
+distclean: clean test_clean cmap_clean
commit: distclean
$(SVN) commit
@@ -39,13 +33,23 @@ WEBDIR=$$HOME/Site/unixuser.org/python/$(PACKAGE)
publish:
$(CP) docs/*.html $(WEBDIR)
+test:
+ cd samples && $(MAKE) test
+test_clean:
+ -cd samples && $(MAKE) clean
+
CONV_CMAP=$(PYTHON) tools/conv_cmap.py
-CMAPDIR=pdfminer/cmap
-CMAPRSRC=cmaprsrc
-cmap: cmaprsrc
- $(CONV_CMAP) $(CMAPDIR) Adobe-CNS1 $(CMAPRSRC)/cid2code_Adobe_CNS1.txt cp950 big5
- $(CONV_CMAP) $(CMAPDIR) Adobe-GB1 $(CMAPRSRC)/cid2code_Adobe_GB1.txt cp936 gb2312
- $(CONV_CMAP) $(CMAPDIR) Adobe-Japan1 $(CMAPRSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
- $(CONV_CMAP) $(CMAPDIR) Adobe-Korea1 $(CMAPRSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
+CMAPSRC=cmaprsrc
+CMAPDST=pdfminer/cmap
+cmap: $(CMAPDST)/TO_UNICODE_Adobe_CNS1.py $(CMAPDST)/TO_UNICODE_Adobe_GB1.py \
+ $(CMAPDST)/TO_UNICODE_Adobe_Japan1.py $(CMAPDST)/TO_UNICODE_Adobe_Korea1.py
cmap_clean:
- cd $(CMAPDIR) && make cmap_clean
+ cd $(CMAPDST) && make cmap_clean
+$(CMAPDST)/TO_UNICODE_Adobe_CNS1.py:
+ $(CONV_CMAP) $(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt cp950 big5
+$(CMAPDST)/TO_UNICODE_Adobe_GB1.py:
+ $(CONV_CMAP) $(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt cp936 gb2312
+$(CMAPDST)/TO_UNICODE_Adobe_Japan1.py:
+ $(CONV_CMAP) $(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
+$(CMAPDST)/TO_UNICODE_Adobe_Korea1.py:
+ $(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
diff --git a/docs/index.html b/docs/index.html
index b5e58f1..8e604c5 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -19,7 +19,7 @@ Python PDF parser and analyzer
-Last Modified: Sun Dec 20 01:25:02 JST 2009
+Last Modified: Fri Jan 1 12:04:47 JST 2010
@@ -27,7 +27,7 @@ Last Modified: Sun Dec 20 01:25:02 JST 2009
What's It?
Download
Install
- (for non-ASCII languages)
+ (for East Asian languages)
How to Use
(pdf2txt.py, dumppdf.py)
TODOs
@@ -54,7 +54,7 @@ PDF parser that can be used for other purposes instead of text analysis.
- Written entirely in Python. (for version 2.4 or newer)
- PDF-1.7 specification support. (well, almost)
-
- Non-ASCII languages and vertical writing scripts support.
+
- East Asian languages and vertical writing scripts support.
- Various font types (Type1, TrueType, Type3, and CID) support.
- Basic encryption (RC4) support.
- PDF to HTML conversion (with a sample converter web app).
@@ -125,8 +125,8 @@ W o r l d
-
For non-ASCII languages
-In order to handle non-ASCII languages (e.g. Japanese),
+For East Asian languages
+In order to handle East Asian languages (Chinese or Japanese, etc.),
you need to install an additional data called CMap
,
which is originally distributed by Adobe. CMap is now included
in the pdfminer package, but not installed by default.
@@ -163,9 +163,6 @@ direction (horizontal or vertical) for each text portion.
You need to provide a password for protected PDF documents when its access is restricted.
You cannot extract any text from a PDF document which does not have extraction permission.
-For non-ASCII languages, you can specify the output encoding
-(such as UTF-8).
-
Note: Not all characters in a PDF can be safely converted to Unicode.
@@ -194,7 +191,7 @@ Page numbers are starting from one.
By default, it extracts texts from all the pages.
-
-c codec
- - Specifies the output codec for non-ASCII texts.
+
- Specifies the output codec.
-
-t type
- Specifies the output format. The following formats are currently supported.
@@ -344,6 +341,9 @@ no stream header is displayed for the ease of saving it to a file.
Changes
+- 2010/01/01: CMap bug fix. Thanks to Winfried Plappert.
+
- 2009/12/24: RunLengthDecode filter added. Thanks to Troy Bollinger.
+
- 2009/12/20: Experimental polygon shape extraction added. Thanks to Yusuf Dewaswala for reporting.
- 2009/12/19: CMap resources are now the part of the package. Thanks to Adobe for open-sourcing them.
- 2009/11/29: Password encryption bug fixed. Thanks to Yannick Gingras.
- 2009/10/31: SGML output format is changed and renamed as XML.
diff --git a/pdfminer/Makefile b/pdfminer/Makefile
index a68f7ec..2ae0f6e 100644
--- a/pdfminer/Makefile
+++ b/pdfminer/Makefile
@@ -1,12 +1,7 @@
# Makefile for pdfminer
-PYCHECKER=pychecker --limit=0
-
all:
clean:
-rm *.pyc *.pyo
cd cmap && make clean
-
-check:
- $(PYCHECKER) *.py
diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py
index 048b940..3c136ae 100644
--- a/pdfminer/cmapdb.py
+++ b/pdfminer/cmapdb.py
@@ -90,14 +90,14 @@ class UnicodeMap(object):
debug = 0
- def __init__(self, cid2unicode=None):
- self.cid2unicode = cid2unicode or {}
+ def __init__(self, cid2unichr=None):
+ self.cid2unichr = cid2unichr or {}
return
- def get_unicode(self, cid):
+ def get_unichr(self, cid):
if self.debug:
- print >>sys.stderr, 'get_unicode: %r, %r' % (self, cid)
- return self.cid2unicode.get(cid)
+ print >>sys.stderr, 'get_unichr: %r, %r' % (self, cid)
+ return self.cid2unichr[cid]
## FileCMap
@@ -151,16 +151,16 @@ class FileUnicodeMap(UnicodeMap):
self.attrs[k] = v
return
- def add_cid2unicode(self, cid, code):
+ def add_cid2unichr(self, cid, code):
assert isinstance(cid, int)
if isinstance(code, PSLiteral):
# Interpret as an Adobe glyph name.
- self.cid2unicode[cid] = name2unicode(code.name)
+ self.cid2unichr[cid] = unichr(name2unicode(code.name))
elif isinstance(code, str):
# Interpret as UTF-16BE.
- self.cid2unicode[cid] = unpack('>H', code)[0]
+ self.cid2unichr[cid] = unicode(code, 'UTF-16BE', 'ignore')
elif isinstance(code, int):
- self.cid2unicode[cid] = code
+ self.cid2unichr[cid] = unichr(code)
else:
raise TypeError(code)
return
@@ -189,10 +189,10 @@ class PyUnicodeMap(UnicodeMap):
def __init__(self, name, module, vertical):
if vertical:
- cid2unicode = module.CID2UNICODE_V
+ cid2unichr = module.CID2UNICHR_V
else:
- cid2unicode = module.CID2UNICODE_H
- UnicodeMap.__init__(self, cid2unicode)
+ cid2unichr = module.CID2UNICHR_H
+ UnicodeMap.__init__(self, cid2unichr)
self.name = name
return
@@ -333,7 +333,7 @@ class CMapParser(PSStackParser):
#assert s1 <= e1
if isinstance(code, list):
for i in xrange(e1-s1+1):
- self.cmap.add_cid2unicode(s1+i, code[i])
+ self.cmap.add_cid2unichr(s1+i, code[i])
else:
var = code[-4:]
base = nunpack(var)
@@ -341,7 +341,7 @@ class CMapParser(PSStackParser):
vlen = len(var)
for i in xrange(e1-s1+1):
x = prefix+pack('>L',base+i)[-vlen:]
- self.cmap.add_cid2unicode(s1+i, x)
+ self.cmap.add_cid2unichr(s1+i, x)
return
if name == 'beginbfchar':
@@ -351,7 +351,7 @@ class CMapParser(PSStackParser):
objs = [ obj for (_,obj) in self.popall() ]
for (cid,code) in choplist(2, objs):
if isinstance(cid, str) and isinstance(code, str):
- self.cmap.add_cid2unicode(nunpack(cid), code)
+ self.cmap.add_cid2unichr(nunpack(cid), code)
return
if name == 'beginnotdefrange':
diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py
index fe544e6..dfb1b8e 100644
--- a/pdfminer/pdffont.py
+++ b/pdfminer/pdffont.py
@@ -405,9 +405,10 @@ class PDFSimpleFont(PDFFont):
def to_unichr(self, cid):
if self.unicode_map:
- code = self.unicode_map.get_unicode(cid)
- if code is not None:
- return unichr(code)
+ try:
+ return self.unicode_map.get_unichr(cid)
+ except KeyError:
+ pass
try:
return self.encoding[cid]
except KeyError:
@@ -571,12 +572,11 @@ class PDFCIDFont(PDFFont):
return self.disps.get(cid, self.default_disp)
def to_unichr(self, cid):
- if not self.unicode_map:
+ try:
+ if not self.unicode_map: raise KeyError(cid)
+ return self.unicode_map.get_unichr(cid)
+ except KeyError:
raise PDFUnicodeNotDefined(self.cidcoding, cid)
- code = self.unicode_map.get_unicode(cid)
- if code is not None:
- return unichr(code)
- raise PDFUnicodeNotDefined(self.cidcoding, cid)
# main
diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py
index e036511..dec9818 100644
--- a/pdfminer/pdfinterp.py
+++ b/pdfminer/pdfinterp.py
@@ -766,7 +766,9 @@ class PDFTextExtractionNotAllowed(PDFInterpreterError): pass
def process_pdf(rsrc, device, fp, pagenos=None, maxpages=0, password=''):
doc = PDFDocument()
- parser = PDFParser(doc, fp)
+ parser = PDFParser(fp)
+ parser.set_document(doc)
+ doc.set_parser(parser)
doc.initialize(password)
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py
index 606af4a..43659dd 100644
--- a/pdfminer/pdfparser.py
+++ b/pdfminer/pdfparser.py
@@ -1,9 +1,12 @@
#!/usr/bin/env python
import sys
import re
-import md5
import struct
from sys import stderr
+try:
+ import hashlib as md5
+except ImportError:
+ import md5
try:
from cStringIO import StringIO
except ImportError:
@@ -19,7 +22,7 @@ from pdftypes import int_value, float_value, num_value
from pdftypes import str_value, list_value, dict_value, stream_value
from arcfour import Arcfour
from utils import choplist, nunpack
-from utils import decode_text
+from utils import decode_text, ObjIdRange
## Exceptions
@@ -39,57 +42,29 @@ LITERAL_CATALOG = LIT('Catalog')
## XRefs
##
-class XRefObjRange(object):
- def __init__(self, start, nobjs):
- self.start = start
- self.nobjs = nobjs
- return
-
- def __repr__(self):
- return '' % (self.get_start_id(), self.get_end_id())
-
- def get_start_id(self):
- return self.start
-
- def get_end_id(self):
- return self.start + self.nobjs - 1
-
- def get_nobjs(self):
- return self.nobjs
-
class PDFBaseXRef(object):
- def __init__(self):
- self.objid_ranges = None
- return
- def objids(self):
- if self.objid_ranges:
- for objid_range in self.objid_ranges:
- for objid in xrange(objid_range.get_start_id(), objid_range.get_end_id() + 1):
- yield objid
- return
+ def get_trailer(self):
+ raise NotImplementedError
+
+ def get_pos(self, objid):
+ raise KeyError(objid)
## PDFXRef
##
class PDFXRef(PDFBaseXRef):
+
def __init__(self):
- PDFBaseXRef.__init__(self)
- self.offsets = None
+ self.offsets = {}
self.trailer = {}
return
- def __repr__(self):
- return '' % len(self.offsets)
-
def load(self, parser, debug=0):
- self.offsets = {}
- self.objid_ranges = []
while 1:
try:
(pos, line) = parser.nextline()
- if not line.strip():
- continue
+ if not line.strip(): continue
except PSEOF:
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
if not line:
@@ -104,8 +79,6 @@ class PDFXRef(PDFBaseXRef):
(start, nobjs) = map(long, f)
except ValueError:
raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line))
- self.newoffsets = {}
- self.objid_ranges.append(XRefObjRange(start, nobjs))
for objid in xrange(start, start+nobjs):
try:
(_, line) = parser.nextline()
@@ -133,10 +106,33 @@ class PDFXRef(PDFBaseXRef):
if not x:
raise PDFNoValidXRef('Unexpected EOF - file corrupted')
(_,dic) = x[0]
- self.trailer.update( dict_value(dic))
+ self.trailer.update(dict_value(dic))
return
- def getpos(self, objid):
+ PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
+ def load_fallback(self, parser, debug=0):
+ parser.seek(0)
+ while 1:
+ try:
+ (pos, line) = parser.nextline()
+ except PSEOF:
+ break
+ if line.startswith('trailer'):
+ parser.seek(pos)
+ self.load_trailer(parser)
+ if 1 <= debug:
+ print >>stderr, 'trailer: %r' % self.get_trailer()
+ break
+ m = self.PDFOBJ_CUE.match(line)
+ if not m: continue
+ (objid, genno) = m.groups()
+ self.offsets[int(objid)] = (0, pos)
+ return
+
+ def get_trailer(self):
+ return self.trailer
+
+ def get_pos(self, objid):
try:
(genno, pos) = self.offsets[objid]
except KeyError:
@@ -149,10 +145,10 @@ class PDFXRef(PDFBaseXRef):
class PDFXRefStream(PDFBaseXRef):
def __init__(self):
- PDFBaseXRef.__init__(self)
self.data = None
self.entlen = None
self.fl1 = self.fl2 = self.fl3 = None
+ self.objid_ranges = []
return
def __repr__(self):
@@ -169,17 +165,22 @@ class PDFXRefStream(PDFBaseXRef):
index_array = stream.dic.get('Index', (0,size))
if len(index_array) % 2 != 0:
raise PDFSyntaxError('Invalid index number')
- self.objid_ranges = [ XRefObjRange(start,nobjs) for (start,nobjs) in choplist(2, index_array) ]
+ self.objid_ranges.extend( ObjIdRange(start, nobjs)
+ for (start,nobjs) in choplist(2, index_array) )
(self.fl1, self.fl2, self.fl3) = stream.dic['W']
self.data = stream.get_data()
self.entlen = self.fl1+self.fl2+self.fl3
self.trailer = stream.dic
if debug:
print >>stderr, ('xref stream: objid=%s, fields=%d,%d,%d' %
- (', '.join(map(repr, self.objid_ranges), self.fl1, self.fl2, self.fl3)))
+ (', '.join(map(repr, self.objid_ranges),
+ self.fl1, self.fl2, self.fl3)))
return
- def getpos(self, objid):
+ def get_trailer(self):
+ return self.trailer
+
+ def get_pos(self, objid):
offset = 0
found = False
for objid_range in self.objid_ranges:
@@ -207,14 +208,35 @@ class PDFXRefStream(PDFBaseXRef):
## PDFPage
##
-## A PDFPage object is nothing more than a bunch of keys and values
-## that describe the properties of the page and point to its contents,
-## and has nothing to do with a real graphical entity. For a real graphical
-## object, look at layout.LTPage.
-##
class PDFPage(object):
+ """An object that holds the information about a page.
+
+ A PDFPage object is merely a convenience class that has a set
+ of keys and values, which describe the properties of a page
+ and point to its contents.
+
+ Attributes:
+ doc: a PDFDocument object.
+ pageid: any Python object that can uniquely identify the page.
+ attrs: a dictionary of page attributes.
+ contents: a list of PDFStream objects that represents the page content.
+ lastmod: the last modified time of the page.
+ resources: a list of resources used by the page.
+ mediabox: the physical size of the page.
+ cropbox: the crop rectangle of the page.
+ rotate: the page rotation (in degree).
+ annots: the page annotations.
+ beads: a chain that represents natural reading order.
+ """
+
def __init__(self, doc, pageid, attrs):
+ """Initialize a page object.
+
+ doc: a PDFDocument object.
+ pageid: any Python object that can uniquely identify the page.
+ attrs: a dictionary of page attributes.
+ """
self.doc = doc
self.pageid = pageid
self.attrs = dict_value(attrs)
@@ -243,13 +265,23 @@ class PDFPage(object):
## PDFDocument
##
-## A PDFDocument object represents a PDF document.
-## Since a PDF file is usually pretty big, normally it is not loaded
-## at once. Rather it is parsed dynamically as processing goes.
-## A PDF parser is associated with the document.
-##
class PDFDocument(object):
+ """PDFDocument object represents a PDF document.
+
+ Since a PDF file can be very big, normally it is not loaded at
+ once. Each PDF document has a PDF parser object associated,
+ and the data stream is parsed dynamically as processing goes.
+
+ Typical usage:
+ doc = PDFDocument()
+ parser = PDFParser(fp)
+ parser.set_document(doc)
+ doc.set_parser(parser)
+ doc.initialize(password)
+
+ """
+
debug = 0
def __init__(self):
@@ -261,24 +293,23 @@ class PDFDocument(object):
self.parser = None
self.encryption = None
self.decipher = None
- self.ready = False
+ self._initialized = False
return
- # set_parser(parser)
- # Associates the document with an (already initialized) parser object.
def set_parser(self, parser):
+ "Set the document to use a given PDFParser object."
if self.parser: return
self.parser = parser
# The document is set to be temporarily ready during collecting
# all the basic information about the document, e.g.
# the header, the encryption information, and the access rights
# for the document.
- self.ready = True
+ self._initialized = True
# Retrieve the information of each header that was appended
# (maybe multiple times) at the end of the document.
self.xrefs = parser.read_xref()
for xref in self.xrefs:
- trailer = xref.trailer
+ trailer = xref.get_trailer()
if not trailer: continue
# If there's an encryption info, remember it.
if 'Encrypt' in trailer:
@@ -293,7 +324,7 @@ class PDFDocument(object):
# The document is set to be non-ready again, until all the
# proper initialization (asking the password key and
# verifying the access permission, so on) is finished.
- self.ready = False
+ self._initialized = False
return
# set_root(root)
@@ -315,7 +346,7 @@ class PDFDocument(object):
def initialize(self, password=''):
if not self.encryption:
self.is_printable = self.is_modifiable = self.is_extractable = True
- self.ready = True
+ self._initialized = True
return
(docid, param) = self.encryption
if literal_name(param['Filter']) != 'Standard':
@@ -367,7 +398,7 @@ class PDFDocument(object):
raise PDFPasswordIncorrect
self.decrypt_key = key
self.decipher = self.decrypt_rc4 # XXX may be AES
- self.ready = True
+ self._initialized = True
return
def decrypt_rc4(self, objid, genno, data):
@@ -378,7 +409,7 @@ class PDFDocument(object):
KEYWORD_OBJ = KWD('obj')
def getobj(self, objid):
- if not self.ready:
+ if not self._initialized:
raise PDFException('PDFDocument not initialized')
#assert self.xrefs
if 2 <= self.debug:
@@ -389,7 +420,7 @@ class PDFDocument(object):
else:
for xref in self.xrefs:
try:
- (strmid, index) = xref.getpos(objid)
+ (strmid, index) = xref.get_pos(objid)
break
except KeyError:
pass
@@ -411,7 +442,7 @@ class PDFDocument(object):
if strmid in self.parsed_objs:
objs = self.parsed_objs[strmid]
else:
- parser = PDFObjStrmParser(self, stream.get_data())
+ parser = PDFObjStrmParser(stream.get_data())
objs = []
try:
while 1:
@@ -458,7 +489,7 @@ class PDFDocument(object):
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
def get_pages(self):
- if not self.ready:
+ if not self._initialized:
raise PDFException('PDFDocument is not initialized')
#assert self.xrefs
def search(obj, parent):
@@ -529,14 +560,15 @@ class PDFDocument(object):
##
class PDFParser(PSStackParser):
- def __init__(self, doc, fp):
+ def __init__(self, fp):
PSStackParser.__init__(self, fp)
- self.doc = doc
- self.doc.set_parser(self)
+ self.doc = None
return
- def __repr__(self):
- return ''
+ def set_document(self, doc):
+ "Associates the parser with a PDFDocument object."
+ self.doc = doc
+ return
KEYWORD_R = KWD('R')
KEYWORD_ENDOBJ = KWD('endobj')
@@ -647,7 +679,7 @@ class PDFParser(PSStackParser):
xref = PDFXRef()
xref.load(self, debug=self.debug)
xrefs.append(xref)
- trailer = xref.trailer
+ trailer = xref.get_trailer()
if 1 <= self.debug:
print >>stderr, 'trailer: %r' % trailer
if 'XRefStm' in trailer:
@@ -669,26 +701,8 @@ class PDFParser(PSStackParser):
# fallback
if 1 <= self.debug:
print >>stderr, 'no xref, fallback'
- self.seek(0)
- pat = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
- offsets = {}
xref = PDFXRef()
- while 1:
- try:
- (pos, line) = self.nextline()
- except PSEOF:
- break
- if line.startswith('trailer'):
- xref.offsets = offsets
- self.seek(pos)
- xref.load_trailer(self)
- if 1 <= self.debug:
- print >>stderr, 'trailer: %r' % xref.trailer
- continue
- m = pat.match(line)
- if not m: continue
- (objid, genno) = m.groups()
- offsets[int(objid)] = (0, pos)
+ xref.load_fallback(self)
xrefs.append(xref)
return xrefs
@@ -697,8 +711,8 @@ class PDFParser(PSStackParser):
##
class PDFObjStrmParser(PSStackParser):
- def __init__(self, doc, data):
- PDFParser.__init__(self, doc, StringIO(data))
+ def __init__(self, data):
+ PSStackParser.__init__(self, StringIO(data))
return
def flush(self):
diff --git a/pdfminer/utils.py b/pdfminer/utils.py
index 19799cc..5a9a918 100644
--- a/pdfminer/utils.py
+++ b/pdfminer/utils.py
@@ -135,3 +135,27 @@ def enc(x, codec='ascii'):
'''Encodes a string for SGML/XML/HTML'''
x = x.replace('&','&').replace('>','>').replace('<','<').replace('"','"')
return x.encode(codec, 'xmlcharrefreplace')
+
+
+## ObjIdRange
+##
+class ObjIdRange(object):
+
+ "A utility class to represent a range of object IDs."
+
+ def __init__(self, start, nobjs):
+ self.start = start
+ self.nobjs = nobjs
+ return
+
+ def __repr__(self):
+ return '' % (self.get_start_id(), self.get_end_id())
+
+ def get_start_id(self):
+ return self.start
+
+ def get_end_id(self):
+ return self.start + self.nobjs - 1
+
+ def get_nobjs(self):
+ return self.nobjs
diff --git a/tools/conv_cmap.py b/tools/conv_cmap.py
index 4102ae6..dcb190e 100755
--- a/tools/conv_cmap.py
+++ b/tools/conv_cmap.py
@@ -12,7 +12,7 @@ def process_cid2code(fp, check_codecs=[]):
else:
return (name+'-H', name+'-V')
- def get_unicode(codes):
+ def get_unichr(codes):
# determine the "most popular" candidate.
d = {}
for code in codes:
@@ -26,7 +26,7 @@ def process_cid2code(fp, check_codecs=[]):
except UnicodeError:
pass
chars = sorted(d.keys(), key=lambda char:d[char], reverse=True)
- return ord(chars[0])
+ return chars[0]
def put(dmap, code, cid, force=False):
for b in code[:-1]:
@@ -45,8 +45,8 @@ def process_cid2code(fp, check_codecs=[]):
names = []
code2cid = {} # {'cmapname': ...}
is_vertical = {}
- cid2unicode_h = {} # {cid: unicode}
- cid2unicode_v = {} # {cid: unicode}
+ cid2unichr_h = {} # {cid: unichr}
+ cid2unichr_v = {} # {cid: unichr}
for line in fp:
line = line.strip()
@@ -95,21 +95,21 @@ def process_cid2code(fp, check_codecs=[]):
put(hmap, code, cid, True)
if name.endswith('-UTF8'):
if hcodes:
- cid2unicode_h[cid] = get_unicode(hcodes)
+ cid2unichr_h[cid] = get_unichr(hcodes)
if vcodes:
- cid2unicode_v[cid] = get_unicode(vcodes)
+ cid2unichr_v[cid] = get_unichr(vcodes)
else:
for code in hcodes:
put(hmap, code, cid)
put(vmap, code, cid)
if name.endswith('-UTF8') and hcodes:
- code = get_unicode(hcodes)
- if cid not in cid2unicode_h:
- cid2unicode_h[cid] = code
- if cid not in cid2unicode_v:
- cid2unicode_v[cid] = code
+ code = get_unichr(hcodes)
+ if cid not in cid2unichr_h:
+ cid2unichr_h[cid] = code
+ if cid not in cid2unichr_v:
+ cid2unichr_v[cid] = code
- return (code2cid, is_vertical, cid2unicode_h, cid2unicode_v)
+ return (code2cid, is_vertical, cid2unichr_h, cid2unichr_v)
# main
def main(argv):
@@ -128,7 +128,7 @@ def main(argv):
print >>sys.stderr, 'reading %r...' % src
fp = file(src)
- (code2cid, is_vertical, cid2unicode_h, cid2unicode_v) = process_cid2code(fp, check_codecs)
+ (code2cid, is_vertical, cid2unichr_h, cid2unichr_v) = process_cid2code(fp, check_codecs)
fp.close()
for (name, cmap) in code2cid.iteritems():
@@ -146,8 +146,8 @@ def main(argv):
fp = file(os.path.join(outdir, fname), 'w')
print >>fp, '#!/usr/bin/env python'
print >>fp, '#', fname
- print >>fp, 'CID2UNICODE_H = %r' % cid2unicode_h
- print >>fp, 'CID2UNICODE_V = %r' % cid2unicode_v
+ print >>fp, 'CID2UNICHR_H = %r' % cid2unichr_h
+ print >>fp, 'CID2UNICHR_V = %r' % cid2unichr_v
fp.close()
return 0
diff --git a/tools/dumppdf.py b/tools/dumppdf.py
index 8b704a9..bc29280 100755
--- a/tools/dumppdf.py
+++ b/tools/dumppdf.py
@@ -99,7 +99,9 @@ def dumpoutline(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None):
doc = PDFDocument()
fp = file(fname, 'rb')
- parser = PDFParser(doc, fp)
+ parser = PDFParser(fp)
+ parser.set_document(doc)
+ doc.set_parser(parser)
doc.initialize(password)
pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
for (level,title,dest,a,se) in doc.get_outlines():
@@ -119,7 +121,9 @@ def dumppdf(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None):
doc = PDFDocument()
fp = file(fname, 'rb')
- parser = PDFParser(doc, fp)
+ parser = PDFParser(fp)
+ parser.set_document(doc)
+ doc.set_parser(parser)
doc.initialize(password)
if objids:
for objid in objids: