Merge pull request #2 from Cybjit/master

CMap fixes and speed improvements
pull/4/head
Philippe Guglielmetti 2014-09-12 07:33:06 +02:00
commit 4f8aa9ff5b
13 changed files with 95 additions and 104 deletions

View File

@ -3,7 +3,7 @@
PACKAGE=pdfminer PACKAGE=pdfminer
PYTHON=python2 PYTHON=python
GIT=git GIT=git
RM=rm -f RM=rm -f
CP=cp -f CP=cp -f

View File

@ -91,12 +91,11 @@ class CMap(CMapBase):
return return
def decode(self, code): def decode(self, code):
logging.debug('decode: %r, %r' % (self, code)) logging.debug('decode: %r, %r', self, code)
d = self.code2cid d = self.code2cid
for i in six.iterbytes(code): for i in six.iterbytes(code):
c = six.int2byte(i) if i in d:
if c in d: d = d[i]
d = d[c]
if isinstance(d, int): if isinstance(d, int):
yield d yield d
d = self.code2cid d = self.code2cid
@ -142,7 +141,7 @@ class UnicodeMap(CMapBase):
return '<UnicodeMap: %s>' % self.attrs.get('CMapName') return '<UnicodeMap: %s>' % self.attrs.get('CMapName')
def get_unichr(self, cid): def get_unichr(self, cid):
logging.debug('get_unichr: %r, %r' % (self, cid)) logging.debug('get_unichr: %r, %r', self, cid)
return self.cid2unichr[cid] return self.cid2unichr[cid]
def dump(self, out=sys.stdout): def dump(self, out=sys.stdout):
@ -229,7 +228,7 @@ class CMapDB(object):
@classmethod @classmethod
def _load_data(klass, name): def _load_data(klass, name):
filename = '%s.pickle.gz' % name filename = '%s.pickle.gz' % name
logging.info('loading: %r' % name) logging.info('loading: %r', name)
cmap_paths = (os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'), cmap_paths = (os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'),
os.path.join(os.path.dirname(__file__), 'cmap'),) os.path.join(os.path.dirname(__file__), 'cmap'),)
for directory in cmap_paths: for directory in cmap_paths:

View File

@ -117,7 +117,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
return item.adv return item.adv
def handle_undefined_char(self, font, cid): def handle_undefined_char(self, font, cid):
logging.info('undefined: %r, %r' % (font, cid)) logging.info('undefined: %r, %r', font, cid)
return '(cid:%d)' % cid return '(cid:%d)' % cid
def receive_layout(self, ltpage): def receive_layout(self, ltpage):

View File

@ -127,7 +127,7 @@ class PDFXRef(PDFBaseXRef):
if use != b'n': if use != b'n':
continue continue
self.offsets[objid] = (None, long(pos) if six.PY2 else int(pos), int(genno)) self.offsets[objid] = (None, long(pos) if six.PY2 else int(pos), int(genno))
logging.info('xref objects: %r' % self.offsets) logging.info('xref objects: %r', self.offsets)
self.load_trailer(parser) self.load_trailer(parser)
return return
@ -142,7 +142,7 @@ class PDFXRef(PDFBaseXRef):
raise PDFNoValidXRef('Unexpected EOF - file corrupted') raise PDFNoValidXRef('Unexpected EOF - file corrupted')
(_, dic) = x[0] (_, dic) = x[0]
self.trailer.update(dict_value(dic)) self.trailer.update(dict_value(dic))
logging.debug('trailer=%r'%self.trailer) logging.debug('trailer=%r', self.trailer)
return return
def get_trailer(self): def get_trailer(self):
@ -177,7 +177,7 @@ class PDFXRefFallback(PDFXRef):
if line.startswith(b'trailer'): if line.startswith(b'trailer'):
parser.seek(pos) parser.seek(pos)
self.load_trailer(parser) self.load_trailer(parser)
logging.info('trailer: %r' % self.get_trailer()) logging.info('trailer: %r', self.trailer)
break break
if six.PY3: if six.PY3:
line=line.decode('latin-1') #default pdf encoding line=line.decode('latin-1') #default pdf encoding
@ -244,9 +244,9 @@ class PDFXRefStream(PDFBaseXRef):
self.data = stream.get_data() self.data = stream.get_data()
self.entlen = self.fl1+self.fl2+self.fl3 self.entlen = self.fl1+self.fl2+self.fl3
self.trailer = stream.attrs self.trailer = stream.attrs
logging.info('xref stream: objid=%s, fields=%d,%d,%d' % logging.info('xref stream: objid=%s, fields=%d,%d,%d',
(', '.join(map(repr, self.ranges)), ', '.join(map(repr, self.ranges)),
self.fl1, self.fl2, self.fl3)) self.fl1, self.fl2, self.fl3)
return return
def get_trailer(self): def get_trailer(self):
@ -655,7 +655,7 @@ class PDFDocument(object):
assert objid != 0 assert objid != 0
if not self.xrefs: if not self.xrefs:
raise PDFException('PDFDocument is not initialized') raise PDFException('PDFDocument is not initialized')
logging.debug('getobj: objid=%r' % objid) logging.debug('getobj: objid=%r', objid)
if objid in self._cached_objs: if objid in self._cached_objs:
(obj, genno) = self._cached_objs[objid] (obj, genno) = self._cached_objs[objid]
else: else:
@ -680,7 +680,7 @@ class PDFDocument(object):
continue continue
else: else:
raise PDFObjectNotFound(objid) raise PDFObjectNotFound(objid)
logging.debug('register: objid=%r: %r' % (objid, obj)) logging.debug('register: objid=%r: %r', objid, obj)
if self.caching: if self.caching:
self._cached_objs[objid] = (obj, genno) self._cached_objs[objid] = (obj, genno)
return obj return obj
@ -753,14 +753,14 @@ class PDFDocument(object):
prev = None prev = None
for line in parser.revreadlines(): for line in parser.revreadlines():
line = line.strip() line = line.strip()
logging.debug('find_xref: %r' % line) logging.debug('find_xref: %r', line)
if line == b'startxref': if line == b'startxref':
break break
if line: if line:
prev = line prev = line
else: else:
raise PDFNoValidXRef('Unexpected EOF') raise PDFNoValidXRef('Unexpected EOF')
logging.info('xref found: pos=%r' % prev) logging.info('xref found: pos=%r', prev)
return long(prev) if six.PY2 else int(prev) return long(prev) if six.PY2 else int(prev)
# read xref table # read xref table
@ -772,7 +772,7 @@ class PDFDocument(object):
(pos, token) = parser.nexttoken() (pos, token) = parser.nexttoken()
except PSEOF: except PSEOF:
raise PDFNoValidXRef('Unexpected EOF') raise PDFNoValidXRef('Unexpected EOF')
logging.info('read_xref_from: start=%d, token=%r' % (start, token)) logging.info('read_xref_from: start=%d, token=%r', start, token)
if isinstance(token, int): if isinstance(token, int):
# XRefStream: PDF-1.5 # XRefStream: PDF-1.5
parser.seek(pos) parser.seek(pos)
@ -786,7 +786,7 @@ class PDFDocument(object):
xref.load(parser) xref.load(parser)
xrefs.append(xref) xrefs.append(xref)
trailer = xref.get_trailer() trailer = xref.get_trailer()
logging.info('trailer: %r' % trailer) logging.info('trailer: %r', trailer)
if 'XRefStm' in trailer: if 'XRefStm' in trailer:
pos = int_value(trailer['XRefStm']) pos = int_value(trailer['XRefStm'])
self.read_xref_from(parser, pos, xrefs) self.read_xref_from(parser, pos, xrefs)

View File

@ -640,8 +640,8 @@ class PDFCIDFont(PDFFont):
raise PDFFontError('BaseFont is missing') raise PDFFontError('BaseFont is missing')
self.basefont = 'unknown' self.basefont = 'unknown'
self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {})) self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'), self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', b'unknown').decode("latin1"),
self.cidsysteminfo.get('Ordering', 'unknown')) self.cidsysteminfo.get('Ordering', b'unknown').decode("latin1"))
try: try:
name = literal_name(spec['Encoding']) name = literal_name(spec['Encoding'])
except KeyError: except KeyError:
@ -728,7 +728,7 @@ class PDFCIDFont(PDFFont):
# main # main
def main(argv): def main(argv):
for fname in argv[1:]: for fname in argv[1:]:
fp = file(fname, 'rb') fp = open(fname, 'rb')
#font = TrueTypeFont(fname, fp) #font = TrueTypeFont(fname, fp)
font = CFFFont(fname, fp) font = CFFFont(fname, fp)
print (font) print (font)

View File

@ -166,7 +166,7 @@ class PDFResourceManager(object):
if objid and objid in self._cached_fonts: if objid and objid in self._cached_fonts:
font = self._cached_fonts[objid] font = self._cached_fonts[objid]
else: else:
logging.info('get_font: create: objid=%r, spec=%r' % (objid, spec)) logging.info('get_font: create: objid=%r, spec=%r', objid, spec)
if STRICT: if STRICT:
if spec['Type'] is not LITERAL_FONT: if spec['Type'] is not LITERAL_FONT:
raise PDFFontError('Type is not /Font') raise PDFFontError('Type is not /Font')
@ -340,7 +340,7 @@ class PDFPageInterpreter(object):
else: else:
return PREDEFINED_COLORSPACE.get(name) return PREDEFINED_COLORSPACE.get(name)
for (k, v) in six.iteritems(dict_value(resources)): for (k, v) in six.iteritems(dict_value(resources)):
logging.debug('Resource: %r: %r' % (k, v)) logging.debug('Resource: %r: %r', k, v)
if k == 'Font': if k == 'Font':
for (fontid, spec) in six.iteritems(dict_value(v)): for (fontid, spec) in six.iteritems(dict_value(v)):
objid = None objid = None
@ -796,7 +796,7 @@ class PDFPageInterpreter(object):
if STRICT: if STRICT:
raise PDFInterpreterError('Undefined xobject id: %r' % xobjid) raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
return return
logging.info('Processing xobj: %r' % xobj) logging.info('Processing xobj: %r', xobj)
subtype = xobj.get('Subtype') subtype = xobj.get('Subtype')
if subtype is LITERAL_FORM and 'BBox' in xobj: if subtype is LITERAL_FORM and 'BBox' in xobj:
interpreter = self.dup() interpreter = self.dup()
@ -819,7 +819,7 @@ class PDFPageInterpreter(object):
return return
def process_page(self, page): def process_page(self, page):
logging.info('Processing page: %r' % page) logging.info('Processing page: %r', page)
(x0, y0, x1, y1) = page.mediabox (x0, y0, x1, y1) = page.mediabox
if page.rotate == 90: if page.rotate == 90:
ctm = (0, -1, 1, 0, -y0, x1) ctm = (0, -1, 1, 0, -y0, x1)
@ -838,8 +838,8 @@ class PDFPageInterpreter(object):
# Render the content streams. # Render the content streams.
# This method may be called recursively. # This method may be called recursively.
def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY): def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY):
logging.info('render_contents: resources=%r, streams=%r, ctm=%r' % logging.info('render_contents: resources=%r, streams=%r, ctm=%r',
(resources, streams, ctm)) resources, streams, ctm)
self.init_resources(resources) self.init_resources(resources)
self.init_state(ctm) self.init_state(ctm)
self.execute(list_value(streams)) self.execute(list_value(streams))
@ -864,11 +864,11 @@ class PDFPageInterpreter(object):
nargs = six.get_function_code(func).co_argcount-1 nargs = six.get_function_code(func).co_argcount-1
if nargs: if nargs:
args = self.pop(nargs) args = self.pop(nargs)
logging.debug('exec: %s %r' % (name, args)) logging.debug('exec: %s %r', name, args)
if len(args) == nargs: if len(args) == nargs:
func(*args) func(*args)
else: else:
logging.debug('exec: %s' % name) logging.debug('exec: %s', name)
func() func()
else: else:
if STRICT: if STRICT:

View File

@ -87,12 +87,12 @@ class PDFPage(object):
if k in klass.INHERITABLE_ATTRS and k not in tree: if k in klass.INHERITABLE_ATTRS and k not in tree:
tree[k] = v tree[k] = v
if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree: if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
logging.info('Pages: Kids=%r' % tree['Kids']) logging.info('Pages: Kids=%r', tree['Kids'])
for c in list_value(tree['Kids']): for c in list_value(tree['Kids']):
for x in search(c, tree): for x in search(c, tree):
yield x yield x
elif tree.get('Type') is LITERAL_PAGE: elif tree.get('Type') is LITERAL_PAGE:
logging.info('Page: %r' % tree) logging.info('Page: %r', tree)
yield (objid, tree) yield (objid, tree)
pages = False pages = False
if 'Pages' in document.catalog: if 'Pages' in document.catalog:

View File

@ -120,7 +120,7 @@ class PDFParser(PSStackParser):
data += line data += line
self.seek(pos+objlen) self.seek(pos+objlen)
# XXX limit objlen not to exceed object boundary # XXX limit objlen not to exceed object boundary
logging.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % (pos, objlen, dic, data[:10])) logging.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...', pos, objlen, dic, data[:10])
obj = PDFStream(dic, data, self.doc.decipher) obj = PDFStream(dic, data, self.doc.decipher)
self.push((pos, obj)) self.push((pos, obj))

View File

@ -101,7 +101,7 @@ def resolve_all(x, default=None):
def decipher_all(decipher, objid, genno, x): def decipher_all(decipher, objid, genno, x):
"""Recursively deciphers the given object. """Recursively deciphers the given object.
""" """
if isinstance(x, str): if isinstance(x, bytes):
return decipher(objid, genno, x) return decipher(objid, genno, x)
if isinstance(x, list): if isinstance(x, list):
x = [decipher_all(decipher, objid, genno, v) for v in x] x = [decipher_all(decipher, objid, genno, v) for v in x]
@ -162,7 +162,7 @@ def dict_value(x):
if not isinstance(x, dict): if not isinstance(x, dict):
if STRICT: if STRICT:
import logging import logging
logging.error('PDFTypeError : Dict required: %r' % x) logging.error('PDFTypeError : Dict required: %r', x)
raise PDFTypeError('Dict required: %r' % x) raise PDFTypeError('Dict required: %r' % x)
return {} return {}
return x return x

View File

@ -6,19 +6,12 @@ import logging
import six # Python 2+3 compatibility import six # Python 2+3 compatibility
def bytes(s,i,j=None): def bytesindex(s,i,j=None):
"""implements s[i], s[i:], s[i:j] for Python2 and Python3""" """implements s[i], s[i:], s[i:j] for Python2 and Python3"""
if six.PY2: if i<0 : i=len(s)+i
if j is None: if j is None: j=i+1
return s[i] if j<0 : j=len(s)
if j<0: return s[i:j]
return s[i:]
return s[i:j]
else: # six.PY3
if i<0 : i=len(s)+i
if j is None: j=i+1
if j<0 : j=len(s)
return b''.join(six.int2byte(s[_]) for _ in range(i,j))
from .utils import choplist from .utils import choplist
@ -214,14 +207,14 @@ class PSBaseParser(object):
if not pos: if not pos:
pos = self.bufpos+self.charpos pos = self.bufpos+self.charpos
self.fp.seek(pos) self.fp.seek(pos)
logging.info('poll(%d): %r' % (pos, self.fp.read(n))) logging.info('poll(%d): %r', pos, self.fp.read(n))
self.fp.seek(pos0) self.fp.seek(pos0)
return return
def seek(self, pos): def seek(self, pos):
"""Seeks the parser to the given position. """Seeks the parser to the given position.
""" """
logging.debug('seek: %r' % pos) logging.debug('seek: %r', pos)
self.fp.seek(pos) self.fp.seek(pos)
# reset the status for nextline() # reset the status for nextline()
self.bufpos = pos self.bufpos = pos
@ -254,7 +247,7 @@ class PSBaseParser(object):
while 1: while 1:
self.fillbuf() self.fillbuf()
if eol: if eol:
c = bytes(self.buf,self.charpos) c = bytesindex(self.buf,self.charpos)
# handle b'\r\n' # handle b'\r\n'
if c == b'\n': if c == b'\n':
linebuf += c linebuf += c
@ -262,16 +255,16 @@ class PSBaseParser(object):
break break
m = EOL.search(self.buf, self.charpos) m = EOL.search(self.buf, self.charpos)
if m: if m:
linebuf += bytes(self.buf,self.charpos,m.end(0)) linebuf += bytesindex(self.buf,self.charpos,m.end(0))
self.charpos = m.end(0) self.charpos = m.end(0)
if bytes(linebuf,-1) == b'\r': if bytesindex(linebuf,-1) == b'\r':
eol = True eol = True
else: else:
break break
else: else:
linebuf += bytes(self.buf,self.charpos,-1) linebuf += bytesindex(self.buf,self.charpos,-1)
self.charpos = len(self.buf) self.charpos = len(self.buf)
logging.debug('nextline: %r, %r' % (linepos, linebuf)) logging.debug('nextline: %r, %r', linepos, linebuf)
return (linepos, linebuf) return (linepos, linebuf)
@ -295,8 +288,8 @@ class PSBaseParser(object):
if n == -1: if n == -1:
buf = s + buf buf = s + buf
break break
yield bytes(s,n,-1)+buf yield bytesindex(s,n,-1)+buf
s = bytes(s,0,n) s = bytesindex(s,0,n)
buf = b'' buf = b''
return return
@ -305,7 +298,7 @@ class PSBaseParser(object):
if not m: if not m:
return len(s) return len(s)
j = m.start(0) j = m.start(0)
c = bytes(s,j) c = bytesindex(s,j)
self._curtokenpos = self.bufpos+j self._curtokenpos = self.bufpos+j
if c == b'%': if c == b'%':
self._curtoken = b'%' self._curtoken = b'%'
@ -351,10 +344,10 @@ class PSBaseParser(object):
def _parse_comment(self, s, i): def _parse_comment(self, s, i):
m = EOL.search(s, i) m = EOL.search(s, i)
if not m: if not m:
self._curtoken += bytes(s,i,-1) self._curtoken += bytesindex(s,i,-1)
return (self._parse_comment, len(s)) return (self._parse_comment, len(s))
j = m.start(0) j = m.start(0)
self._curtoken += bytes(s,i,j) self._curtoken += bytesindex(s,i,j)
self._parse1 = self._parse_main self._parse1 = self._parse_main
# We ignore comments. # We ignore comments.
#self._tokens.append(self._curtoken) #self._tokens.append(self._curtoken)
@ -363,11 +356,11 @@ class PSBaseParser(object):
def _parse_literal(self, s, i): def _parse_literal(self, s, i):
m = END_LITERAL.search(s, i) m = END_LITERAL.search(s, i)
if not m: if not m:
self._curtoken += bytes(s,i,-1) self._curtoken += bytesindex(s,i,-1)
return len(s) return len(s)
j = m.start(0) j = m.start(0)
self._curtoken += bytes(s,i,j) self._curtoken += bytesindex(s,i,j)
c = bytes(s,j) c = bytesindex(s,j)
if c == b'#': if c == b'#':
self.hex = b'' self.hex = b''
self._parse1 = self._parse_literal_hex self._parse1 = self._parse_literal_hex
@ -381,7 +374,7 @@ class PSBaseParser(object):
return j return j
def _parse_literal_hex(self, s, i): def _parse_literal_hex(self, s, i):
c = bytes(s,i) c = bytesindex(s,i)
if HEX.match(c) and len(self.hex) < 2: if HEX.match(c) and len(self.hex) < 2:
self.hex += c self.hex += c
return i+1 return i+1
@ -393,11 +386,11 @@ class PSBaseParser(object):
def _parse_number(self, s, i): def _parse_number(self, s, i):
m = END_NUMBER.search(s, i) m = END_NUMBER.search(s, i)
if not m: if not m:
self._curtoken += bytes(s,i,-1) self._curtoken += bytesindex(s,i,-1)
return len(s) return len(s)
j = m.start(0) j = m.start(0)
self._curtoken += bytes(s,i,j) self._curtoken += bytesindex(s,i,j)
c = bytes(s,j) c = bytesindex(s,j)
if c == b'.': if c == b'.':
self._curtoken += c self._curtoken += c
self._parse1 = self._parse_float self._parse1 = self._parse_float
@ -412,10 +405,10 @@ class PSBaseParser(object):
def _parse_float(self, s, i): def _parse_float(self, s, i):
m = END_NUMBER.search(s, i) m = END_NUMBER.search(s, i)
if not m: if not m:
self._curtoken += bytes(s,i,-1) self._curtoken += bytesindex(s,i,-1)
return len(s) return len(s)
j = m.start(0) j = m.start(0)
self._curtoken += bytes(s,i,j) self._curtoken += bytesindex(s,i,j)
try: try:
self._add_token(float(self._curtoken)) self._add_token(float(self._curtoken))
except ValueError: except ValueError:
@ -426,10 +419,10 @@ class PSBaseParser(object):
def _parse_keyword(self, s, i): def _parse_keyword(self, s, i):
m = END_KEYWORD.search(s, i) m = END_KEYWORD.search(s, i)
if not m: if not m:
self._curtoken += bytes(s,i,-1) self._curtoken += bytesindex(s,i,-1)
return len(s) return len(s)
j = m.start(0) j = m.start(0)
self._curtoken += bytes(s,i,j) self._curtoken += bytesindex(s,i,j)
if self._curtoken == b'true': if self._curtoken == b'true':
token = True token = True
elif self._curtoken == b'false': elif self._curtoken == b'false':
@ -443,11 +436,11 @@ class PSBaseParser(object):
def _parse_string(self, s, i): def _parse_string(self, s, i):
m = END_STRING.search(s, i) m = END_STRING.search(s, i)
if not m: if not m:
self._curtoken += bytes(s,i,-1) self._curtoken += bytesindex(s,i,-1)
return len(s) return len(s)
j = m.start(0) j = m.start(0)
self._curtoken += bytes(s,i,j) self._curtoken += bytesindex(s,i,j)
c = bytes(s,j) c = bytesindex(s,j)
if c == b'\\': if c == b'\\':
self.oct = b'' self.oct = b''
self._parse1 = self._parse_string_1 self._parse1 = self._parse_string_1
@ -466,7 +459,7 @@ class PSBaseParser(object):
return j+1 return j+1
def _parse_string_1(self, s, i): def _parse_string_1(self, s, i):
c = bytes(s,i) c = bytesindex(s,i)
if OCT_STRING.match(c) and len(self.oct) < 3: if OCT_STRING.match(c) and len(self.oct) < 3:
self.oct += c self.oct += c
return i+1 return i+1
@ -480,7 +473,7 @@ class PSBaseParser(object):
return i+1 return i+1
def _parse_wopen(self, s, i): def _parse_wopen(self, s, i):
c = bytes(s,i) c = bytesindex(s,i)
if c == b'<': if c == b'<':
self._add_token(KEYWORD_DICT_BEGIN) self._add_token(KEYWORD_DICT_BEGIN)
self._parse1 = self._parse_main self._parse1 = self._parse_main
@ -490,7 +483,7 @@ class PSBaseParser(object):
return i return i
def _parse_wclose(self, s, i): def _parse_wclose(self, s, i):
c = bytes(s,i) c = bytesindex(s,i)
if c == b'>': if c == b'>':
self._add_token(KEYWORD_DICT_END) self._add_token(KEYWORD_DICT_END)
i += 1 i += 1
@ -500,10 +493,10 @@ class PSBaseParser(object):
def _parse_hexstring(self, s, i): def _parse_hexstring(self, s, i):
m = END_HEX_STRING.search(s, i) m = END_HEX_STRING.search(s, i)
if not m: if not m:
self._curtoken += bytes(s,i,-1) self._curtoken += bytesindex(s,i,-1)
return len(s) return len(s)
j = m.start(0) j = m.start(0)
self._curtoken += bytes(s,i,j) self._curtoken += bytesindex(s,i,j)
token = HEX_PAIR.sub(lambda m: six.int2byte(int(m.group(0), 16)),SPC.sub(b'', self._curtoken)) token = HEX_PAIR.sub(lambda m: six.int2byte(int(m.group(0), 16)),SPC.sub(b'', self._curtoken))
self._add_token(token) self._add_token(token)
self._parse1 = self._parse_main self._parse1 = self._parse_main
@ -514,7 +507,7 @@ class PSBaseParser(object):
self.fillbuf() self.fillbuf()
self.charpos = self._parse1(self.buf, self.charpos) self.charpos = self._parse1(self.buf, self.charpos)
token = self._tokens.pop(0) token = self._tokens.pop(0)
logging.debug('nexttoken: (%r:%r)' % token) logging.debug('nexttoken: %r', token)
return token return token
@ -555,7 +548,7 @@ class PSStackParser(PSBaseParser):
def add_results(self, *objs): def add_results(self, *objs):
try: try:
logging.debug('add_results: %s' % repr(objs)) logging.debug('add_results: %r', objs)
except: except:
logging.debug('add_results: (unprintable object)') logging.debug('add_results: (unprintable object)')
self.results.extend(objs) self.results.extend(objs)
@ -564,7 +557,7 @@ class PSStackParser(PSBaseParser):
def start_type(self, pos, type): def start_type(self, pos, type):
self.context.append((pos, self.curtype, self.curstack)) self.context.append((pos, self.curtype, self.curstack))
(self.curtype, self.curstack) = (type, []) (self.curtype, self.curstack) = (type, [])
logging.debug('start_type: pos=%r, type=%r' % (pos, type)) logging.debug('start_type: pos=%r, type=%r', pos, type)
return return
def end_type(self, type): def end_type(self, type):
@ -572,7 +565,7 @@ class PSStackParser(PSBaseParser):
raise PSTypeError('Type mismatch: %r != %r' % (self.curtype, type)) raise PSTypeError('Type mismatch: %r != %r' % (self.curtype, type))
objs = [obj for (_, obj) in self.curstack] objs = [obj for (_, obj) in self.curstack]
(pos, self.curtype, self.curstack) = self.context.pop() (pos, self.curtype, self.curstack) = self.context.pop()
logging.debug('end_type: pos=%r, type=%r, objs=%r' % (pos, type, objs)) logging.debug('end_type: pos=%r, type=%r, objs=%r', pos, type, objs)
return (pos, objs) return (pos, objs)
def do_keyword(self, pos, token): def do_keyword(self, pos, token):
@ -626,10 +619,10 @@ class PSStackParser(PSBaseParser):
if STRICT: if STRICT:
raise raise
elif isinstance(token,PSKeyword): elif isinstance(token,PSKeyword):
logging.debug('do_keyword: pos=%r, token=%r, stack=%r' % (pos, token, self.curstack)) logging.debug('do_keyword: pos=%r, token=%r, stack=%r', pos, token, self.curstack)
self.do_keyword(pos, token) self.do_keyword(pos, token)
else: else:
logging.error('unknown token: pos=%r, token=%r, stack=%r' % (pos, token, self.curstack)) logging.error('unknown token: pos=%r, token=%r, stack=%r', pos, token, self.curstack)
self.do_keyword(pos, token) self.do_keyword(pos, token)
raise raise
if self.context: if self.context:
@ -638,7 +631,7 @@ class PSStackParser(PSBaseParser):
self.flush() self.flush()
obj = self.results.pop(0) obj = self.results.pop(0)
try: try:
logging.debug('nextobject: %s' % repr(obj)) logging.debug('nextobject: %r', obj)
except: except:
logging.debug('nextobject: (unprintable object)') logging.debug('nextobject: (unprintable object)')
return obj return obj

View File

@ -4,6 +4,8 @@ try:
import cPickle as pickle import cPickle as pickle
except ImportError: except ImportError:
import pickle as pickle import pickle as pickle
import codecs
import six
## CMapConverter ## CMapConverter
@ -56,14 +58,17 @@ class CMapConverter(object):
def put(dmap, code, cid, force=False): def put(dmap, code, cid, force=False):
for b in code[:-1]: for b in code[:-1]:
b = ord(b) if six.PY2:
b = ord(b)
if b in dmap: if b in dmap:
dmap = dmap[b] dmap = dmap[b]
else: else:
d = {} d = {}
dmap[b] = d dmap[b] = d
dmap = d dmap = d
b = ord(code[-1]) b = code[-1]
if six.PY2:
b = ord(b)
if force or ((b not in dmap) or dmap[b] == cid): if force or ((b not in dmap) or dmap[b] == cid):
dmap[b] = cid dmap[b] = cid
return return
@ -83,8 +88,8 @@ class CMapConverter(object):
return return
def pick(unimap): def pick(unimap):
chars = unimap.items() chars = list(unimap.items())
chars.sort(key=(lambda (c,n):(n,-ord(c))), reverse=True) chars.sort(key=(lambda x:(x[1],-ord(x[0]))), reverse=True)
(c,_) = chars[0] (c,_) = chars[0]
return c return c
@ -103,7 +108,7 @@ class CMapConverter(object):
if vertical: if vertical:
code = code[:-1] code = code[:-1]
try: try:
code = code.decode('hex') code = codecs.decode(code, 'hex_codec')
except: except:
code = chr(int(code, 16)) code = chr(int(code, 16))
if vertical: if vertical:
@ -138,7 +143,7 @@ class CMapConverter(object):
IS_VERTICAL=self.is_vertical.get(enc, False), IS_VERTICAL=self.is_vertical.get(enc, False),
CODE2CID=self.code2cid.get(enc), CODE2CID=self.code2cid.get(enc),
) )
fp.write(pickle.dumps(data)) fp.write(pickle.dumps(data, 2))
return return
def dump_unicodemap(self, fp): def dump_unicodemap(self, fp):
@ -146,7 +151,7 @@ class CMapConverter(object):
CID2UNICHR_H=self.cid2unichr_h, CID2UNICHR_H=self.cid2unichr_h,
CID2UNICHR_V=self.cid2unichr_v, CID2UNICHR_V=self.cid2unichr_v,
) )
fp.write(pickle.dumps(data)) fp.write(pickle.dumps(data, 2))
return return
# main # main
@ -175,7 +180,7 @@ def main(argv):
converter = CMapConverter(enc2codec) converter = CMapConverter(enc2codec)
for path in args: for path in args:
print ('reading: %r...' % path) print ('reading: %r...' % path)
fp = file(path) fp = open(path)
converter.load(fp) converter.load(fp)
fp.close() fp.close()

View File

@ -247,7 +247,7 @@ def main(argv):
outfp = sys.stdout outfp = sys.stdout
extractdir = None extractdir = None
for (k, v) in opts: for (k, v) in opts:
if k == '-d': logging.getLogger().setlevel(logging.DEBUG) if k == '-d': logging.getLogger().setLevel(logging.DEBUG)
elif k == '-o': outfp = open(v, 'w') elif k == '-o': outfp = open(v, 'w')
elif k == '-i': objids.extend( int(x) for x in v.split(',') ) elif k == '-i': objids.extend( int(x) for x in v.split(',') )
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )

View File

@ -9,6 +9,7 @@ from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.cmapdb import CMapDB from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams from pdfminer.layout import LAParams
from pdfminer.image import ImageWriter from pdfminer.image import ImageWriter
import logging
# main # main
def main(argv): def main(argv):
@ -25,8 +26,6 @@ def main(argv):
except getopt.GetoptError: except getopt.GetoptError:
return usage() return usage()
if not args: return usage() if not args: return usage()
# debug option
debug = 0
# input option # input option
password = b'' password = b''
pagenos = set() pagenos = set()
@ -45,7 +44,7 @@ def main(argv):
showpageno = True showpageno = True
laparams = LAParams() laparams = LAParams()
for (k, v) in opts: for (k, v) in opts:
if k == '-d': debug += 1 if k == '-d': logging.getLogger().setLevel(logging.DEBUG)
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v) elif k == '-m': maxpages = int(v)
elif k == '-P': password = v elif k == '-P': password = v
@ -66,11 +65,6 @@ def main(argv):
elif k == '-c': codec = v elif k == '-c': codec = v
elif k == '-s': scale = float(v) elif k == '-s': scale = float(v)
# #
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFPageInterpreter.debug = debug
#
rsrcmgr = PDFResourceManager(caching=caching) rsrcmgr = PDFResourceManager(caching=caching)
if not outtype: if not outtype:
outtype = 'text' outtype = 'text'
@ -97,7 +91,7 @@ def main(argv):
elif outtype == 'html': elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams, layoutmode=layoutmode, laparams=laparams,
imagewriter=imagewriter, debug=debug) imagewriter=imagewriter)
elif outtype == 'tag': elif outtype == 'tag':
device = TagExtractor(rsrcmgr, outfp, codec=codec) device = TagExtractor(rsrcmgr, outfp, codec=codec)
else: else: