Merge pull request #2 from Cybjit/master

CMap fixes and speed improvements
pull/4/head
Philippe Guglielmetti 2014-09-12 07:33:06 +02:00
commit 4f8aa9ff5b
13 changed files with 95 additions and 104 deletions

View File

@ -3,7 +3,7 @@
PACKAGE=pdfminer
PYTHON=python2
PYTHON=python
GIT=git
RM=rm -f
CP=cp -f

View File

@ -91,12 +91,11 @@ class CMap(CMapBase):
return
def decode(self, code):
logging.debug('decode: %r, %r' % (self, code))
logging.debug('decode: %r, %r', self, code)
d = self.code2cid
for i in six.iterbytes(code):
c = six.int2byte(i)
if c in d:
d = d[c]
if i in d:
d = d[i]
if isinstance(d, int):
yield d
d = self.code2cid
@ -142,7 +141,7 @@ class UnicodeMap(CMapBase):
return '<UnicodeMap: %s>' % self.attrs.get('CMapName')
def get_unichr(self, cid):
logging.debug('get_unichr: %r, %r' % (self, cid))
logging.debug('get_unichr: %r, %r', self, cid)
return self.cid2unichr[cid]
def dump(self, out=sys.stdout):
@ -229,7 +228,7 @@ class CMapDB(object):
@classmethod
def _load_data(klass, name):
filename = '%s.pickle.gz' % name
logging.info('loading: %r' % name)
logging.info('loading: %r', name)
cmap_paths = (os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'),
os.path.join(os.path.dirname(__file__), 'cmap'),)
for directory in cmap_paths:

View File

@ -117,7 +117,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
return item.adv
def handle_undefined_char(self, font, cid):
logging.info('undefined: %r, %r' % (font, cid))
logging.info('undefined: %r, %r', font, cid)
return '(cid:%d)' % cid
def receive_layout(self, ltpage):

View File

@ -127,7 +127,7 @@ class PDFXRef(PDFBaseXRef):
if use != b'n':
continue
self.offsets[objid] = (None, long(pos) if six.PY2 else int(pos), int(genno))
logging.info('xref objects: %r' % self.offsets)
logging.info('xref objects: %r', self.offsets)
self.load_trailer(parser)
return
@ -142,7 +142,7 @@ class PDFXRef(PDFBaseXRef):
raise PDFNoValidXRef('Unexpected EOF - file corrupted')
(_, dic) = x[0]
self.trailer.update(dict_value(dic))
logging.debug('trailer=%r'%self.trailer)
logging.debug('trailer=%r', self.trailer)
return
def get_trailer(self):
@ -177,7 +177,7 @@ class PDFXRefFallback(PDFXRef):
if line.startswith(b'trailer'):
parser.seek(pos)
self.load_trailer(parser)
logging.info('trailer: %r' % self.get_trailer())
logging.info('trailer: %r', self.trailer)
break
if six.PY3:
line=line.decode('latin-1') #default pdf encoding
@ -244,9 +244,9 @@ class PDFXRefStream(PDFBaseXRef):
self.data = stream.get_data()
self.entlen = self.fl1+self.fl2+self.fl3
self.trailer = stream.attrs
logging.info('xref stream: objid=%s, fields=%d,%d,%d' %
(', '.join(map(repr, self.ranges)),
self.fl1, self.fl2, self.fl3))
logging.info('xref stream: objid=%s, fields=%d,%d,%d',
', '.join(map(repr, self.ranges)),
self.fl1, self.fl2, self.fl3)
return
def get_trailer(self):
@ -655,7 +655,7 @@ class PDFDocument(object):
assert objid != 0
if not self.xrefs:
raise PDFException('PDFDocument is not initialized')
logging.debug('getobj: objid=%r' % objid)
logging.debug('getobj: objid=%r', objid)
if objid in self._cached_objs:
(obj, genno) = self._cached_objs[objid]
else:
@ -680,7 +680,7 @@ class PDFDocument(object):
continue
else:
raise PDFObjectNotFound(objid)
logging.debug('register: objid=%r: %r' % (objid, obj))
logging.debug('register: objid=%r: %r', objid, obj)
if self.caching:
self._cached_objs[objid] = (obj, genno)
return obj
@ -753,14 +753,14 @@ class PDFDocument(object):
prev = None
for line in parser.revreadlines():
line = line.strip()
logging.debug('find_xref: %r' % line)
logging.debug('find_xref: %r', line)
if line == b'startxref':
break
if line:
prev = line
else:
raise PDFNoValidXRef('Unexpected EOF')
logging.info('xref found: pos=%r' % prev)
logging.info('xref found: pos=%r', prev)
return long(prev) if six.PY2 else int(prev)
# read xref table
@ -772,7 +772,7 @@ class PDFDocument(object):
(pos, token) = parser.nexttoken()
except PSEOF:
raise PDFNoValidXRef('Unexpected EOF')
logging.info('read_xref_from: start=%d, token=%r' % (start, token))
logging.info('read_xref_from: start=%d, token=%r', start, token)
if isinstance(token, int):
# XRefStream: PDF-1.5
parser.seek(pos)
@ -786,7 +786,7 @@ class PDFDocument(object):
xref.load(parser)
xrefs.append(xref)
trailer = xref.get_trailer()
logging.info('trailer: %r' % trailer)
logging.info('trailer: %r', trailer)
if 'XRefStm' in trailer:
pos = int_value(trailer['XRefStm'])
self.read_xref_from(parser, pos, xrefs)

View File

@ -640,8 +640,8 @@ class PDFCIDFont(PDFFont):
raise PDFFontError('BaseFont is missing')
self.basefont = 'unknown'
self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'),
self.cidsysteminfo.get('Ordering', 'unknown'))
self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', b'unknown').decode("latin1"),
self.cidsysteminfo.get('Ordering', b'unknown').decode("latin1"))
try:
name = literal_name(spec['Encoding'])
except KeyError:
@ -728,7 +728,7 @@ class PDFCIDFont(PDFFont):
# main
def main(argv):
for fname in argv[1:]:
fp = file(fname, 'rb')
fp = open(fname, 'rb')
#font = TrueTypeFont(fname, fp)
font = CFFFont(fname, fp)
print (font)

View File

@ -166,7 +166,7 @@ class PDFResourceManager(object):
if objid and objid in self._cached_fonts:
font = self._cached_fonts[objid]
else:
logging.info('get_font: create: objid=%r, spec=%r' % (objid, spec))
logging.info('get_font: create: objid=%r, spec=%r', objid, spec)
if STRICT:
if spec['Type'] is not LITERAL_FONT:
raise PDFFontError('Type is not /Font')
@ -340,7 +340,7 @@ class PDFPageInterpreter(object):
else:
return PREDEFINED_COLORSPACE.get(name)
for (k, v) in six.iteritems(dict_value(resources)):
logging.debug('Resource: %r: %r' % (k, v))
logging.debug('Resource: %r: %r', k, v)
if k == 'Font':
for (fontid, spec) in six.iteritems(dict_value(v)):
objid = None
@ -796,7 +796,7 @@ class PDFPageInterpreter(object):
if STRICT:
raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
return
logging.info('Processing xobj: %r' % xobj)
logging.info('Processing xobj: %r', xobj)
subtype = xobj.get('Subtype')
if subtype is LITERAL_FORM and 'BBox' in xobj:
interpreter = self.dup()
@ -819,7 +819,7 @@ class PDFPageInterpreter(object):
return
def process_page(self, page):
logging.info('Processing page: %r' % page)
logging.info('Processing page: %r', page)
(x0, y0, x1, y1) = page.mediabox
if page.rotate == 90:
ctm = (0, -1, 1, 0, -y0, x1)
@ -838,8 +838,8 @@ class PDFPageInterpreter(object):
# Render the content streams.
# This method may be called recursively.
def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY):
logging.info('render_contents: resources=%r, streams=%r, ctm=%r' %
(resources, streams, ctm))
logging.info('render_contents: resources=%r, streams=%r, ctm=%r',
resources, streams, ctm)
self.init_resources(resources)
self.init_state(ctm)
self.execute(list_value(streams))
@ -864,11 +864,11 @@ class PDFPageInterpreter(object):
nargs = six.get_function_code(func).co_argcount-1
if nargs:
args = self.pop(nargs)
logging.debug('exec: %s %r' % (name, args))
logging.debug('exec: %s %r', name, args)
if len(args) == nargs:
func(*args)
else:
logging.debug('exec: %s' % name)
logging.debug('exec: %s', name)
func()
else:
if STRICT:

View File

@ -87,12 +87,12 @@ class PDFPage(object):
if k in klass.INHERITABLE_ATTRS and k not in tree:
tree[k] = v
if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
logging.info('Pages: Kids=%r' % tree['Kids'])
logging.info('Pages: Kids=%r', tree['Kids'])
for c in list_value(tree['Kids']):
for x in search(c, tree):
yield x
elif tree.get('Type') is LITERAL_PAGE:
logging.info('Page: %r' % tree)
logging.info('Page: %r', tree)
yield (objid, tree)
pages = False
if 'Pages' in document.catalog:

View File

@ -120,7 +120,7 @@ class PDFParser(PSStackParser):
data += line
self.seek(pos+objlen)
# XXX limit objlen not to exceed object boundary
logging.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % (pos, objlen, dic, data[:10]))
logging.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...', pos, objlen, dic, data[:10])
obj = PDFStream(dic, data, self.doc.decipher)
self.push((pos, obj))

View File

@ -101,7 +101,7 @@ def resolve_all(x, default=None):
def decipher_all(decipher, objid, genno, x):
"""Recursively deciphers the given object.
"""
if isinstance(x, str):
if isinstance(x, bytes):
return decipher(objid, genno, x)
if isinstance(x, list):
x = [decipher_all(decipher, objid, genno, v) for v in x]
@ -162,7 +162,7 @@ def dict_value(x):
if not isinstance(x, dict):
if STRICT:
import logging
logging.error('PDFTypeError : Dict required: %r' % x)
logging.error('PDFTypeError : Dict required: %r', x)
raise PDFTypeError('Dict required: %r' % x)
return {}
return x

View File

@ -6,19 +6,12 @@ import logging
import six # Python 2+3 compatibility
def bytes(s,i,j=None):
def bytesindex(s,i,j=None):
"""implements s[i], s[i:], s[i:j] for Python2 and Python3"""
if six.PY2:
if j is None:
return s[i]
if j<0:
return s[i:]
return s[i:j]
else: # six.PY3
if i<0 : i=len(s)+i
if j is None: j=i+1
if j<0 : j=len(s)
return b''.join(six.int2byte(s[_]) for _ in range(i,j))
if i<0 : i=len(s)+i
if j is None: j=i+1
if j<0 : j=len(s)
return s[i:j]
from .utils import choplist
@ -214,14 +207,14 @@ class PSBaseParser(object):
if not pos:
pos = self.bufpos+self.charpos
self.fp.seek(pos)
logging.info('poll(%d): %r' % (pos, self.fp.read(n)))
logging.info('poll(%d): %r', pos, self.fp.read(n))
self.fp.seek(pos0)
return
def seek(self, pos):
"""Seeks the parser to the given position.
"""
logging.debug('seek: %r' % pos)
logging.debug('seek: %r', pos)
self.fp.seek(pos)
# reset the status for nextline()
self.bufpos = pos
@ -254,7 +247,7 @@ class PSBaseParser(object):
while 1:
self.fillbuf()
if eol:
c = bytes(self.buf,self.charpos)
c = bytesindex(self.buf,self.charpos)
# handle b'\r\n'
if c == b'\n':
linebuf += c
@ -262,16 +255,16 @@ class PSBaseParser(object):
break
m = EOL.search(self.buf, self.charpos)
if m:
linebuf += bytes(self.buf,self.charpos,m.end(0))
linebuf += bytesindex(self.buf,self.charpos,m.end(0))
self.charpos = m.end(0)
if bytes(linebuf,-1) == b'\r':
if bytesindex(linebuf,-1) == b'\r':
eol = True
else:
break
else:
linebuf += bytes(self.buf,self.charpos,-1)
linebuf += bytesindex(self.buf,self.charpos,-1)
self.charpos = len(self.buf)
logging.debug('nextline: %r, %r' % (linepos, linebuf))
logging.debug('nextline: %r, %r', linepos, linebuf)
return (linepos, linebuf)
@ -295,8 +288,8 @@ class PSBaseParser(object):
if n == -1:
buf = s + buf
break
yield bytes(s,n,-1)+buf
s = bytes(s,0,n)
yield bytesindex(s,n,-1)+buf
s = bytesindex(s,0,n)
buf = b''
return
@ -305,7 +298,7 @@ class PSBaseParser(object):
if not m:
return len(s)
j = m.start(0)
c = bytes(s,j)
c = bytesindex(s,j)
self._curtokenpos = self.bufpos+j
if c == b'%':
self._curtoken = b'%'
@ -351,10 +344,10 @@ class PSBaseParser(object):
def _parse_comment(self, s, i):
m = EOL.search(s, i)
if not m:
self._curtoken += bytes(s,i,-1)
self._curtoken += bytesindex(s,i,-1)
return (self._parse_comment, len(s))
j = m.start(0)
self._curtoken += bytes(s,i,j)
self._curtoken += bytesindex(s,i,j)
self._parse1 = self._parse_main
# We ignore comments.
#self._tokens.append(self._curtoken)
@ -363,11 +356,11 @@ class PSBaseParser(object):
def _parse_literal(self, s, i):
m = END_LITERAL.search(s, i)
if not m:
self._curtoken += bytes(s,i,-1)
self._curtoken += bytesindex(s,i,-1)
return len(s)
j = m.start(0)
self._curtoken += bytes(s,i,j)
c = bytes(s,j)
self._curtoken += bytesindex(s,i,j)
c = bytesindex(s,j)
if c == b'#':
self.hex = b''
self._parse1 = self._parse_literal_hex
@ -381,7 +374,7 @@ class PSBaseParser(object):
return j
def _parse_literal_hex(self, s, i):
c = bytes(s,i)
c = bytesindex(s,i)
if HEX.match(c) and len(self.hex) < 2:
self.hex += c
return i+1
@ -393,11 +386,11 @@ class PSBaseParser(object):
def _parse_number(self, s, i):
m = END_NUMBER.search(s, i)
if not m:
self._curtoken += bytes(s,i,-1)
self._curtoken += bytesindex(s,i,-1)
return len(s)
j = m.start(0)
self._curtoken += bytes(s,i,j)
c = bytes(s,j)
self._curtoken += bytesindex(s,i,j)
c = bytesindex(s,j)
if c == b'.':
self._curtoken += c
self._parse1 = self._parse_float
@ -412,10 +405,10 @@ class PSBaseParser(object):
def _parse_float(self, s, i):
m = END_NUMBER.search(s, i)
if not m:
self._curtoken += bytes(s,i,-1)
self._curtoken += bytesindex(s,i,-1)
return len(s)
j = m.start(0)
self._curtoken += bytes(s,i,j)
self._curtoken += bytesindex(s,i,j)
try:
self._add_token(float(self._curtoken))
except ValueError:
@ -426,10 +419,10 @@ class PSBaseParser(object):
def _parse_keyword(self, s, i):
m = END_KEYWORD.search(s, i)
if not m:
self._curtoken += bytes(s,i,-1)
self._curtoken += bytesindex(s,i,-1)
return len(s)
j = m.start(0)
self._curtoken += bytes(s,i,j)
self._curtoken += bytesindex(s,i,j)
if self._curtoken == b'true':
token = True
elif self._curtoken == b'false':
@ -443,11 +436,11 @@ class PSBaseParser(object):
def _parse_string(self, s, i):
m = END_STRING.search(s, i)
if not m:
self._curtoken += bytes(s,i,-1)
self._curtoken += bytesindex(s,i,-1)
return len(s)
j = m.start(0)
self._curtoken += bytes(s,i,j)
c = bytes(s,j)
self._curtoken += bytesindex(s,i,j)
c = bytesindex(s,j)
if c == b'\\':
self.oct = b''
self._parse1 = self._parse_string_1
@ -466,7 +459,7 @@ class PSBaseParser(object):
return j+1
def _parse_string_1(self, s, i):
c = bytes(s,i)
c = bytesindex(s,i)
if OCT_STRING.match(c) and len(self.oct) < 3:
self.oct += c
return i+1
@ -480,7 +473,7 @@ class PSBaseParser(object):
return i+1
def _parse_wopen(self, s, i):
c = bytes(s,i)
c = bytesindex(s,i)
if c == b'<':
self._add_token(KEYWORD_DICT_BEGIN)
self._parse1 = self._parse_main
@ -490,7 +483,7 @@ class PSBaseParser(object):
return i
def _parse_wclose(self, s, i):
c = bytes(s,i)
c = bytesindex(s,i)
if c == b'>':
self._add_token(KEYWORD_DICT_END)
i += 1
@ -500,10 +493,10 @@ class PSBaseParser(object):
def _parse_hexstring(self, s, i):
m = END_HEX_STRING.search(s, i)
if not m:
self._curtoken += bytes(s,i,-1)
self._curtoken += bytesindex(s,i,-1)
return len(s)
j = m.start(0)
self._curtoken += bytes(s,i,j)
self._curtoken += bytesindex(s,i,j)
token = HEX_PAIR.sub(lambda m: six.int2byte(int(m.group(0), 16)),SPC.sub(b'', self._curtoken))
self._add_token(token)
self._parse1 = self._parse_main
@ -514,7 +507,7 @@ class PSBaseParser(object):
self.fillbuf()
self.charpos = self._parse1(self.buf, self.charpos)
token = self._tokens.pop(0)
logging.debug('nexttoken: (%r:%r)' % token)
logging.debug('nexttoken: %r', token)
return token
@ -555,7 +548,7 @@ class PSStackParser(PSBaseParser):
def add_results(self, *objs):
try:
logging.debug('add_results: %s' % repr(objs))
logging.debug('add_results: %r', objs)
except:
logging.debug('add_results: (unprintable object)')
self.results.extend(objs)
@ -564,7 +557,7 @@ class PSStackParser(PSBaseParser):
def start_type(self, pos, type):
self.context.append((pos, self.curtype, self.curstack))
(self.curtype, self.curstack) = (type, [])
logging.debug('start_type: pos=%r, type=%r' % (pos, type))
logging.debug('start_type: pos=%r, type=%r', pos, type)
return
def end_type(self, type):
@ -572,7 +565,7 @@ class PSStackParser(PSBaseParser):
raise PSTypeError('Type mismatch: %r != %r' % (self.curtype, type))
objs = [obj for (_, obj) in self.curstack]
(pos, self.curtype, self.curstack) = self.context.pop()
logging.debug('end_type: pos=%r, type=%r, objs=%r' % (pos, type, objs))
logging.debug('end_type: pos=%r, type=%r, objs=%r', pos, type, objs)
return (pos, objs)
def do_keyword(self, pos, token):
@ -626,10 +619,10 @@ class PSStackParser(PSBaseParser):
if STRICT:
raise
elif isinstance(token,PSKeyword):
logging.debug('do_keyword: pos=%r, token=%r, stack=%r' % (pos, token, self.curstack))
logging.debug('do_keyword: pos=%r, token=%r, stack=%r', pos, token, self.curstack)
self.do_keyword(pos, token)
else:
logging.error('unknown token: pos=%r, token=%r, stack=%r' % (pos, token, self.curstack))
logging.error('unknown token: pos=%r, token=%r, stack=%r', pos, token, self.curstack)
self.do_keyword(pos, token)
raise
if self.context:
@ -638,7 +631,7 @@ class PSStackParser(PSBaseParser):
self.flush()
obj = self.results.pop(0)
try:
logging.debug('nextobject: %s' % repr(obj))
logging.debug('nextobject: %r', obj)
except:
logging.debug('nextobject: (unprintable object)')
return obj

View File

@ -4,6 +4,8 @@ try:
import cPickle as pickle
except ImportError:
import pickle as pickle
import codecs
import six
## CMapConverter
@ -56,14 +58,17 @@ class CMapConverter(object):
def put(dmap, code, cid, force=False):
for b in code[:-1]:
b = ord(b)
if six.PY2:
b = ord(b)
if b in dmap:
dmap = dmap[b]
else:
d = {}
dmap[b] = d
dmap = d
b = ord(code[-1])
b = code[-1]
if six.PY2:
b = ord(b)
if force or ((b not in dmap) or dmap[b] == cid):
dmap[b] = cid
return
@ -83,8 +88,8 @@ class CMapConverter(object):
return
def pick(unimap):
chars = unimap.items()
chars.sort(key=(lambda (c,n):(n,-ord(c))), reverse=True)
chars = list(unimap.items())
chars.sort(key=(lambda x:(x[1],-ord(x[0]))), reverse=True)
(c,_) = chars[0]
return c
@ -103,7 +108,7 @@ class CMapConverter(object):
if vertical:
code = code[:-1]
try:
code = code.decode('hex')
code = codecs.decode(code, 'hex_codec')
except:
code = chr(int(code, 16))
if vertical:
@ -138,7 +143,7 @@ class CMapConverter(object):
IS_VERTICAL=self.is_vertical.get(enc, False),
CODE2CID=self.code2cid.get(enc),
)
fp.write(pickle.dumps(data))
fp.write(pickle.dumps(data, 2))
return
def dump_unicodemap(self, fp):
@ -146,7 +151,7 @@ class CMapConverter(object):
CID2UNICHR_H=self.cid2unichr_h,
CID2UNICHR_V=self.cid2unichr_v,
)
fp.write(pickle.dumps(data))
fp.write(pickle.dumps(data, 2))
return
# main
@ -175,7 +180,7 @@ def main(argv):
converter = CMapConverter(enc2codec)
for path in args:
print ('reading: %r...' % path)
fp = file(path)
fp = open(path)
converter.load(fp)
fp.close()

View File

@ -247,7 +247,7 @@ def main(argv):
outfp = sys.stdout
extractdir = None
for (k, v) in opts:
if k == '-d': logging.getLogger().setlevel(logging.DEBUG)
if k == '-d': logging.getLogger().setLevel(logging.DEBUG)
elif k == '-o': outfp = open(v, 'w')
elif k == '-i': objids.extend( int(x) for x in v.split(',') )
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )

View File

@ -9,6 +9,7 @@ from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams
from pdfminer.image import ImageWriter
import logging
# main
def main(argv):
@ -25,8 +26,6 @@ def main(argv):
except getopt.GetoptError:
return usage()
if not args: return usage()
# debug option
debug = 0
# input option
password = b''
pagenos = set()
@ -45,7 +44,7 @@ def main(argv):
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
if k == '-d': logging.getLogger().setLevel(logging.DEBUG)
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
@ -66,11 +65,6 @@ def main(argv):
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
#
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFPageInterpreter.debug = debug
#
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = 'text'
@ -97,7 +91,7 @@ def main(argv):
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams,
imagewriter=imagewriter, debug=debug)
imagewriter=imagewriter)
elif outtype == 'tag':
device = TagExtractor(rsrcmgr, outfp, codec=codec)
else: