commit
4f8aa9ff5b
2
Makefile
2
Makefile
|
@ -3,7 +3,7 @@
|
|||
|
||||
PACKAGE=pdfminer
|
||||
|
||||
PYTHON=python2
|
||||
PYTHON=python
|
||||
GIT=git
|
||||
RM=rm -f
|
||||
CP=cp -f
|
||||
|
|
|
@ -91,12 +91,11 @@ class CMap(CMapBase):
|
|||
return
|
||||
|
||||
def decode(self, code):
|
||||
logging.debug('decode: %r, %r' % (self, code))
|
||||
logging.debug('decode: %r, %r', self, code)
|
||||
d = self.code2cid
|
||||
for i in six.iterbytes(code):
|
||||
c = six.int2byte(i)
|
||||
if c in d:
|
||||
d = d[c]
|
||||
if i in d:
|
||||
d = d[i]
|
||||
if isinstance(d, int):
|
||||
yield d
|
||||
d = self.code2cid
|
||||
|
@ -142,7 +141,7 @@ class UnicodeMap(CMapBase):
|
|||
return '<UnicodeMap: %s>' % self.attrs.get('CMapName')
|
||||
|
||||
def get_unichr(self, cid):
|
||||
logging.debug('get_unichr: %r, %r' % (self, cid))
|
||||
logging.debug('get_unichr: %r, %r', self, cid)
|
||||
return self.cid2unichr[cid]
|
||||
|
||||
def dump(self, out=sys.stdout):
|
||||
|
@ -229,7 +228,7 @@ class CMapDB(object):
|
|||
@classmethod
|
||||
def _load_data(klass, name):
|
||||
filename = '%s.pickle.gz' % name
|
||||
logging.info('loading: %r' % name)
|
||||
logging.info('loading: %r', name)
|
||||
cmap_paths = (os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'),
|
||||
os.path.join(os.path.dirname(__file__), 'cmap'),)
|
||||
for directory in cmap_paths:
|
||||
|
|
|
@ -117,7 +117,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|||
return item.adv
|
||||
|
||||
def handle_undefined_char(self, font, cid):
|
||||
logging.info('undefined: %r, %r' % (font, cid))
|
||||
logging.info('undefined: %r, %r', font, cid)
|
||||
return '(cid:%d)' % cid
|
||||
|
||||
def receive_layout(self, ltpage):
|
||||
|
|
|
@ -127,7 +127,7 @@ class PDFXRef(PDFBaseXRef):
|
|||
if use != b'n':
|
||||
continue
|
||||
self.offsets[objid] = (None, long(pos) if six.PY2 else int(pos), int(genno))
|
||||
logging.info('xref objects: %r' % self.offsets)
|
||||
logging.info('xref objects: %r', self.offsets)
|
||||
self.load_trailer(parser)
|
||||
return
|
||||
|
||||
|
@ -142,7 +142,7 @@ class PDFXRef(PDFBaseXRef):
|
|||
raise PDFNoValidXRef('Unexpected EOF - file corrupted')
|
||||
(_, dic) = x[0]
|
||||
self.trailer.update(dict_value(dic))
|
||||
logging.debug('trailer=%r'%self.trailer)
|
||||
logging.debug('trailer=%r', self.trailer)
|
||||
return
|
||||
|
||||
def get_trailer(self):
|
||||
|
@ -177,7 +177,7 @@ class PDFXRefFallback(PDFXRef):
|
|||
if line.startswith(b'trailer'):
|
||||
parser.seek(pos)
|
||||
self.load_trailer(parser)
|
||||
logging.info('trailer: %r' % self.get_trailer())
|
||||
logging.info('trailer: %r', self.trailer)
|
||||
break
|
||||
if six.PY3:
|
||||
line=line.decode('latin-1') #default pdf encoding
|
||||
|
@ -244,9 +244,9 @@ class PDFXRefStream(PDFBaseXRef):
|
|||
self.data = stream.get_data()
|
||||
self.entlen = self.fl1+self.fl2+self.fl3
|
||||
self.trailer = stream.attrs
|
||||
logging.info('xref stream: objid=%s, fields=%d,%d,%d' %
|
||||
(', '.join(map(repr, self.ranges)),
|
||||
self.fl1, self.fl2, self.fl3))
|
||||
logging.info('xref stream: objid=%s, fields=%d,%d,%d',
|
||||
', '.join(map(repr, self.ranges)),
|
||||
self.fl1, self.fl2, self.fl3)
|
||||
return
|
||||
|
||||
def get_trailer(self):
|
||||
|
@ -655,7 +655,7 @@ class PDFDocument(object):
|
|||
assert objid != 0
|
||||
if not self.xrefs:
|
||||
raise PDFException('PDFDocument is not initialized')
|
||||
logging.debug('getobj: objid=%r' % objid)
|
||||
logging.debug('getobj: objid=%r', objid)
|
||||
if objid in self._cached_objs:
|
||||
(obj, genno) = self._cached_objs[objid]
|
||||
else:
|
||||
|
@ -680,7 +680,7 @@ class PDFDocument(object):
|
|||
continue
|
||||
else:
|
||||
raise PDFObjectNotFound(objid)
|
||||
logging.debug('register: objid=%r: %r' % (objid, obj))
|
||||
logging.debug('register: objid=%r: %r', objid, obj)
|
||||
if self.caching:
|
||||
self._cached_objs[objid] = (obj, genno)
|
||||
return obj
|
||||
|
@ -753,14 +753,14 @@ class PDFDocument(object):
|
|||
prev = None
|
||||
for line in parser.revreadlines():
|
||||
line = line.strip()
|
||||
logging.debug('find_xref: %r' % line)
|
||||
logging.debug('find_xref: %r', line)
|
||||
if line == b'startxref':
|
||||
break
|
||||
if line:
|
||||
prev = line
|
||||
else:
|
||||
raise PDFNoValidXRef('Unexpected EOF')
|
||||
logging.info('xref found: pos=%r' % prev)
|
||||
logging.info('xref found: pos=%r', prev)
|
||||
return long(prev) if six.PY2 else int(prev)
|
||||
|
||||
# read xref table
|
||||
|
@ -772,7 +772,7 @@ class PDFDocument(object):
|
|||
(pos, token) = parser.nexttoken()
|
||||
except PSEOF:
|
||||
raise PDFNoValidXRef('Unexpected EOF')
|
||||
logging.info('read_xref_from: start=%d, token=%r' % (start, token))
|
||||
logging.info('read_xref_from: start=%d, token=%r', start, token)
|
||||
if isinstance(token, int):
|
||||
# XRefStream: PDF-1.5
|
||||
parser.seek(pos)
|
||||
|
@ -786,7 +786,7 @@ class PDFDocument(object):
|
|||
xref.load(parser)
|
||||
xrefs.append(xref)
|
||||
trailer = xref.get_trailer()
|
||||
logging.info('trailer: %r' % trailer)
|
||||
logging.info('trailer: %r', trailer)
|
||||
if 'XRefStm' in trailer:
|
||||
pos = int_value(trailer['XRefStm'])
|
||||
self.read_xref_from(parser, pos, xrefs)
|
||||
|
|
|
@ -640,8 +640,8 @@ class PDFCIDFont(PDFFont):
|
|||
raise PDFFontError('BaseFont is missing')
|
||||
self.basefont = 'unknown'
|
||||
self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
|
||||
self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'),
|
||||
self.cidsysteminfo.get('Ordering', 'unknown'))
|
||||
self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', b'unknown').decode("latin1"),
|
||||
self.cidsysteminfo.get('Ordering', b'unknown').decode("latin1"))
|
||||
try:
|
||||
name = literal_name(spec['Encoding'])
|
||||
except KeyError:
|
||||
|
@ -728,7 +728,7 @@ class PDFCIDFont(PDFFont):
|
|||
# main
|
||||
def main(argv):
|
||||
for fname in argv[1:]:
|
||||
fp = file(fname, 'rb')
|
||||
fp = open(fname, 'rb')
|
||||
#font = TrueTypeFont(fname, fp)
|
||||
font = CFFFont(fname, fp)
|
||||
print (font)
|
||||
|
|
|
@ -166,7 +166,7 @@ class PDFResourceManager(object):
|
|||
if objid and objid in self._cached_fonts:
|
||||
font = self._cached_fonts[objid]
|
||||
else:
|
||||
logging.info('get_font: create: objid=%r, spec=%r' % (objid, spec))
|
||||
logging.info('get_font: create: objid=%r, spec=%r', objid, spec)
|
||||
if STRICT:
|
||||
if spec['Type'] is not LITERAL_FONT:
|
||||
raise PDFFontError('Type is not /Font')
|
||||
|
@ -340,7 +340,7 @@ class PDFPageInterpreter(object):
|
|||
else:
|
||||
return PREDEFINED_COLORSPACE.get(name)
|
||||
for (k, v) in six.iteritems(dict_value(resources)):
|
||||
logging.debug('Resource: %r: %r' % (k, v))
|
||||
logging.debug('Resource: %r: %r', k, v)
|
||||
if k == 'Font':
|
||||
for (fontid, spec) in six.iteritems(dict_value(v)):
|
||||
objid = None
|
||||
|
@ -796,7 +796,7 @@ class PDFPageInterpreter(object):
|
|||
if STRICT:
|
||||
raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
|
||||
return
|
||||
logging.info('Processing xobj: %r' % xobj)
|
||||
logging.info('Processing xobj: %r', xobj)
|
||||
subtype = xobj.get('Subtype')
|
||||
if subtype is LITERAL_FORM and 'BBox' in xobj:
|
||||
interpreter = self.dup()
|
||||
|
@ -819,7 +819,7 @@ class PDFPageInterpreter(object):
|
|||
return
|
||||
|
||||
def process_page(self, page):
|
||||
logging.info('Processing page: %r' % page)
|
||||
logging.info('Processing page: %r', page)
|
||||
(x0, y0, x1, y1) = page.mediabox
|
||||
if page.rotate == 90:
|
||||
ctm = (0, -1, 1, 0, -y0, x1)
|
||||
|
@ -838,8 +838,8 @@ class PDFPageInterpreter(object):
|
|||
# Render the content streams.
|
||||
# This method may be called recursively.
|
||||
def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY):
|
||||
logging.info('render_contents: resources=%r, streams=%r, ctm=%r' %
|
||||
(resources, streams, ctm))
|
||||
logging.info('render_contents: resources=%r, streams=%r, ctm=%r',
|
||||
resources, streams, ctm)
|
||||
self.init_resources(resources)
|
||||
self.init_state(ctm)
|
||||
self.execute(list_value(streams))
|
||||
|
@ -864,11 +864,11 @@ class PDFPageInterpreter(object):
|
|||
nargs = six.get_function_code(func).co_argcount-1
|
||||
if nargs:
|
||||
args = self.pop(nargs)
|
||||
logging.debug('exec: %s %r' % (name, args))
|
||||
logging.debug('exec: %s %r', name, args)
|
||||
if len(args) == nargs:
|
||||
func(*args)
|
||||
else:
|
||||
logging.debug('exec: %s' % name)
|
||||
logging.debug('exec: %s', name)
|
||||
func()
|
||||
else:
|
||||
if STRICT:
|
||||
|
|
|
@ -87,12 +87,12 @@ class PDFPage(object):
|
|||
if k in klass.INHERITABLE_ATTRS and k not in tree:
|
||||
tree[k] = v
|
||||
if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
|
||||
logging.info('Pages: Kids=%r' % tree['Kids'])
|
||||
logging.info('Pages: Kids=%r', tree['Kids'])
|
||||
for c in list_value(tree['Kids']):
|
||||
for x in search(c, tree):
|
||||
yield x
|
||||
elif tree.get('Type') is LITERAL_PAGE:
|
||||
logging.info('Page: %r' % tree)
|
||||
logging.info('Page: %r', tree)
|
||||
yield (objid, tree)
|
||||
pages = False
|
||||
if 'Pages' in document.catalog:
|
||||
|
|
|
@ -120,7 +120,7 @@ class PDFParser(PSStackParser):
|
|||
data += line
|
||||
self.seek(pos+objlen)
|
||||
# XXX limit objlen not to exceed object boundary
|
||||
logging.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % (pos, objlen, dic, data[:10]))
|
||||
logging.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...', pos, objlen, dic, data[:10])
|
||||
obj = PDFStream(dic, data, self.doc.decipher)
|
||||
self.push((pos, obj))
|
||||
|
||||
|
|
|
@ -101,7 +101,7 @@ def resolve_all(x, default=None):
|
|||
def decipher_all(decipher, objid, genno, x):
|
||||
"""Recursively deciphers the given object.
|
||||
"""
|
||||
if isinstance(x, str):
|
||||
if isinstance(x, bytes):
|
||||
return decipher(objid, genno, x)
|
||||
if isinstance(x, list):
|
||||
x = [decipher_all(decipher, objid, genno, v) for v in x]
|
||||
|
@ -162,7 +162,7 @@ def dict_value(x):
|
|||
if not isinstance(x, dict):
|
||||
if STRICT:
|
||||
import logging
|
||||
logging.error('PDFTypeError : Dict required: %r' % x)
|
||||
logging.error('PDFTypeError : Dict required: %r', x)
|
||||
raise PDFTypeError('Dict required: %r' % x)
|
||||
return {}
|
||||
return x
|
||||
|
|
|
@ -6,19 +6,12 @@ import logging
|
|||
|
||||
import six # Python 2+3 compatibility
|
||||
|
||||
def bytes(s,i,j=None):
|
||||
def bytesindex(s,i,j=None):
|
||||
"""implements s[i], s[i:], s[i:j] for Python2 and Python3"""
|
||||
if six.PY2:
|
||||
if j is None:
|
||||
return s[i]
|
||||
if j<0:
|
||||
return s[i:]
|
||||
return s[i:j]
|
||||
else: # six.PY3
|
||||
if i<0 : i=len(s)+i
|
||||
if j is None: j=i+1
|
||||
if j<0 : j=len(s)
|
||||
return b''.join(six.int2byte(s[_]) for _ in range(i,j))
|
||||
if i<0 : i=len(s)+i
|
||||
if j is None: j=i+1
|
||||
if j<0 : j=len(s)
|
||||
return s[i:j]
|
||||
|
||||
from .utils import choplist
|
||||
|
||||
|
@ -214,14 +207,14 @@ class PSBaseParser(object):
|
|||
if not pos:
|
||||
pos = self.bufpos+self.charpos
|
||||
self.fp.seek(pos)
|
||||
logging.info('poll(%d): %r' % (pos, self.fp.read(n)))
|
||||
logging.info('poll(%d): %r', pos, self.fp.read(n))
|
||||
self.fp.seek(pos0)
|
||||
return
|
||||
|
||||
def seek(self, pos):
|
||||
"""Seeks the parser to the given position.
|
||||
"""
|
||||
logging.debug('seek: %r' % pos)
|
||||
logging.debug('seek: %r', pos)
|
||||
self.fp.seek(pos)
|
||||
# reset the status for nextline()
|
||||
self.bufpos = pos
|
||||
|
@ -254,7 +247,7 @@ class PSBaseParser(object):
|
|||
while 1:
|
||||
self.fillbuf()
|
||||
if eol:
|
||||
c = bytes(self.buf,self.charpos)
|
||||
c = bytesindex(self.buf,self.charpos)
|
||||
# handle b'\r\n'
|
||||
if c == b'\n':
|
||||
linebuf += c
|
||||
|
@ -262,16 +255,16 @@ class PSBaseParser(object):
|
|||
break
|
||||
m = EOL.search(self.buf, self.charpos)
|
||||
if m:
|
||||
linebuf += bytes(self.buf,self.charpos,m.end(0))
|
||||
linebuf += bytesindex(self.buf,self.charpos,m.end(0))
|
||||
self.charpos = m.end(0)
|
||||
if bytes(linebuf,-1) == b'\r':
|
||||
if bytesindex(linebuf,-1) == b'\r':
|
||||
eol = True
|
||||
else:
|
||||
break
|
||||
else:
|
||||
linebuf += bytes(self.buf,self.charpos,-1)
|
||||
linebuf += bytesindex(self.buf,self.charpos,-1)
|
||||
self.charpos = len(self.buf)
|
||||
logging.debug('nextline: %r, %r' % (linepos, linebuf))
|
||||
logging.debug('nextline: %r, %r', linepos, linebuf)
|
||||
|
||||
return (linepos, linebuf)
|
||||
|
||||
|
@ -295,8 +288,8 @@ class PSBaseParser(object):
|
|||
if n == -1:
|
||||
buf = s + buf
|
||||
break
|
||||
yield bytes(s,n,-1)+buf
|
||||
s = bytes(s,0,n)
|
||||
yield bytesindex(s,n,-1)+buf
|
||||
s = bytesindex(s,0,n)
|
||||
buf = b''
|
||||
return
|
||||
|
||||
|
@ -305,7 +298,7 @@ class PSBaseParser(object):
|
|||
if not m:
|
||||
return len(s)
|
||||
j = m.start(0)
|
||||
c = bytes(s,j)
|
||||
c = bytesindex(s,j)
|
||||
self._curtokenpos = self.bufpos+j
|
||||
if c == b'%':
|
||||
self._curtoken = b'%'
|
||||
|
@ -351,10 +344,10 @@ class PSBaseParser(object):
|
|||
def _parse_comment(self, s, i):
|
||||
m = EOL.search(s, i)
|
||||
if not m:
|
||||
self._curtoken += bytes(s,i,-1)
|
||||
self._curtoken += bytesindex(s,i,-1)
|
||||
return (self._parse_comment, len(s))
|
||||
j = m.start(0)
|
||||
self._curtoken += bytes(s,i,j)
|
||||
self._curtoken += bytesindex(s,i,j)
|
||||
self._parse1 = self._parse_main
|
||||
# We ignore comments.
|
||||
#self._tokens.append(self._curtoken)
|
||||
|
@ -363,11 +356,11 @@ class PSBaseParser(object):
|
|||
def _parse_literal(self, s, i):
|
||||
m = END_LITERAL.search(s, i)
|
||||
if not m:
|
||||
self._curtoken += bytes(s,i,-1)
|
||||
self._curtoken += bytesindex(s,i,-1)
|
||||
return len(s)
|
||||
j = m.start(0)
|
||||
self._curtoken += bytes(s,i,j)
|
||||
c = bytes(s,j)
|
||||
self._curtoken += bytesindex(s,i,j)
|
||||
c = bytesindex(s,j)
|
||||
if c == b'#':
|
||||
self.hex = b''
|
||||
self._parse1 = self._parse_literal_hex
|
||||
|
@ -381,7 +374,7 @@ class PSBaseParser(object):
|
|||
return j
|
||||
|
||||
def _parse_literal_hex(self, s, i):
|
||||
c = bytes(s,i)
|
||||
c = bytesindex(s,i)
|
||||
if HEX.match(c) and len(self.hex) < 2:
|
||||
self.hex += c
|
||||
return i+1
|
||||
|
@ -393,11 +386,11 @@ class PSBaseParser(object):
|
|||
def _parse_number(self, s, i):
|
||||
m = END_NUMBER.search(s, i)
|
||||
if not m:
|
||||
self._curtoken += bytes(s,i,-1)
|
||||
self._curtoken += bytesindex(s,i,-1)
|
||||
return len(s)
|
||||
j = m.start(0)
|
||||
self._curtoken += bytes(s,i,j)
|
||||
c = bytes(s,j)
|
||||
self._curtoken += bytesindex(s,i,j)
|
||||
c = bytesindex(s,j)
|
||||
if c == b'.':
|
||||
self._curtoken += c
|
||||
self._parse1 = self._parse_float
|
||||
|
@ -412,10 +405,10 @@ class PSBaseParser(object):
|
|||
def _parse_float(self, s, i):
|
||||
m = END_NUMBER.search(s, i)
|
||||
if not m:
|
||||
self._curtoken += bytes(s,i,-1)
|
||||
self._curtoken += bytesindex(s,i,-1)
|
||||
return len(s)
|
||||
j = m.start(0)
|
||||
self._curtoken += bytes(s,i,j)
|
||||
self._curtoken += bytesindex(s,i,j)
|
||||
try:
|
||||
self._add_token(float(self._curtoken))
|
||||
except ValueError:
|
||||
|
@ -426,10 +419,10 @@ class PSBaseParser(object):
|
|||
def _parse_keyword(self, s, i):
|
||||
m = END_KEYWORD.search(s, i)
|
||||
if not m:
|
||||
self._curtoken += bytes(s,i,-1)
|
||||
self._curtoken += bytesindex(s,i,-1)
|
||||
return len(s)
|
||||
j = m.start(0)
|
||||
self._curtoken += bytes(s,i,j)
|
||||
self._curtoken += bytesindex(s,i,j)
|
||||
if self._curtoken == b'true':
|
||||
token = True
|
||||
elif self._curtoken == b'false':
|
||||
|
@ -443,11 +436,11 @@ class PSBaseParser(object):
|
|||
def _parse_string(self, s, i):
|
||||
m = END_STRING.search(s, i)
|
||||
if not m:
|
||||
self._curtoken += bytes(s,i,-1)
|
||||
self._curtoken += bytesindex(s,i,-1)
|
||||
return len(s)
|
||||
j = m.start(0)
|
||||
self._curtoken += bytes(s,i,j)
|
||||
c = bytes(s,j)
|
||||
self._curtoken += bytesindex(s,i,j)
|
||||
c = bytesindex(s,j)
|
||||
if c == b'\\':
|
||||
self.oct = b''
|
||||
self._parse1 = self._parse_string_1
|
||||
|
@ -466,7 +459,7 @@ class PSBaseParser(object):
|
|||
return j+1
|
||||
|
||||
def _parse_string_1(self, s, i):
|
||||
c = bytes(s,i)
|
||||
c = bytesindex(s,i)
|
||||
if OCT_STRING.match(c) and len(self.oct) < 3:
|
||||
self.oct += c
|
||||
return i+1
|
||||
|
@ -480,7 +473,7 @@ class PSBaseParser(object):
|
|||
return i+1
|
||||
|
||||
def _parse_wopen(self, s, i):
|
||||
c = bytes(s,i)
|
||||
c = bytesindex(s,i)
|
||||
if c == b'<':
|
||||
self._add_token(KEYWORD_DICT_BEGIN)
|
||||
self._parse1 = self._parse_main
|
||||
|
@ -490,7 +483,7 @@ class PSBaseParser(object):
|
|||
return i
|
||||
|
||||
def _parse_wclose(self, s, i):
|
||||
c = bytes(s,i)
|
||||
c = bytesindex(s,i)
|
||||
if c == b'>':
|
||||
self._add_token(KEYWORD_DICT_END)
|
||||
i += 1
|
||||
|
@ -500,10 +493,10 @@ class PSBaseParser(object):
|
|||
def _parse_hexstring(self, s, i):
|
||||
m = END_HEX_STRING.search(s, i)
|
||||
if not m:
|
||||
self._curtoken += bytes(s,i,-1)
|
||||
self._curtoken += bytesindex(s,i,-1)
|
||||
return len(s)
|
||||
j = m.start(0)
|
||||
self._curtoken += bytes(s,i,j)
|
||||
self._curtoken += bytesindex(s,i,j)
|
||||
token = HEX_PAIR.sub(lambda m: six.int2byte(int(m.group(0), 16)),SPC.sub(b'', self._curtoken))
|
||||
self._add_token(token)
|
||||
self._parse1 = self._parse_main
|
||||
|
@ -514,7 +507,7 @@ class PSBaseParser(object):
|
|||
self.fillbuf()
|
||||
self.charpos = self._parse1(self.buf, self.charpos)
|
||||
token = self._tokens.pop(0)
|
||||
logging.debug('nexttoken: (%r:%r)' % token)
|
||||
logging.debug('nexttoken: %r', token)
|
||||
return token
|
||||
|
||||
|
||||
|
@ -555,7 +548,7 @@ class PSStackParser(PSBaseParser):
|
|||
|
||||
def add_results(self, *objs):
|
||||
try:
|
||||
logging.debug('add_results: %s' % repr(objs))
|
||||
logging.debug('add_results: %r', objs)
|
||||
except:
|
||||
logging.debug('add_results: (unprintable object)')
|
||||
self.results.extend(objs)
|
||||
|
@ -564,7 +557,7 @@ class PSStackParser(PSBaseParser):
|
|||
def start_type(self, pos, type):
|
||||
self.context.append((pos, self.curtype, self.curstack))
|
||||
(self.curtype, self.curstack) = (type, [])
|
||||
logging.debug('start_type: pos=%r, type=%r' % (pos, type))
|
||||
logging.debug('start_type: pos=%r, type=%r', pos, type)
|
||||
return
|
||||
|
||||
def end_type(self, type):
|
||||
|
@ -572,7 +565,7 @@ class PSStackParser(PSBaseParser):
|
|||
raise PSTypeError('Type mismatch: %r != %r' % (self.curtype, type))
|
||||
objs = [obj for (_, obj) in self.curstack]
|
||||
(pos, self.curtype, self.curstack) = self.context.pop()
|
||||
logging.debug('end_type: pos=%r, type=%r, objs=%r' % (pos, type, objs))
|
||||
logging.debug('end_type: pos=%r, type=%r, objs=%r', pos, type, objs)
|
||||
return (pos, objs)
|
||||
|
||||
def do_keyword(self, pos, token):
|
||||
|
@ -626,10 +619,10 @@ class PSStackParser(PSBaseParser):
|
|||
if STRICT:
|
||||
raise
|
||||
elif isinstance(token,PSKeyword):
|
||||
logging.debug('do_keyword: pos=%r, token=%r, stack=%r' % (pos, token, self.curstack))
|
||||
logging.debug('do_keyword: pos=%r, token=%r, stack=%r', pos, token, self.curstack)
|
||||
self.do_keyword(pos, token)
|
||||
else:
|
||||
logging.error('unknown token: pos=%r, token=%r, stack=%r' % (pos, token, self.curstack))
|
||||
logging.error('unknown token: pos=%r, token=%r, stack=%r', pos, token, self.curstack)
|
||||
self.do_keyword(pos, token)
|
||||
raise
|
||||
if self.context:
|
||||
|
@ -638,7 +631,7 @@ class PSStackParser(PSBaseParser):
|
|||
self.flush()
|
||||
obj = self.results.pop(0)
|
||||
try:
|
||||
logging.debug('nextobject: %s' % repr(obj))
|
||||
logging.debug('nextobject: %r', obj)
|
||||
except:
|
||||
logging.debug('nextobject: (unprintable object)')
|
||||
return obj
|
||||
|
|
|
@ -4,6 +4,8 @@ try:
|
|||
import cPickle as pickle
|
||||
except ImportError:
|
||||
import pickle as pickle
|
||||
import codecs
|
||||
import six
|
||||
|
||||
|
||||
## CMapConverter
|
||||
|
@ -56,14 +58,17 @@ class CMapConverter(object):
|
|||
|
||||
def put(dmap, code, cid, force=False):
|
||||
for b in code[:-1]:
|
||||
b = ord(b)
|
||||
if six.PY2:
|
||||
b = ord(b)
|
||||
if b in dmap:
|
||||
dmap = dmap[b]
|
||||
else:
|
||||
d = {}
|
||||
dmap[b] = d
|
||||
dmap = d
|
||||
b = ord(code[-1])
|
||||
b = code[-1]
|
||||
if six.PY2:
|
||||
b = ord(b)
|
||||
if force or ((b not in dmap) or dmap[b] == cid):
|
||||
dmap[b] = cid
|
||||
return
|
||||
|
@ -83,8 +88,8 @@ class CMapConverter(object):
|
|||
return
|
||||
|
||||
def pick(unimap):
|
||||
chars = unimap.items()
|
||||
chars.sort(key=(lambda (c,n):(n,-ord(c))), reverse=True)
|
||||
chars = list(unimap.items())
|
||||
chars.sort(key=(lambda x:(x[1],-ord(x[0]))), reverse=True)
|
||||
(c,_) = chars[0]
|
||||
return c
|
||||
|
||||
|
@ -103,7 +108,7 @@ class CMapConverter(object):
|
|||
if vertical:
|
||||
code = code[:-1]
|
||||
try:
|
||||
code = code.decode('hex')
|
||||
code = codecs.decode(code, 'hex_codec')
|
||||
except:
|
||||
code = chr(int(code, 16))
|
||||
if vertical:
|
||||
|
@ -138,7 +143,7 @@ class CMapConverter(object):
|
|||
IS_VERTICAL=self.is_vertical.get(enc, False),
|
||||
CODE2CID=self.code2cid.get(enc),
|
||||
)
|
||||
fp.write(pickle.dumps(data))
|
||||
fp.write(pickle.dumps(data, 2))
|
||||
return
|
||||
|
||||
def dump_unicodemap(self, fp):
|
||||
|
@ -146,7 +151,7 @@ class CMapConverter(object):
|
|||
CID2UNICHR_H=self.cid2unichr_h,
|
||||
CID2UNICHR_V=self.cid2unichr_v,
|
||||
)
|
||||
fp.write(pickle.dumps(data))
|
||||
fp.write(pickle.dumps(data, 2))
|
||||
return
|
||||
|
||||
# main
|
||||
|
@ -175,7 +180,7 @@ def main(argv):
|
|||
converter = CMapConverter(enc2codec)
|
||||
for path in args:
|
||||
print ('reading: %r...' % path)
|
||||
fp = file(path)
|
||||
fp = open(path)
|
||||
converter.load(fp)
|
||||
fp.close()
|
||||
|
||||
|
|
|
@ -247,7 +247,7 @@ def main(argv):
|
|||
outfp = sys.stdout
|
||||
extractdir = None
|
||||
for (k, v) in opts:
|
||||
if k == '-d': logging.getLogger().setlevel(logging.DEBUG)
|
||||
if k == '-d': logging.getLogger().setLevel(logging.DEBUG)
|
||||
elif k == '-o': outfp = open(v, 'w')
|
||||
elif k == '-i': objids.extend( int(x) for x in v.split(',') )
|
||||
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
|
||||
|
|
|
@ -9,6 +9,7 @@ from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
|
|||
from pdfminer.cmapdb import CMapDB
|
||||
from pdfminer.layout import LAParams
|
||||
from pdfminer.image import ImageWriter
|
||||
import logging
|
||||
|
||||
# main
|
||||
def main(argv):
|
||||
|
@ -25,8 +26,6 @@ def main(argv):
|
|||
except getopt.GetoptError:
|
||||
return usage()
|
||||
if not args: return usage()
|
||||
# debug option
|
||||
debug = 0
|
||||
# input option
|
||||
password = b''
|
||||
pagenos = set()
|
||||
|
@ -45,7 +44,7 @@ def main(argv):
|
|||
showpageno = True
|
||||
laparams = LAParams()
|
||||
for (k, v) in opts:
|
||||
if k == '-d': debug += 1
|
||||
if k == '-d': logging.getLogger().setLevel(logging.DEBUG)
|
||||
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
|
||||
elif k == '-m': maxpages = int(v)
|
||||
elif k == '-P': password = v
|
||||
|
@ -66,11 +65,6 @@ def main(argv):
|
|||
elif k == '-c': codec = v
|
||||
elif k == '-s': scale = float(v)
|
||||
#
|
||||
PDFDocument.debug = debug
|
||||
PDFParser.debug = debug
|
||||
CMapDB.debug = debug
|
||||
PDFPageInterpreter.debug = debug
|
||||
#
|
||||
rsrcmgr = PDFResourceManager(caching=caching)
|
||||
if not outtype:
|
||||
outtype = 'text'
|
||||
|
@ -97,7 +91,7 @@ def main(argv):
|
|||
elif outtype == 'html':
|
||||
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
|
||||
layoutmode=layoutmode, laparams=laparams,
|
||||
imagewriter=imagewriter, debug=debug)
|
||||
imagewriter=imagewriter)
|
||||
elif outtype == 'tag':
|
||||
device = TagExtractor(rsrcmgr, outfp, codec=codec)
|
||||
else:
|
||||
|
|
Loading…
Reference in New Issue