From 9b0a3ee53edfe85c8e334f66b8dd6e173f497728 Mon Sep 17 00:00:00 2001 From: cybjit Date: Thu, 11 Sep 2014 23:30:02 +0200 Subject: [PATCH 1/8] decode cmap font name --- pdfminer/pdffont.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index a4bca61..b2a9df8 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -640,8 +640,8 @@ class PDFCIDFont(PDFFont): raise PDFFontError('BaseFont is missing') self.basefont = 'unknown' self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {})) - self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'), - self.cidsysteminfo.get('Ordering', 'unknown')) + self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', b'unknown').decode("latin1"), + self.cidsysteminfo.get('Ordering', b'unknown').decode("latin1")) try: name = literal_name(spec['Encoding']) except KeyError: @@ -728,7 +728,7 @@ class PDFCIDFont(PDFFont): # main def main(argv): for fname in argv[1:]: - fp = file(fname, 'rb') + fp = open(fname, 'rb') #font = TrueTypeFont(fname, fp) font = CFFFont(fname, fp) print (font) From 6357e2da80da14f76e6fcfaadc4c4d03e27e1e8a Mon Sep 17 00:00:00 2001 From: cybjit Date: Thu, 11 Sep 2014 23:32:43 +0200 Subject: [PATCH 2/8] code2cid uses int, not byte --- pdfminer/cmapdb.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index e964b8f..fdcc5de 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -94,9 +94,8 @@ class CMap(CMapBase): logging.debug('decode: %r, %r' % (self, code)) d = self.code2cid for i in six.iterbytes(code): - c = six.int2byte(i) - if c in d: - d = d[c] + if i in d: + d = d[i] if isinstance(d, int): yield d d = self.code2cid From cba5a42ba87c5af0021694f03a6f921904433e9e Mon Sep 17 00:00:00 2001 From: cybjit Date: Thu, 11 Sep 2014 23:33:14 +0200 Subject: [PATCH 3/8] decipher_all bytes --- pdfminer/pdftypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index 18236a2..229c2be 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -101,7 +101,7 @@ def resolve_all(x, default=None): def decipher_all(decipher, objid, genno, x): """Recursively deciphers the given object. """ - if isinstance(x, str): + if isinstance(x, bytes): return decipher(objid, genno, x) if isinstance(x, list): x = [decipher_all(decipher, objid, genno, v) for v in x] From ed13f7c47d045ab6553cbc8cf793c69da8a591e6 Mon Sep 17 00:00:00 2001 From: cybjit Date: Thu, 11 Sep 2014 23:34:09 +0200 Subject: [PATCH 4/8] conv_cmap py3 compat --- Makefile | 2 +- tools/conv_cmap.py | 21 +++++++++++++-------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index d671b9a..2b6a4eb 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ PACKAGE=pdfminer -PYTHON=python2 +PYTHON=python GIT=git RM=rm -f CP=cp -f diff --git a/tools/conv_cmap.py b/tools/conv_cmap.py index 88cab57..7e45a89 100755 --- a/tools/conv_cmap.py +++ b/tools/conv_cmap.py @@ -4,6 +4,8 @@ try: import cPickle as pickle except ImportError: import pickle as pickle +import codecs +import six ## CMapConverter @@ -56,14 +58,17 @@ class CMapConverter(object): def put(dmap, code, cid, force=False): for b in code[:-1]: - b = ord(b) + if six.PY2: + b = ord(b) if b in dmap: dmap = dmap[b] else: d = {} dmap[b] = d dmap = d - b = ord(code[-1]) + b = code[-1] + if six.PY2: + b = ord(b) if force or ((b not in dmap) or dmap[b] == cid): dmap[b] = cid return @@ -83,8 +88,8 @@ class CMapConverter(object): return def pick(unimap): - chars = unimap.items() - chars.sort(key=(lambda (c,n):(n,-ord(c))), reverse=True) + chars = list(unimap.items()) + chars.sort(key=(lambda x:(x[1],-ord(x[0]))), reverse=True) (c,_) = chars[0] return c @@ -103,7 +108,7 @@ class CMapConverter(object): if vertical: code = code[:-1] try: - code = code.decode('hex') + code = codecs.decode(code, 'hex_codec') except: code = chr(int(code, 16)) if vertical: @@ -138,7 +143,7 @@ class CMapConverter(object): IS_VERTICAL=self.is_vertical.get(enc, False), CODE2CID=self.code2cid.get(enc), ) - fp.write(pickle.dumps(data)) + fp.write(pickle.dumps(data, 2)) return def dump_unicodemap(self, fp): @@ -146,7 +151,7 @@ class CMapConverter(object): CID2UNICHR_H=self.cid2unichr_h, CID2UNICHR_V=self.cid2unichr_v, ) - fp.write(pickle.dumps(data)) + fp.write(pickle.dumps(data, 2)) return # main @@ -175,7 +180,7 @@ def main(argv): converter = CMapConverter(enc2codec) for path in args: print ('reading: %r...' % path) - fp = file(path) + fp = open(path) converter.load(fp) fp.close() From 31e6afc7cf55214bc135c1e8c85614d061044ae3 Mon Sep 17 00:00:00 2001 From: cybjit Date: Thu, 11 Sep 2014 23:35:26 +0200 Subject: [PATCH 5/8] faster and simpler bytes implementation --- pdfminer/psparser.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index 880d4aa..98935c9 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -8,17 +8,10 @@ import six # Python 2+3 compatibility def bytes(s,i,j=None): """implements s[i], s[i:], s[i:j] for Python2 and Python3""" - if six.PY2: - if j is None: - return s[i] - if j<0: - return s[i:] - return s[i:j] - else: # six.PY3 - if i<0 : i=len(s)+i - if j is None: j=i+1 - if j<0 : j=len(s) - return b''.join(six.int2byte(s[_]) for _ in range(i,j)) + if i<0 : i=len(s)+i + if j is None: j=i+1 + if j<0 : j=len(s) + return s[i:j] from .utils import choplist From 01821c7d1e962e7ad5b17998b3d0b2572e4d71dd Mon Sep 17 00:00:00 2001 From: cybjit Date: Thu, 11 Sep 2014 23:38:05 +0200 Subject: [PATCH 6/8] rename bytes to avoid built-in collision --- pdfminer/psparser.py | 58 ++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index 98935c9..e9093fe 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -6,7 +6,7 @@ import logging import six # Python 2+3 compatibility -def bytes(s,i,j=None): +def bytesindex(s,i,j=None): """implements s[i], s[i:], s[i:j] for Python2 and Python3""" if i<0 : i=len(s)+i if j is None: j=i+1 @@ -247,7 +247,7 @@ class PSBaseParser(object): while 1: self.fillbuf() if eol: - c = bytes(self.buf,self.charpos) + c = bytesindex(self.buf,self.charpos) # handle b'\r\n' if c == b'\n': linebuf += c @@ -255,14 +255,14 @@ class PSBaseParser(object): break m = EOL.search(self.buf, self.charpos) if m: - linebuf += bytes(self.buf,self.charpos,m.end(0)) + linebuf += bytesindex(self.buf,self.charpos,m.end(0)) self.charpos = m.end(0) - if bytes(linebuf,-1) == b'\r': + if bytesindex(linebuf,-1) == b'\r': eol = True else: break else: - linebuf += bytes(self.buf,self.charpos,-1) + linebuf += bytesindex(self.buf,self.charpos,-1) self.charpos = len(self.buf) logging.debug('nextline: %r, %r' % (linepos, linebuf)) @@ -288,8 +288,8 @@ class PSBaseParser(object): if n == -1: buf = s + buf break - yield bytes(s,n,-1)+buf - s = bytes(s,0,n) + yield bytesindex(s,n,-1)+buf + s = bytesindex(s,0,n) buf = b'' return @@ -298,7 +298,7 @@ class PSBaseParser(object): if not m: return len(s) j = m.start(0) - c = bytes(s,j) + c = bytesindex(s,j) self._curtokenpos = self.bufpos+j if c == b'%': self._curtoken = b'%' @@ -344,10 +344,10 @@ class PSBaseParser(object): def _parse_comment(self, s, i): m = EOL.search(s, i) if not m: - self._curtoken += bytes(s,i,-1) + self._curtoken += bytesindex(s,i,-1) return (self._parse_comment, len(s)) j = m.start(0) - self._curtoken += bytes(s,i,j) + self._curtoken += bytesindex(s,i,j) self._parse1 = self._parse_main # We ignore comments. #self._tokens.append(self._curtoken) @@ -356,11 +356,11 @@ class PSBaseParser(object): def _parse_literal(self, s, i): m = END_LITERAL.search(s, i) if not m: - self._curtoken += bytes(s,i,-1) + self._curtoken += bytesindex(s,i,-1) return len(s) j = m.start(0) - self._curtoken += bytes(s,i,j) - c = bytes(s,j) + self._curtoken += bytesindex(s,i,j) + c = bytesindex(s,j) if c == b'#': self.hex = b'' self._parse1 = self._parse_literal_hex @@ -374,7 +374,7 @@ class PSBaseParser(object): return j def _parse_literal_hex(self, s, i): - c = bytes(s,i) + c = bytesindex(s,i) if HEX.match(c) and len(self.hex) < 2: self.hex += c return i+1 @@ -386,11 +386,11 @@ class PSBaseParser(object): def _parse_number(self, s, i): m = END_NUMBER.search(s, i) if not m: - self._curtoken += bytes(s,i,-1) + self._curtoken += bytesindex(s,i,-1) return len(s) j = m.start(0) - self._curtoken += bytes(s,i,j) - c = bytes(s,j) + self._curtoken += bytesindex(s,i,j) + c = bytesindex(s,j) if c == b'.': self._curtoken += c self._parse1 = self._parse_float @@ -405,10 +405,10 @@ class PSBaseParser(object): def _parse_float(self, s, i): m = END_NUMBER.search(s, i) if not m: - self._curtoken += bytes(s,i,-1) + self._curtoken += bytesindex(s,i,-1) return len(s) j = m.start(0) - self._curtoken += bytes(s,i,j) + self._curtoken += bytesindex(s,i,j) try: self._add_token(float(self._curtoken)) except ValueError: @@ -419,10 +419,10 @@ class PSBaseParser(object): def _parse_keyword(self, s, i): m = END_KEYWORD.search(s, i) if not m: - self._curtoken += bytes(s,i,-1) + self._curtoken += bytesindex(s,i,-1) return len(s) j = m.start(0) - self._curtoken += bytes(s,i,j) + self._curtoken += bytesindex(s,i,j) if self._curtoken == b'true': token = True elif self._curtoken == b'false': @@ -436,11 +436,11 @@ class PSBaseParser(object): def _parse_string(self, s, i): m = END_STRING.search(s, i) if not m: - self._curtoken += bytes(s,i,-1) + self._curtoken += bytesindex(s,i,-1) return len(s) j = m.start(0) - self._curtoken += bytes(s,i,j) - c = bytes(s,j) + self._curtoken += bytesindex(s,i,j) + c = bytesindex(s,j) if c == b'\\': self.oct = b'' self._parse1 = self._parse_string_1 @@ -459,7 +459,7 @@ class PSBaseParser(object): return j+1 def _parse_string_1(self, s, i): - c = bytes(s,i) + c = bytesindex(s,i) if OCT_STRING.match(c) and len(self.oct) < 3: self.oct += c return i+1 @@ -473,7 +473,7 @@ class PSBaseParser(object): return i+1 def _parse_wopen(self, s, i): - c = bytes(s,i) + c = bytesindex(s,i) if c == b'<': self._add_token(KEYWORD_DICT_BEGIN) self._parse1 = self._parse_main @@ -483,7 +483,7 @@ class PSBaseParser(object): return i def _parse_wclose(self, s, i): - c = bytes(s,i) + c = bytesindex(s,i) if c == b'>': self._add_token(KEYWORD_DICT_END) i += 1 @@ -493,10 +493,10 @@ class PSBaseParser(object): def _parse_hexstring(self, s, i): m = END_HEX_STRING.search(s, i) if not m: - self._curtoken += bytes(s,i,-1) + self._curtoken += bytesindex(s,i,-1) return len(s) j = m.start(0) - self._curtoken += bytes(s,i,j) + self._curtoken += bytesindex(s,i,j) token = HEX_PAIR.sub(lambda m: six.int2byte(int(m.group(0), 16)),SPC.sub(b'', self._curtoken)) self._add_token(token) self._parse1 = self._parse_main From 39942b6642fd3399fcd7ab71029afced9b0c8434 Mon Sep 17 00:00:00 2001 From: cybjit Date: Thu, 11 Sep 2014 23:40:18 +0200 Subject: [PATCH 7/8] avoid string formating when not logging --- pdfminer/cmapdb.py | 6 +++--- pdfminer/converter.py | 2 +- pdfminer/pdfdocument.py | 24 ++++++++++++------------ pdfminer/pdfinterp.py | 16 ++++++++-------- pdfminer/pdfpage.py | 4 ++-- pdfminer/pdfparser.py | 2 +- pdfminer/pdftypes.py | 2 +- pdfminer/psparser.py | 20 ++++++++++---------- 8 files changed, 38 insertions(+), 38 deletions(-) diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index fdcc5de..a1cc53c 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -91,7 +91,7 @@ class CMap(CMapBase): return def decode(self, code): - logging.debug('decode: %r, %r' % (self, code)) + logging.debug('decode: %r, %r', self, code) d = self.code2cid for i in six.iterbytes(code): if i in d: @@ -141,7 +141,7 @@ class UnicodeMap(CMapBase): return '' % self.attrs.get('CMapName') def get_unichr(self, cid): - logging.debug('get_unichr: %r, %r' % (self, cid)) + logging.debug('get_unichr: %r, %r', self, cid) return self.cid2unichr[cid] def dump(self, out=sys.stdout): @@ -228,7 +228,7 @@ class CMapDB(object): @classmethod def _load_data(klass, name): filename = '%s.pickle.gz' % name - logging.info('loading: %r' % name) + logging.info('loading: %r', name) cmap_paths = (os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'), os.path.join(os.path.dirname(__file__), 'cmap'),) for directory in cmap_paths: diff --git a/pdfminer/converter.py b/pdfminer/converter.py index ccf1d2a..e93b055 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -117,7 +117,7 @@ class PDFLayoutAnalyzer(PDFTextDevice): return item.adv def handle_undefined_char(self, font, cid): - logging.info('undefined: %r, %r' % (font, cid)) + logging.info('undefined: %r, %r', font, cid) return '(cid:%d)' % cid def receive_layout(self, ltpage): diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py index a49ae0b..a63213a 100644 --- a/pdfminer/pdfdocument.py +++ b/pdfminer/pdfdocument.py @@ -127,7 +127,7 @@ class PDFXRef(PDFBaseXRef): if use != b'n': continue self.offsets[objid] = (None, long(pos) if six.PY2 else int(pos), int(genno)) - logging.info('xref objects: %r' % self.offsets) + logging.info('xref objects: %r', self.offsets) self.load_trailer(parser) return @@ -142,7 +142,7 @@ class PDFXRef(PDFBaseXRef): raise PDFNoValidXRef('Unexpected EOF - file corrupted') (_, dic) = x[0] self.trailer.update(dict_value(dic)) - logging.debug('trailer=%r'%self.trailer) + logging.debug('trailer=%r', self.trailer) return def get_trailer(self): @@ -177,7 +177,7 @@ class PDFXRefFallback(PDFXRef): if line.startswith(b'trailer'): parser.seek(pos) self.load_trailer(parser) - logging.info('trailer: %r' % self.get_trailer()) + logging.info('trailer: %r', self.trailer) break if six.PY3: line=line.decode('latin-1') #default pdf encoding @@ -244,9 +244,9 @@ class PDFXRefStream(PDFBaseXRef): self.data = stream.get_data() self.entlen = self.fl1+self.fl2+self.fl3 self.trailer = stream.attrs - logging.info('xref stream: objid=%s, fields=%d,%d,%d' % - (', '.join(map(repr, self.ranges)), - self.fl1, self.fl2, self.fl3)) + logging.info('xref stream: objid=%s, fields=%d,%d,%d', + ', '.join(map(repr, self.ranges)), + self.fl1, self.fl2, self.fl3) return def get_trailer(self): @@ -655,7 +655,7 @@ class PDFDocument(object): assert objid != 0 if not self.xrefs: raise PDFException('PDFDocument is not initialized') - logging.debug('getobj: objid=%r' % objid) + logging.debug('getobj: objid=%r', objid) if objid in self._cached_objs: (obj, genno) = self._cached_objs[objid] else: @@ -680,7 +680,7 @@ class PDFDocument(object): continue else: raise PDFObjectNotFound(objid) - logging.debug('register: objid=%r: %r' % (objid, obj)) + logging.debug('register: objid=%r: %r', objid, obj) if self.caching: self._cached_objs[objid] = (obj, genno) return obj @@ -753,14 +753,14 @@ class PDFDocument(object): prev = None for line in parser.revreadlines(): line = line.strip() - logging.debug('find_xref: %r' % line) + logging.debug('find_xref: %r', line) if line == b'startxref': break if line: prev = line else: raise PDFNoValidXRef('Unexpected EOF') - logging.info('xref found: pos=%r' % prev) + logging.info('xref found: pos=%r', prev) return long(prev) if six.PY2 else int(prev) # read xref table @@ -772,7 +772,7 @@ class PDFDocument(object): (pos, token) = parser.nexttoken() except PSEOF: raise PDFNoValidXRef('Unexpected EOF') - logging.info('read_xref_from: start=%d, token=%r' % (start, token)) + logging.info('read_xref_from: start=%d, token=%r', start, token) if isinstance(token, int): # XRefStream: PDF-1.5 parser.seek(pos) @@ -786,7 +786,7 @@ class PDFDocument(object): xref.load(parser) xrefs.append(xref) trailer = xref.get_trailer() - logging.info('trailer: %r' % trailer) + logging.info('trailer: %r', trailer) if 'XRefStm' in trailer: pos = int_value(trailer['XRefStm']) self.read_xref_from(parser, pos, xrefs) diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index 5d73ca4..779c864 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -166,7 +166,7 @@ class PDFResourceManager(object): if objid and objid in self._cached_fonts: font = self._cached_fonts[objid] else: - logging.info('get_font: create: objid=%r, spec=%r' % (objid, spec)) + logging.info('get_font: create: objid=%r, spec=%r', objid, spec) if STRICT: if spec['Type'] is not LITERAL_FONT: raise PDFFontError('Type is not /Font') @@ -340,7 +340,7 @@ class PDFPageInterpreter(object): else: return PREDEFINED_COLORSPACE.get(name) for (k, v) in six.iteritems(dict_value(resources)): - logging.debug('Resource: %r: %r' % (k, v)) + logging.debug('Resource: %r: %r', k, v) if k == 'Font': for (fontid, spec) in six.iteritems(dict_value(v)): objid = None @@ -796,7 +796,7 @@ class PDFPageInterpreter(object): if STRICT: raise PDFInterpreterError('Undefined xobject id: %r' % xobjid) return - logging.info('Processing xobj: %r' % xobj) + logging.info('Processing xobj: %r', xobj) subtype = xobj.get('Subtype') if subtype is LITERAL_FORM and 'BBox' in xobj: interpreter = self.dup() @@ -819,7 +819,7 @@ class PDFPageInterpreter(object): return def process_page(self, page): - logging.info('Processing page: %r' % page) + logging.info('Processing page: %r', page) (x0, y0, x1, y1) = page.mediabox if page.rotate == 90: ctm = (0, -1, 1, 0, -y0, x1) @@ -838,8 +838,8 @@ class PDFPageInterpreter(object): # Render the content streams. # This method may be called recursively. def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY): - logging.info('render_contents: resources=%r, streams=%r, ctm=%r' % - (resources, streams, ctm)) + logging.info('render_contents: resources=%r, streams=%r, ctm=%r', + resources, streams, ctm) self.init_resources(resources) self.init_state(ctm) self.execute(list_value(streams)) @@ -864,11 +864,11 @@ class PDFPageInterpreter(object): nargs = six.get_function_code(func).co_argcount-1 if nargs: args = self.pop(nargs) - logging.debug('exec: %s %r' % (name, args)) + logging.debug('exec: %s %r', name, args) if len(args) == nargs: func(*args) else: - logging.debug('exec: %s' % name) + logging.debug('exec: %s', name) func() else: if STRICT: diff --git a/pdfminer/pdfpage.py b/pdfminer/pdfpage.py index 5755229..25a351c 100644 --- a/pdfminer/pdfpage.py +++ b/pdfminer/pdfpage.py @@ -87,12 +87,12 @@ class PDFPage(object): if k in klass.INHERITABLE_ATTRS and k not in tree: tree[k] = v if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree: - logging.info('Pages: Kids=%r' % tree['Kids']) + logging.info('Pages: Kids=%r', tree['Kids']) for c in list_value(tree['Kids']): for x in search(c, tree): yield x elif tree.get('Type') is LITERAL_PAGE: - logging.info('Page: %r' % tree) + logging.info('Page: %r', tree) yield (objid, tree) pages = False if 'Pages' in document.catalog: diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py index 6daf1eb..7407ade 100644 --- a/pdfminer/pdfparser.py +++ b/pdfminer/pdfparser.py @@ -120,7 +120,7 @@ class PDFParser(PSStackParser): data += line self.seek(pos+objlen) # XXX limit objlen not to exceed object boundary - logging.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % (pos, objlen, dic, data[:10])) + logging.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...', pos, objlen, dic, data[:10]) obj = PDFStream(dic, data, self.doc.decipher) self.push((pos, obj)) diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index 229c2be..64d84bb 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -162,7 +162,7 @@ def dict_value(x): if not isinstance(x, dict): if STRICT: import logging - logging.error('PDFTypeError : Dict required: %r' % x) + logging.error('PDFTypeError : Dict required: %r', x) raise PDFTypeError('Dict required: %r' % x) return {} return x diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index e9093fe..dbc7861 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -207,14 +207,14 @@ class PSBaseParser(object): if not pos: pos = self.bufpos+self.charpos self.fp.seek(pos) - logging.info('poll(%d): %r' % (pos, self.fp.read(n))) + logging.info('poll(%d): %r', pos, self.fp.read(n)) self.fp.seek(pos0) return def seek(self, pos): """Seeks the parser to the given position. """ - logging.debug('seek: %r' % pos) + logging.debug('seek: %r', pos) self.fp.seek(pos) # reset the status for nextline() self.bufpos = pos @@ -264,7 +264,7 @@ class PSBaseParser(object): else: linebuf += bytesindex(self.buf,self.charpos,-1) self.charpos = len(self.buf) - logging.debug('nextline: %r, %r' % (linepos, linebuf)) + logging.debug('nextline: %r, %r', linepos, linebuf) return (linepos, linebuf) @@ -507,7 +507,7 @@ class PSBaseParser(object): self.fillbuf() self.charpos = self._parse1(self.buf, self.charpos) token = self._tokens.pop(0) - logging.debug('nexttoken: (%r:%r)' % token) + logging.debug('nexttoken: %r', token) return token @@ -548,7 +548,7 @@ class PSStackParser(PSBaseParser): def add_results(self, *objs): try: - logging.debug('add_results: %s' % repr(objs)) + logging.debug('add_results: %r', objs) except: logging.debug('add_results: (unprintable object)') self.results.extend(objs) @@ -557,7 +557,7 @@ class PSStackParser(PSBaseParser): def start_type(self, pos, type): self.context.append((pos, self.curtype, self.curstack)) (self.curtype, self.curstack) = (type, []) - logging.debug('start_type: pos=%r, type=%r' % (pos, type)) + logging.debug('start_type: pos=%r, type=%r', pos, type) return def end_type(self, type): @@ -565,7 +565,7 @@ class PSStackParser(PSBaseParser): raise PSTypeError('Type mismatch: %r != %r' % (self.curtype, type)) objs = [obj for (_, obj) in self.curstack] (pos, self.curtype, self.curstack) = self.context.pop() - logging.debug('end_type: pos=%r, type=%r, objs=%r' % (pos, type, objs)) + logging.debug('end_type: pos=%r, type=%r, objs=%r', pos, type, objs) return (pos, objs) def do_keyword(self, pos, token): @@ -619,10 +619,10 @@ class PSStackParser(PSBaseParser): if STRICT: raise elif isinstance(token,PSKeyword): - logging.debug('do_keyword: pos=%r, token=%r, stack=%r' % (pos, token, self.curstack)) + logging.debug('do_keyword: pos=%r, token=%r, stack=%r', pos, token, self.curstack) self.do_keyword(pos, token) else: - logging.error('unknown token: pos=%r, token=%r, stack=%r' % (pos, token, self.curstack)) + logging.error('unknown token: pos=%r, token=%r, stack=%r', pos, token, self.curstack) self.do_keyword(pos, token) raise if self.context: @@ -631,7 +631,7 @@ class PSStackParser(PSBaseParser): self.flush() obj = self.results.pop(0) try: - logging.debug('nextobject: %s' % repr(obj)) + logging.debug('nextobject: %r', obj) except: logging.debug('nextobject: (unprintable object)') return obj From 714423883c85bd4a3055697bf81c9feb02d975b5 Mon Sep 17 00:00:00 2001 From: cybjit Date: Thu, 11 Sep 2014 23:41:01 +0200 Subject: [PATCH 8/8] setup logging for pdf2txt and fix dumppdf --- tools/dumppdf.py | 2 +- tools/pdf2txt.py | 12 +++--------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/tools/dumppdf.py b/tools/dumppdf.py index 7a46e68..860da56 100755 --- a/tools/dumppdf.py +++ b/tools/dumppdf.py @@ -247,7 +247,7 @@ def main(argv): outfp = sys.stdout extractdir = None for (k, v) in opts: - if k == '-d': logging.getLogger().setlevel(logging.DEBUG) + if k == '-d': logging.getLogger().setLevel(logging.DEBUG) elif k == '-o': outfp = open(v, 'w') elif k == '-i': objids.extend( int(x) for x in v.split(',') ) elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index 2cf1572..40711d4 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -9,6 +9,7 @@ from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter from pdfminer.cmapdb import CMapDB from pdfminer.layout import LAParams from pdfminer.image import ImageWriter +import logging # main def main(argv): @@ -25,8 +26,6 @@ def main(argv): except getopt.GetoptError: return usage() if not args: return usage() - # debug option - debug = 0 # input option password = b'' pagenos = set() @@ -45,7 +44,7 @@ def main(argv): showpageno = True laparams = LAParams() for (k, v) in opts: - if k == '-d': debug += 1 + if k == '-d': logging.getLogger().setLevel(logging.DEBUG) elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v @@ -66,11 +65,6 @@ def main(argv): elif k == '-c': codec = v elif k == '-s': scale = float(v) # - PDFDocument.debug = debug - PDFParser.debug = debug - CMapDB.debug = debug - PDFPageInterpreter.debug = debug - # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' @@ -97,7 +91,7 @@ def main(argv): elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, - imagewriter=imagewriter, debug=debug) + imagewriter=imagewriter) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: