From ecc4d056754a528706f7bf43f8754c448ea2dd4b Mon Sep 17 00:00:00 2001 From: speedplane Date: Tue, 11 Nov 2014 23:34:33 -0500 Subject: [PATCH 1/3] Fix a unicode conversion bug. See https://github.com/euske/pdfminer/issues/75 --- pdfminer/psparser.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index c1ebe93..be715af 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -343,7 +343,15 @@ class PSBaseParser(object): self.hex = b'' self._parse1 = self._parse_literal_hex return j+1 - self._add_token(LIT(unicode(self._curtoken))) + + try: + # Try to interpret the token as a utf-8 string + utoken = self._curtoken.decode('utf-8') + except UnicodeDecodeError: + # We failed, there is possibly a corrupt PDF here. + if STRICT: raise + utoken = "" + self._add_token(LIT(utoken)) self._parse1 = self._parse_main return j From 1067cb9f9f433d6843ed8acffa08394bdbba3ad2 Mon Sep 17 00:00:00 2001 From: speedplane Date: Tue, 11 Nov 2014 23:36:26 -0500 Subject: [PATCH 2/3] Use a .gitignore file. --- .gitignore | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ed8045c --- /dev/null +++ b/.gitignore @@ -0,0 +1,54 @@ +# Intermediate documents +*.xps + +# Password and Key Files +*.pem +*.p12 + +# Compiled source # +################### +*.pyc +*.com +*.class +*.dll +*.exe +*.o +*.so + +# Mecurial Files? +*.i +*.d +*.mo +*.hg/ + +# Python data files # +*.shelf +*.shelve + +# Don't track these files, they are output from scripts + +# Subversion files +*.svn-base +all-wcprops +entries + +# Logs and databases # +###################### +bulkloader-log-* +*.log +*.sql +*.sql3 +*.sql3-journal +*.sqlite + +# OS generated files # +###################### +.DS_Store? +ehthumbs.db +Icon? +Thumbs.db + +# tmp files # +############# +~$*.doc +~WRL*.tmp \ No newline at end of file From 36977fbe0802b994069c800f9c55d253137c0aef Mon Sep 17 00:00:00 2001 From: speedplane Date: Tue, 11 Nov 2014 23:36:58 -0500 Subject: [PATCH 3/3] Add debug flags for much of the debug output. --- pdfminer/pdfdocument.py | 20 ++++++++++++++------ pdfminer/pdfinterp.py | 12 ++++++++---- pdfminer/pdfpage.py | 6 ++++-- 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py index 2c3c274..66b575a 100644 --- a/pdfminer/pdfdocument.py +++ b/pdfminer/pdfdocument.py @@ -65,6 +65,8 @@ LITERAL_CATALOG = LIT('Catalog') ## class PDFBaseXRef(object): + debug = False + def get_trailer(self): raise NotImplementedError @@ -122,7 +124,7 @@ class PDFXRef(PDFBaseXRef): if use != b'n': continue self.offsets[objid] = (None, long(pos), int(genno)) - logging.info('xref objects: %r' % self.offsets) + if self.debug: logging.info('xref objects: %r' % self.offsets) self.load_trailer(parser) return @@ -173,7 +175,7 @@ class PDFXRefFallback(PDFXRef): if line.startswith(b'trailer'): parser.seek(pos) self.load_trailer(parser) - logging.info('trailer: %r' % self.get_trailer()) + if self.debug: logging.info('trailer: %r' % self.get_trailer()) break m = self.PDFOBJ_CUE.match(line) if not m: @@ -212,6 +214,8 @@ class PDFXRefFallback(PDFXRef): ## class PDFXRefStream(PDFBaseXRef): + debug = False + def __init__(self): self.data = None self.entlen = None @@ -238,7 +242,8 @@ class PDFXRefStream(PDFBaseXRef): self.data = stream.get_data() self.entlen = self.fl1+self.fl2+self.fl3 self.trailer = stream.attrs - logging.info('xref stream: objid=%s, fields=%d,%d,%d' % + if self.debug: + logging.info('xref stream: objid=%s, fields=%d,%d,%d' % (', '.join(map(repr, self.ranges)), self.fl1, self.fl2, self.fl3)) return @@ -761,7 +766,8 @@ class PDFDocument(object): prev = line else: raise PDFNoValidXRef('Unexpected EOF') - logging.info('xref found: pos=%r' % prev) + if self.debug: + logging.info('xref found: pos=%r' % prev) return long(prev) # read xref table @@ -773,7 +779,8 @@ class PDFDocument(object): (pos, token) = parser.nexttoken() except PSEOF: raise PDFNoValidXRef('Unexpected EOF') - logging.info('read_xref_from: start=%d, token=%r' % (start, token)) + if self.debug: + logging.info('read_xref_from: start=%d, token=%r' % (start, token)) if isinstance(token, int): # XRefStream: PDF-1.5 parser.seek(pos) @@ -787,7 +794,8 @@ class PDFDocument(object): xref.load(parser) xrefs.append(xref) trailer = xref.get_trailer() - logging.info('trailer: %r' % trailer) + if self.debug: + logging.info('trailer: %r' % trailer) if 'XRefStm' in trailer: pos = int_value(trailer['XRefStm']) self.read_xref_from(parser, pos, xrefs) diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index 3b368e0..3f3f393 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -139,6 +139,8 @@ class PDFResourceManager(object): allocated multiple times. """ + debug = False + def __init__(self, caching=True): self.caching = caching self._cached_fonts = {} @@ -167,7 +169,8 @@ class PDFResourceManager(object): if objid and objid in self._cached_fonts: font = self._cached_fonts[objid] else: - logging.info('get_font: create: objid=%r, spec=%r' % (objid, spec)) + if self.debug: + logging.info('get_font: create: objid=%r, spec=%r' % (objid, spec)) if STRICT: if spec['Type'] is not LITERAL_FONT: raise PDFFontError('Type is not /Font') @@ -799,7 +802,7 @@ class PDFPageInterpreter(object): if STRICT: raise PDFInterpreterError('Undefined xobject id: %r' % xobjid) return - logging.info('Processing xobj: %r' % xobj) + if self.debug: logging.info('Processing xobj: %r' % xobj) subtype = xobj.get('Subtype') if subtype is LITERAL_FORM and 'BBox' in xobj: interpreter = self.dup() @@ -822,7 +825,7 @@ class PDFPageInterpreter(object): return def process_page(self, page): - logging.info('Processing page: %r' % page) + if self.debug: logging.info('Processing page: %r' % page) (x0, y0, x1, y1) = page.mediabox if page.rotate == 90: ctm = (0, -1, 1, 0, -y0, x1) @@ -841,7 +844,8 @@ class PDFPageInterpreter(object): # Render the content streams. # This method may be called recursively. def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY): - logging.info('render_contents: resources=%r, streams=%r, ctm=%r' % + if self.debug: + logging.info('render_contents: resources=%r, streams=%r, ctm=%r' % (resources, streams, ctm)) self.init_resources(resources) self.init_state(ctm) diff --git a/pdfminer/pdfpage.py b/pdfminer/pdfpage.py index fcdf17b..a48767c 100644 --- a/pdfminer/pdfpage.py +++ b/pdfminer/pdfpage.py @@ -39,6 +39,8 @@ class PDFPage(object): beads: a chain that represents natural reading order. """ + debug = False + def __init__(self, doc, pageid, attrs): """Initialize a page object. @@ -86,12 +88,12 @@ class PDFPage(object): if k in klass.INHERITABLE_ATTRS and k not in tree: tree[k] = v if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree: - logging.info('Pages: Kids=%r' % tree['Kids']) + if klass.debug: logging.info('Pages: Kids=%r' % tree['Kids']) for c in list_value(tree['Kids']): for x in search(c, tree): yield x elif tree.get('Type') is LITERAL_PAGE: - logging.info('Page: %r' % tree) + if klass.debug: logging.info('Page: %r' % tree) yield (objid, tree) pages = False if 'Pages' in document.catalog: