From 87143cb36fb4234c521e80713aa0d71a9bac7a09 Mon Sep 17 00:00:00 2001 From: Yusuke Shinyama Date: Wed, 9 Oct 2013 22:08:16 +0900 Subject: [PATCH] Fallback when /Pages does not exist. --- pdfminer/pdfinterp.py | 9 +++++---- pdfminer/pdfparser.py | 27 ++++++++++++++------------- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index 3b0c789..f74e501 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -18,7 +18,7 @@ from pdffont import PDFFontError from pdffont import PDFType1Font, PDFTrueTypeFont, PDFType3Font from pdffont import PDFCIDFont from pdfparser import PDFDocument, PDFParser -from pdfparser import PDFPasswordIncorrect +from pdfparser import PDFPasswordIncorrect, PDFObjectNotFound from pdfcolor import PDFColorSpace from pdfcolor import PREDEFINED_COLORSPACE from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB @@ -335,7 +335,10 @@ class PDFPageInterpreter(object): objid = None if isinstance(spec, PDFObjRef): objid = spec.objid - spec = dict_value(spec) + try: + spec = dict_value(spec) + except PDFObjectNotFound: + spec = {} self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec) elif k == 'ColorSpace': for (csid,spec) in dict_value(v).iteritems(): @@ -629,10 +632,8 @@ class PDFPageInterpreter(object): try: self.textstate.font = self.fontmap[literal_name(fontid)] except KeyError: - raise if STRICT: raise PDFInterpreterError('Undefined Font id: %r' % fontid) - return self.textstate.fontsize = fontsize return # setrendering diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py index a4ab8be..3cddf83 100644 --- a/pdfminer/pdfparser.py +++ b/pdfminer/pdfparser.py @@ -505,12 +505,6 @@ class PDFDocument(object): obj = decipher_all(self.decipher, objid, genno, obj) return obj - def get_objects(self): - for xref in self.xrefs: - for objid in xref.get_objids(): - yield self.getobj(objid) - return - INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate']) def get_pages(self): if not self.xrefs: @@ -535,14 +529,21 @@ class PDFDocument(object): if 1 <= self.debug: print >>sys.stderr, 'Page: %r' % tree yield (objid, tree) - if 'Pages' in self.catalog: - for (pageid,tree) in search(self.catalog['Pages'], self.catalog): - yield PDFPage(self, pageid, tree) - else: + try: + if 'Pages' in self.catalog: + for (objid,tree) in search(self.catalog['Pages'], self.catalog): + yield PDFPage(self, objid, tree) + return + except PDFObjectNotFound: # fallback when /Pages is missing. - for obj in self.get_objects(): - if isinstance(obj, dict) and obj.get('Type') is LITERAL_PAGES: - yield PDFPage(self, pageid, obj) + for xref in self.xrefs: + for objid in xref.get_objids(): + try: + obj = self.getobj(objid) + if isinstance(obj, dict) and obj.get('Type') is LITERAL_PAGE: + yield PDFPage(self, objid, obj) + except PDFObjectNotFound: + pass return def get_outlines(self):