diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index f74e501..849f825 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -18,7 +18,7 @@ from pdffont import PDFFontError from pdffont import PDFType1Font, PDFTrueTypeFont, PDFType3Font from pdffont import PDFCIDFont from pdfparser import PDFDocument, PDFParser -from pdfparser import PDFPasswordIncorrect, PDFObjectNotFound +from pdfparser import PDFPasswordIncorrect from pdfcolor import PDFColorSpace from pdfcolor import PREDEFINED_COLORSPACE from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB @@ -335,10 +335,7 @@ class PDFPageInterpreter(object): objid = None if isinstance(spec, PDFObjRef): objid = spec.objid - try: - spec = dict_value(spec) - except PDFObjectNotFound: - spec = {} + spec = dict_value(spec) self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec) elif k == 'ColorSpace': for (csid,spec) in dict_value(v).iteritems(): @@ -634,6 +631,7 @@ class PDFPageInterpreter(object): except KeyError: if STRICT: raise PDFInterpreterError('Undefined Font id: %r' % fontid) + self.textstate.font = self.rsrcmgr.get_font(None, {}) self.textstate.fontsize = fontsize return # setrendering diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py index 3cddf83..df5890f 100644 --- a/pdfminer/pdfparser.py +++ b/pdfminer/pdfparser.py @@ -15,7 +15,7 @@ from psparser import PSSyntaxError, PSEOF from psparser import literal_name from psparser import LIT, KWD, STRICT from pdftypes import PDFException, PDFTypeError, PDFNotImplementedError -from pdftypes import PDFStream, PDFObjRef +from pdftypes import PDFObjectNotFound, PDFStream, PDFObjRef from pdftypes import resolve1, decipher_all from pdftypes import int_value, float_value, num_value from pdftypes import str_value, list_value, dict_value, stream_value @@ -31,7 +31,6 @@ class PDFNoValidXRef(PDFSyntaxError): pass class PDFNoOutlines(PDFException): pass class PDFDestinationNotFound(PDFException): pass class PDFEncryptionError(PDFException): pass -class PDFObjectNotFound(PDFException): pass class PDFPasswordIncorrect(PDFEncryptionError): pass # some predefined literals and keywords. @@ -330,23 +329,14 @@ class PDFDocument(object): # If there's an encryption info, remember it. if 'Encrypt' in trailer: #assert not self.encryption - try: - self.encryption = (list_value(trailer['ID']), - dict_value(trailer['Encrypt'])) - except PDFObjectNotFound, e: - pass + self.encryption = (list_value(trailer['ID']), + dict_value(trailer['Encrypt'])) if 'Info' in trailer: - try: - self.info.append(dict_value(trailer['Info'])) - except PDFObjectNotFound, e: - pass + self.info.append(dict_value(trailer['Info'])) if 'Root' in trailer: # Every PDF file must have exactly one /Root dictionary. - try: - self.catalog = dict_value(trailer['Root']) - break - except PDFObjectNotFound, e: - pass + self.catalog = dict_value(trailer['Root']) + break else: raise PDFSyntaxError('No /Root object! - Is this really a PDF?') if self.catalog.get('Type') is not LITERAL_CATALOG: @@ -529,12 +519,12 @@ class PDFDocument(object): if 1 <= self.debug: print >>sys.stderr, 'Page: %r' % tree yield (objid, tree) - try: - if 'Pages' in self.catalog: - for (objid,tree) in search(self.catalog['Pages'], self.catalog): - yield PDFPage(self, objid, tree) - return - except PDFObjectNotFound: + pages = False + if 'Pages' in self.catalog: + for (objid,tree) in search(self.catalog['Pages'], self.catalog): + yield PDFPage(self, objid, tree) + pages = True + if not pages: # fallback when /Pages is missing. for xref in self.xrefs: for objid in xref.get_objids(): diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index 60717a0..0a163e2 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -28,7 +28,8 @@ class PDFObject(PSObject): pass class PDFException(PSException): pass class PDFTypeError(PDFException): pass class PDFValueError(PDFException): pass -class PDFNotImplementedError(PSException): pass +class PDFObjectNotFound(PDFException): pass +class PDFNotImplementedError(PDFException): pass ## PDFObjRef @@ -47,34 +48,37 @@ class PDFObjRef(PDFObject): def __repr__(self): return '' % (self.objid) - def resolve(self): - return self.doc.getobj(self.objid) + def resolve(self, default=None): + try: + return self.doc.getobj(self.objid) + except PDFObjectNotFound: + return default # resolve -def resolve1(x): +def resolve1(x, default=None): """Resolves an object. If this is an array or dictionary, it may still contains some indirect objects inside. """ while isinstance(x, PDFObjRef): - x = x.resolve() + x = x.resolve(default=default) return x -def resolve_all(x): +def resolve_all(x, default=None): """Recursively resolves the given object and all the internals. Make sure there is no indirect reference within the nested object. This procedure might be slow. """ while isinstance(x, PDFObjRef): - x = x.resolve() + x = x.resolve(default=default) if isinstance(x, list): - x = [ resolve_all(v) for v in x ] + x = [ resolve_all(v, default=default) for v in x ] elif isinstance(x, dict): for (k,v) in x.iteritems(): - x[k] = resolve_all(v) + x[k] = resolve_all(v, default=default) return x def decipher_all(decipher, objid, genno, x): diff --git a/tools/dumppdf.py b/tools/dumppdf.py index c3a9beb..0612124 100755 --- a/tools/dumppdf.py +++ b/tools/dumppdf.py @@ -9,7 +9,7 @@ import sys, re from pdfminer.psparser import PSKeyword, PSLiteral from pdfminer.pdfparser import PDFDocument, PDFParser, PDFNoOutlines -from pdfminer.pdfparser import PDFObjectNotFound +from pdfminer.pdftypes import PDFObjectNotFound from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value