Added fallback for broken PDFs.

2013-10-09 22:45:54 +09:00 · 2013-10-09 22:45:54 +09:00 · 1467fc674c
parent eabe72ee63
commit 1467fc674c
4 changed files with 29 additions and 37 deletions
--- a/pdfminer/pdfinterp.py
+++ b/pdfminer/pdfinterp.py
@ -18,7 +18,7 @@ from pdffont import PDFFontError
 from pdffont import PDFType1Font, PDFTrueTypeFont, PDFType3Font
 from pdffont import PDFCIDFont
 from pdfparser import PDFDocument, PDFParser
-from pdfparser import PDFPasswordIncorrect, PDFObjectNotFound
+from pdfparser import PDFPasswordIncorrect
 from pdfcolor import PDFColorSpace
 from pdfcolor import PREDEFINED_COLORSPACE
 from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
@ -335,10 +335,7 @@ class PDFPageInterpreter(object):
                    objid = None
                    if isinstance(spec, PDFObjRef):
                        objid = spec.objid
                    try:
                    spec = dict_value(spec)
                    except PDFObjectNotFound:
                        spec = {}
                    self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
            elif k == 'ColorSpace':
                for (csid,spec) in dict_value(v).iteritems():
@ -634,6 +631,7 @@ class PDFPageInterpreter(object):
        except KeyError:
            if STRICT:
                raise PDFInterpreterError('Undefined Font id: %r' % fontid)
            self.textstate.font = self.rsrcmgr.get_font(None, {})
        self.textstate.fontsize = fontsize
        return
    # setrendering
--- a/pdfminer/pdfparser.py
+++ b/pdfminer/pdfparser.py
@ -15,7 +15,7 @@ from psparser import PSSyntaxError, PSEOF
 from psparser import literal_name
 from psparser import LIT, KWD, STRICT
 from pdftypes import PDFException, PDFTypeError, PDFNotImplementedError
-from pdftypes import PDFStream, PDFObjRef
+from pdftypes import PDFObjectNotFound, PDFStream, PDFObjRef
 from pdftypes import resolve1, decipher_all
 from pdftypes import int_value, float_value, num_value
 from pdftypes import str_value, list_value, dict_value, stream_value
@ -31,7 +31,6 @@ class PDFNoValidXRef(PDFSyntaxError): pass
 class PDFNoOutlines(PDFException): pass
 class PDFDestinationNotFound(PDFException): pass
 class PDFEncryptionError(PDFException): pass
 class PDFObjectNotFound(PDFException): pass
 class PDFPasswordIncorrect(PDFEncryptionError): pass
 # some predefined literals and keywords.
@ -330,23 +329,14 @@ class PDFDocument(object):
            # If there's an encryption info, remember it.
            if 'Encrypt' in trailer:
                #assert not self.encryption
                try:
                self.encryption = (list_value(trailer['ID']),
                                   dict_value(trailer['Encrypt']))
                except PDFObjectNotFound, e:
                    pass
            if 'Info' in trailer:
                try:
                self.info.append(dict_value(trailer['Info']))
                except PDFObjectNotFound, e:
                    pass
            if 'Root' in trailer:
                # Every PDF file must have exactly one /Root dictionary.
                try:
                self.catalog = dict_value(trailer['Root'])
                break
                except PDFObjectNotFound, e:
                    pass
        else:
            raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
        if self.catalog.get('Type') is not LITERAL_CATALOG:
@ -529,12 +519,12 @@ class PDFDocument(object):
                if 1 <= self.debug:
                    print >>sys.stderr, 'Page: %r' % tree
                yield (objid, tree)
-        try:
+        pages = False
        if 'Pages' in self.catalog:
            for (objid,tree) in search(self.catalog['Pages'], self.catalog):
                yield PDFPage(self, objid, tree)
-                return
+                pages = True
-        except PDFObjectNotFound:
+        if not pages:
            # fallback when /Pages is missing.
            for xref in self.xrefs:
                for objid in xref.get_objids():
--- a/pdfminer/pdftypes.py
+++ b/pdfminer/pdftypes.py
@ -28,7 +28,8 @@ class PDFObject(PSObject): pass
 class PDFException(PSException): pass
 class PDFTypeError(PDFException): pass
 class PDFValueError(PDFException): pass
-class PDFNotImplementedError(PSException): pass
+class PDFObjectNotFound(PDFException): pass
 class PDFNotImplementedError(PDFException): pass
 ##  PDFObjRef
@ -47,34 +48,37 @@ class PDFObjRef(PDFObject):
    def __repr__(self):
        return '<PDFObjRef:%d>' % (self.objid)
-    def resolve(self):
+    def resolve(self, default=None):
        try:
            return self.doc.getobj(self.objid)
        except PDFObjectNotFound:
            return default
 # resolve
-def resolve1(x):
+def resolve1(x, default=None):
    """Resolves an object.
    If this is an array or dictionary, it may still contains
    some indirect objects inside.
    """
    while isinstance(x, PDFObjRef):
-        x = x.resolve()
+        x = x.resolve(default=default)
    return x
-def resolve_all(x):
+def resolve_all(x, default=None):
    """Recursively resolves the given object and all the internals.
    Make sure there is no indirect reference within the nested object.
    This procedure might be slow.
    """
    while isinstance(x, PDFObjRef):
-        x = x.resolve()
+        x = x.resolve(default=default)
    if isinstance(x, list):
-        x = [ resolve_all(v) for v in x ]
+        x = [ resolve_all(v, default=default) for v in x ]
    elif isinstance(x, dict):
        for (k,v) in x.iteritems():
-            x[k] = resolve_all(v)
+            x[k] = resolve_all(v, default=default)
    return x
 def decipher_all(decipher, objid, genno, x):
--- a/tools/dumppdf.py
+++ b/tools/dumppdf.py
@ -9,7 +9,7 @@
 import sys, re
 from pdfminer.psparser import PSKeyword, PSLiteral
 from pdfminer.pdfparser import PDFDocument, PDFParser, PDFNoOutlines
-from pdfminer.pdfparser import PDFObjectNotFound
+from pdfminer.pdftypes import PDFObjectNotFound
 from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value