Added fallback for broken PDFs.

pull/1/head
Yusuke Shinyama 2013-10-09 22:45:54 +09:00
parent eabe72ee63
commit 1467fc674c
4 changed files with 29 additions and 37 deletions

View File

@ -18,7 +18,7 @@ from pdffont import PDFFontError
from pdffont import PDFType1Font, PDFTrueTypeFont, PDFType3Font from pdffont import PDFType1Font, PDFTrueTypeFont, PDFType3Font
from pdffont import PDFCIDFont from pdffont import PDFCIDFont
from pdfparser import PDFDocument, PDFParser from pdfparser import PDFDocument, PDFParser
from pdfparser import PDFPasswordIncorrect, PDFObjectNotFound from pdfparser import PDFPasswordIncorrect
from pdfcolor import PDFColorSpace from pdfcolor import PDFColorSpace
from pdfcolor import PREDEFINED_COLORSPACE from pdfcolor import PREDEFINED_COLORSPACE
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
@ -335,10 +335,7 @@ class PDFPageInterpreter(object):
objid = None objid = None
if isinstance(spec, PDFObjRef): if isinstance(spec, PDFObjRef):
objid = spec.objid objid = spec.objid
try:
spec = dict_value(spec) spec = dict_value(spec)
except PDFObjectNotFound:
spec = {}
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec) self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
elif k == 'ColorSpace': elif k == 'ColorSpace':
for (csid,spec) in dict_value(v).iteritems(): for (csid,spec) in dict_value(v).iteritems():
@ -634,6 +631,7 @@ class PDFPageInterpreter(object):
except KeyError: except KeyError:
if STRICT: if STRICT:
raise PDFInterpreterError('Undefined Font id: %r' % fontid) raise PDFInterpreterError('Undefined Font id: %r' % fontid)
self.textstate.font = self.rsrcmgr.get_font(None, {})
self.textstate.fontsize = fontsize self.textstate.fontsize = fontsize
return return
# setrendering # setrendering

View File

@ -15,7 +15,7 @@ from psparser import PSSyntaxError, PSEOF
from psparser import literal_name from psparser import literal_name
from psparser import LIT, KWD, STRICT from psparser import LIT, KWD, STRICT
from pdftypes import PDFException, PDFTypeError, PDFNotImplementedError from pdftypes import PDFException, PDFTypeError, PDFNotImplementedError
from pdftypes import PDFStream, PDFObjRef from pdftypes import PDFObjectNotFound, PDFStream, PDFObjRef
from pdftypes import resolve1, decipher_all from pdftypes import resolve1, decipher_all
from pdftypes import int_value, float_value, num_value from pdftypes import int_value, float_value, num_value
from pdftypes import str_value, list_value, dict_value, stream_value from pdftypes import str_value, list_value, dict_value, stream_value
@ -31,7 +31,6 @@ class PDFNoValidXRef(PDFSyntaxError): pass
class PDFNoOutlines(PDFException): pass class PDFNoOutlines(PDFException): pass
class PDFDestinationNotFound(PDFException): pass class PDFDestinationNotFound(PDFException): pass
class PDFEncryptionError(PDFException): pass class PDFEncryptionError(PDFException): pass
class PDFObjectNotFound(PDFException): pass
class PDFPasswordIncorrect(PDFEncryptionError): pass class PDFPasswordIncorrect(PDFEncryptionError): pass
# some predefined literals and keywords. # some predefined literals and keywords.
@ -330,23 +329,14 @@ class PDFDocument(object):
# If there's an encryption info, remember it. # If there's an encryption info, remember it.
if 'Encrypt' in trailer: if 'Encrypt' in trailer:
#assert not self.encryption #assert not self.encryption
try:
self.encryption = (list_value(trailer['ID']), self.encryption = (list_value(trailer['ID']),
dict_value(trailer['Encrypt'])) dict_value(trailer['Encrypt']))
except PDFObjectNotFound, e:
pass
if 'Info' in trailer: if 'Info' in trailer:
try:
self.info.append(dict_value(trailer['Info'])) self.info.append(dict_value(trailer['Info']))
except PDFObjectNotFound, e:
pass
if 'Root' in trailer: if 'Root' in trailer:
# Every PDF file must have exactly one /Root dictionary. # Every PDF file must have exactly one /Root dictionary.
try:
self.catalog = dict_value(trailer['Root']) self.catalog = dict_value(trailer['Root'])
break break
except PDFObjectNotFound, e:
pass
else: else:
raise PDFSyntaxError('No /Root object! - Is this really a PDF?') raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
if self.catalog.get('Type') is not LITERAL_CATALOG: if self.catalog.get('Type') is not LITERAL_CATALOG:
@ -529,12 +519,12 @@ class PDFDocument(object):
if 1 <= self.debug: if 1 <= self.debug:
print >>sys.stderr, 'Page: %r' % tree print >>sys.stderr, 'Page: %r' % tree
yield (objid, tree) yield (objid, tree)
try: pages = False
if 'Pages' in self.catalog: if 'Pages' in self.catalog:
for (objid,tree) in search(self.catalog['Pages'], self.catalog): for (objid,tree) in search(self.catalog['Pages'], self.catalog):
yield PDFPage(self, objid, tree) yield PDFPage(self, objid, tree)
return pages = True
except PDFObjectNotFound: if not pages:
# fallback when /Pages is missing. # fallback when /Pages is missing.
for xref in self.xrefs: for xref in self.xrefs:
for objid in xref.get_objids(): for objid in xref.get_objids():

View File

@ -28,7 +28,8 @@ class PDFObject(PSObject): pass
class PDFException(PSException): pass class PDFException(PSException): pass
class PDFTypeError(PDFException): pass class PDFTypeError(PDFException): pass
class PDFValueError(PDFException): pass class PDFValueError(PDFException): pass
class PDFNotImplementedError(PSException): pass class PDFObjectNotFound(PDFException): pass
class PDFNotImplementedError(PDFException): pass
## PDFObjRef ## PDFObjRef
@ -47,34 +48,37 @@ class PDFObjRef(PDFObject):
def __repr__(self): def __repr__(self):
return '<PDFObjRef:%d>' % (self.objid) return '<PDFObjRef:%d>' % (self.objid)
def resolve(self): def resolve(self, default=None):
try:
return self.doc.getobj(self.objid) return self.doc.getobj(self.objid)
except PDFObjectNotFound:
return default
# resolve # resolve
def resolve1(x): def resolve1(x, default=None):
"""Resolves an object. """Resolves an object.
If this is an array or dictionary, it may still contains If this is an array or dictionary, it may still contains
some indirect objects inside. some indirect objects inside.
""" """
while isinstance(x, PDFObjRef): while isinstance(x, PDFObjRef):
x = x.resolve() x = x.resolve(default=default)
return x return x
def resolve_all(x): def resolve_all(x, default=None):
"""Recursively resolves the given object and all the internals. """Recursively resolves the given object and all the internals.
Make sure there is no indirect reference within the nested object. Make sure there is no indirect reference within the nested object.
This procedure might be slow. This procedure might be slow.
""" """
while isinstance(x, PDFObjRef): while isinstance(x, PDFObjRef):
x = x.resolve() x = x.resolve(default=default)
if isinstance(x, list): if isinstance(x, list):
x = [ resolve_all(v) for v in x ] x = [ resolve_all(v, default=default) for v in x ]
elif isinstance(x, dict): elif isinstance(x, dict):
for (k,v) in x.iteritems(): for (k,v) in x.iteritems():
x[k] = resolve_all(v) x[k] = resolve_all(v, default=default)
return x return x
def decipher_all(decipher, objid, genno, x): def decipher_all(decipher, objid, genno, x):

View File

@ -9,7 +9,7 @@
import sys, re import sys, re
from pdfminer.psparser import PSKeyword, PSLiteral from pdfminer.psparser import PSKeyword, PSLiteral
from pdfminer.pdfparser import PDFDocument, PDFParser, PDFNoOutlines from pdfminer.pdfparser import PDFDocument, PDFParser, PDFNoOutlines
from pdfminer.pdfparser import PDFObjectNotFound from pdfminer.pdftypes import PDFObjectNotFound
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value