Added fallback for broken PDFs.
parent
eabe72ee63
commit
1467fc674c
|
@ -18,7 +18,7 @@ from pdffont import PDFFontError
|
|||
from pdffont import PDFType1Font, PDFTrueTypeFont, PDFType3Font
|
||||
from pdffont import PDFCIDFont
|
||||
from pdfparser import PDFDocument, PDFParser
|
||||
from pdfparser import PDFPasswordIncorrect, PDFObjectNotFound
|
||||
from pdfparser import PDFPasswordIncorrect
|
||||
from pdfcolor import PDFColorSpace
|
||||
from pdfcolor import PREDEFINED_COLORSPACE
|
||||
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
|
||||
|
@ -335,10 +335,7 @@ class PDFPageInterpreter(object):
|
|||
objid = None
|
||||
if isinstance(spec, PDFObjRef):
|
||||
objid = spec.objid
|
||||
try:
|
||||
spec = dict_value(spec)
|
||||
except PDFObjectNotFound:
|
||||
spec = {}
|
||||
spec = dict_value(spec)
|
||||
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
|
||||
elif k == 'ColorSpace':
|
||||
for (csid,spec) in dict_value(v).iteritems():
|
||||
|
@ -634,6 +631,7 @@ class PDFPageInterpreter(object):
|
|||
except KeyError:
|
||||
if STRICT:
|
||||
raise PDFInterpreterError('Undefined Font id: %r' % fontid)
|
||||
self.textstate.font = self.rsrcmgr.get_font(None, {})
|
||||
self.textstate.fontsize = fontsize
|
||||
return
|
||||
# setrendering
|
||||
|
|
|
@ -15,7 +15,7 @@ from psparser import PSSyntaxError, PSEOF
|
|||
from psparser import literal_name
|
||||
from psparser import LIT, KWD, STRICT
|
||||
from pdftypes import PDFException, PDFTypeError, PDFNotImplementedError
|
||||
from pdftypes import PDFStream, PDFObjRef
|
||||
from pdftypes import PDFObjectNotFound, PDFStream, PDFObjRef
|
||||
from pdftypes import resolve1, decipher_all
|
||||
from pdftypes import int_value, float_value, num_value
|
||||
from pdftypes import str_value, list_value, dict_value, stream_value
|
||||
|
@ -31,7 +31,6 @@ class PDFNoValidXRef(PDFSyntaxError): pass
|
|||
class PDFNoOutlines(PDFException): pass
|
||||
class PDFDestinationNotFound(PDFException): pass
|
||||
class PDFEncryptionError(PDFException): pass
|
||||
class PDFObjectNotFound(PDFException): pass
|
||||
class PDFPasswordIncorrect(PDFEncryptionError): pass
|
||||
|
||||
# some predefined literals and keywords.
|
||||
|
@ -330,23 +329,14 @@ class PDFDocument(object):
|
|||
# If there's an encryption info, remember it.
|
||||
if 'Encrypt' in trailer:
|
||||
#assert not self.encryption
|
||||
try:
|
||||
self.encryption = (list_value(trailer['ID']),
|
||||
dict_value(trailer['Encrypt']))
|
||||
except PDFObjectNotFound, e:
|
||||
pass
|
||||
self.encryption = (list_value(trailer['ID']),
|
||||
dict_value(trailer['Encrypt']))
|
||||
if 'Info' in trailer:
|
||||
try:
|
||||
self.info.append(dict_value(trailer['Info']))
|
||||
except PDFObjectNotFound, e:
|
||||
pass
|
||||
self.info.append(dict_value(trailer['Info']))
|
||||
if 'Root' in trailer:
|
||||
# Every PDF file must have exactly one /Root dictionary.
|
||||
try:
|
||||
self.catalog = dict_value(trailer['Root'])
|
||||
break
|
||||
except PDFObjectNotFound, e:
|
||||
pass
|
||||
self.catalog = dict_value(trailer['Root'])
|
||||
break
|
||||
else:
|
||||
raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
|
||||
if self.catalog.get('Type') is not LITERAL_CATALOG:
|
||||
|
@ -529,12 +519,12 @@ class PDFDocument(object):
|
|||
if 1 <= self.debug:
|
||||
print >>sys.stderr, 'Page: %r' % tree
|
||||
yield (objid, tree)
|
||||
try:
|
||||
if 'Pages' in self.catalog:
|
||||
for (objid,tree) in search(self.catalog['Pages'], self.catalog):
|
||||
yield PDFPage(self, objid, tree)
|
||||
return
|
||||
except PDFObjectNotFound:
|
||||
pages = False
|
||||
if 'Pages' in self.catalog:
|
||||
for (objid,tree) in search(self.catalog['Pages'], self.catalog):
|
||||
yield PDFPage(self, objid, tree)
|
||||
pages = True
|
||||
if not pages:
|
||||
# fallback when /Pages is missing.
|
||||
for xref in self.xrefs:
|
||||
for objid in xref.get_objids():
|
||||
|
|
|
@ -28,7 +28,8 @@ class PDFObject(PSObject): pass
|
|||
class PDFException(PSException): pass
|
||||
class PDFTypeError(PDFException): pass
|
||||
class PDFValueError(PDFException): pass
|
||||
class PDFNotImplementedError(PSException): pass
|
||||
class PDFObjectNotFound(PDFException): pass
|
||||
class PDFNotImplementedError(PDFException): pass
|
||||
|
||||
|
||||
## PDFObjRef
|
||||
|
@ -47,34 +48,37 @@ class PDFObjRef(PDFObject):
|
|||
def __repr__(self):
|
||||
return '<PDFObjRef:%d>' % (self.objid)
|
||||
|
||||
def resolve(self):
|
||||
return self.doc.getobj(self.objid)
|
||||
def resolve(self, default=None):
|
||||
try:
|
||||
return self.doc.getobj(self.objid)
|
||||
except PDFObjectNotFound:
|
||||
return default
|
||||
|
||||
|
||||
# resolve
|
||||
def resolve1(x):
|
||||
def resolve1(x, default=None):
|
||||
"""Resolves an object.
|
||||
|
||||
If this is an array or dictionary, it may still contains
|
||||
some indirect objects inside.
|
||||
"""
|
||||
while isinstance(x, PDFObjRef):
|
||||
x = x.resolve()
|
||||
x = x.resolve(default=default)
|
||||
return x
|
||||
|
||||
def resolve_all(x):
|
||||
def resolve_all(x, default=None):
|
||||
"""Recursively resolves the given object and all the internals.
|
||||
|
||||
Make sure there is no indirect reference within the nested object.
|
||||
This procedure might be slow.
|
||||
"""
|
||||
while isinstance(x, PDFObjRef):
|
||||
x = x.resolve()
|
||||
x = x.resolve(default=default)
|
||||
if isinstance(x, list):
|
||||
x = [ resolve_all(v) for v in x ]
|
||||
x = [ resolve_all(v, default=default) for v in x ]
|
||||
elif isinstance(x, dict):
|
||||
for (k,v) in x.iteritems():
|
||||
x[k] = resolve_all(v)
|
||||
x[k] = resolve_all(v, default=default)
|
||||
return x
|
||||
|
||||
def decipher_all(decipher, objid, genno, x):
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
import sys, re
|
||||
from pdfminer.psparser import PSKeyword, PSLiteral
|
||||
from pdfminer.pdfparser import PDFDocument, PDFParser, PDFNoOutlines
|
||||
from pdfminer.pdfparser import PDFObjectNotFound
|
||||
from pdfminer.pdftypes import PDFObjectNotFound
|
||||
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue