Added fallback for broken PDFs.
parent
eabe72ee63
commit
1467fc674c
|
@ -18,7 +18,7 @@ from pdffont import PDFFontError
|
||||||
from pdffont import PDFType1Font, PDFTrueTypeFont, PDFType3Font
|
from pdffont import PDFType1Font, PDFTrueTypeFont, PDFType3Font
|
||||||
from pdffont import PDFCIDFont
|
from pdffont import PDFCIDFont
|
||||||
from pdfparser import PDFDocument, PDFParser
|
from pdfparser import PDFDocument, PDFParser
|
||||||
from pdfparser import PDFPasswordIncorrect, PDFObjectNotFound
|
from pdfparser import PDFPasswordIncorrect
|
||||||
from pdfcolor import PDFColorSpace
|
from pdfcolor import PDFColorSpace
|
||||||
from pdfcolor import PREDEFINED_COLORSPACE
|
from pdfcolor import PREDEFINED_COLORSPACE
|
||||||
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
|
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
|
||||||
|
@ -335,10 +335,7 @@ class PDFPageInterpreter(object):
|
||||||
objid = None
|
objid = None
|
||||||
if isinstance(spec, PDFObjRef):
|
if isinstance(spec, PDFObjRef):
|
||||||
objid = spec.objid
|
objid = spec.objid
|
||||||
try:
|
|
||||||
spec = dict_value(spec)
|
spec = dict_value(spec)
|
||||||
except PDFObjectNotFound:
|
|
||||||
spec = {}
|
|
||||||
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
|
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
|
||||||
elif k == 'ColorSpace':
|
elif k == 'ColorSpace':
|
||||||
for (csid,spec) in dict_value(v).iteritems():
|
for (csid,spec) in dict_value(v).iteritems():
|
||||||
|
@ -634,6 +631,7 @@ class PDFPageInterpreter(object):
|
||||||
except KeyError:
|
except KeyError:
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFInterpreterError('Undefined Font id: %r' % fontid)
|
raise PDFInterpreterError('Undefined Font id: %r' % fontid)
|
||||||
|
self.textstate.font = self.rsrcmgr.get_font(None, {})
|
||||||
self.textstate.fontsize = fontsize
|
self.textstate.fontsize = fontsize
|
||||||
return
|
return
|
||||||
# setrendering
|
# setrendering
|
||||||
|
|
|
@ -15,7 +15,7 @@ from psparser import PSSyntaxError, PSEOF
|
||||||
from psparser import literal_name
|
from psparser import literal_name
|
||||||
from psparser import LIT, KWD, STRICT
|
from psparser import LIT, KWD, STRICT
|
||||||
from pdftypes import PDFException, PDFTypeError, PDFNotImplementedError
|
from pdftypes import PDFException, PDFTypeError, PDFNotImplementedError
|
||||||
from pdftypes import PDFStream, PDFObjRef
|
from pdftypes import PDFObjectNotFound, PDFStream, PDFObjRef
|
||||||
from pdftypes import resolve1, decipher_all
|
from pdftypes import resolve1, decipher_all
|
||||||
from pdftypes import int_value, float_value, num_value
|
from pdftypes import int_value, float_value, num_value
|
||||||
from pdftypes import str_value, list_value, dict_value, stream_value
|
from pdftypes import str_value, list_value, dict_value, stream_value
|
||||||
|
@ -31,7 +31,6 @@ class PDFNoValidXRef(PDFSyntaxError): pass
|
||||||
class PDFNoOutlines(PDFException): pass
|
class PDFNoOutlines(PDFException): pass
|
||||||
class PDFDestinationNotFound(PDFException): pass
|
class PDFDestinationNotFound(PDFException): pass
|
||||||
class PDFEncryptionError(PDFException): pass
|
class PDFEncryptionError(PDFException): pass
|
||||||
class PDFObjectNotFound(PDFException): pass
|
|
||||||
class PDFPasswordIncorrect(PDFEncryptionError): pass
|
class PDFPasswordIncorrect(PDFEncryptionError): pass
|
||||||
|
|
||||||
# some predefined literals and keywords.
|
# some predefined literals and keywords.
|
||||||
|
@ -330,23 +329,14 @@ class PDFDocument(object):
|
||||||
# If there's an encryption info, remember it.
|
# If there's an encryption info, remember it.
|
||||||
if 'Encrypt' in trailer:
|
if 'Encrypt' in trailer:
|
||||||
#assert not self.encryption
|
#assert not self.encryption
|
||||||
try:
|
|
||||||
self.encryption = (list_value(trailer['ID']),
|
self.encryption = (list_value(trailer['ID']),
|
||||||
dict_value(trailer['Encrypt']))
|
dict_value(trailer['Encrypt']))
|
||||||
except PDFObjectNotFound, e:
|
|
||||||
pass
|
|
||||||
if 'Info' in trailer:
|
if 'Info' in trailer:
|
||||||
try:
|
|
||||||
self.info.append(dict_value(trailer['Info']))
|
self.info.append(dict_value(trailer['Info']))
|
||||||
except PDFObjectNotFound, e:
|
|
||||||
pass
|
|
||||||
if 'Root' in trailer:
|
if 'Root' in trailer:
|
||||||
# Every PDF file must have exactly one /Root dictionary.
|
# Every PDF file must have exactly one /Root dictionary.
|
||||||
try:
|
|
||||||
self.catalog = dict_value(trailer['Root'])
|
self.catalog = dict_value(trailer['Root'])
|
||||||
break
|
break
|
||||||
except PDFObjectNotFound, e:
|
|
||||||
pass
|
|
||||||
else:
|
else:
|
||||||
raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
|
raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
|
||||||
if self.catalog.get('Type') is not LITERAL_CATALOG:
|
if self.catalog.get('Type') is not LITERAL_CATALOG:
|
||||||
|
@ -529,12 +519,12 @@ class PDFDocument(object):
|
||||||
if 1 <= self.debug:
|
if 1 <= self.debug:
|
||||||
print >>sys.stderr, 'Page: %r' % tree
|
print >>sys.stderr, 'Page: %r' % tree
|
||||||
yield (objid, tree)
|
yield (objid, tree)
|
||||||
try:
|
pages = False
|
||||||
if 'Pages' in self.catalog:
|
if 'Pages' in self.catalog:
|
||||||
for (objid,tree) in search(self.catalog['Pages'], self.catalog):
|
for (objid,tree) in search(self.catalog['Pages'], self.catalog):
|
||||||
yield PDFPage(self, objid, tree)
|
yield PDFPage(self, objid, tree)
|
||||||
return
|
pages = True
|
||||||
except PDFObjectNotFound:
|
if not pages:
|
||||||
# fallback when /Pages is missing.
|
# fallback when /Pages is missing.
|
||||||
for xref in self.xrefs:
|
for xref in self.xrefs:
|
||||||
for objid in xref.get_objids():
|
for objid in xref.get_objids():
|
||||||
|
|
|
@ -28,7 +28,8 @@ class PDFObject(PSObject): pass
|
||||||
class PDFException(PSException): pass
|
class PDFException(PSException): pass
|
||||||
class PDFTypeError(PDFException): pass
|
class PDFTypeError(PDFException): pass
|
||||||
class PDFValueError(PDFException): pass
|
class PDFValueError(PDFException): pass
|
||||||
class PDFNotImplementedError(PSException): pass
|
class PDFObjectNotFound(PDFException): pass
|
||||||
|
class PDFNotImplementedError(PDFException): pass
|
||||||
|
|
||||||
|
|
||||||
## PDFObjRef
|
## PDFObjRef
|
||||||
|
@ -47,34 +48,37 @@ class PDFObjRef(PDFObject):
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<PDFObjRef:%d>' % (self.objid)
|
return '<PDFObjRef:%d>' % (self.objid)
|
||||||
|
|
||||||
def resolve(self):
|
def resolve(self, default=None):
|
||||||
|
try:
|
||||||
return self.doc.getobj(self.objid)
|
return self.doc.getobj(self.objid)
|
||||||
|
except PDFObjectNotFound:
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
# resolve
|
# resolve
|
||||||
def resolve1(x):
|
def resolve1(x, default=None):
|
||||||
"""Resolves an object.
|
"""Resolves an object.
|
||||||
|
|
||||||
If this is an array or dictionary, it may still contains
|
If this is an array or dictionary, it may still contains
|
||||||
some indirect objects inside.
|
some indirect objects inside.
|
||||||
"""
|
"""
|
||||||
while isinstance(x, PDFObjRef):
|
while isinstance(x, PDFObjRef):
|
||||||
x = x.resolve()
|
x = x.resolve(default=default)
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def resolve_all(x):
|
def resolve_all(x, default=None):
|
||||||
"""Recursively resolves the given object and all the internals.
|
"""Recursively resolves the given object and all the internals.
|
||||||
|
|
||||||
Make sure there is no indirect reference within the nested object.
|
Make sure there is no indirect reference within the nested object.
|
||||||
This procedure might be slow.
|
This procedure might be slow.
|
||||||
"""
|
"""
|
||||||
while isinstance(x, PDFObjRef):
|
while isinstance(x, PDFObjRef):
|
||||||
x = x.resolve()
|
x = x.resolve(default=default)
|
||||||
if isinstance(x, list):
|
if isinstance(x, list):
|
||||||
x = [ resolve_all(v) for v in x ]
|
x = [ resolve_all(v, default=default) for v in x ]
|
||||||
elif isinstance(x, dict):
|
elif isinstance(x, dict):
|
||||||
for (k,v) in x.iteritems():
|
for (k,v) in x.iteritems():
|
||||||
x[k] = resolve_all(v)
|
x[k] = resolve_all(v, default=default)
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def decipher_all(decipher, objid, genno, x):
|
def decipher_all(decipher, objid, genno, x):
|
||||||
|
|
|
@ -9,7 +9,7 @@
|
||||||
import sys, re
|
import sys, re
|
||||||
from pdfminer.psparser import PSKeyword, PSLiteral
|
from pdfminer.psparser import PSKeyword, PSLiteral
|
||||||
from pdfminer.pdfparser import PDFDocument, PDFParser, PDFNoOutlines
|
from pdfminer.pdfparser import PDFDocument, PDFParser, PDFNoOutlines
|
||||||
from pdfminer.pdfparser import PDFObjectNotFound
|
from pdfminer.pdftypes import PDFObjectNotFound
|
||||||
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
|
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue