Added fallback for broken PDFs.

pull/1/head
Yusuke Shinyama 2013-10-09 22:45:54 +09:00
parent eabe72ee63
commit 1467fc674c
4 changed files with 29 additions and 37 deletions

View File

@ -18,7 +18,7 @@ from pdffont import PDFFontError
from pdffont import PDFType1Font, PDFTrueTypeFont, PDFType3Font
from pdffont import PDFCIDFont
from pdfparser import PDFDocument, PDFParser
from pdfparser import PDFPasswordIncorrect, PDFObjectNotFound
from pdfparser import PDFPasswordIncorrect
from pdfcolor import PDFColorSpace
from pdfcolor import PREDEFINED_COLORSPACE
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
@ -335,10 +335,7 @@ class PDFPageInterpreter(object):
objid = None
if isinstance(spec, PDFObjRef):
objid = spec.objid
try:
spec = dict_value(spec)
except PDFObjectNotFound:
spec = {}
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
elif k == 'ColorSpace':
for (csid,spec) in dict_value(v).iteritems():
@ -634,6 +631,7 @@ class PDFPageInterpreter(object):
except KeyError:
if STRICT:
raise PDFInterpreterError('Undefined Font id: %r' % fontid)
self.textstate.font = self.rsrcmgr.get_font(None, {})
self.textstate.fontsize = fontsize
return
# setrendering

View File

@ -15,7 +15,7 @@ from psparser import PSSyntaxError, PSEOF
from psparser import literal_name
from psparser import LIT, KWD, STRICT
from pdftypes import PDFException, PDFTypeError, PDFNotImplementedError
from pdftypes import PDFStream, PDFObjRef
from pdftypes import PDFObjectNotFound, PDFStream, PDFObjRef
from pdftypes import resolve1, decipher_all
from pdftypes import int_value, float_value, num_value
from pdftypes import str_value, list_value, dict_value, stream_value
@ -31,7 +31,6 @@ class PDFNoValidXRef(PDFSyntaxError): pass
class PDFNoOutlines(PDFException): pass
class PDFDestinationNotFound(PDFException): pass
class PDFEncryptionError(PDFException): pass
class PDFObjectNotFound(PDFException): pass
class PDFPasswordIncorrect(PDFEncryptionError): pass
# some predefined literals and keywords.
@ -330,23 +329,14 @@ class PDFDocument(object):
# If there's an encryption info, remember it.
if 'Encrypt' in trailer:
#assert not self.encryption
try:
self.encryption = (list_value(trailer['ID']),
dict_value(trailer['Encrypt']))
except PDFObjectNotFound, e:
pass
if 'Info' in trailer:
try:
self.info.append(dict_value(trailer['Info']))
except PDFObjectNotFound, e:
pass
if 'Root' in trailer:
# Every PDF file must have exactly one /Root dictionary.
try:
self.catalog = dict_value(trailer['Root'])
break
except PDFObjectNotFound, e:
pass
else:
raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
if self.catalog.get('Type') is not LITERAL_CATALOG:
@ -529,12 +519,12 @@ class PDFDocument(object):
if 1 <= self.debug:
print >>sys.stderr, 'Page: %r' % tree
yield (objid, tree)
try:
pages = False
if 'Pages' in self.catalog:
for (objid,tree) in search(self.catalog['Pages'], self.catalog):
yield PDFPage(self, objid, tree)
return
except PDFObjectNotFound:
pages = True
if not pages:
# fallback when /Pages is missing.
for xref in self.xrefs:
for objid in xref.get_objids():

View File

@ -28,7 +28,8 @@ class PDFObject(PSObject): pass
class PDFException(PSException): pass
class PDFTypeError(PDFException): pass
class PDFValueError(PDFException): pass
class PDFNotImplementedError(PSException): pass
class PDFObjectNotFound(PDFException): pass
class PDFNotImplementedError(PDFException): pass
## PDFObjRef
@ -47,34 +48,37 @@ class PDFObjRef(PDFObject):
def __repr__(self):
return '<PDFObjRef:%d>' % (self.objid)
def resolve(self):
def resolve(self, default=None):
try:
return self.doc.getobj(self.objid)
except PDFObjectNotFound:
return default
# resolve
def resolve1(x):
def resolve1(x, default=None):
"""Resolves an object.
If this is an array or dictionary, it may still contains
some indirect objects inside.
"""
while isinstance(x, PDFObjRef):
x = x.resolve()
x = x.resolve(default=default)
return x
def resolve_all(x):
def resolve_all(x, default=None):
"""Recursively resolves the given object and all the internals.
Make sure there is no indirect reference within the nested object.
This procedure might be slow.
"""
while isinstance(x, PDFObjRef):
x = x.resolve()
x = x.resolve(default=default)
if isinstance(x, list):
x = [ resolve_all(v) for v in x ]
x = [ resolve_all(v, default=default) for v in x ]
elif isinstance(x, dict):
for (k,v) in x.iteritems():
x[k] = resolve_all(v)
x[k] = resolve_all(v, default=default)
return x
def decipher_all(decipher, objid, genno, x):

View File

@ -9,7 +9,7 @@
import sys, re
from pdfminer.psparser import PSKeyword, PSLiteral
from pdfminer.pdfparser import PDFDocument, PDFParser, PDFNoOutlines
from pdfminer.pdfparser import PDFObjectNotFound
from pdfminer.pdftypes import PDFObjectNotFound
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value