Introducing PDFObjectNotFound

pull/1/head
Yusuke Shinyama 2013-10-09 21:39:23 +09:00
parent 3c3cba2ecc
commit 06425bba00
2 changed files with 70 additions and 47 deletions

View File

@ -31,6 +31,7 @@ class PDFNoValidXRef(PDFSyntaxError): pass
class PDFNoOutlines(PDFException): pass class PDFNoOutlines(PDFException): pass
class PDFDestinationNotFound(PDFException): pass class PDFDestinationNotFound(PDFException): pass
class PDFEncryptionError(PDFException): pass class PDFEncryptionError(PDFException): pass
class PDFObjectNotFound(PDFException): pass
class PDFPasswordIncorrect(PDFEncryptionError): pass class PDFPasswordIncorrect(PDFEncryptionError): pass
# some predefined literals and keywords. # some predefined literals and keywords.
@ -64,6 +65,9 @@ class PDFXRef(PDFBaseXRef):
self.trailer = {} self.trailer = {}
return return
def __repr__(self):
return '<PDFXRef: offsets=%r>' % (self.offsets.keys())
def load(self, parser, debug=0): def load(self, parser, debug=0):
while 1: while 1:
try: try:
@ -308,27 +312,41 @@ class PDFDocument(object):
self._parsed_objs = {} self._parsed_objs = {}
return return
def set_parser(self, parser): def set_parser(self, parser, fallback=True):
"Set the document to use a given PDFParser object." "Set the document to use a given PDFParser object."
if self._parser: return if self._parser: return
self._parser = parser self._parser = parser
# Retrieve the information of each header that was appended # Retrieve the information of each header that was appended
# (maybe multiple times) at the end of the document. # (maybe multiple times) at the end of the document.
self.xrefs = parser.read_xref() try:
self.xrefs = parser.read_xref()
except PDFNoValidXRef:
fallback = True
if fallback:
self.xrefs.extend(parser.read_xref(fallback=True))
for xref in self.xrefs: for xref in self.xrefs:
trailer = xref.get_trailer() trailer = xref.get_trailer()
if not trailer: continue if not trailer: continue
# If there's an encryption info, remember it. # If there's an encryption info, remember it.
if 'Encrypt' in trailer: if 'Encrypt' in trailer:
#assert not self.encryption #assert not self.encryption
self.encryption = (list_value(trailer['ID']), try:
dict_value(trailer['Encrypt'])) self.encryption = (list_value(trailer['ID']),
dict_value(trailer['Encrypt']))
except PDFObjectNotFound, e:
pass
if 'Info' in trailer: if 'Info' in trailer:
self.info.append(dict_value(trailer['Info'])) try:
self.info.append(dict_value(trailer['Info']))
except PDFObjectNotFound, e:
pass
if 'Root' in trailer: if 'Root' in trailer:
# Every PDF file must have exactly one /Root dictionary. # Every PDF file must have exactly one /Root dictionary.
self.catalog = dict_value(trailer['Root']) try:
break self.catalog = dict_value(trailer['Root'])
break
except PDFObjectNotFound, e:
pass
else: else:
raise PDFSyntaxError('No /Root object! - Is this really a PDF?') raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
if self.catalog.get('Type') is not LITERAL_CATALOG: if self.catalog.get('Type') is not LITERAL_CATALOG:
@ -404,6 +422,7 @@ class PDFDocument(object):
return Arcfour(key).process(data) return Arcfour(key).process(data)
KEYWORD_OBJ = KWD('obj') KEYWORD_OBJ = KWD('obj')
# can raise PDFObjectNotFound
def getobj(self, objid): def getobj(self, objid):
if not self.xrefs: if not self.xrefs:
raise PDFException('PDFDocument is not initialized') raise PDFException('PDFDocument is not initialized')
@ -420,10 +439,7 @@ class PDFDocument(object):
except KeyError: except KeyError:
pass pass
else: else:
if STRICT: raise PDFObjectNotFound(objid)
raise PDFSyntaxError('Cannot locate objid=%r' % objid)
# return null for a nonexistent reference.
return None
if strmid: if strmid:
stream = stream_value(self.getobj(strmid)) stream = stream_value(self.getobj(strmid))
if stream.get('Type') is not LITERAL_OBJSTM: if stream.get('Type') is not LITERAL_OBJSTM:
@ -454,36 +470,33 @@ class PDFDocument(object):
try: try:
obj = objs[i] obj = objs[i]
except IndexError: except IndexError:
if STRICT: raise PDFObjectNotFound(objid)
raise PDFSyntaxError('Invalid object number: objid=%r' % (objid))
# return None for an invalid object number
return None
if isinstance(obj, PDFStream): if isinstance(obj, PDFStream):
obj.set_objid(objid, 0) obj.set_objid(objid, 0)
else: else:
self._parser.seek(index) self._parser.seek(index)
(_,objid1) = self._parser.nexttoken() # objid
(_,genno) = self._parser.nexttoken() # genno
(_,kwd) = self._parser.nexttoken()
# #### hack around malformed pdf files
#assert objid1 == objid, (objid, objid1)
if objid1 != objid:
x = []
while kwd is not self.KEYWORD_OBJ:
(_,kwd) = self._parser.nexttoken()
x.append(kwd)
if x:
objid1 = x[-2]
genno = x[-1]
# #### end hack around malformed pdf files
if kwd is not self.KEYWORD_OBJ:
raise PDFSyntaxError('Invalid object spec: offset=%r' % index)
try: try:
(_,objid1) = self._parser.nexttoken() # objid
(_,genno) = self._parser.nexttoken() # genno
(_,kwd) = self._parser.nexttoken()
# #### hack around malformed pdf files
#assert objid1 == objid, (objid, objid1)
if objid1 != objid:
x = []
while kwd is not self.KEYWORD_OBJ:
(_,kwd) = self._parser.nexttoken()
x.append(kwd)
if x:
objid1 = x[-2]
genno = x[-1]
# #### end hack around malformed pdf files
if kwd is not self.KEYWORD_OBJ:
raise PDFSyntaxError('Invalid object spec: offset=%r' % index)
(_,obj) = self._parser.nextobject() (_,obj) = self._parser.nextobject()
if isinstance(obj, PDFStream): if isinstance(obj, PDFStream):
obj.set_objid(objid, genno) obj.set_objid(objid, genno)
except PSEOF: except PSEOF:
return None raise PDFObjectNotFound(objid)
if 2 <= self.debug: if 2 <= self.debug:
print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj) print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj)
if self.caching: if self.caching:
@ -492,6 +505,12 @@ class PDFDocument(object):
obj = decipher_all(self.decipher, objid, genno, obj) obj = decipher_all(self.decipher, objid, genno, obj)
return obj return obj
def get_objects(self):
for xref in self.xrefs:
for objid in xref.get_objids():
yield self.getobj(objid)
return
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate']) INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
def get_pages(self): def get_pages(self):
if not self.xrefs: if not self.xrefs:
@ -516,9 +535,14 @@ class PDFDocument(object):
if 1 <= self.debug: if 1 <= self.debug:
print >>sys.stderr, 'Page: %r' % tree print >>sys.stderr, 'Page: %r' % tree
yield (objid, tree) yield (objid, tree)
if 'Pages' not in self.catalog: return if 'Pages' in self.catalog:
for (pageid,tree) in search(self.catalog['Pages'], self.catalog): for (pageid,tree) in search(self.catalog['Pages'], self.catalog):
yield PDFPage(self, pageid, tree) yield PDFPage(self, pageid, tree)
else:
# fallback when /Pages is missing.
for obj in self.get_objects():
if isinstance(obj, dict) and obj.get('Type') is LITERAL_PAGES:
yield PDFPage(self, pageid, obj)
return return
def get_outlines(self): def get_outlines(self):
@ -592,6 +616,7 @@ class PDFParser(PSStackParser):
Typical usage: Typical usage:
parser = PDFParser(fp) parser = PDFParser(fp)
parser.read_xref() parser.read_xref()
parser.read_xref(fallback=True) # optional
parser.set_document(doc) parser.set_document(doc)
parser.seek(offset) parser.seek(offset)
parser.nextobject() parser.nextobject()
@ -741,20 +766,17 @@ class PDFParser(PSStackParser):
return return
# read xref tables and trailers # read xref tables and trailers
def read_xref(self): def read_xref(self, fallback=False):
"""Reads all the XRefs in the PDF file and returns them.""" """Reads all the XRefs in the PDF file and returns them."""
xrefs = [] xrefs = []
try: self.fallback = fallback
pos = self.find_xref() if self.fallback:
self.read_xref_from(pos, xrefs)
except PDFNoValidXRef:
# fallback
if 1 <= self.debug:
print >>sys.stderr, 'no xref, fallback'
self.fallback = True
xref = PDFXRef() xref = PDFXRef()
xref.load_fallback(self) xref.load_fallback(self)
xrefs.append(xref) xrefs.append(xref)
else:
pos = self.find_xref()
self.read_xref_from(pos, xrefs)
return xrefs return xrefs

View File

@ -9,6 +9,7 @@
import sys, re import sys, re
from pdfminer.psparser import PSKeyword, PSLiteral from pdfminer.psparser import PSKeyword, PSLiteral
from pdfminer.pdfparser import PDFDocument, PDFParser, PDFNoOutlines from pdfminer.pdfparser import PDFDocument, PDFParser, PDFNoOutlines
from pdfminer.pdfparser import PDFObjectNotFound
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
@ -97,8 +98,8 @@ def dumpallobjs(out, doc, codec=None):
out.write('<object id="%d">\n' % objid) out.write('<object id="%d">\n' % objid)
dumpxml(out, obj, codec=codec) dumpxml(out, obj, codec=codec)
out.write('\n</object>\n\n') out.write('\n</object>\n\n')
except: except PDFObjectNotFound, e:
raise print >>sys.stderr, 'not found: %r' % e
dumptrailers(out, doc) dumptrailers(out, doc)
out.write('</pdf>') out.write('</pdf>')
return return