Introducing PDFObjectNotFound
parent
3c3cba2ecc
commit
06425bba00
|
@ -31,6 +31,7 @@ class PDFNoValidXRef(PDFSyntaxError): pass
|
||||||
class PDFNoOutlines(PDFException): pass
|
class PDFNoOutlines(PDFException): pass
|
||||||
class PDFDestinationNotFound(PDFException): pass
|
class PDFDestinationNotFound(PDFException): pass
|
||||||
class PDFEncryptionError(PDFException): pass
|
class PDFEncryptionError(PDFException): pass
|
||||||
|
class PDFObjectNotFound(PDFException): pass
|
||||||
class PDFPasswordIncorrect(PDFEncryptionError): pass
|
class PDFPasswordIncorrect(PDFEncryptionError): pass
|
||||||
|
|
||||||
# some predefined literals and keywords.
|
# some predefined literals and keywords.
|
||||||
|
@ -64,6 +65,9 @@ class PDFXRef(PDFBaseXRef):
|
||||||
self.trailer = {}
|
self.trailer = {}
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return '<PDFXRef: offsets=%r>' % (self.offsets.keys())
|
||||||
|
|
||||||
def load(self, parser, debug=0):
|
def load(self, parser, debug=0):
|
||||||
while 1:
|
while 1:
|
||||||
try:
|
try:
|
||||||
|
@ -308,27 +312,41 @@ class PDFDocument(object):
|
||||||
self._parsed_objs = {}
|
self._parsed_objs = {}
|
||||||
return
|
return
|
||||||
|
|
||||||
def set_parser(self, parser):
|
def set_parser(self, parser, fallback=True):
|
||||||
"Set the document to use a given PDFParser object."
|
"Set the document to use a given PDFParser object."
|
||||||
if self._parser: return
|
if self._parser: return
|
||||||
self._parser = parser
|
self._parser = parser
|
||||||
# Retrieve the information of each header that was appended
|
# Retrieve the information of each header that was appended
|
||||||
# (maybe multiple times) at the end of the document.
|
# (maybe multiple times) at the end of the document.
|
||||||
|
try:
|
||||||
self.xrefs = parser.read_xref()
|
self.xrefs = parser.read_xref()
|
||||||
|
except PDFNoValidXRef:
|
||||||
|
fallback = True
|
||||||
|
if fallback:
|
||||||
|
self.xrefs.extend(parser.read_xref(fallback=True))
|
||||||
for xref in self.xrefs:
|
for xref in self.xrefs:
|
||||||
trailer = xref.get_trailer()
|
trailer = xref.get_trailer()
|
||||||
if not trailer: continue
|
if not trailer: continue
|
||||||
# If there's an encryption info, remember it.
|
# If there's an encryption info, remember it.
|
||||||
if 'Encrypt' in trailer:
|
if 'Encrypt' in trailer:
|
||||||
#assert not self.encryption
|
#assert not self.encryption
|
||||||
|
try:
|
||||||
self.encryption = (list_value(trailer['ID']),
|
self.encryption = (list_value(trailer['ID']),
|
||||||
dict_value(trailer['Encrypt']))
|
dict_value(trailer['Encrypt']))
|
||||||
|
except PDFObjectNotFound, e:
|
||||||
|
pass
|
||||||
if 'Info' in trailer:
|
if 'Info' in trailer:
|
||||||
|
try:
|
||||||
self.info.append(dict_value(trailer['Info']))
|
self.info.append(dict_value(trailer['Info']))
|
||||||
|
except PDFObjectNotFound, e:
|
||||||
|
pass
|
||||||
if 'Root' in trailer:
|
if 'Root' in trailer:
|
||||||
# Every PDF file must have exactly one /Root dictionary.
|
# Every PDF file must have exactly one /Root dictionary.
|
||||||
|
try:
|
||||||
self.catalog = dict_value(trailer['Root'])
|
self.catalog = dict_value(trailer['Root'])
|
||||||
break
|
break
|
||||||
|
except PDFObjectNotFound, e:
|
||||||
|
pass
|
||||||
else:
|
else:
|
||||||
raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
|
raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
|
||||||
if self.catalog.get('Type') is not LITERAL_CATALOG:
|
if self.catalog.get('Type') is not LITERAL_CATALOG:
|
||||||
|
@ -404,6 +422,7 @@ class PDFDocument(object):
|
||||||
return Arcfour(key).process(data)
|
return Arcfour(key).process(data)
|
||||||
|
|
||||||
KEYWORD_OBJ = KWD('obj')
|
KEYWORD_OBJ = KWD('obj')
|
||||||
|
# can raise PDFObjectNotFound
|
||||||
def getobj(self, objid):
|
def getobj(self, objid):
|
||||||
if not self.xrefs:
|
if not self.xrefs:
|
||||||
raise PDFException('PDFDocument is not initialized')
|
raise PDFException('PDFDocument is not initialized')
|
||||||
|
@ -420,10 +439,7 @@ class PDFDocument(object):
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
if STRICT:
|
raise PDFObjectNotFound(objid)
|
||||||
raise PDFSyntaxError('Cannot locate objid=%r' % objid)
|
|
||||||
# return null for a nonexistent reference.
|
|
||||||
return None
|
|
||||||
if strmid:
|
if strmid:
|
||||||
stream = stream_value(self.getobj(strmid))
|
stream = stream_value(self.getobj(strmid))
|
||||||
if stream.get('Type') is not LITERAL_OBJSTM:
|
if stream.get('Type') is not LITERAL_OBJSTM:
|
||||||
|
@ -454,14 +470,12 @@ class PDFDocument(object):
|
||||||
try:
|
try:
|
||||||
obj = objs[i]
|
obj = objs[i]
|
||||||
except IndexError:
|
except IndexError:
|
||||||
if STRICT:
|
raise PDFObjectNotFound(objid)
|
||||||
raise PDFSyntaxError('Invalid object number: objid=%r' % (objid))
|
|
||||||
# return None for an invalid object number
|
|
||||||
return None
|
|
||||||
if isinstance(obj, PDFStream):
|
if isinstance(obj, PDFStream):
|
||||||
obj.set_objid(objid, 0)
|
obj.set_objid(objid, 0)
|
||||||
else:
|
else:
|
||||||
self._parser.seek(index)
|
self._parser.seek(index)
|
||||||
|
try:
|
||||||
(_,objid1) = self._parser.nexttoken() # objid
|
(_,objid1) = self._parser.nexttoken() # objid
|
||||||
(_,genno) = self._parser.nexttoken() # genno
|
(_,genno) = self._parser.nexttoken() # genno
|
||||||
(_,kwd) = self._parser.nexttoken()
|
(_,kwd) = self._parser.nexttoken()
|
||||||
|
@ -478,12 +492,11 @@ class PDFDocument(object):
|
||||||
# #### end hack around malformed pdf files
|
# #### end hack around malformed pdf files
|
||||||
if kwd is not self.KEYWORD_OBJ:
|
if kwd is not self.KEYWORD_OBJ:
|
||||||
raise PDFSyntaxError('Invalid object spec: offset=%r' % index)
|
raise PDFSyntaxError('Invalid object spec: offset=%r' % index)
|
||||||
try:
|
|
||||||
(_,obj) = self._parser.nextobject()
|
(_,obj) = self._parser.nextobject()
|
||||||
if isinstance(obj, PDFStream):
|
if isinstance(obj, PDFStream):
|
||||||
obj.set_objid(objid, genno)
|
obj.set_objid(objid, genno)
|
||||||
except PSEOF:
|
except PSEOF:
|
||||||
return None
|
raise PDFObjectNotFound(objid)
|
||||||
if 2 <= self.debug:
|
if 2 <= self.debug:
|
||||||
print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj)
|
print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj)
|
||||||
if self.caching:
|
if self.caching:
|
||||||
|
@ -492,6 +505,12 @@ class PDFDocument(object):
|
||||||
obj = decipher_all(self.decipher, objid, genno, obj)
|
obj = decipher_all(self.decipher, objid, genno, obj)
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
|
def get_objects(self):
|
||||||
|
for xref in self.xrefs:
|
||||||
|
for objid in xref.get_objids():
|
||||||
|
yield self.getobj(objid)
|
||||||
|
return
|
||||||
|
|
||||||
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
|
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
|
||||||
def get_pages(self):
|
def get_pages(self):
|
||||||
if not self.xrefs:
|
if not self.xrefs:
|
||||||
|
@ -516,9 +535,14 @@ class PDFDocument(object):
|
||||||
if 1 <= self.debug:
|
if 1 <= self.debug:
|
||||||
print >>sys.stderr, 'Page: %r' % tree
|
print >>sys.stderr, 'Page: %r' % tree
|
||||||
yield (objid, tree)
|
yield (objid, tree)
|
||||||
if 'Pages' not in self.catalog: return
|
if 'Pages' in self.catalog:
|
||||||
for (pageid,tree) in search(self.catalog['Pages'], self.catalog):
|
for (pageid,tree) in search(self.catalog['Pages'], self.catalog):
|
||||||
yield PDFPage(self, pageid, tree)
|
yield PDFPage(self, pageid, tree)
|
||||||
|
else:
|
||||||
|
# fallback when /Pages is missing.
|
||||||
|
for obj in self.get_objects():
|
||||||
|
if isinstance(obj, dict) and obj.get('Type') is LITERAL_PAGES:
|
||||||
|
yield PDFPage(self, pageid, obj)
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_outlines(self):
|
def get_outlines(self):
|
||||||
|
@ -592,6 +616,7 @@ class PDFParser(PSStackParser):
|
||||||
Typical usage:
|
Typical usage:
|
||||||
parser = PDFParser(fp)
|
parser = PDFParser(fp)
|
||||||
parser.read_xref()
|
parser.read_xref()
|
||||||
|
parser.read_xref(fallback=True) # optional
|
||||||
parser.set_document(doc)
|
parser.set_document(doc)
|
||||||
parser.seek(offset)
|
parser.seek(offset)
|
||||||
parser.nextobject()
|
parser.nextobject()
|
||||||
|
@ -741,20 +766,17 @@ class PDFParser(PSStackParser):
|
||||||
return
|
return
|
||||||
|
|
||||||
# read xref tables and trailers
|
# read xref tables and trailers
|
||||||
def read_xref(self):
|
def read_xref(self, fallback=False):
|
||||||
"""Reads all the XRefs in the PDF file and returns them."""
|
"""Reads all the XRefs in the PDF file and returns them."""
|
||||||
xrefs = []
|
xrefs = []
|
||||||
try:
|
self.fallback = fallback
|
||||||
pos = self.find_xref()
|
if self.fallback:
|
||||||
self.read_xref_from(pos, xrefs)
|
|
||||||
except PDFNoValidXRef:
|
|
||||||
# fallback
|
|
||||||
if 1 <= self.debug:
|
|
||||||
print >>sys.stderr, 'no xref, fallback'
|
|
||||||
self.fallback = True
|
|
||||||
xref = PDFXRef()
|
xref = PDFXRef()
|
||||||
xref.load_fallback(self)
|
xref.load_fallback(self)
|
||||||
xrefs.append(xref)
|
xrefs.append(xref)
|
||||||
|
else:
|
||||||
|
pos = self.find_xref()
|
||||||
|
self.read_xref_from(pos, xrefs)
|
||||||
return xrefs
|
return xrefs
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -9,6 +9,7 @@
|
||||||
import sys, re
|
import sys, re
|
||||||
from pdfminer.psparser import PSKeyword, PSLiteral
|
from pdfminer.psparser import PSKeyword, PSLiteral
|
||||||
from pdfminer.pdfparser import PDFDocument, PDFParser, PDFNoOutlines
|
from pdfminer.pdfparser import PDFDocument, PDFParser, PDFNoOutlines
|
||||||
|
from pdfminer.pdfparser import PDFObjectNotFound
|
||||||
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
|
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
|
||||||
|
|
||||||
|
|
||||||
|
@ -97,8 +98,8 @@ def dumpallobjs(out, doc, codec=None):
|
||||||
out.write('<object id="%d">\n' % objid)
|
out.write('<object id="%d">\n' % objid)
|
||||||
dumpxml(out, obj, codec=codec)
|
dumpxml(out, obj, codec=codec)
|
||||||
out.write('\n</object>\n\n')
|
out.write('\n</object>\n\n')
|
||||||
except:
|
except PDFObjectNotFound, e:
|
||||||
raise
|
print >>sys.stderr, 'not found: %r' % e
|
||||||
dumptrailers(out, doc)
|
dumptrailers(out, doc)
|
||||||
out.write('</pdf>')
|
out.write('</pdf>')
|
||||||
return
|
return
|
||||||
|
|
Loading…
Reference in New Issue