Code cleanup.
parent
cfd60eafbf
commit
e4bc4e43b1
|
@ -47,6 +47,9 @@ class PDFBaseXRef(object):
|
||||||
def get_objids(self):
|
def get_objids(self):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
# Must return
|
||||||
|
# (strmid, index, genno)
|
||||||
|
# or (None, pos, genno)
|
||||||
def get_pos(self, objid):
|
def get_pos(self, objid):
|
||||||
raise KeyError(objid)
|
raise KeyError(objid)
|
||||||
|
|
||||||
|
@ -92,7 +95,7 @@ class PDFXRef(PDFBaseXRef):
|
||||||
raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line))
|
raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line))
|
||||||
(pos, genno, use) = f
|
(pos, genno, use) = f
|
||||||
if use != 'n': continue
|
if use != 'n': continue
|
||||||
self.offsets[objid] = (int(genno), long(pos))
|
self.offsets[objid] = (None, long(pos), int(genno))
|
||||||
if 1 <= debug:
|
if 1 <= debug:
|
||||||
print >>sys.stderr, 'xref objects:', self.offsets
|
print >>sys.stderr, 'xref objects:', self.offsets
|
||||||
self.load_trailer(parser)
|
self.load_trailer(parser)
|
||||||
|
@ -120,10 +123,9 @@ class PDFXRef(PDFBaseXRef):
|
||||||
|
|
||||||
def get_pos(self, objid):
|
def get_pos(self, objid):
|
||||||
try:
|
try:
|
||||||
(genno, pos) = self.offsets[objid]
|
return self.offsets[objid]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
raise
|
raise
|
||||||
return (None, pos)
|
|
||||||
|
|
||||||
|
|
||||||
## PDFXRefFallback
|
## PDFXRefFallback
|
||||||
|
@ -147,7 +149,7 @@ class PDFXRefFallback(PDFXRef):
|
||||||
m = self.PDFOBJ_CUE.match(line)
|
m = self.PDFOBJ_CUE.match(line)
|
||||||
if not m: continue
|
if not m: continue
|
||||||
(objid, genno) = m.groups()
|
(objid, genno) = m.groups()
|
||||||
self.offsets[int(objid)] = (0, pos)
|
self.offsets[int(objid)] = (None, pos, int(genno))
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@ -163,7 +165,7 @@ class PDFXRefStream(PDFBaseXRef):
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<PDFXRefStream: fields=%d,%d,%d>' % (self.fl1, self.fl2, self.fl3)
|
return '<PDFXRefStream: ranges=%r>' % (self.ranges)
|
||||||
|
|
||||||
def load(self, parser, debug=0):
|
def load(self, parser, debug=0):
|
||||||
(_,objid) = parser.nexttoken() # ignored
|
(_,objid) = parser.nexttoken() # ignored
|
||||||
|
@ -208,14 +210,13 @@ class PDFXRefStream(PDFBaseXRef):
|
||||||
offset = self.entlen * index
|
offset = self.entlen * index
|
||||||
ent = self.data[offset:offset+self.entlen]
|
ent = self.data[offset:offset+self.entlen]
|
||||||
f1 = nunpack(ent[:self.fl1], 1)
|
f1 = nunpack(ent[:self.fl1], 1)
|
||||||
|
f2 = nunpack(ent[self.fl1:self.fl1+self.fl2])
|
||||||
|
f3 = nunpack(ent[self.fl1+self.fl2:])
|
||||||
if f1 == 1:
|
if f1 == 1:
|
||||||
pos = nunpack(ent[self.fl1:self.fl1+self.fl2])
|
return (None, f2, f3)
|
||||||
genno = nunpack(ent[self.fl1+self.fl2:])
|
|
||||||
return (None, pos)
|
|
||||||
elif f1 == 2:
|
elif f1 == 2:
|
||||||
objid = nunpack(ent[self.fl1:self.fl1+self.fl2])
|
return (f2, f3, 0)
|
||||||
index = nunpack(ent[self.fl1+self.fl2:])
|
else:
|
||||||
return (objid, index)
|
|
||||||
# this is a free object
|
# this is a free object
|
||||||
raise KeyError(objid)
|
raise KeyError(objid)
|
||||||
|
|
||||||
|
@ -409,27 +410,7 @@ class PDFDocument(object):
|
||||||
key = hash.digest()[:min(len(key),16)]
|
key = hash.digest()[:min(len(key),16)]
|
||||||
return Arcfour(key).process(data)
|
return Arcfour(key).process(data)
|
||||||
|
|
||||||
KEYWORD_OBJ = KWD('obj')
|
def _getobj_objstm(self, stream, index, objid):
|
||||||
# can raise PDFObjectNotFound
|
|
||||||
def getobj(self, objid):
|
|
||||||
if not self.xrefs:
|
|
||||||
raise PDFException('PDFDocument is not initialized')
|
|
||||||
if 2 <= self.debug:
|
|
||||||
print >>sys.stderr, 'getobj: objid=%r' % (objid)
|
|
||||||
if objid in self._cached_objs:
|
|
||||||
genno = 0
|
|
||||||
obj = self._cached_objs[objid]
|
|
||||||
else:
|
|
||||||
for xref in self.xrefs:
|
|
||||||
try:
|
|
||||||
(strmid, index) = xref.get_pos(objid)
|
|
||||||
break
|
|
||||||
except KeyError:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
raise PDFObjectNotFound(objid)
|
|
||||||
if strmid:
|
|
||||||
stream = stream_value(self.getobj(strmid))
|
|
||||||
if stream.get('Type') is not LITERAL_OBJSTM:
|
if stream.get('Type') is not LITERAL_OBJSTM:
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFSyntaxError('Not a stream object: %r' % stream)
|
raise PDFSyntaxError('Not a stream object: %r' % stream)
|
||||||
|
@ -439,8 +420,8 @@ class PDFDocument(object):
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFSyntaxError('N is not defined: %r' % stream)
|
raise PDFSyntaxError('N is not defined: %r' % stream)
|
||||||
n = 0
|
n = 0
|
||||||
if strmid in self._parsed_objs:
|
if stream.objid in self._parsed_objs:
|
||||||
objs = self._parsed_objs[strmid]
|
objs = self._parsed_objs[stream.objid]
|
||||||
else:
|
else:
|
||||||
parser = PDFStreamParser(stream.get_data())
|
parser = PDFStreamParser(stream.get_data())
|
||||||
parser.set_document(self)
|
parser.set_document(self)
|
||||||
|
@ -452,43 +433,58 @@ class PDFDocument(object):
|
||||||
except PSEOF:
|
except PSEOF:
|
||||||
pass
|
pass
|
||||||
if self.caching:
|
if self.caching:
|
||||||
self._parsed_objs[strmid] = objs
|
self._parsed_objs[stream.objid] = objs
|
||||||
genno = 0
|
|
||||||
i = n*2+index
|
i = n*2+index
|
||||||
try:
|
try:
|
||||||
obj = objs[i]
|
obj = objs[i]
|
||||||
except IndexError:
|
except IndexError:
|
||||||
raise PDFObjectNotFound(objid)
|
raise PDFSyntaxError('index too big: %r' % index)
|
||||||
if isinstance(obj, PDFStream):
|
return obj
|
||||||
obj.set_objid(objid, 0)
|
|
||||||
else:
|
KEYWORD_OBJ = KWD('obj')
|
||||||
self._parser.seek(index)
|
def _getobj_parse(self, pos, objid):
|
||||||
try:
|
self._parser.seek(pos)
|
||||||
(_,objid1) = self._parser.nexttoken() # objid
|
(_,objid1) = self._parser.nexttoken() # objid
|
||||||
|
if objid1 != objid:
|
||||||
|
raise PDFSyntaxError('objid mismatch: %r=%r' % (objid1, objid))
|
||||||
(_,genno) = self._parser.nexttoken() # genno
|
(_,genno) = self._parser.nexttoken() # genno
|
||||||
(_,kwd) = self._parser.nexttoken()
|
(_,kwd) = self._parser.nexttoken()
|
||||||
# #### hack around malformed pdf files
|
|
||||||
#assert objid1 == objid, (objid, objid1)
|
|
||||||
if objid1 != objid:
|
|
||||||
x = []
|
|
||||||
while kwd is not self.KEYWORD_OBJ:
|
|
||||||
(_,kwd) = self._parser.nexttoken()
|
|
||||||
x.append(kwd)
|
|
||||||
if x:
|
|
||||||
objid1 = x[-2]
|
|
||||||
genno = x[-1]
|
|
||||||
# #### end hack around malformed pdf files
|
|
||||||
if kwd is not self.KEYWORD_OBJ:
|
if kwd is not self.KEYWORD_OBJ:
|
||||||
raise PDFSyntaxError('Invalid object spec: offset=%r' % index)
|
raise PDFSyntaxError('Invalid object spec: offset=%r' % pos)
|
||||||
(_,obj) = self._parser.nextobject()
|
(_,obj) = self._parser.nextobject()
|
||||||
|
return obj
|
||||||
|
|
||||||
|
# can raise PDFObjectNotFound
|
||||||
|
def getobj(self, objid):
|
||||||
|
if not self.xrefs:
|
||||||
|
raise PDFException('PDFDocument is not initialized')
|
||||||
|
if 2 <= self.debug:
|
||||||
|
print >>sys.stderr, 'getobj: objid=%r' % (objid)
|
||||||
|
if objid in self._cached_objs:
|
||||||
|
(obj, genno) = self._cached_objs[objid]
|
||||||
|
else:
|
||||||
|
for xref in self.xrefs:
|
||||||
|
try:
|
||||||
|
(strmid, index, genno) = xref.get_pos(objid)
|
||||||
|
except KeyError:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
if strmid is not None:
|
||||||
|
stream = stream_value(self.getobj(strmid))
|
||||||
|
obj = self._getobj_objstm(stream, index, objid)
|
||||||
|
else:
|
||||||
|
obj = self._getobj_parse(index, objid)
|
||||||
if isinstance(obj, PDFStream):
|
if isinstance(obj, PDFStream):
|
||||||
obj.set_objid(objid, genno)
|
obj.set_objid(objid, genno)
|
||||||
except PSEOF:
|
break
|
||||||
|
except (PSEOF, PDFSyntaxError):
|
||||||
|
continue
|
||||||
|
else:
|
||||||
raise PDFObjectNotFound(objid)
|
raise PDFObjectNotFound(objid)
|
||||||
if 2 <= self.debug:
|
if 2 <= self.debug:
|
||||||
print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj)
|
print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj)
|
||||||
if self.caching:
|
if self.caching:
|
||||||
self._cached_objs[objid] = obj
|
self._cached_objs[objid] = (obj, genno)
|
||||||
if self.decipher:
|
if self.decipher:
|
||||||
obj = decipher_all(self.decipher, objid, genno, obj)
|
obj = decipher_all(self.decipher, objid, genno, obj)
|
||||||
return obj
|
return obj
|
||||||
|
|
Loading…
Reference in New Issue