Code cleanup.

pull/1/head
Yusuke Shinyama 2013-10-10 19:17:58 +09:00
parent cfd60eafbf
commit e4bc4e43b1
1 changed files with 71 additions and 75 deletions

View File

@ -47,6 +47,9 @@ class PDFBaseXRef(object):
def get_objids(self): def get_objids(self):
return [] return []
# Must return
# (strmid, index, genno)
# or (None, pos, genno)
def get_pos(self, objid): def get_pos(self, objid):
raise KeyError(objid) raise KeyError(objid)
@ -92,7 +95,7 @@ class PDFXRef(PDFBaseXRef):
raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line)) raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line))
(pos, genno, use) = f (pos, genno, use) = f
if use != 'n': continue if use != 'n': continue
self.offsets[objid] = (int(genno), long(pos)) self.offsets[objid] = (None, long(pos), int(genno))
if 1 <= debug: if 1 <= debug:
print >>sys.stderr, 'xref objects:', self.offsets print >>sys.stderr, 'xref objects:', self.offsets
self.load_trailer(parser) self.load_trailer(parser)
@ -120,10 +123,9 @@ class PDFXRef(PDFBaseXRef):
def get_pos(self, objid): def get_pos(self, objid):
try: try:
(genno, pos) = self.offsets[objid] return self.offsets[objid]
except KeyError: except KeyError:
raise raise
return (None, pos)
## PDFXRefFallback ## PDFXRefFallback
@ -147,7 +149,7 @@ class PDFXRefFallback(PDFXRef):
m = self.PDFOBJ_CUE.match(line) m = self.PDFOBJ_CUE.match(line)
if not m: continue if not m: continue
(objid, genno) = m.groups() (objid, genno) = m.groups()
self.offsets[int(objid)] = (0, pos) self.offsets[int(objid)] = (None, pos, int(genno))
return return
@ -163,7 +165,7 @@ class PDFXRefStream(PDFBaseXRef):
return return
def __repr__(self): def __repr__(self):
return '<PDFXRefStream: fields=%d,%d,%d>' % (self.fl1, self.fl2, self.fl3) return '<PDFXRefStream: ranges=%r>' % (self.ranges)
def load(self, parser, debug=0): def load(self, parser, debug=0):
(_,objid) = parser.nexttoken() # ignored (_,objid) = parser.nexttoken() # ignored
@ -208,16 +210,15 @@ class PDFXRefStream(PDFBaseXRef):
offset = self.entlen * index offset = self.entlen * index
ent = self.data[offset:offset+self.entlen] ent = self.data[offset:offset+self.entlen]
f1 = nunpack(ent[:self.fl1], 1) f1 = nunpack(ent[:self.fl1], 1)
f2 = nunpack(ent[self.fl1:self.fl1+self.fl2])
f3 = nunpack(ent[self.fl1+self.fl2:])
if f1 == 1: if f1 == 1:
pos = nunpack(ent[self.fl1:self.fl1+self.fl2]) return (None, f2, f3)
genno = nunpack(ent[self.fl1+self.fl2:])
return (None, pos)
elif f1 == 2: elif f1 == 2:
objid = nunpack(ent[self.fl1:self.fl1+self.fl2]) return (f2, f3, 0)
index = nunpack(ent[self.fl1+self.fl2:]) else:
return (objid, index) # this is a free object
# this is a free object raise KeyError(objid)
raise KeyError(objid)
## PDFPage ## PDFPage
@ -409,7 +410,50 @@ class PDFDocument(object):
key = hash.digest()[:min(len(key),16)] key = hash.digest()[:min(len(key),16)]
return Arcfour(key).process(data) return Arcfour(key).process(data)
def _getobj_objstm(self, stream, index, objid):
if stream.get('Type') is not LITERAL_OBJSTM:
if STRICT:
raise PDFSyntaxError('Not a stream object: %r' % stream)
try:
n = stream['N']
except KeyError:
if STRICT:
raise PDFSyntaxError('N is not defined: %r' % stream)
n = 0
if stream.objid in self._parsed_objs:
objs = self._parsed_objs[stream.objid]
else:
parser = PDFStreamParser(stream.get_data())
parser.set_document(self)
objs = []
try:
while 1:
(_,obj) = parser.nextobject()
objs.append(obj)
except PSEOF:
pass
if self.caching:
self._parsed_objs[stream.objid] = objs
i = n*2+index
try:
obj = objs[i]
except IndexError:
raise PDFSyntaxError('index too big: %r' % index)
return obj
KEYWORD_OBJ = KWD('obj') KEYWORD_OBJ = KWD('obj')
def _getobj_parse(self, pos, objid):
self._parser.seek(pos)
(_,objid1) = self._parser.nexttoken() # objid
if objid1 != objid:
raise PDFSyntaxError('objid mismatch: %r=%r' % (objid1, objid))
(_,genno) = self._parser.nexttoken() # genno
(_,kwd) = self._parser.nexttoken()
if kwd is not self.KEYWORD_OBJ:
raise PDFSyntaxError('Invalid object spec: offset=%r' % pos)
(_,obj) = self._parser.nextobject()
return obj
# can raise PDFObjectNotFound # can raise PDFObjectNotFound
def getobj(self, objid): def getobj(self, objid):
if not self.xrefs: if not self.xrefs:
@ -417,78 +461,30 @@ class PDFDocument(object):
if 2 <= self.debug: if 2 <= self.debug:
print >>sys.stderr, 'getobj: objid=%r' % (objid) print >>sys.stderr, 'getobj: objid=%r' % (objid)
if objid in self._cached_objs: if objid in self._cached_objs:
genno = 0 (obj, genno) = self._cached_objs[objid]
obj = self._cached_objs[objid]
else: else:
for xref in self.xrefs: for xref in self.xrefs:
try: try:
(strmid, index) = xref.get_pos(objid) (strmid, index, genno) = xref.get_pos(objid)
break
except KeyError: except KeyError:
pass continue
else:
raise PDFObjectNotFound(objid)
if strmid:
stream = stream_value(self.getobj(strmid))
if stream.get('Type') is not LITERAL_OBJSTM:
if STRICT:
raise PDFSyntaxError('Not a stream object: %r' % stream)
try: try:
n = stream['N'] if strmid is not None:
except KeyError: stream = stream_value(self.getobj(strmid))
if STRICT: obj = self._getobj_objstm(stream, index, objid)
raise PDFSyntaxError('N is not defined: %r' % stream) else:
n = 0 obj = self._getobj_parse(index, objid)
if strmid in self._parsed_objs:
objs = self._parsed_objs[strmid]
else:
parser = PDFStreamParser(stream.get_data())
parser.set_document(self)
objs = []
try:
while 1:
(_,obj) = parser.nextobject()
objs.append(obj)
except PSEOF:
pass
if self.caching:
self._parsed_objs[strmid] = objs
genno = 0
i = n*2+index
try:
obj = objs[i]
except IndexError:
raise PDFObjectNotFound(objid)
if isinstance(obj, PDFStream):
obj.set_objid(objid, 0)
else:
self._parser.seek(index)
try:
(_,objid1) = self._parser.nexttoken() # objid
(_,genno) = self._parser.nexttoken() # genno
(_,kwd) = self._parser.nexttoken()
# #### hack around malformed pdf files
#assert objid1 == objid, (objid, objid1)
if objid1 != objid:
x = []
while kwd is not self.KEYWORD_OBJ:
(_,kwd) = self._parser.nexttoken()
x.append(kwd)
if x:
objid1 = x[-2]
genno = x[-1]
# #### end hack around malformed pdf files
if kwd is not self.KEYWORD_OBJ:
raise PDFSyntaxError('Invalid object spec: offset=%r' % index)
(_,obj) = self._parser.nextobject()
if isinstance(obj, PDFStream): if isinstance(obj, PDFStream):
obj.set_objid(objid, genno) obj.set_objid(objid, genno)
except PSEOF: break
raise PDFObjectNotFound(objid) except (PSEOF, PDFSyntaxError):
continue
else:
raise PDFObjectNotFound(objid)
if 2 <= self.debug: if 2 <= self.debug:
print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj) print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj)
if self.caching: if self.caching:
self._cached_objs[objid] = obj self._cached_objs[objid] = (obj, genno)
if self.decipher: if self.decipher:
obj = decipher_all(self.decipher, objid, genno, obj) obj = decipher_all(self.decipher, objid, genno, obj)
return obj return obj