basic encryption support added.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@19 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
5c1aa960f5
commit
1300046181
|
@ -0,0 +1,9 @@
|
|||
TODO:
|
||||
- Code Documentation.
|
||||
- Error handling for invalid type.
|
||||
|
||||
- Outlines.
|
||||
- Named Objects. (pages)
|
||||
- Writers.
|
||||
- Linearized PDF.
|
||||
- Encryption?
|
|
@ -0,0 +1,40 @@
|
|||
#!/usr/bin/env python
|
||||
#
|
||||
# Arcfour implementation
|
||||
# * public domain *
|
||||
#
|
||||
|
||||
class Arcfour:
|
||||
|
||||
def __init__(self, key):
|
||||
s = range(256)
|
||||
j = 0
|
||||
klen = len(key)
|
||||
for i in xrange(256):
|
||||
j = (j + s[i] + ord(key[i % klen])) % 256
|
||||
(s[i], s[j]) = (s[j], s[i])
|
||||
self.s = s
|
||||
(self.i, self.j) = (0, 0)
|
||||
return
|
||||
|
||||
def process(self, data):
|
||||
(i, j) = (self.i, self.j)
|
||||
s = self.s
|
||||
r = ''
|
||||
for c in data:
|
||||
i = (i+1) % 256
|
||||
j = (j+s[i]) % 256
|
||||
(s[i], s[j]) = (s[j], s[i])
|
||||
k = s[(s[i]+s[j]) % 256]
|
||||
r += chr(ord(c) ^ k)
|
||||
(self.i, self.j) = (i, j)
|
||||
return r
|
||||
|
||||
if __name__ == '__main__':
|
||||
def doit(key, data):
|
||||
cipher = Arcfour(key)
|
||||
return ''.join( '%02X' % ord(c) for c in cipher.process(data) )
|
||||
assert doit("Key", "Plaintext") == 'BBF316E8D940AF0AD3'
|
||||
assert doit("Wiki", "pedia") == '1021BF0420'
|
||||
assert doit("Secret", "Attack at dawn") == '45A01F645FC35B383552544B9BF5'
|
||||
print 'test succeeded'
|
28
extent.py
28
extent.py
|
@ -20,6 +20,9 @@ class Rect:
|
|||
self.y1 = y0+h
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<Rect: (%d,%d)-(%d,%d)>' % (self.x0, self.y0, self.x1, self.y1)
|
||||
|
||||
def overlap(self, rect):
|
||||
return not (rect.x1 <= self.x0 or self.x1 <= rect.x0 or
|
||||
rect.y1 <= self.y0 or self.y1 <= rect.y0)
|
||||
|
@ -31,7 +34,7 @@ class ExtSet:
|
|||
|
||||
def __init__(self, gridsize):
|
||||
self.gridsize = gridsize
|
||||
self.grid = []
|
||||
self.grid = {}
|
||||
return
|
||||
|
||||
def cells(self, x0, x1):
|
||||
|
@ -45,12 +48,18 @@ class ExtSet:
|
|||
|
||||
def add(self, x0, x1, obj):
|
||||
for i in self.cells(x0, x1):
|
||||
self.grid[i].append(obj)
|
||||
if i not in self.grid:
|
||||
a = []
|
||||
self.grid[i] = a
|
||||
else:
|
||||
a = self.grid[i]
|
||||
a.append(obj)
|
||||
return
|
||||
|
||||
def get(self, x0, x1):
|
||||
objs = set()
|
||||
for i in self.cells(x0, x1):
|
||||
if i in self.grid:
|
||||
objs.update(self.grid[i])
|
||||
return objs
|
||||
|
||||
|
@ -78,12 +87,13 @@ class ExtGrid:
|
|||
self.vext = ExtSet(gridsize)
|
||||
return
|
||||
|
||||
def add(self, rect):
|
||||
self.hext.add(rect.x0, rect.x1, rect)
|
||||
self.vext.add(rect.y0, rect.y1, rect)
|
||||
def add(self, rect, obj):
|
||||
self.hext.add(rect.x0, rect.x1, obj)
|
||||
self.vext.add(rect.y0, rect.y1, obj)
|
||||
return
|
||||
|
||||
def get(self, rect):
|
||||
rects = self.hext.get(rect.x0, rect.x1)
|
||||
rects.update_intersect(self.vext.get(rect.y0, rect.y1))
|
||||
return rects
|
||||
def get(self, rect, getrect):
|
||||
objs = self.hext.get(rect.x0, rect.x1)
|
||||
objs.intersection_update(self.vext.get(rect.y0, rect.y1))
|
||||
objs = [ obj for obj in objs if rect.overlap(getrect(obj)) ]
|
||||
return objs
|
||||
|
|
171
pdf2txt.py
171
pdf2txt.py
|
@ -7,86 +7,183 @@ from pdfinterp import PDFDevice, PDFResourceManager, \
|
|||
PDFPageInterpreter, PDFUnicodeNotDefined, \
|
||||
mult_matrix, apply_matrix
|
||||
from cmap import CMapDB
|
||||
from extent import Rect, ExtSet, ExtGrid
|
||||
|
||||
|
||||
## PageItem
|
||||
##
|
||||
class PageItem:
|
||||
|
||||
GRID_SIZE = 20
|
||||
|
||||
def __init__(self, id, (x0,y0,x1,y1), rotate=0):
|
||||
self.id = id
|
||||
self.bbox = Rect(x0, y0, x1-x0, y1-y0)
|
||||
self.rotate = rotate
|
||||
self.grid = ExtGrid(self.GRID_SIZE)
|
||||
self.objs = []
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
bbox = self.bbox
|
||||
return ('<page id=%r bbox="%d,%d,%d,%d" rotate="%d">' %
|
||||
(self.id, bbox.x0,bbox.y0,bbox.x1,bbox.y1, self.rotate))
|
||||
|
||||
def add(self, obj):
|
||||
self.objs.append(obj)
|
||||
self.grid.add(obj.bbox, obj)
|
||||
return
|
||||
|
||||
def dump(self, outfp, codec):
|
||||
outfp.write(repr(self)+'\n')
|
||||
for obj in self.objs:
|
||||
obj.dump(outfp, codec)
|
||||
outfp.write('</page>\n')
|
||||
return
|
||||
|
||||
def fuse(self):
|
||||
for obj1 in self.objs:
|
||||
f = (lambda obj: obj.bbox)
|
||||
for rect in obj1.search_range():
|
||||
neighbors = [ obj2 for obj2 in self.grid.get(rect, f) if obj2 is not obj1 ]
|
||||
#print obj1.bbox, obj1.text.encode('euc-jp','ignore'), rect, [ obj.bbox for obj in neighbors ]
|
||||
return
|
||||
|
||||
|
||||
## FigureItem
|
||||
##
|
||||
class FigureItem(PageItem):
|
||||
|
||||
def __repr__(self):
|
||||
bbox = self.bbox
|
||||
return ('<figure id=%r bbox="%d,%d,%d,%d">' %
|
||||
(self.id, bbox.x0,bbox.y0,bbox.x1,bbox.y1))
|
||||
|
||||
def dump(self, outfp, codec):
|
||||
outfp.write(repr(self)+'\n')
|
||||
for obj in self.objs:
|
||||
obj.dump(outfp, codec)
|
||||
outfp.write('</figure>\n')
|
||||
return
|
||||
|
||||
def search_range(self):
|
||||
return []
|
||||
|
||||
|
||||
## TextItem
|
||||
##
|
||||
class TextItem:
|
||||
|
||||
def __init__(self, matrix, font, size, width, text):
|
||||
self.matrix = matrix
|
||||
self.font = font
|
||||
(a,b,c,d,tx,ty) = self.matrix
|
||||
(self.width, self.size) = apply_matrix((a,b,c,d,0,0), (width,size))
|
||||
self.width = abs(self.width)
|
||||
self.origin = (tx,ty)
|
||||
self.direction = 0
|
||||
if not self.font.is_vertical():
|
||||
self.direction = 1
|
||||
(_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*size*0.001))
|
||||
(_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*size*0.001))
|
||||
self.bbox = Rect(tx, ty+descent, self.width, self.size)
|
||||
else:
|
||||
self.direction = 2
|
||||
mindisp = min( d for (d,_) in text )
|
||||
(mindisp,_) = apply_matrix((a,b,c,d,0,0), (mindisp*size*0.001,0))
|
||||
self.bbox = Rect(tx-mindisp, ty+self.width, self.size, self.width)
|
||||
self.text = ''.join( c for (_,c) in text )
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<text matrix=%r font=%r size=%r width=%r text=%r>' %
|
||||
(self.matrix, self.font, self.size, self.width, self.text))
|
||||
|
||||
def dump(self, outfp, codec):
|
||||
(a,b,c,d,tx,ty) = self.matrix
|
||||
outfp.write('<text x="%.3f" y="%.3f" font=%r size="%.3f" width="%.3f">' %
|
||||
(tx, ty, self.font.fontname, self.size, self.width))
|
||||
outfp.write(self.text.encode(codec, 'xmlcharrefreplace'))
|
||||
outfp.write('</text>\n')
|
||||
return
|
||||
|
||||
def search_range(self):
|
||||
if self.direction == 1:
|
||||
return [ Rect(self.bbox.x1, self.bbox.y0, self.size, self.size) ]
|
||||
else:
|
||||
return [ Rect(self.bbox.x0, self.bbox.y0-self.size, self.size, self.size) ]
|
||||
|
||||
|
||||
## TextConverter
|
||||
##
|
||||
class TextConverter(PDFDevice):
|
||||
|
||||
def __init__(self, outfp, rsrc, codec, debug=0):
|
||||
def __init__(self, rsrc, debug=0):
|
||||
PDFDevice.__init__(self, rsrc, debug=debug)
|
||||
self.outfp = outfp
|
||||
self.codec = codec
|
||||
return
|
||||
|
||||
def close(self):
|
||||
self.outfp.write('\n')
|
||||
self.pages = []
|
||||
self.stack = []
|
||||
return
|
||||
|
||||
def begin_page(self, page):
|
||||
(x0,y0,x1,y1) = page.mediabox
|
||||
self.outfp.write('<page id="%d" mediabox="%d,%d,%d,%d" rotate="%d">' %
|
||||
(page.pageid, x0,y0,x1,y1, page.rotate))
|
||||
self.context = PageItem(str(page.pageid), page.mediabox, page.rotate)
|
||||
return
|
||||
def end_page(self, _):
|
||||
self.outfp.write('</page>\n')
|
||||
assert not self.stack
|
||||
self.pages.append(self.context)
|
||||
return
|
||||
|
||||
def begin_figure(self, name, bbox):
|
||||
(x0,y0,x1,y1) = bbox
|
||||
self.outfp.write('<figure name="%s" bbox="%d,%d,%d,%d">\n' %
|
||||
(name, x0,y0,x1,y1))
|
||||
self.stack.append(self.context)
|
||||
self.context = FigureItem(name, bbox)
|
||||
return
|
||||
def end_figure(self, _):
|
||||
self.outfp.write('</figure>\n')
|
||||
fig = self.context
|
||||
self.context = self.stack.pop()
|
||||
self.context.add(fig)
|
||||
return
|
||||
|
||||
def handle_undefined_char(self, cidcoding, cid):
|
||||
if self.debug:
|
||||
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
|
||||
#return unichr(cid)
|
||||
#return unichr(cid+32)
|
||||
return
|
||||
return None
|
||||
|
||||
def render_string(self, textstate, textmatrix, size, seq):
|
||||
font = textstate.font
|
||||
spwidth = int(-font.char_width(32) * 0.6) # space width
|
||||
buf = ''
|
||||
text = []
|
||||
for x in seq:
|
||||
if isinstance(x, int) or isinstance(x, float):
|
||||
if not font.is_vertical() and x <= spwidth:
|
||||
buf += ' '
|
||||
text.append((0, ' '))
|
||||
else:
|
||||
chars = font.decode(x)
|
||||
for cid in chars:
|
||||
try:
|
||||
char = font.to_unicode(cid)
|
||||
buf += char
|
||||
text.append((font.char_disp(cid), char))
|
||||
except PDFUnicodeNotDefined, e:
|
||||
(cidcoding, cid) = e.args
|
||||
s = self.handle_undefined_char(cidcoding, cid)
|
||||
if s:
|
||||
buf += s
|
||||
(a,b,c,d,tx,ty) = mult_matrix(textmatrix, self.ctm)
|
||||
if font.is_vertical():
|
||||
size = -size
|
||||
tag = 'vtext'
|
||||
else:
|
||||
tag = 'htext'
|
||||
if (b != 0 or c != 0 or a <= 0 or d <= 0):
|
||||
tag += ' skewed'
|
||||
s = buf.encode(self.codec, 'xmlcharrefreplace')
|
||||
(w,fs) = apply_matrix((a,b,c,d,0,0), (size,textstate.fontsize))
|
||||
def f(x): return '%.03f' % x
|
||||
self.outfp.write('<%s font="%s" size="%s" x="%s" y="%s" w="%s">%s</%s>\n' %
|
||||
(tag, font.fontname, f(fs), f(tx), f(ty), f(w), s, tag))
|
||||
text.append(s)
|
||||
item = TextItem(mult_matrix(textmatrix, self.ctm),
|
||||
font, textstate.fontsize, size, text)
|
||||
self.context.add(item)
|
||||
return
|
||||
|
||||
def dump(self, outfp, codec):
|
||||
outfp.write('<document>\n')
|
||||
for page in self.pages:
|
||||
#page.fuse()
|
||||
page.dump(outfp, codec)
|
||||
outfp.write('</document>\n')
|
||||
return
|
||||
|
||||
|
||||
# pdf2txt
|
||||
def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
|
||||
device = TextConverter(outfp, rsrc, codec, debug=debug)
|
||||
outfp.write('<document>\n')
|
||||
device = TextConverter(rsrc, debug=debug)
|
||||
doc = PDFDocument(debug=debug)
|
||||
fp = file(fname)
|
||||
parser = PDFParser(doc, fp, debug=debug)
|
||||
|
@ -95,7 +192,7 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
|
|||
if pages and (i not in pages): continue
|
||||
interpreter.process_page(page)
|
||||
fp.close()
|
||||
outfp.write('</document>\n')
|
||||
device.dump(outfp, codec)
|
||||
device.close()
|
||||
return
|
||||
|
||||
|
|
129
pdfparser.py
129
pdfparser.py
|
@ -4,30 +4,30 @@
|
|||
# ver 0.1, Dec 24 2004-
|
||||
# ver 0.2, Dec 24 2007
|
||||
|
||||
# TODO:
|
||||
# - Code Documentation.
|
||||
# - Error handling for invalid type.
|
||||
|
||||
# - Outlines.
|
||||
# - Named Objects. (pages)
|
||||
# - Writers.
|
||||
# - Linearized PDF.
|
||||
# - Encryption?
|
||||
|
||||
import sys
|
||||
import md5, struct
|
||||
stderr = sys.stderr
|
||||
from utils import choplist, nunpack
|
||||
from arcfour import Arcfour
|
||||
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
|
||||
PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
|
||||
literal_name, keyword_name, \
|
||||
PSStackParser, STRICT
|
||||
|
||||
|
||||
def decrypt_rc4(key, objid, genno, data):
|
||||
key += struct.pack('<L',objid)[:3]+struct.pack('<L',genno)[:2]
|
||||
hash = md5.md5(key)
|
||||
key = hash.digest()[:min(len(key),16)]
|
||||
return Arcfour(key).process(data)
|
||||
|
||||
|
||||
## PDF Exceptions
|
||||
##
|
||||
class PDFException(PSException): pass
|
||||
class PDFSyntaxError(PDFException): pass
|
||||
class PDFEncrypted(PDFException): pass
|
||||
class PDFEncryptionError(PDFException): pass
|
||||
class PDFPasswordIncorrect(PDFEncryptionError): pass
|
||||
class PDFTypeError(PDFException): pass
|
||||
class PDFValueError(PDFException): pass
|
||||
|
||||
|
@ -38,6 +38,7 @@ LITERAL_XREF = PSLiteralTable.intern('XRef')
|
|||
LITERAL_PAGE = PSLiteralTable.intern('Page')
|
||||
LITERAL_PAGES = PSLiteralTable.intern('Pages')
|
||||
LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
|
||||
LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
|
||||
LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode')
|
||||
KEYWORD_R = PSKeywordTable.intern('R')
|
||||
KEYWORD_OBJ = PSKeywordTable.intern('obj')
|
||||
|
@ -45,6 +46,7 @@ KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
|
|||
KEYWORD_STREAM = PSKeywordTable.intern('stream')
|
||||
KEYWORD_XREF = PSKeywordTable.intern('xref')
|
||||
KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
|
||||
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
|
||||
|
||||
|
||||
## PDFObjRef
|
||||
|
@ -77,7 +79,7 @@ def resolve1(x):
|
|||
x = x.resolve()
|
||||
return x
|
||||
|
||||
def resolveall(x):
|
||||
def resolve_all(x):
|
||||
'''
|
||||
Recursively resolve X and all the internals.
|
||||
Make sure there is no indirect reference within the nested object.
|
||||
|
@ -86,10 +88,23 @@ def resolveall(x):
|
|||
while isinstance(x, PDFObjRef):
|
||||
x = x.resolve()
|
||||
if isinstance(x, list):
|
||||
x = [ resolveall(v) for v in x ]
|
||||
x = [ resolve_all(v) for v in x ]
|
||||
elif isinstance(x, dict):
|
||||
for (k,v) in x.iteritems():
|
||||
x[k] = resolveall(v)
|
||||
x[k] = resolve_all(v)
|
||||
return x
|
||||
|
||||
def decipher_all(decipher, objid, genno, x):
|
||||
'''
|
||||
Recursively decipher X.
|
||||
'''
|
||||
if isinstance(x, str):
|
||||
return decipher(objid, genno, x)
|
||||
if isinstance(x, list):
|
||||
x = [ decipher_all(decipher, objid, genno, v) for v in x ]
|
||||
elif isinstance(x, dict):
|
||||
for (k,v) in x.iteritems():
|
||||
x[k] = decipher_all(decipher, objid, genno, v)
|
||||
return x
|
||||
|
||||
# Type cheking
|
||||
|
@ -159,6 +174,13 @@ class PDFStream:
|
|||
self.rawdata = rawdata
|
||||
self.decipher = decipher
|
||||
self.data = None
|
||||
self.objid = None
|
||||
self.genno = None
|
||||
return
|
||||
|
||||
def set_objid(self, objid, genno):
|
||||
self.objid = objid
|
||||
self.genno = genno
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
|
@ -168,7 +190,7 @@ class PDFStream:
|
|||
assert self.data == None and self.rawdata != None
|
||||
data = self.rawdata
|
||||
if self.decipher:
|
||||
data = self.decipher(data)
|
||||
data = self.decipher(self.objid, self.genno, data)
|
||||
if 'Filter' not in self.dic:
|
||||
self.data = data
|
||||
self.rawdata = None
|
||||
|
@ -201,6 +223,8 @@ class PDFStream:
|
|||
buf += ent1
|
||||
ent0 = ent1
|
||||
data = buf
|
||||
if f == LITERAL_CRYPT:
|
||||
raise PDFEncryptionError
|
||||
else:
|
||||
if STRICT:
|
||||
raise PDFValueError('Invalid filter spec: %r' % f)
|
||||
|
@ -338,10 +362,11 @@ class PDFDocument:
|
|||
self.xrefs = []
|
||||
self.objs = {}
|
||||
self.parsed_objs = {}
|
||||
self.decipher = None
|
||||
self.root = None
|
||||
self.catalog = None
|
||||
self.parser = None
|
||||
self.encryption = None
|
||||
self.decipher = None
|
||||
return
|
||||
|
||||
def set_parser(self, parser):
|
||||
|
@ -351,20 +376,74 @@ class PDFDocument:
|
|||
for xref in self.xrefs:
|
||||
trailer = xref.trailer
|
||||
if 'Encrypt' in trailer:
|
||||
raise PDFEncrypted
|
||||
param = dict_value(trailer['Encrypt'])
|
||||
self.decipher = DECRYPTOR(param)
|
||||
self.parser.strfilter = self.decipher
|
||||
self.encryption = (list_value(trailer['ID']),
|
||||
dict_value(trailer['Encrypt']))
|
||||
if 'Root' in trailer:
|
||||
self.set_root(dict_value(trailer['Root']))
|
||||
break
|
||||
else:
|
||||
raise PDFValueError('no /Root object!')
|
||||
if self.encryption:
|
||||
self.prepare_cipher()
|
||||
return
|
||||
|
||||
def prepare_cipher(self, password=''):
|
||||
(docid, param) = self.encryption
|
||||
if literal_name(param['Filter']) != 'Standard':
|
||||
raise PDFEncryptionError('unknown filter: param=%r' % param)
|
||||
V = int_value(param.get('V', 0))
|
||||
if not (V == 1 or V == 2):
|
||||
raise PDFEncryptionError('unknown algorithm: param=%r' % param)
|
||||
length = int_value(param.get('Length', 40)) # Key length (bits)
|
||||
O = str_value(param['O'])
|
||||
R = int_value(param['R']) # Revision
|
||||
if 5 <= R:
|
||||
raise PDFEncryptionError('unknown revision: %r' % R)
|
||||
U = str_value(param['U'])
|
||||
P = int_value(param['P'])
|
||||
is_printable = bool(P & 4)
|
||||
is_modifiable = bool(P & 8)
|
||||
is_extractable = bool(P & 16)
|
||||
# Algorithm 3.2
|
||||
password = (password+PASSWORD_PADDING)[:32] # 1
|
||||
hash = md5.md5(password) # 2
|
||||
hash.update(O) # 3
|
||||
hash.update(struct.pack('<L', P)) # 4
|
||||
hash.update(docid[0]) # 5
|
||||
if 4 <= R:
|
||||
raise NotImplementedError # 6
|
||||
if 3 <= R:
|
||||
# 8
|
||||
for _ in xrange(50):
|
||||
hash = md5.md5(hash.digest()[:length/8])
|
||||
key = hash.digest()[:length/8]
|
||||
if R == 2:
|
||||
# Algorithm 3.4
|
||||
u1 = Arcfour(key).process(password)
|
||||
elif R == 3:
|
||||
# Algorithm 3.5
|
||||
hash = md5.md5(PASSWORD_PADDING) # 2
|
||||
hash.update(docid[0]) # 3
|
||||
x = Arcfour(key).process(hash.digest()[:16]) # 4
|
||||
for i in xrange(1,19+1):
|
||||
k = ''.join( chr(c ^ i) for c in key )
|
||||
x = Arcfour(k).process(x)
|
||||
u1 = x+x # 32bytes total
|
||||
else:
|
||||
raise PDFEncryptionError('unknown revision: %r' % R)
|
||||
if R == 2:
|
||||
is_authenticated = (u1 == U)
|
||||
else:
|
||||
is_authenticated = (u1[:16] == U[:16])
|
||||
if not is_authenticated:
|
||||
raise PDFPasswordIncorrect
|
||||
self.decipher = (lambda objid,genno,data: decrypt_rc4(key, objid, genno, data))
|
||||
return
|
||||
|
||||
def getobj(self, objid):
|
||||
#assert self.xrefs
|
||||
if objid in self.objs:
|
||||
genno = 0
|
||||
obj = self.objs[objid]
|
||||
else:
|
||||
for xref in self.xrefs:
|
||||
|
@ -400,18 +479,26 @@ class PDFDocument:
|
|||
except PSEOF:
|
||||
pass
|
||||
self.parsed_objs[stream] = objs
|
||||
genno = 0
|
||||
obj = objs[stream.dic['N']*2+index]
|
||||
if isinstance(obj, PDFStream):
|
||||
obj.set_objid(objid, 0)
|
||||
else:
|
||||
self.parser.seek(index)
|
||||
(_,objid1) = self.parser.nextobject() # objid
|
||||
(_,genno1) = self.parser.nextobject() # genno
|
||||
(_,genno) = self.parser.nextobject() # genno
|
||||
assert objid1 == objid
|
||||
(_,kwd) = self.parser.nextobject()
|
||||
if kwd != KEYWORD_OBJ:
|
||||
raise PDFSyntaxError('invalid obj spec: offset=%r' % index)
|
||||
(_,obj) = self.parser.nextobject()
|
||||
if isinstance(obj, PDFStream):
|
||||
obj.set_objid(objid, genno)
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'register: objid=%r: %r' % (objid, obj)
|
||||
self.objs[objid] = obj
|
||||
if self.decipher:
|
||||
obj = decipher_all(self.decipher, objid, genno, obj)
|
||||
return obj
|
||||
|
||||
def get_pages(self, debug=0):
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue