bug fixes
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@20 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
1300046181
commit
b1163b69bb
51
pdf2txt.py
51
pdf2txt.py
|
@ -7,31 +7,26 @@ from pdfinterp import PDFDevice, PDFResourceManager, \
|
||||||
PDFPageInterpreter, PDFUnicodeNotDefined, \
|
PDFPageInterpreter, PDFUnicodeNotDefined, \
|
||||||
mult_matrix, apply_matrix
|
mult_matrix, apply_matrix
|
||||||
from cmap import CMapDB
|
from cmap import CMapDB
|
||||||
from extent import Rect, ExtSet, ExtGrid
|
|
||||||
|
|
||||||
|
|
||||||
## PageItem
|
## PageItem
|
||||||
##
|
##
|
||||||
class PageItem:
|
class PageItem:
|
||||||
|
|
||||||
GRID_SIZE = 20
|
|
||||||
|
|
||||||
def __init__(self, id, (x0,y0,x1,y1), rotate=0):
|
def __init__(self, id, (x0,y0,x1,y1), rotate=0):
|
||||||
self.id = id
|
self.id = id
|
||||||
self.bbox = Rect(x0, y0, x1-x0, y1-y0)
|
self.bbox = (x0, y0, x1-x0, y1-y0)
|
||||||
self.rotate = rotate
|
self.rotate = rotate
|
||||||
self.grid = ExtGrid(self.GRID_SIZE)
|
|
||||||
self.objs = []
|
self.objs = []
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
bbox = self.bbox
|
bbox = '%d,%d,%d,%d' % self.bbox
|
||||||
return ('<page id=%r bbox="%d,%d,%d,%d" rotate="%d">' %
|
return ('<page id=%r bbox="%s" rotate="%d">' %
|
||||||
(self.id, bbox.x0,bbox.y0,bbox.x1,bbox.y1, self.rotate))
|
(self.id, bbox, self.rotate))
|
||||||
|
|
||||||
def add(self, obj):
|
def add(self, obj):
|
||||||
self.objs.append(obj)
|
self.objs.append(obj)
|
||||||
self.grid.add(obj.bbox, obj)
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def dump(self, outfp, codec):
|
def dump(self, outfp, codec):
|
||||||
|
@ -41,23 +36,14 @@ class PageItem:
|
||||||
outfp.write('</page>\n')
|
outfp.write('</page>\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
def fuse(self):
|
|
||||||
for obj1 in self.objs:
|
|
||||||
f = (lambda obj: obj.bbox)
|
|
||||||
for rect in obj1.search_range():
|
|
||||||
neighbors = [ obj2 for obj2 in self.grid.get(rect, f) if obj2 is not obj1 ]
|
|
||||||
#print obj1.bbox, obj1.text.encode('euc-jp','ignore'), rect, [ obj.bbox for obj in neighbors ]
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
## FigureItem
|
## FigureItem
|
||||||
##
|
##
|
||||||
class FigureItem(PageItem):
|
class FigureItem(PageItem):
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
bbox = self.bbox
|
bbox = '%d,%d,%d,%d' % self.bbox
|
||||||
return ('<figure id=%r bbox="%d,%d,%d,%d">' %
|
return ('<figure id=%r bbox="%s">' % (self.id, bbox))
|
||||||
(self.id, bbox.x0,bbox.y0,bbox.x1,bbox.y1))
|
|
||||||
|
|
||||||
def dump(self, outfp, codec):
|
def dump(self, outfp, codec):
|
||||||
outfp.write(repr(self)+'\n')
|
outfp.write(repr(self)+'\n')
|
||||||
|
@ -66,9 +52,6 @@ class FigureItem(PageItem):
|
||||||
outfp.write('</figure>\n')
|
outfp.write('</figure>\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
def search_range(self):
|
|
||||||
return []
|
|
||||||
|
|
||||||
|
|
||||||
## TextItem
|
## TextItem
|
||||||
##
|
##
|
||||||
|
@ -86,12 +69,12 @@ class TextItem:
|
||||||
self.direction = 1
|
self.direction = 1
|
||||||
(_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*size*0.001))
|
(_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*size*0.001))
|
||||||
(_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*size*0.001))
|
(_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*size*0.001))
|
||||||
self.bbox = Rect(tx, ty+descent, self.width, self.size)
|
self.bbox = (tx, ty+descent, self.width, self.size)
|
||||||
else:
|
else:
|
||||||
self.direction = 2
|
self.direction = 2
|
||||||
mindisp = min( d for (d,_) in text )
|
mindisp = min( d for (d,_) in text )
|
||||||
(mindisp,_) = apply_matrix((a,b,c,d,0,0), (mindisp*size*0.001,0))
|
(mindisp,_) = apply_matrix((a,b,c,d,0,0), (mindisp*size*0.001,0))
|
||||||
self.bbox = Rect(tx-mindisp, ty+self.width, self.size, self.width)
|
self.bbox = (tx-mindisp, ty+self.width, self.size, self.width)
|
||||||
self.text = ''.join( c for (_,c) in text )
|
self.text = ''.join( c for (_,c) in text )
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -107,12 +90,6 @@ class TextItem:
|
||||||
outfp.write('</text>\n')
|
outfp.write('</text>\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
def search_range(self):
|
|
||||||
if self.direction == 1:
|
|
||||||
return [ Rect(self.bbox.x1, self.bbox.y0, self.size, self.size) ]
|
|
||||||
else:
|
|
||||||
return [ Rect(self.bbox.x0, self.bbox.y0-self.size, self.size, self.size) ]
|
|
||||||
|
|
||||||
|
|
||||||
## TextConverter
|
## TextConverter
|
||||||
##
|
##
|
||||||
|
@ -120,6 +97,10 @@ class TextConverter(PDFDevice):
|
||||||
|
|
||||||
def __init__(self, rsrc, debug=0):
|
def __init__(self, rsrc, debug=0):
|
||||||
PDFDevice.__init__(self, rsrc, debug=debug)
|
PDFDevice.__init__(self, rsrc, debug=debug)
|
||||||
|
self.reset()
|
||||||
|
return
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
self.pages = []
|
self.pages = []
|
||||||
self.stack = []
|
self.stack = []
|
||||||
return
|
return
|
||||||
|
@ -173,11 +154,8 @@ class TextConverter(PDFDevice):
|
||||||
return
|
return
|
||||||
|
|
||||||
def dump(self, outfp, codec):
|
def dump(self, outfp, codec):
|
||||||
outfp.write('<document>\n')
|
|
||||||
for page in self.pages:
|
for page in self.pages:
|
||||||
#page.fuse()
|
|
||||||
page.dump(outfp, codec)
|
page.dump(outfp, codec)
|
||||||
outfp.write('</document>\n')
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@ -188,12 +166,15 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
|
||||||
fp = file(fname)
|
fp = file(fname)
|
||||||
parser = PDFParser(doc, fp, debug=debug)
|
parser = PDFParser(doc, fp, debug=debug)
|
||||||
interpreter = PDFPageInterpreter(rsrc, device, debug=debug)
|
interpreter = PDFPageInterpreter(rsrc, device, debug=debug)
|
||||||
|
outfp.write('<document>\n')
|
||||||
for (i,page) in enumerate(doc.get_pages(debug=debug)):
|
for (i,page) in enumerate(doc.get_pages(debug=debug)):
|
||||||
if pages and (i not in pages): continue
|
if pages and (i not in pages): continue
|
||||||
|
device.reset()
|
||||||
interpreter.process_page(page)
|
interpreter.process_page(page)
|
||||||
|
device.dump(outfp, codec)
|
||||||
fp.close()
|
fp.close()
|
||||||
device.dump(outfp, codec)
|
|
||||||
device.close()
|
device.close()
|
||||||
|
outfp.write('</document>\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
|
25
pdfinterp.py
25
pdfinterp.py
|
@ -1,5 +1,5 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import sys, re
|
import sys
|
||||||
stderr = sys.stderr
|
stderr = sys.stderr
|
||||||
from struct import pack, unpack
|
from struct import pack, unpack
|
||||||
try:
|
try:
|
||||||
|
@ -292,8 +292,18 @@ class PDFCIDFont(PDFFont):
|
||||||
self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
|
self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
|
||||||
self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'),
|
self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'),
|
||||||
self.cidsysteminfo.get('Ordering', 'unknown'))
|
self.cidsysteminfo.get('Ordering', 'unknown'))
|
||||||
self.cmap = CMapDB.get_cmap(literal_name(spec['Encoding']))
|
try:
|
||||||
descriptor = dict_value(spec['FontDescriptor'])
|
self.cmap = CMapDB.get_cmap(literal_name(spec['Encoding']))
|
||||||
|
except KeyError:
|
||||||
|
if STRICT:
|
||||||
|
raise PDFFontError('cmap is missing')
|
||||||
|
self.cmap = None
|
||||||
|
try:
|
||||||
|
descriptor = dict_value(spec['FontDescriptor'])
|
||||||
|
except KeyError:
|
||||||
|
if STRICT:
|
||||||
|
raise PDFFontError('FontDescriptor is missing')
|
||||||
|
descriptor = {}
|
||||||
ttf = None
|
ttf = None
|
||||||
if 'FontFile2' in descriptor:
|
if 'FontFile2' in descriptor:
|
||||||
self.fontfile = stream_value(descriptor.get('FontFile2'))
|
self.fontfile = stream_value(descriptor.get('FontFile2'))
|
||||||
|
@ -486,9 +496,6 @@ class PDFContentParser(PSStackParser):
|
||||||
PSStackParser.__init__(self, None, debug=debug)
|
PSStackParser.__init__(self, None, debug=debug)
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return '<PDFParser: linepos=%d>' % self.linepos
|
|
||||||
|
|
||||||
def fillfp(self):
|
def fillfp(self):
|
||||||
if not self.fp:
|
if not self.fp:
|
||||||
if self.istream < len(self.streams):
|
if self.istream < len(self.streams):
|
||||||
|
@ -611,9 +618,9 @@ class PDFPageInterpreter:
|
||||||
name = literal_name(spec[0])
|
name = literal_name(spec[0])
|
||||||
else:
|
else:
|
||||||
name = literal_name(spec)
|
name = literal_name(spec)
|
||||||
if name == 'ICCBased':
|
if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec):
|
||||||
return ColorSpace(name, stream_value(spec[1]).dic['N'])
|
return ColorSpace(name, stream_value(spec[1]).dic['N'])
|
||||||
elif name == 'DeviceN':
|
elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec):
|
||||||
return ColorSpace(name, len(list_value(spec[1])))
|
return ColorSpace(name, len(list_value(spec[1])))
|
||||||
else:
|
else:
|
||||||
return PREDEFINED_COLORSPACE[name]
|
return PREDEFINED_COLORSPACE[name]
|
||||||
|
@ -935,7 +942,7 @@ class PDFPageInterpreter:
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
|
raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
|
||||||
return
|
return
|
||||||
if xobj.dic['Subtype'] == LITERAL_FORM:
|
if xobj.dic.get('Subtype') == LITERAL_FORM and 'BBox' in xobj.dic:
|
||||||
if 1 <= self.debug:
|
if 1 <= self.debug:
|
||||||
print >>stderr, 'Processing xobj: %r' % xobj
|
print >>stderr, 'Processing xobj: %r' % xobj
|
||||||
interpreter = PDFPageInterpreter(self.rsrc, self.device)
|
interpreter = PDFPageInterpreter(self.rsrc, self.device)
|
||||||
|
|
120
pdfparser.py
120
pdfparser.py
|
@ -30,6 +30,7 @@ class PDFEncryptionError(PDFException): pass
|
||||||
class PDFPasswordIncorrect(PDFEncryptionError): pass
|
class PDFPasswordIncorrect(PDFEncryptionError): pass
|
||||||
class PDFTypeError(PDFException): pass
|
class PDFTypeError(PDFException): pass
|
||||||
class PDFValueError(PDFException): pass
|
class PDFValueError(PDFException): pass
|
||||||
|
class PDFNotImplementedError(PSException): pass
|
||||||
|
|
||||||
|
|
||||||
# some predefined literals and keywords.
|
# some predefined literals and keywords.
|
||||||
|
@ -40,11 +41,13 @@ LITERAL_PAGES = PSLiteralTable.intern('Pages')
|
||||||
LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
|
LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
|
||||||
LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
|
LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
|
||||||
LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode')
|
LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode')
|
||||||
|
LITERAL_LZW_DECODE = PSLiteralTable.intern('LZWDecode')
|
||||||
KEYWORD_R = PSKeywordTable.intern('R')
|
KEYWORD_R = PSKeywordTable.intern('R')
|
||||||
KEYWORD_OBJ = PSKeywordTable.intern('obj')
|
KEYWORD_OBJ = PSKeywordTable.intern('obj')
|
||||||
KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
|
KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
|
||||||
KEYWORD_STREAM = PSKeywordTable.intern('stream')
|
KEYWORD_STREAM = PSKeywordTable.intern('stream')
|
||||||
KEYWORD_XREF = PSKeywordTable.intern('xref')
|
KEYWORD_XREF = PSKeywordTable.intern('xref')
|
||||||
|
KEYWORD_TRAILER = PSKeywordTable.intern('trailer')
|
||||||
KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
|
KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
|
||||||
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
|
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
|
||||||
|
|
||||||
|
@ -184,12 +187,13 @@ class PDFStream:
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<PDFStream: %r>' % (self.dic)
|
return '<PDFStream(%d): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
|
||||||
|
|
||||||
def decode(self):
|
def decode(self):
|
||||||
assert self.data == None and self.rawdata != None
|
assert self.data == None and self.rawdata != None
|
||||||
data = self.rawdata
|
data = self.rawdata
|
||||||
if self.decipher:
|
if self.decipher:
|
||||||
|
# Handle encryption
|
||||||
data = self.decipher(self.objid, self.genno, data)
|
data = self.decipher(self.objid, self.genno, data)
|
||||||
if 'Filter' not in self.dic:
|
if 'Filter' not in self.dic:
|
||||||
self.data = data
|
self.data = data
|
||||||
|
@ -203,31 +207,32 @@ class PDFStream:
|
||||||
import zlib
|
import zlib
|
||||||
# will get errors if the document is encrypted.
|
# will get errors if the document is encrypted.
|
||||||
data = zlib.decompress(data)
|
data = zlib.decompress(data)
|
||||||
# apply predictors
|
elif f == LITERAL_LZW_DECODE:
|
||||||
params = self.dic.get('DecodeParms', {})
|
raise PDFNotImplementedError('LZWDecode is currently unsupported.')
|
||||||
if 'Predictor' in params:
|
elif f == LITERAL_CRYPT:
|
||||||
pred = int_value(params['Predictor'])
|
|
||||||
if pred:
|
|
||||||
if pred != 12:
|
|
||||||
raise PDFValueError('Unsupported predictor: %r' % pred)
|
|
||||||
if 'Columns' not in params:
|
|
||||||
raise PDFValueError('Columns undefined for predictor=12')
|
|
||||||
columns = int_value(params['Columns'])
|
|
||||||
buf = ''
|
|
||||||
ent0 = '\x00' * columns
|
|
||||||
for i in xrange(0, len(data), columns+1):
|
|
||||||
pred = data[i]
|
|
||||||
ent1 = data[i+1:i+1+columns]
|
|
||||||
if pred == '\x02':
|
|
||||||
ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )
|
|
||||||
buf += ent1
|
|
||||||
ent0 = ent1
|
|
||||||
data = buf
|
|
||||||
if f == LITERAL_CRYPT:
|
|
||||||
raise PDFEncryptionError
|
raise PDFEncryptionError
|
||||||
else:
|
else:
|
||||||
if STRICT:
|
raise PDFNotImplementedError('Unsupported filter: %r' % f)
|
||||||
raise PDFValueError('Invalid filter spec: %r' % f)
|
# apply predictors
|
||||||
|
params = self.dic.get('DecodeParms', {})
|
||||||
|
if 'Predictor' in params:
|
||||||
|
pred = int_value(params['Predictor'])
|
||||||
|
if pred:
|
||||||
|
if pred != 12:
|
||||||
|
raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
|
||||||
|
if 'Columns' not in params:
|
||||||
|
raise PDFValueError('Columns undefined for predictor=12')
|
||||||
|
columns = int_value(params['Columns'])
|
||||||
|
buf = ''
|
||||||
|
ent0 = '\x00' * columns
|
||||||
|
for i in xrange(0, len(data), columns+1):
|
||||||
|
pred = data[i]
|
||||||
|
ent1 = data[i+1:i+1+columns]
|
||||||
|
if pred == '\x02':
|
||||||
|
ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )
|
||||||
|
buf += ent1
|
||||||
|
ent0 = ent1
|
||||||
|
data = buf
|
||||||
self.data = data
|
self.data = data
|
||||||
self.rawdata = None
|
self.rawdata = None
|
||||||
return
|
return
|
||||||
|
@ -274,18 +279,19 @@ class PDFXRef:
|
||||||
|
|
||||||
def __init__(self, parser):
|
def __init__(self, parser):
|
||||||
while 1:
|
while 1:
|
||||||
(_, line) = parser.nextline()
|
(pos, line) = parser.nextline()
|
||||||
if not line:
|
if not line:
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFSyntaxError('premature eof: %r' % parser)
|
raise PDFSyntaxError('premature eof: %r' % parser)
|
||||||
break
|
break
|
||||||
line = line.strip()
|
if line.startswith('trailer'):
|
||||||
f = line.split(' ')
|
parser.seek(pos)
|
||||||
if len(f) != 2:
|
|
||||||
if line != 'trailer':
|
|
||||||
if STRICT:
|
|
||||||
raise PDFSyntaxError('trailer not found: %r: line=%r' % (parser, line))
|
|
||||||
break
|
break
|
||||||
|
f = line.strip().split(' ')
|
||||||
|
if len(f) != 2:
|
||||||
|
if STRICT:
|
||||||
|
raise PDFSyntaxError('trailer not found: %r: line=%r' % (parser, line))
|
||||||
|
continue
|
||||||
(start, nobjs) = map(long, f)
|
(start, nobjs) = map(long, f)
|
||||||
self.objid0 = start
|
self.objid0 = start
|
||||||
self.objid1 = start+nobjs
|
self.objid1 = start+nobjs
|
||||||
|
@ -300,7 +306,9 @@ class PDFXRef:
|
||||||
(pos, genno, use) = f
|
(pos, genno, use) = f
|
||||||
self.offsets.append((int(genno), long(pos), use))
|
self.offsets.append((int(genno), long(pos), use))
|
||||||
# read trailer
|
# read trailer
|
||||||
(_, dic) = parser.nextobject()
|
(_,kwd) = parser.nexttoken()
|
||||||
|
assert kwd == KEYWORD_TRAILER
|
||||||
|
(_,dic) = parser.nextobject()
|
||||||
self.trailer = dict_value(dic)
|
self.trailer = dict_value(dic)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -319,9 +327,9 @@ class PDFXRef:
|
||||||
class PDFXRefStream:
|
class PDFXRefStream:
|
||||||
|
|
||||||
def __init__(self, parser):
|
def __init__(self, parser):
|
||||||
(_,objid) = parser.nextobject()
|
(_,objid) = parser.nexttoken()
|
||||||
(_,genno) = parser.nextobject()
|
(_,genno) = parser.nexttoken()
|
||||||
parser.nextobject()
|
(_,kwd) = parser.nexttoken()
|
||||||
(_,stream) = parser.nextobject()
|
(_,stream) = parser.nextobject()
|
||||||
if STRICT:
|
if STRICT:
|
||||||
if stream.dic['Type'] != LITERAL_XREF:
|
if stream.dic['Type'] != LITERAL_XREF:
|
||||||
|
@ -367,6 +375,7 @@ class PDFDocument:
|
||||||
self.parser = None
|
self.parser = None
|
||||||
self.encryption = None
|
self.encryption = None
|
||||||
self.decipher = None
|
self.decipher = None
|
||||||
|
self.is_printable = self.is_modifiable = self.is_extractable = True
|
||||||
return
|
return
|
||||||
|
|
||||||
def set_parser(self, parser):
|
def set_parser(self, parser):
|
||||||
|
@ -401,9 +410,9 @@ class PDFDocument:
|
||||||
raise PDFEncryptionError('unknown revision: %r' % R)
|
raise PDFEncryptionError('unknown revision: %r' % R)
|
||||||
U = str_value(param['U'])
|
U = str_value(param['U'])
|
||||||
P = int_value(param['P'])
|
P = int_value(param['P'])
|
||||||
is_printable = bool(P & 4)
|
self.is_printable = bool(P & 4)
|
||||||
is_modifiable = bool(P & 8)
|
self.is_modifiable = bool(P & 8)
|
||||||
is_extractable = bool(P & 16)
|
self.is_extractable = bool(P & 16)
|
||||||
# Algorithm 3.2
|
# Algorithm 3.2
|
||||||
password = (password+PASSWORD_PADDING)[:32] # 1
|
password = (password+PASSWORD_PADDING)[:32] # 1
|
||||||
hash = md5.md5(password) # 2
|
hash = md5.md5(password) # 2
|
||||||
|
@ -411,7 +420,8 @@ class PDFDocument:
|
||||||
hash.update(struct.pack('<L', P)) # 4
|
hash.update(struct.pack('<L', P)) # 4
|
||||||
hash.update(docid[0]) # 5
|
hash.update(docid[0]) # 5
|
||||||
if 4 <= R:
|
if 4 <= R:
|
||||||
raise NotImplementedError # 6
|
# 6
|
||||||
|
raise PDFNotImplementedError('Revision 4 encryption is currently unsupported')
|
||||||
if 3 <= R:
|
if 3 <= R:
|
||||||
# 8
|
# 8
|
||||||
for _ in xrange(50):
|
for _ in xrange(50):
|
||||||
|
@ -429,8 +439,6 @@ class PDFDocument:
|
||||||
k = ''.join( chr(c ^ i) for c in key )
|
k = ''.join( chr(c ^ i) for c in key )
|
||||||
x = Arcfour(k).process(x)
|
x = Arcfour(k).process(x)
|
||||||
u1 = x+x # 32bytes total
|
u1 = x+x # 32bytes total
|
||||||
else:
|
|
||||||
raise PDFEncryptionError('unknown revision: %r' % R)
|
|
||||||
if R == 2:
|
if R == 2:
|
||||||
is_authenticated = (u1 == U)
|
is_authenticated = (u1 == U)
|
||||||
else:
|
else:
|
||||||
|
@ -485,10 +493,10 @@ class PDFDocument:
|
||||||
obj.set_objid(objid, 0)
|
obj.set_objid(objid, 0)
|
||||||
else:
|
else:
|
||||||
self.parser.seek(index)
|
self.parser.seek(index)
|
||||||
(_,objid1) = self.parser.nextobject() # objid
|
(_,objid1) = self.parser.nexttoken() # objid
|
||||||
(_,genno) = self.parser.nextobject() # genno
|
(_,genno) = self.parser.nexttoken() # genno
|
||||||
assert objid1 == objid
|
assert objid1 == objid, (objid, objid1)
|
||||||
(_,kwd) = self.parser.nextobject()
|
(_,kwd) = self.parser.nexttoken()
|
||||||
if kwd != KEYWORD_OBJ:
|
if kwd != KEYWORD_OBJ:
|
||||||
raise PDFSyntaxError('invalid obj spec: offset=%r' % index)
|
raise PDFSyntaxError('invalid obj spec: offset=%r' % index)
|
||||||
(_,obj) = self.parser.nextobject()
|
(_,obj) = self.parser.nextobject()
|
||||||
|
@ -582,7 +590,11 @@ class PDFParser(PSStackParser):
|
||||||
self.seek(pos+objlen)
|
self.seek(pos+objlen)
|
||||||
while 1:
|
while 1:
|
||||||
(linepos, line) = self.nextline()
|
(linepos, line) = self.nextline()
|
||||||
if line.startswith('endstream'): break
|
if 'endstream' in line:
|
||||||
|
i = line.index('endstream')
|
||||||
|
objlen += i
|
||||||
|
data += line[:i]
|
||||||
|
break
|
||||||
objlen += len(line)
|
objlen += len(line)
|
||||||
data += line
|
data += line
|
||||||
if 1 <= self.debug:
|
if 1 <= self.debug:
|
||||||
|
@ -597,7 +609,7 @@ class PDFParser(PSStackParser):
|
||||||
return
|
return
|
||||||
|
|
||||||
def find_xref(self):
|
def find_xref(self):
|
||||||
# find the first xref table
|
# search the last xref table by scanning the file backwards.
|
||||||
prev = None
|
prev = None
|
||||||
for line in self.revreadlines():
|
for line in self.revreadlines():
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
|
@ -620,19 +632,19 @@ class PDFParser(PSStackParser):
|
||||||
self.find_xref()
|
self.find_xref()
|
||||||
while 1:
|
while 1:
|
||||||
# read xref table
|
# read xref table
|
||||||
(linepos, line) = self.nextline()
|
(pos, token) = self.nexttoken()
|
||||||
if 2 <= self.debug:
|
if 2 <= self.debug:
|
||||||
print >>stderr, 'read_xref: %r' % line
|
print >>stderr, 'read_xref: %r' % token
|
||||||
if line[0].isdigit():
|
if isinstance(token, int):
|
||||||
# XRefStream: PDF-1.5
|
# XRefStream: PDF-1.5
|
||||||
self.seek(linepos)
|
self.seek(pos)
|
||||||
self.reset()
|
self.reset()
|
||||||
xref = PDFXRefStream(self)
|
xref = PDFXRefStream(self)
|
||||||
else:
|
else:
|
||||||
if line.strip() != 'xref':
|
if token != KEYWORD_XREF:
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFSyntaxError('xref not found: linepos=%d, line=%r' %
|
raise PDFSyntaxError('xref not found: pos=%d, token=%r' %
|
||||||
(linepos, line))
|
(pos, token))
|
||||||
xref = PDFXRef(self)
|
xref = PDFXRef(self)
|
||||||
yield xref
|
yield xref
|
||||||
trailer = xref.trailer
|
trailer = xref.trailer
|
||||||
|
|
|
@ -424,6 +424,11 @@ class PSStackParser(PSBaseParser):
|
||||||
self.results = []
|
self.results = []
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def seek(self, pos):
|
||||||
|
PSBaseParser.seek(self, pos)
|
||||||
|
self.reset()
|
||||||
|
return
|
||||||
|
|
||||||
def push(self, *objs):
|
def push(self, *objs):
|
||||||
self.curstack.extend(objs)
|
self.curstack.extend(objs)
|
||||||
return
|
return
|
||||||
|
|
Loading…
Reference in New Issue