basic encryption support added.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@19 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
5c1aa960f5
commit
1300046181
|
@ -0,0 +1,9 @@
|
||||||
|
TODO:
|
||||||
|
- Code Documentation.
|
||||||
|
- Error handling for invalid type.
|
||||||
|
|
||||||
|
- Outlines.
|
||||||
|
- Named Objects. (pages)
|
||||||
|
- Writers.
|
||||||
|
- Linearized PDF.
|
||||||
|
- Encryption?
|
|
@ -0,0 +1,40 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
#
|
||||||
|
# Arcfour implementation
|
||||||
|
# * public domain *
|
||||||
|
#
|
||||||
|
|
||||||
|
class Arcfour:
|
||||||
|
|
||||||
|
def __init__(self, key):
|
||||||
|
s = range(256)
|
||||||
|
j = 0
|
||||||
|
klen = len(key)
|
||||||
|
for i in xrange(256):
|
||||||
|
j = (j + s[i] + ord(key[i % klen])) % 256
|
||||||
|
(s[i], s[j]) = (s[j], s[i])
|
||||||
|
self.s = s
|
||||||
|
(self.i, self.j) = (0, 0)
|
||||||
|
return
|
||||||
|
|
||||||
|
def process(self, data):
|
||||||
|
(i, j) = (self.i, self.j)
|
||||||
|
s = self.s
|
||||||
|
r = ''
|
||||||
|
for c in data:
|
||||||
|
i = (i+1) % 256
|
||||||
|
j = (j+s[i]) % 256
|
||||||
|
(s[i], s[j]) = (s[j], s[i])
|
||||||
|
k = s[(s[i]+s[j]) % 256]
|
||||||
|
r += chr(ord(c) ^ k)
|
||||||
|
(self.i, self.j) = (i, j)
|
||||||
|
return r
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
def doit(key, data):
|
||||||
|
cipher = Arcfour(key)
|
||||||
|
return ''.join( '%02X' % ord(c) for c in cipher.process(data) )
|
||||||
|
assert doit("Key", "Plaintext") == 'BBF316E8D940AF0AD3'
|
||||||
|
assert doit("Wiki", "pedia") == '1021BF0420'
|
||||||
|
assert doit("Secret", "Attack at dawn") == '45A01F645FC35B383552544B9BF5'
|
||||||
|
print 'test succeeded'
|
28
extent.py
28
extent.py
|
@ -20,6 +20,9 @@ class Rect:
|
||||||
self.y1 = y0+h
|
self.y1 = y0+h
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return '<Rect: (%d,%d)-(%d,%d)>' % (self.x0, self.y0, self.x1, self.y1)
|
||||||
|
|
||||||
def overlap(self, rect):
|
def overlap(self, rect):
|
||||||
return not (rect.x1 <= self.x0 or self.x1 <= rect.x0 or
|
return not (rect.x1 <= self.x0 or self.x1 <= rect.x0 or
|
||||||
rect.y1 <= self.y0 or self.y1 <= rect.y0)
|
rect.y1 <= self.y0 or self.y1 <= rect.y0)
|
||||||
|
@ -31,7 +34,7 @@ class ExtSet:
|
||||||
|
|
||||||
def __init__(self, gridsize):
|
def __init__(self, gridsize):
|
||||||
self.gridsize = gridsize
|
self.gridsize = gridsize
|
||||||
self.grid = []
|
self.grid = {}
|
||||||
return
|
return
|
||||||
|
|
||||||
def cells(self, x0, x1):
|
def cells(self, x0, x1):
|
||||||
|
@ -45,12 +48,18 @@ class ExtSet:
|
||||||
|
|
||||||
def add(self, x0, x1, obj):
|
def add(self, x0, x1, obj):
|
||||||
for i in self.cells(x0, x1):
|
for i in self.cells(x0, x1):
|
||||||
self.grid[i].append(obj)
|
if i not in self.grid:
|
||||||
|
a = []
|
||||||
|
self.grid[i] = a
|
||||||
|
else:
|
||||||
|
a = self.grid[i]
|
||||||
|
a.append(obj)
|
||||||
return
|
return
|
||||||
|
|
||||||
def get(self, x0, x1):
|
def get(self, x0, x1):
|
||||||
objs = set()
|
objs = set()
|
||||||
for i in self.cells(x0, x1):
|
for i in self.cells(x0, x1):
|
||||||
|
if i in self.grid:
|
||||||
objs.update(self.grid[i])
|
objs.update(self.grid[i])
|
||||||
return objs
|
return objs
|
||||||
|
|
||||||
|
@ -78,12 +87,13 @@ class ExtGrid:
|
||||||
self.vext = ExtSet(gridsize)
|
self.vext = ExtSet(gridsize)
|
||||||
return
|
return
|
||||||
|
|
||||||
def add(self, rect):
|
def add(self, rect, obj):
|
||||||
self.hext.add(rect.x0, rect.x1, rect)
|
self.hext.add(rect.x0, rect.x1, obj)
|
||||||
self.vext.add(rect.y0, rect.y1, rect)
|
self.vext.add(rect.y0, rect.y1, obj)
|
||||||
return
|
return
|
||||||
|
|
||||||
def get(self, rect):
|
def get(self, rect, getrect):
|
||||||
rects = self.hext.get(rect.x0, rect.x1)
|
objs = self.hext.get(rect.x0, rect.x1)
|
||||||
rects.update_intersect(self.vext.get(rect.y0, rect.y1))
|
objs.intersection_update(self.vext.get(rect.y0, rect.y1))
|
||||||
return rects
|
objs = [ obj for obj in objs if rect.overlap(getrect(obj)) ]
|
||||||
|
return objs
|
||||||
|
|
171
pdf2txt.py
171
pdf2txt.py
|
@ -7,86 +7,183 @@ from pdfinterp import PDFDevice, PDFResourceManager, \
|
||||||
PDFPageInterpreter, PDFUnicodeNotDefined, \
|
PDFPageInterpreter, PDFUnicodeNotDefined, \
|
||||||
mult_matrix, apply_matrix
|
mult_matrix, apply_matrix
|
||||||
from cmap import CMapDB
|
from cmap import CMapDB
|
||||||
|
from extent import Rect, ExtSet, ExtGrid
|
||||||
|
|
||||||
|
|
||||||
|
## PageItem
|
||||||
|
##
|
||||||
|
class PageItem:
|
||||||
|
|
||||||
|
GRID_SIZE = 20
|
||||||
|
|
||||||
|
def __init__(self, id, (x0,y0,x1,y1), rotate=0):
|
||||||
|
self.id = id
|
||||||
|
self.bbox = Rect(x0, y0, x1-x0, y1-y0)
|
||||||
|
self.rotate = rotate
|
||||||
|
self.grid = ExtGrid(self.GRID_SIZE)
|
||||||
|
self.objs = []
|
||||||
|
return
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
bbox = self.bbox
|
||||||
|
return ('<page id=%r bbox="%d,%d,%d,%d" rotate="%d">' %
|
||||||
|
(self.id, bbox.x0,bbox.y0,bbox.x1,bbox.y1, self.rotate))
|
||||||
|
|
||||||
|
def add(self, obj):
|
||||||
|
self.objs.append(obj)
|
||||||
|
self.grid.add(obj.bbox, obj)
|
||||||
|
return
|
||||||
|
|
||||||
|
def dump(self, outfp, codec):
|
||||||
|
outfp.write(repr(self)+'\n')
|
||||||
|
for obj in self.objs:
|
||||||
|
obj.dump(outfp, codec)
|
||||||
|
outfp.write('</page>\n')
|
||||||
|
return
|
||||||
|
|
||||||
|
def fuse(self):
|
||||||
|
for obj1 in self.objs:
|
||||||
|
f = (lambda obj: obj.bbox)
|
||||||
|
for rect in obj1.search_range():
|
||||||
|
neighbors = [ obj2 for obj2 in self.grid.get(rect, f) if obj2 is not obj1 ]
|
||||||
|
#print obj1.bbox, obj1.text.encode('euc-jp','ignore'), rect, [ obj.bbox for obj in neighbors ]
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
## FigureItem
|
||||||
|
##
|
||||||
|
class FigureItem(PageItem):
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
bbox = self.bbox
|
||||||
|
return ('<figure id=%r bbox="%d,%d,%d,%d">' %
|
||||||
|
(self.id, bbox.x0,bbox.y0,bbox.x1,bbox.y1))
|
||||||
|
|
||||||
|
def dump(self, outfp, codec):
|
||||||
|
outfp.write(repr(self)+'\n')
|
||||||
|
for obj in self.objs:
|
||||||
|
obj.dump(outfp, codec)
|
||||||
|
outfp.write('</figure>\n')
|
||||||
|
return
|
||||||
|
|
||||||
|
def search_range(self):
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
## TextItem
|
||||||
|
##
|
||||||
|
class TextItem:
|
||||||
|
|
||||||
|
def __init__(self, matrix, font, size, width, text):
|
||||||
|
self.matrix = matrix
|
||||||
|
self.font = font
|
||||||
|
(a,b,c,d,tx,ty) = self.matrix
|
||||||
|
(self.width, self.size) = apply_matrix((a,b,c,d,0,0), (width,size))
|
||||||
|
self.width = abs(self.width)
|
||||||
|
self.origin = (tx,ty)
|
||||||
|
self.direction = 0
|
||||||
|
if not self.font.is_vertical():
|
||||||
|
self.direction = 1
|
||||||
|
(_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*size*0.001))
|
||||||
|
(_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*size*0.001))
|
||||||
|
self.bbox = Rect(tx, ty+descent, self.width, self.size)
|
||||||
|
else:
|
||||||
|
self.direction = 2
|
||||||
|
mindisp = min( d for (d,_) in text )
|
||||||
|
(mindisp,_) = apply_matrix((a,b,c,d,0,0), (mindisp*size*0.001,0))
|
||||||
|
self.bbox = Rect(tx-mindisp, ty+self.width, self.size, self.width)
|
||||||
|
self.text = ''.join( c for (_,c) in text )
|
||||||
|
return
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return ('<text matrix=%r font=%r size=%r width=%r text=%r>' %
|
||||||
|
(self.matrix, self.font, self.size, self.width, self.text))
|
||||||
|
|
||||||
|
def dump(self, outfp, codec):
|
||||||
|
(a,b,c,d,tx,ty) = self.matrix
|
||||||
|
outfp.write('<text x="%.3f" y="%.3f" font=%r size="%.3f" width="%.3f">' %
|
||||||
|
(tx, ty, self.font.fontname, self.size, self.width))
|
||||||
|
outfp.write(self.text.encode(codec, 'xmlcharrefreplace'))
|
||||||
|
outfp.write('</text>\n')
|
||||||
|
return
|
||||||
|
|
||||||
|
def search_range(self):
|
||||||
|
if self.direction == 1:
|
||||||
|
return [ Rect(self.bbox.x1, self.bbox.y0, self.size, self.size) ]
|
||||||
|
else:
|
||||||
|
return [ Rect(self.bbox.x0, self.bbox.y0-self.size, self.size, self.size) ]
|
||||||
|
|
||||||
|
|
||||||
## TextConverter
|
## TextConverter
|
||||||
##
|
##
|
||||||
class TextConverter(PDFDevice):
|
class TextConverter(PDFDevice):
|
||||||
|
|
||||||
def __init__(self, outfp, rsrc, codec, debug=0):
|
def __init__(self, rsrc, debug=0):
|
||||||
PDFDevice.__init__(self, rsrc, debug=debug)
|
PDFDevice.__init__(self, rsrc, debug=debug)
|
||||||
self.outfp = outfp
|
self.pages = []
|
||||||
self.codec = codec
|
self.stack = []
|
||||||
return
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
self.outfp.write('\n')
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_page(self, page):
|
def begin_page(self, page):
|
||||||
(x0,y0,x1,y1) = page.mediabox
|
self.context = PageItem(str(page.pageid), page.mediabox, page.rotate)
|
||||||
self.outfp.write('<page id="%d" mediabox="%d,%d,%d,%d" rotate="%d">' %
|
|
||||||
(page.pageid, x0,y0,x1,y1, page.rotate))
|
|
||||||
return
|
return
|
||||||
def end_page(self, _):
|
def end_page(self, _):
|
||||||
self.outfp.write('</page>\n')
|
assert not self.stack
|
||||||
|
self.pages.append(self.context)
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_figure(self, name, bbox):
|
def begin_figure(self, name, bbox):
|
||||||
(x0,y0,x1,y1) = bbox
|
self.stack.append(self.context)
|
||||||
self.outfp.write('<figure name="%s" bbox="%d,%d,%d,%d">\n' %
|
self.context = FigureItem(name, bbox)
|
||||||
(name, x0,y0,x1,y1))
|
|
||||||
return
|
return
|
||||||
def end_figure(self, _):
|
def end_figure(self, _):
|
||||||
self.outfp.write('</figure>\n')
|
fig = self.context
|
||||||
|
self.context = self.stack.pop()
|
||||||
|
self.context.add(fig)
|
||||||
return
|
return
|
||||||
|
|
||||||
def handle_undefined_char(self, cidcoding, cid):
|
def handle_undefined_char(self, cidcoding, cid):
|
||||||
if self.debug:
|
if self.debug:
|
||||||
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
|
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
|
||||||
#return unichr(cid)
|
#return unichr(cid)
|
||||||
#return unichr(cid+32)
|
return None
|
||||||
return
|
|
||||||
|
|
||||||
def render_string(self, textstate, textmatrix, size, seq):
|
def render_string(self, textstate, textmatrix, size, seq):
|
||||||
font = textstate.font
|
font = textstate.font
|
||||||
spwidth = int(-font.char_width(32) * 0.6) # space width
|
spwidth = int(-font.char_width(32) * 0.6) # space width
|
||||||
buf = ''
|
text = []
|
||||||
for x in seq:
|
for x in seq:
|
||||||
if isinstance(x, int) or isinstance(x, float):
|
if isinstance(x, int) or isinstance(x, float):
|
||||||
if not font.is_vertical() and x <= spwidth:
|
if not font.is_vertical() and x <= spwidth:
|
||||||
buf += ' '
|
text.append((0, ' '))
|
||||||
else:
|
else:
|
||||||
chars = font.decode(x)
|
chars = font.decode(x)
|
||||||
for cid in chars:
|
for cid in chars:
|
||||||
try:
|
try:
|
||||||
char = font.to_unicode(cid)
|
char = font.to_unicode(cid)
|
||||||
buf += char
|
text.append((font.char_disp(cid), char))
|
||||||
except PDFUnicodeNotDefined, e:
|
except PDFUnicodeNotDefined, e:
|
||||||
(cidcoding, cid) = e.args
|
(cidcoding, cid) = e.args
|
||||||
s = self.handle_undefined_char(cidcoding, cid)
|
s = self.handle_undefined_char(cidcoding, cid)
|
||||||
if s:
|
if s:
|
||||||
buf += s
|
text.append(s)
|
||||||
(a,b,c,d,tx,ty) = mult_matrix(textmatrix, self.ctm)
|
item = TextItem(mult_matrix(textmatrix, self.ctm),
|
||||||
if font.is_vertical():
|
font, textstate.fontsize, size, text)
|
||||||
size = -size
|
self.context.add(item)
|
||||||
tag = 'vtext'
|
return
|
||||||
else:
|
|
||||||
tag = 'htext'
|
def dump(self, outfp, codec):
|
||||||
if (b != 0 or c != 0 or a <= 0 or d <= 0):
|
outfp.write('<document>\n')
|
||||||
tag += ' skewed'
|
for page in self.pages:
|
||||||
s = buf.encode(self.codec, 'xmlcharrefreplace')
|
#page.fuse()
|
||||||
(w,fs) = apply_matrix((a,b,c,d,0,0), (size,textstate.fontsize))
|
page.dump(outfp, codec)
|
||||||
def f(x): return '%.03f' % x
|
outfp.write('</document>\n')
|
||||||
self.outfp.write('<%s font="%s" size="%s" x="%s" y="%s" w="%s">%s</%s>\n' %
|
|
||||||
(tag, font.fontname, f(fs), f(tx), f(ty), f(w), s, tag))
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
# pdf2txt
|
# pdf2txt
|
||||||
def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
|
def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
|
||||||
device = TextConverter(outfp, rsrc, codec, debug=debug)
|
device = TextConverter(rsrc, debug=debug)
|
||||||
outfp.write('<document>\n')
|
|
||||||
doc = PDFDocument(debug=debug)
|
doc = PDFDocument(debug=debug)
|
||||||
fp = file(fname)
|
fp = file(fname)
|
||||||
parser = PDFParser(doc, fp, debug=debug)
|
parser = PDFParser(doc, fp, debug=debug)
|
||||||
|
@ -95,7 +192,7 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
|
||||||
if pages and (i not in pages): continue
|
if pages and (i not in pages): continue
|
||||||
interpreter.process_page(page)
|
interpreter.process_page(page)
|
||||||
fp.close()
|
fp.close()
|
||||||
outfp.write('</document>\n')
|
device.dump(outfp, codec)
|
||||||
device.close()
|
device.close()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
129
pdfparser.py
129
pdfparser.py
|
@ -4,30 +4,30 @@
|
||||||
# ver 0.1, Dec 24 2004-
|
# ver 0.1, Dec 24 2004-
|
||||||
# ver 0.2, Dec 24 2007
|
# ver 0.2, Dec 24 2007
|
||||||
|
|
||||||
# TODO:
|
|
||||||
# - Code Documentation.
|
|
||||||
# - Error handling for invalid type.
|
|
||||||
|
|
||||||
# - Outlines.
|
|
||||||
# - Named Objects. (pages)
|
|
||||||
# - Writers.
|
|
||||||
# - Linearized PDF.
|
|
||||||
# - Encryption?
|
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
import md5, struct
|
||||||
stderr = sys.stderr
|
stderr = sys.stderr
|
||||||
from utils import choplist, nunpack
|
from utils import choplist, nunpack
|
||||||
|
from arcfour import Arcfour
|
||||||
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
|
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
|
||||||
PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
|
PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
|
||||||
literal_name, keyword_name, \
|
literal_name, keyword_name, \
|
||||||
PSStackParser, STRICT
|
PSStackParser, STRICT
|
||||||
|
|
||||||
|
|
||||||
|
def decrypt_rc4(key, objid, genno, data):
|
||||||
|
key += struct.pack('<L',objid)[:3]+struct.pack('<L',genno)[:2]
|
||||||
|
hash = md5.md5(key)
|
||||||
|
key = hash.digest()[:min(len(key),16)]
|
||||||
|
return Arcfour(key).process(data)
|
||||||
|
|
||||||
|
|
||||||
## PDF Exceptions
|
## PDF Exceptions
|
||||||
##
|
##
|
||||||
class PDFException(PSException): pass
|
class PDFException(PSException): pass
|
||||||
class PDFSyntaxError(PDFException): pass
|
class PDFSyntaxError(PDFException): pass
|
||||||
class PDFEncrypted(PDFException): pass
|
class PDFEncryptionError(PDFException): pass
|
||||||
|
class PDFPasswordIncorrect(PDFEncryptionError): pass
|
||||||
class PDFTypeError(PDFException): pass
|
class PDFTypeError(PDFException): pass
|
||||||
class PDFValueError(PDFException): pass
|
class PDFValueError(PDFException): pass
|
||||||
|
|
||||||
|
@ -38,6 +38,7 @@ LITERAL_XREF = PSLiteralTable.intern('XRef')
|
||||||
LITERAL_PAGE = PSLiteralTable.intern('Page')
|
LITERAL_PAGE = PSLiteralTable.intern('Page')
|
||||||
LITERAL_PAGES = PSLiteralTable.intern('Pages')
|
LITERAL_PAGES = PSLiteralTable.intern('Pages')
|
||||||
LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
|
LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
|
||||||
|
LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
|
||||||
LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode')
|
LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode')
|
||||||
KEYWORD_R = PSKeywordTable.intern('R')
|
KEYWORD_R = PSKeywordTable.intern('R')
|
||||||
KEYWORD_OBJ = PSKeywordTable.intern('obj')
|
KEYWORD_OBJ = PSKeywordTable.intern('obj')
|
||||||
|
@ -45,6 +46,7 @@ KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
|
||||||
KEYWORD_STREAM = PSKeywordTable.intern('stream')
|
KEYWORD_STREAM = PSKeywordTable.intern('stream')
|
||||||
KEYWORD_XREF = PSKeywordTable.intern('xref')
|
KEYWORD_XREF = PSKeywordTable.intern('xref')
|
||||||
KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
|
KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
|
||||||
|
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
|
||||||
|
|
||||||
|
|
||||||
## PDFObjRef
|
## PDFObjRef
|
||||||
|
@ -77,7 +79,7 @@ def resolve1(x):
|
||||||
x = x.resolve()
|
x = x.resolve()
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def resolveall(x):
|
def resolve_all(x):
|
||||||
'''
|
'''
|
||||||
Recursively resolve X and all the internals.
|
Recursively resolve X and all the internals.
|
||||||
Make sure there is no indirect reference within the nested object.
|
Make sure there is no indirect reference within the nested object.
|
||||||
|
@ -86,10 +88,23 @@ def resolveall(x):
|
||||||
while isinstance(x, PDFObjRef):
|
while isinstance(x, PDFObjRef):
|
||||||
x = x.resolve()
|
x = x.resolve()
|
||||||
if isinstance(x, list):
|
if isinstance(x, list):
|
||||||
x = [ resolveall(v) for v in x ]
|
x = [ resolve_all(v) for v in x ]
|
||||||
elif isinstance(x, dict):
|
elif isinstance(x, dict):
|
||||||
for (k,v) in x.iteritems():
|
for (k,v) in x.iteritems():
|
||||||
x[k] = resolveall(v)
|
x[k] = resolve_all(v)
|
||||||
|
return x
|
||||||
|
|
||||||
|
def decipher_all(decipher, objid, genno, x):
|
||||||
|
'''
|
||||||
|
Recursively decipher X.
|
||||||
|
'''
|
||||||
|
if isinstance(x, str):
|
||||||
|
return decipher(objid, genno, x)
|
||||||
|
if isinstance(x, list):
|
||||||
|
x = [ decipher_all(decipher, objid, genno, v) for v in x ]
|
||||||
|
elif isinstance(x, dict):
|
||||||
|
for (k,v) in x.iteritems():
|
||||||
|
x[k] = decipher_all(decipher, objid, genno, v)
|
||||||
return x
|
return x
|
||||||
|
|
||||||
# Type cheking
|
# Type cheking
|
||||||
|
@ -159,6 +174,13 @@ class PDFStream:
|
||||||
self.rawdata = rawdata
|
self.rawdata = rawdata
|
||||||
self.decipher = decipher
|
self.decipher = decipher
|
||||||
self.data = None
|
self.data = None
|
||||||
|
self.objid = None
|
||||||
|
self.genno = None
|
||||||
|
return
|
||||||
|
|
||||||
|
def set_objid(self, objid, genno):
|
||||||
|
self.objid = objid
|
||||||
|
self.genno = genno
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
|
@ -168,7 +190,7 @@ class PDFStream:
|
||||||
assert self.data == None and self.rawdata != None
|
assert self.data == None and self.rawdata != None
|
||||||
data = self.rawdata
|
data = self.rawdata
|
||||||
if self.decipher:
|
if self.decipher:
|
||||||
data = self.decipher(data)
|
data = self.decipher(self.objid, self.genno, data)
|
||||||
if 'Filter' not in self.dic:
|
if 'Filter' not in self.dic:
|
||||||
self.data = data
|
self.data = data
|
||||||
self.rawdata = None
|
self.rawdata = None
|
||||||
|
@ -201,6 +223,8 @@ class PDFStream:
|
||||||
buf += ent1
|
buf += ent1
|
||||||
ent0 = ent1
|
ent0 = ent1
|
||||||
data = buf
|
data = buf
|
||||||
|
if f == LITERAL_CRYPT:
|
||||||
|
raise PDFEncryptionError
|
||||||
else:
|
else:
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFValueError('Invalid filter spec: %r' % f)
|
raise PDFValueError('Invalid filter spec: %r' % f)
|
||||||
|
@ -338,10 +362,11 @@ class PDFDocument:
|
||||||
self.xrefs = []
|
self.xrefs = []
|
||||||
self.objs = {}
|
self.objs = {}
|
||||||
self.parsed_objs = {}
|
self.parsed_objs = {}
|
||||||
self.decipher = None
|
|
||||||
self.root = None
|
self.root = None
|
||||||
self.catalog = None
|
self.catalog = None
|
||||||
self.parser = None
|
self.parser = None
|
||||||
|
self.encryption = None
|
||||||
|
self.decipher = None
|
||||||
return
|
return
|
||||||
|
|
||||||
def set_parser(self, parser):
|
def set_parser(self, parser):
|
||||||
|
@ -351,20 +376,74 @@ class PDFDocument:
|
||||||
for xref in self.xrefs:
|
for xref in self.xrefs:
|
||||||
trailer = xref.trailer
|
trailer = xref.trailer
|
||||||
if 'Encrypt' in trailer:
|
if 'Encrypt' in trailer:
|
||||||
raise PDFEncrypted
|
self.encryption = (list_value(trailer['ID']),
|
||||||
param = dict_value(trailer['Encrypt'])
|
dict_value(trailer['Encrypt']))
|
||||||
self.decipher = DECRYPTOR(param)
|
|
||||||
self.parser.strfilter = self.decipher
|
|
||||||
if 'Root' in trailer:
|
if 'Root' in trailer:
|
||||||
self.set_root(dict_value(trailer['Root']))
|
self.set_root(dict_value(trailer['Root']))
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
raise PDFValueError('no /Root object!')
|
raise PDFValueError('no /Root object!')
|
||||||
|
if self.encryption:
|
||||||
|
self.prepare_cipher()
|
||||||
|
return
|
||||||
|
|
||||||
|
def prepare_cipher(self, password=''):
|
||||||
|
(docid, param) = self.encryption
|
||||||
|
if literal_name(param['Filter']) != 'Standard':
|
||||||
|
raise PDFEncryptionError('unknown filter: param=%r' % param)
|
||||||
|
V = int_value(param.get('V', 0))
|
||||||
|
if not (V == 1 or V == 2):
|
||||||
|
raise PDFEncryptionError('unknown algorithm: param=%r' % param)
|
||||||
|
length = int_value(param.get('Length', 40)) # Key length (bits)
|
||||||
|
O = str_value(param['O'])
|
||||||
|
R = int_value(param['R']) # Revision
|
||||||
|
if 5 <= R:
|
||||||
|
raise PDFEncryptionError('unknown revision: %r' % R)
|
||||||
|
U = str_value(param['U'])
|
||||||
|
P = int_value(param['P'])
|
||||||
|
is_printable = bool(P & 4)
|
||||||
|
is_modifiable = bool(P & 8)
|
||||||
|
is_extractable = bool(P & 16)
|
||||||
|
# Algorithm 3.2
|
||||||
|
password = (password+PASSWORD_PADDING)[:32] # 1
|
||||||
|
hash = md5.md5(password) # 2
|
||||||
|
hash.update(O) # 3
|
||||||
|
hash.update(struct.pack('<L', P)) # 4
|
||||||
|
hash.update(docid[0]) # 5
|
||||||
|
if 4 <= R:
|
||||||
|
raise NotImplementedError # 6
|
||||||
|
if 3 <= R:
|
||||||
|
# 8
|
||||||
|
for _ in xrange(50):
|
||||||
|
hash = md5.md5(hash.digest()[:length/8])
|
||||||
|
key = hash.digest()[:length/8]
|
||||||
|
if R == 2:
|
||||||
|
# Algorithm 3.4
|
||||||
|
u1 = Arcfour(key).process(password)
|
||||||
|
elif R == 3:
|
||||||
|
# Algorithm 3.5
|
||||||
|
hash = md5.md5(PASSWORD_PADDING) # 2
|
||||||
|
hash.update(docid[0]) # 3
|
||||||
|
x = Arcfour(key).process(hash.digest()[:16]) # 4
|
||||||
|
for i in xrange(1,19+1):
|
||||||
|
k = ''.join( chr(c ^ i) for c in key )
|
||||||
|
x = Arcfour(k).process(x)
|
||||||
|
u1 = x+x # 32bytes total
|
||||||
|
else:
|
||||||
|
raise PDFEncryptionError('unknown revision: %r' % R)
|
||||||
|
if R == 2:
|
||||||
|
is_authenticated = (u1 == U)
|
||||||
|
else:
|
||||||
|
is_authenticated = (u1[:16] == U[:16])
|
||||||
|
if not is_authenticated:
|
||||||
|
raise PDFPasswordIncorrect
|
||||||
|
self.decipher = (lambda objid,genno,data: decrypt_rc4(key, objid, genno, data))
|
||||||
return
|
return
|
||||||
|
|
||||||
def getobj(self, objid):
|
def getobj(self, objid):
|
||||||
#assert self.xrefs
|
#assert self.xrefs
|
||||||
if objid in self.objs:
|
if objid in self.objs:
|
||||||
|
genno = 0
|
||||||
obj = self.objs[objid]
|
obj = self.objs[objid]
|
||||||
else:
|
else:
|
||||||
for xref in self.xrefs:
|
for xref in self.xrefs:
|
||||||
|
@ -400,18 +479,26 @@ class PDFDocument:
|
||||||
except PSEOF:
|
except PSEOF:
|
||||||
pass
|
pass
|
||||||
self.parsed_objs[stream] = objs
|
self.parsed_objs[stream] = objs
|
||||||
|
genno = 0
|
||||||
obj = objs[stream.dic['N']*2+index]
|
obj = objs[stream.dic['N']*2+index]
|
||||||
|
if isinstance(obj, PDFStream):
|
||||||
|
obj.set_objid(objid, 0)
|
||||||
else:
|
else:
|
||||||
self.parser.seek(index)
|
self.parser.seek(index)
|
||||||
(_,objid1) = self.parser.nextobject() # objid
|
(_,objid1) = self.parser.nextobject() # objid
|
||||||
(_,genno1) = self.parser.nextobject() # genno
|
(_,genno) = self.parser.nextobject() # genno
|
||||||
|
assert objid1 == objid
|
||||||
(_,kwd) = self.parser.nextobject()
|
(_,kwd) = self.parser.nextobject()
|
||||||
if kwd != KEYWORD_OBJ:
|
if kwd != KEYWORD_OBJ:
|
||||||
raise PDFSyntaxError('invalid obj spec: offset=%r' % index)
|
raise PDFSyntaxError('invalid obj spec: offset=%r' % index)
|
||||||
(_,obj) = self.parser.nextobject()
|
(_,obj) = self.parser.nextobject()
|
||||||
|
if isinstance(obj, PDFStream):
|
||||||
|
obj.set_objid(objid, genno)
|
||||||
if 2 <= self.debug:
|
if 2 <= self.debug:
|
||||||
print >>stderr, 'register: objid=%r: %r' % (objid, obj)
|
print >>stderr, 'register: objid=%r: %r' % (objid, obj)
|
||||||
self.objs[objid] = obj
|
self.objs[objid] = obj
|
||||||
|
if self.decipher:
|
||||||
|
obj = decipher_all(self.decipher, objid, genno, obj)
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
def get_pages(self, debug=0):
|
def get_pages(self, debug=0):
|
||||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue