basic encryption support added.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@19 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2008-04-26 06:47:56 +00:00
parent 5c1aa960f5
commit 1300046181
10 changed files with 311 additions and 68 deletions

9
TODO Normal file
View File

@ -0,0 +1,9 @@
TODO:
- Code Documentation.
- Error handling for invalid type.
- Outlines.
- Named Objects. (pages)
- Writers.
- Linearized PDF.
- Encryption?

40
arcfour.py Executable file
View File

@ -0,0 +1,40 @@
#!/usr/bin/env python
#
# Arcfour implementation
# * public domain *
#
class Arcfour:
def __init__(self, key):
s = range(256)
j = 0
klen = len(key)
for i in xrange(256):
j = (j + s[i] + ord(key[i % klen])) % 256
(s[i], s[j]) = (s[j], s[i])
self.s = s
(self.i, self.j) = (0, 0)
return
def process(self, data):
(i, j) = (self.i, self.j)
s = self.s
r = ''
for c in data:
i = (i+1) % 256
j = (j+s[i]) % 256
(s[i], s[j]) = (s[j], s[i])
k = s[(s[i]+s[j]) % 256]
r += chr(ord(c) ^ k)
(self.i, self.j) = (i, j)
return r
if __name__ == '__main__':
def doit(key, data):
cipher = Arcfour(key)
return ''.join( '%02X' % ord(c) for c in cipher.process(data) )
assert doit("Key", "Plaintext") == 'BBF316E8D940AF0AD3'
assert doit("Wiki", "pedia") == '1021BF0420'
assert doit("Secret", "Attack at dawn") == '45A01F645FC35B383552544B9BF5'
print 'test succeeded'

View File

@ -20,6 +20,9 @@ class Rect:
self.y1 = y0+h self.y1 = y0+h
return return
def __repr__(self):
return '<Rect: (%d,%d)-(%d,%d)>' % (self.x0, self.y0, self.x1, self.y1)
def overlap(self, rect): def overlap(self, rect):
return not (rect.x1 <= self.x0 or self.x1 <= rect.x0 or return not (rect.x1 <= self.x0 or self.x1 <= rect.x0 or
rect.y1 <= self.y0 or self.y1 <= rect.y0) rect.y1 <= self.y0 or self.y1 <= rect.y0)
@ -31,7 +34,7 @@ class ExtSet:
def __init__(self, gridsize): def __init__(self, gridsize):
self.gridsize = gridsize self.gridsize = gridsize
self.grid = [] self.grid = {}
return return
def cells(self, x0, x1): def cells(self, x0, x1):
@ -45,12 +48,18 @@ class ExtSet:
def add(self, x0, x1, obj): def add(self, x0, x1, obj):
for i in self.cells(x0, x1): for i in self.cells(x0, x1):
self.grid[i].append(obj) if i not in self.grid:
a = []
self.grid[i] = a
else:
a = self.grid[i]
a.append(obj)
return return
def get(self, x0, x1): def get(self, x0, x1):
objs = set() objs = set()
for i in self.cells(x0, x1): for i in self.cells(x0, x1):
if i in self.grid:
objs.update(self.grid[i]) objs.update(self.grid[i])
return objs return objs
@ -78,12 +87,13 @@ class ExtGrid:
self.vext = ExtSet(gridsize) self.vext = ExtSet(gridsize)
return return
def add(self, rect): def add(self, rect, obj):
self.hext.add(rect.x0, rect.x1, rect) self.hext.add(rect.x0, rect.x1, obj)
self.vext.add(rect.y0, rect.y1, rect) self.vext.add(rect.y0, rect.y1, obj)
return return
def get(self, rect): def get(self, rect, getrect):
rects = self.hext.get(rect.x0, rect.x1) objs = self.hext.get(rect.x0, rect.x1)
rects.update_intersect(self.vext.get(rect.y0, rect.y1)) objs.intersection_update(self.vext.get(rect.y0, rect.y1))
return rects objs = [ obj for obj in objs if rect.overlap(getrect(obj)) ]
return objs

View File

@ -7,86 +7,183 @@ from pdfinterp import PDFDevice, PDFResourceManager, \
PDFPageInterpreter, PDFUnicodeNotDefined, \ PDFPageInterpreter, PDFUnicodeNotDefined, \
mult_matrix, apply_matrix mult_matrix, apply_matrix
from cmap import CMapDB from cmap import CMapDB
from extent import Rect, ExtSet, ExtGrid
## PageItem
##
class PageItem:
GRID_SIZE = 20
def __init__(self, id, (x0,y0,x1,y1), rotate=0):
self.id = id
self.bbox = Rect(x0, y0, x1-x0, y1-y0)
self.rotate = rotate
self.grid = ExtGrid(self.GRID_SIZE)
self.objs = []
return
def __repr__(self):
bbox = self.bbox
return ('<page id=%r bbox="%d,%d,%d,%d" rotate="%d">' %
(self.id, bbox.x0,bbox.y0,bbox.x1,bbox.y1, self.rotate))
def add(self, obj):
self.objs.append(obj)
self.grid.add(obj.bbox, obj)
return
def dump(self, outfp, codec):
outfp.write(repr(self)+'\n')
for obj in self.objs:
obj.dump(outfp, codec)
outfp.write('</page>\n')
return
def fuse(self):
for obj1 in self.objs:
f = (lambda obj: obj.bbox)
for rect in obj1.search_range():
neighbors = [ obj2 for obj2 in self.grid.get(rect, f) if obj2 is not obj1 ]
#print obj1.bbox, obj1.text.encode('euc-jp','ignore'), rect, [ obj.bbox for obj in neighbors ]
return
## FigureItem
##
class FigureItem(PageItem):
def __repr__(self):
bbox = self.bbox
return ('<figure id=%r bbox="%d,%d,%d,%d">' %
(self.id, bbox.x0,bbox.y0,bbox.x1,bbox.y1))
def dump(self, outfp, codec):
outfp.write(repr(self)+'\n')
for obj in self.objs:
obj.dump(outfp, codec)
outfp.write('</figure>\n')
return
def search_range(self):
return []
## TextItem
##
class TextItem:
def __init__(self, matrix, font, size, width, text):
self.matrix = matrix
self.font = font
(a,b,c,d,tx,ty) = self.matrix
(self.width, self.size) = apply_matrix((a,b,c,d,0,0), (width,size))
self.width = abs(self.width)
self.origin = (tx,ty)
self.direction = 0
if not self.font.is_vertical():
self.direction = 1
(_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*size*0.001))
(_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*size*0.001))
self.bbox = Rect(tx, ty+descent, self.width, self.size)
else:
self.direction = 2
mindisp = min( d for (d,_) in text )
(mindisp,_) = apply_matrix((a,b,c,d,0,0), (mindisp*size*0.001,0))
self.bbox = Rect(tx-mindisp, ty+self.width, self.size, self.width)
self.text = ''.join( c for (_,c) in text )
return
def __repr__(self):
return ('<text matrix=%r font=%r size=%r width=%r text=%r>' %
(self.matrix, self.font, self.size, self.width, self.text))
def dump(self, outfp, codec):
(a,b,c,d,tx,ty) = self.matrix
outfp.write('<text x="%.3f" y="%.3f" font=%r size="%.3f" width="%.3f">' %
(tx, ty, self.font.fontname, self.size, self.width))
outfp.write(self.text.encode(codec, 'xmlcharrefreplace'))
outfp.write('</text>\n')
return
def search_range(self):
if self.direction == 1:
return [ Rect(self.bbox.x1, self.bbox.y0, self.size, self.size) ]
else:
return [ Rect(self.bbox.x0, self.bbox.y0-self.size, self.size, self.size) ]
## TextConverter ## TextConverter
## ##
class TextConverter(PDFDevice): class TextConverter(PDFDevice):
def __init__(self, outfp, rsrc, codec, debug=0): def __init__(self, rsrc, debug=0):
PDFDevice.__init__(self, rsrc, debug=debug) PDFDevice.__init__(self, rsrc, debug=debug)
self.outfp = outfp self.pages = []
self.codec = codec self.stack = []
return
def close(self):
self.outfp.write('\n')
return return
def begin_page(self, page): def begin_page(self, page):
(x0,y0,x1,y1) = page.mediabox self.context = PageItem(str(page.pageid), page.mediabox, page.rotate)
self.outfp.write('<page id="%d" mediabox="%d,%d,%d,%d" rotate="%d">' %
(page.pageid, x0,y0,x1,y1, page.rotate))
return return
def end_page(self, _): def end_page(self, _):
self.outfp.write('</page>\n') assert not self.stack
self.pages.append(self.context)
return return
def begin_figure(self, name, bbox): def begin_figure(self, name, bbox):
(x0,y0,x1,y1) = bbox self.stack.append(self.context)
self.outfp.write('<figure name="%s" bbox="%d,%d,%d,%d">\n' % self.context = FigureItem(name, bbox)
(name, x0,y0,x1,y1))
return return
def end_figure(self, _): def end_figure(self, _):
self.outfp.write('</figure>\n') fig = self.context
self.context = self.stack.pop()
self.context.add(fig)
return return
def handle_undefined_char(self, cidcoding, cid): def handle_undefined_char(self, cidcoding, cid):
if self.debug: if self.debug:
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid) print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
#return unichr(cid) #return unichr(cid)
#return unichr(cid+32) return None
return
def render_string(self, textstate, textmatrix, size, seq): def render_string(self, textstate, textmatrix, size, seq):
font = textstate.font font = textstate.font
spwidth = int(-font.char_width(32) * 0.6) # space width spwidth = int(-font.char_width(32) * 0.6) # space width
buf = '' text = []
for x in seq: for x in seq:
if isinstance(x, int) or isinstance(x, float): if isinstance(x, int) or isinstance(x, float):
if not font.is_vertical() and x <= spwidth: if not font.is_vertical() and x <= spwidth:
buf += ' ' text.append((0, ' '))
else: else:
chars = font.decode(x) chars = font.decode(x)
for cid in chars: for cid in chars:
try: try:
char = font.to_unicode(cid) char = font.to_unicode(cid)
buf += char text.append((font.char_disp(cid), char))
except PDFUnicodeNotDefined, e: except PDFUnicodeNotDefined, e:
(cidcoding, cid) = e.args (cidcoding, cid) = e.args
s = self.handle_undefined_char(cidcoding, cid) s = self.handle_undefined_char(cidcoding, cid)
if s: if s:
buf += s text.append(s)
(a,b,c,d,tx,ty) = mult_matrix(textmatrix, self.ctm) item = TextItem(mult_matrix(textmatrix, self.ctm),
if font.is_vertical(): font, textstate.fontsize, size, text)
size = -size self.context.add(item)
tag = 'vtext' return
else:
tag = 'htext' def dump(self, outfp, codec):
if (b != 0 or c != 0 or a <= 0 or d <= 0): outfp.write('<document>\n')
tag += ' skewed' for page in self.pages:
s = buf.encode(self.codec, 'xmlcharrefreplace') #page.fuse()
(w,fs) = apply_matrix((a,b,c,d,0,0), (size,textstate.fontsize)) page.dump(outfp, codec)
def f(x): return '%.03f' % x outfp.write('</document>\n')
self.outfp.write('<%s font="%s" size="%s" x="%s" y="%s" w="%s">%s</%s>\n' %
(tag, font.fontname, f(fs), f(tx), f(ty), f(w), s, tag))
return return
# pdf2txt # pdf2txt
def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0): def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
device = TextConverter(outfp, rsrc, codec, debug=debug) device = TextConverter(rsrc, debug=debug)
outfp.write('<document>\n')
doc = PDFDocument(debug=debug) doc = PDFDocument(debug=debug)
fp = file(fname) fp = file(fname)
parser = PDFParser(doc, fp, debug=debug) parser = PDFParser(doc, fp, debug=debug)
@ -95,7 +192,7 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
if pages and (i not in pages): continue if pages and (i not in pages): continue
interpreter.process_page(page) interpreter.process_page(page)
fp.close() fp.close()
outfp.write('</document>\n') device.dump(outfp, codec)
device.close() device.close()
return return

View File

@ -4,30 +4,30 @@
# ver 0.1, Dec 24 2004- # ver 0.1, Dec 24 2004-
# ver 0.2, Dec 24 2007 # ver 0.2, Dec 24 2007
# TODO:
# - Code Documentation.
# - Error handling for invalid type.
# - Outlines.
# - Named Objects. (pages)
# - Writers.
# - Linearized PDF.
# - Encryption?
import sys import sys
import md5, struct
stderr = sys.stderr stderr = sys.stderr
from utils import choplist, nunpack from utils import choplist, nunpack
from arcfour import Arcfour
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \ from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \ PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
literal_name, keyword_name, \ literal_name, keyword_name, \
PSStackParser, STRICT PSStackParser, STRICT
def decrypt_rc4(key, objid, genno, data):
key += struct.pack('<L',objid)[:3]+struct.pack('<L',genno)[:2]
hash = md5.md5(key)
key = hash.digest()[:min(len(key),16)]
return Arcfour(key).process(data)
## PDF Exceptions ## PDF Exceptions
## ##
class PDFException(PSException): pass class PDFException(PSException): pass
class PDFSyntaxError(PDFException): pass class PDFSyntaxError(PDFException): pass
class PDFEncrypted(PDFException): pass class PDFEncryptionError(PDFException): pass
class PDFPasswordIncorrect(PDFEncryptionError): pass
class PDFTypeError(PDFException): pass class PDFTypeError(PDFException): pass
class PDFValueError(PDFException): pass class PDFValueError(PDFException): pass
@ -38,6 +38,7 @@ LITERAL_XREF = PSLiteralTable.intern('XRef')
LITERAL_PAGE = PSLiteralTable.intern('Page') LITERAL_PAGE = PSLiteralTable.intern('Page')
LITERAL_PAGES = PSLiteralTable.intern('Pages') LITERAL_PAGES = PSLiteralTable.intern('Pages')
LITERAL_CATALOG = PSLiteralTable.intern('Catalog') LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode') LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode')
KEYWORD_R = PSKeywordTable.intern('R') KEYWORD_R = PSKeywordTable.intern('R')
KEYWORD_OBJ = PSKeywordTable.intern('obj') KEYWORD_OBJ = PSKeywordTable.intern('obj')
@ -45,6 +46,7 @@ KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
KEYWORD_STREAM = PSKeywordTable.intern('stream') KEYWORD_STREAM = PSKeywordTable.intern('stream')
KEYWORD_XREF = PSKeywordTable.intern('xref') KEYWORD_XREF = PSKeywordTable.intern('xref')
KEYWORD_STARTXREF = PSKeywordTable.intern('startxref') KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
## PDFObjRef ## PDFObjRef
@ -77,7 +79,7 @@ def resolve1(x):
x = x.resolve() x = x.resolve()
return x return x
def resolveall(x): def resolve_all(x):
''' '''
Recursively resolve X and all the internals. Recursively resolve X and all the internals.
Make sure there is no indirect reference within the nested object. Make sure there is no indirect reference within the nested object.
@ -86,10 +88,23 @@ def resolveall(x):
while isinstance(x, PDFObjRef): while isinstance(x, PDFObjRef):
x = x.resolve() x = x.resolve()
if isinstance(x, list): if isinstance(x, list):
x = [ resolveall(v) for v in x ] x = [ resolve_all(v) for v in x ]
elif isinstance(x, dict): elif isinstance(x, dict):
for (k,v) in x.iteritems(): for (k,v) in x.iteritems():
x[k] = resolveall(v) x[k] = resolve_all(v)
return x
def decipher_all(decipher, objid, genno, x):
'''
Recursively decipher X.
'''
if isinstance(x, str):
return decipher(objid, genno, x)
if isinstance(x, list):
x = [ decipher_all(decipher, objid, genno, v) for v in x ]
elif isinstance(x, dict):
for (k,v) in x.iteritems():
x[k] = decipher_all(decipher, objid, genno, v)
return x return x
# Type cheking # Type cheking
@ -159,6 +174,13 @@ class PDFStream:
self.rawdata = rawdata self.rawdata = rawdata
self.decipher = decipher self.decipher = decipher
self.data = None self.data = None
self.objid = None
self.genno = None
return
def set_objid(self, objid, genno):
self.objid = objid
self.genno = genno
return return
def __repr__(self): def __repr__(self):
@ -168,7 +190,7 @@ class PDFStream:
assert self.data == None and self.rawdata != None assert self.data == None and self.rawdata != None
data = self.rawdata data = self.rawdata
if self.decipher: if self.decipher:
data = self.decipher(data) data = self.decipher(self.objid, self.genno, data)
if 'Filter' not in self.dic: if 'Filter' not in self.dic:
self.data = data self.data = data
self.rawdata = None self.rawdata = None
@ -201,6 +223,8 @@ class PDFStream:
buf += ent1 buf += ent1
ent0 = ent1 ent0 = ent1
data = buf data = buf
if f == LITERAL_CRYPT:
raise PDFEncryptionError
else: else:
if STRICT: if STRICT:
raise PDFValueError('Invalid filter spec: %r' % f) raise PDFValueError('Invalid filter spec: %r' % f)
@ -338,10 +362,11 @@ class PDFDocument:
self.xrefs = [] self.xrefs = []
self.objs = {} self.objs = {}
self.parsed_objs = {} self.parsed_objs = {}
self.decipher = None
self.root = None self.root = None
self.catalog = None self.catalog = None
self.parser = None self.parser = None
self.encryption = None
self.decipher = None
return return
def set_parser(self, parser): def set_parser(self, parser):
@ -351,20 +376,74 @@ class PDFDocument:
for xref in self.xrefs: for xref in self.xrefs:
trailer = xref.trailer trailer = xref.trailer
if 'Encrypt' in trailer: if 'Encrypt' in trailer:
raise PDFEncrypted self.encryption = (list_value(trailer['ID']),
param = dict_value(trailer['Encrypt']) dict_value(trailer['Encrypt']))
self.decipher = DECRYPTOR(param)
self.parser.strfilter = self.decipher
if 'Root' in trailer: if 'Root' in trailer:
self.set_root(dict_value(trailer['Root'])) self.set_root(dict_value(trailer['Root']))
break break
else: else:
raise PDFValueError('no /Root object!') raise PDFValueError('no /Root object!')
if self.encryption:
self.prepare_cipher()
return
def prepare_cipher(self, password=''):
(docid, param) = self.encryption
if literal_name(param['Filter']) != 'Standard':
raise PDFEncryptionError('unknown filter: param=%r' % param)
V = int_value(param.get('V', 0))
if not (V == 1 or V == 2):
raise PDFEncryptionError('unknown algorithm: param=%r' % param)
length = int_value(param.get('Length', 40)) # Key length (bits)
O = str_value(param['O'])
R = int_value(param['R']) # Revision
if 5 <= R:
raise PDFEncryptionError('unknown revision: %r' % R)
U = str_value(param['U'])
P = int_value(param['P'])
is_printable = bool(P & 4)
is_modifiable = bool(P & 8)
is_extractable = bool(P & 16)
# Algorithm 3.2
password = (password+PASSWORD_PADDING)[:32] # 1
hash = md5.md5(password) # 2
hash.update(O) # 3
hash.update(struct.pack('<L', P)) # 4
hash.update(docid[0]) # 5
if 4 <= R:
raise NotImplementedError # 6
if 3 <= R:
# 8
for _ in xrange(50):
hash = md5.md5(hash.digest()[:length/8])
key = hash.digest()[:length/8]
if R == 2:
# Algorithm 3.4
u1 = Arcfour(key).process(password)
elif R == 3:
# Algorithm 3.5
hash = md5.md5(PASSWORD_PADDING) # 2
hash.update(docid[0]) # 3
x = Arcfour(key).process(hash.digest()[:16]) # 4
for i in xrange(1,19+1):
k = ''.join( chr(c ^ i) for c in key )
x = Arcfour(k).process(x)
u1 = x+x # 32bytes total
else:
raise PDFEncryptionError('unknown revision: %r' % R)
if R == 2:
is_authenticated = (u1 == U)
else:
is_authenticated = (u1[:16] == U[:16])
if not is_authenticated:
raise PDFPasswordIncorrect
self.decipher = (lambda objid,genno,data: decrypt_rc4(key, objid, genno, data))
return return
def getobj(self, objid): def getobj(self, objid):
#assert self.xrefs #assert self.xrefs
if objid in self.objs: if objid in self.objs:
genno = 0
obj = self.objs[objid] obj = self.objs[objid]
else: else:
for xref in self.xrefs: for xref in self.xrefs:
@ -400,18 +479,26 @@ class PDFDocument:
except PSEOF: except PSEOF:
pass pass
self.parsed_objs[stream] = objs self.parsed_objs[stream] = objs
genno = 0
obj = objs[stream.dic['N']*2+index] obj = objs[stream.dic['N']*2+index]
if isinstance(obj, PDFStream):
obj.set_objid(objid, 0)
else: else:
self.parser.seek(index) self.parser.seek(index)
(_,objid1) = self.parser.nextobject() # objid (_,objid1) = self.parser.nextobject() # objid
(_,genno1) = self.parser.nextobject() # genno (_,genno) = self.parser.nextobject() # genno
assert objid1 == objid
(_,kwd) = self.parser.nextobject() (_,kwd) = self.parser.nextobject()
if kwd != KEYWORD_OBJ: if kwd != KEYWORD_OBJ:
raise PDFSyntaxError('invalid obj spec: offset=%r' % index) raise PDFSyntaxError('invalid obj spec: offset=%r' % index)
(_,obj) = self.parser.nextobject() (_,obj) = self.parser.nextobject()
if isinstance(obj, PDFStream):
obj.set_objid(objid, genno)
if 2 <= self.debug: if 2 <= self.debug:
print >>stderr, 'register: objid=%r: %r' % (objid, obj) print >>stderr, 'register: objid=%r: %r' % (objid, obj)
self.objs[objid] = obj self.objs[objid] = obj
if self.decipher:
obj = decipher_all(self.decipher, objid, genno, obj)
return obj return obj
def get_pages(self, debug=0): def get_pages(self, debug=0):

BIN
samples/dmca.pdf Normal file

Binary file not shown.

BIN
samples/f1040nr.pdf Normal file

Binary file not shown.

BIN
samples/i1040nr.pdf Normal file

Binary file not shown.

BIN
samples/kampo.pdf Normal file

Binary file not shown.

BIN
samples/nlp2004slides.pdf Normal file

Binary file not shown.