Implement revision 4 and 5 encryption handler.

pull/1/head
numion 2014-05-19 16:27:43 +02:00
parent 29ebc2d618
commit a4997d6f10
15 changed files with 316 additions and 59 deletions

View File

@ -43,6 +43,10 @@ class Arcfour(object):
r += chr(ord(c) ^ k) r += chr(ord(c) ^ k)
(self.i, self.j) = (i, j) (self.i, self.j) = (i, j)
return r return r
encrypt = decrypt = process
new = Arcfour
# test # test
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -6,6 +6,14 @@ try:
import hashlib as md5 import hashlib as md5
except ImportError: except ImportError:
import md5 import md5
try:
from Crypto.Cipher import ARC4
from Crypto.Cipher import AES
from Crypto.Hash import SHA256
except ImportError:
AES = SHA256 = None
import arcfour as ARC4
from psparser import PSEOF from psparser import PSEOF
from psparser import literal_name from psparser import literal_name
from psparser import LIT, KWD, STRICT from psparser import LIT, KWD, STRICT
@ -16,7 +24,6 @@ from pdftypes import int_value
from pdftypes import str_value, list_value, dict_value, stream_value from pdftypes import str_value, list_value, dict_value, stream_value
from pdfparser import PDFSyntaxError from pdfparser import PDFSyntaxError
from pdfparser import PDFStreamParser from pdfparser import PDFStreamParser
from arcfour import Arcfour
from utils import choplist, nunpack from utils import choplist, nunpack
from utils import decode_text from utils import decode_text
@ -269,6 +276,217 @@ class PDFXRefStream(PDFBaseXRef):
raise KeyError(objid) raise KeyError(objid)
## PDFSecurityHandler
##
class PDFStandardSecurityHandler(object):
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
supported_revisions = (2, 3)
def __init__(self, docid, param, password=''):
self.docid = docid
self.param = param
self.password = password
self.init()
def init(self):
self.init_params()
if self.r not in self.supported_revisions:
raise PDFEncryptionError('Unsupported revision: param=%r' % self.param)
self.init_key()
def init_params(self):
self.v = int_value(self.param.get('V', 0))
self.r = int_value(self.param['R'])
self.p = int_value(self.param['P'])
self.o = str_value(self.param['O'])
self.u = str_value(self.param['U'])
self.length = int_value(self.param.get('Length', 40))
def init_key(self):
self.key = self.authenticate(self.password)
if self.key is None:
raise PDFPasswordIncorrect
def is_printable(self):
return bool(self.p & 4)
def is_modifiable(self):
return bool(self.p & 8)
def is_extractable(self):
return bool(self.p & 16)
def compute_u(self, key):
if self.r == 2:
# Algorithm 3.4
return ARC4.new(key).encrypt(self.PASSWORD_PADDING) # 2
else:
# Algorithm 3.5
hash = md5.md5(self.PASSWORD_PADDING) # 2
hash.update(self.docid[0]) # 3
result = ARC4.new(key).encrypt(hash.digest()) # 4
for i in range(1, 20): # 5
k = ''.join(chr(ord(c) ^ i) for c in key)
result = ARC4.new(k).encrypt(result)
result += result # 6
return result
def compute_encryption_key(self, password):
# Algorithm 3.2
password = (password + self.PASSWORD_PADDING)[:32] # 1
hash = md5.md5(password) # 2
hash.update(self.o) # 3
hash.update(struct.pack('<l', self.p)) # 4
hash.update(self.docid[0]) # 5
if self.r >= 4:
if not self.encrypt_metadata:
hash.update('\xff\xff\xff\xff')
result = hash.digest()
n = 5
if self.r >= 3:
n = self.length // 8
for _ in range(50):
result = md5.md5(result[:n]).digest()
return result[:n]
def authenticate(self, password):
key = self.authenticate_user_password(password)
if key is None:
key = self.authenticate_owner_password(password)
return key
def authenticate_user_password(self, password):
key = self.compute_encryption_key(password)
if self.verify_encryption_key(key):
return key
def verify_encryption_key(self, key):
# Algorithm 3.6
u = self.compute_u(key)
if self.r == 2:
return u == self.u
return u[:16] == self.u[:16]
def authenticate_owner_password(self, password):
# Algorithm 3.7
password = (password + self.PASSWORD_PADDING)[:32]
hash = md5.md5(password)
if self.r >= 3:
for _ in range(50):
hash = md5.md5(hash.digest())
n = 5
if self.r >= 3:
n = self.length // 8
key = hash.digest()[:n]
if self.r == 2:
user_password = ARC4.new(key).decrypt(self.o)
else:
user_password = self.o
for i in range(19, -1, -1):
k = ''.join(chr(ord(c) ^ i) for c in key)
user_password = ARC4.new(k).decrypt(user_password)
return self.authenticate_user_password(user_password)
def decrypt(self, objid, genno, data, attrs=None):
return self.decrypt_rc4(objid, genno, data)
def decrypt_rc4(self, objid, genno, data):
key = self.key + struct.pack('<L', objid)[:3] + struct.pack('<L', genno)[:2]
hash = md5.md5(key)
key = hash.digest()[:min(len(key), 16)]
return ARC4.new(key).decrypt(data)
class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
supported_revisions = (4,)
def init_params(self):
super(PDFStandardSecurityHandlerV4, self).init_params()
self.length = 128
self.cf = dict_value(self.param.get('CF'))
self.stmf = literal_name(self.param['StmF'])
self.strf = literal_name(self.param['StrF'])
self.encrypt_metadata = bool(self.param.get('EncryptMetadata', True))
if self.stmf != self.strf:
raise PDFEncryptionError('Unsupported crypt filter: param=%r' % self.param)
self.cfm = {}
for k, v in self.cf.items():
f = self.get_cfm(literal_name(v['CFM']))
if f is None:
raise PDFEncryptionError('Unknown crypt filter method: param=%r' % self.param)
self.cfm[k] = f
self.cfm['Identity'] = self.decrypt_identity
if self.strf not in self.cfm:
raise PDFEncryptionError('Undefined crypt filter: param=%r' % self.param)
def get_cfm(self, name):
if name == 'V2':
return self.decrypt_rc4
elif name == 'AESV2':
return self.decrypt_aes128
def decrypt(self, objid, genno, data, attrs=None, name=None):
if not self.encrypt_metadata and attrs is not None:
t = attrs.get('Type')
if t is not None and literal_name(t) == 'Metadata':
return data
if name is None:
name = self.strf
return self.cfm[name](objid, genno, data)
def decrypt_identity(self, objid, genno, data):
return data
def decrypt_aes128(self, objid, genno, data):
key = self.key + struct.pack('<L', objid)[:3] + struct.pack('<L', genno)[:2] + "sAlT"
hash = md5.md5(key)
key = hash.digest()[:min(len(key), 16)]
return AES.new(key, mode=AES.MODE_CBC, IV=data[:16]).decrypt(data[16:])
class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
supported_revisions = (5,)
def init_params(self):
super(PDFStandardSecurityHandlerV5, self).init_params()
self.length = 256
self.oe = str_value(self.param['OE'])
self.ue = str_value(self.param['UE'])
self.o_hash = self.o[:32]
self.o_validation_salt = self.o[32:40]
self.o_key_salt = self.o[40:]
self.u_hash = self.u[:32]
self.u_validation_salt = self.u[32:40]
self.u_key_salt = self.u[40:]
def get_cfm(self, name):
if name == 'AESV3':
return self.decrypt_aes256
def authenticate(self, password):
password = password.encode('utf-8')[:127]
hash = SHA256.new(password)
hash.update(self.o_validation_salt)
hash.update(self.u)
if hash.digest() == self.o_hash:
hash = SHA256.new(password)
hash.update(self.o_key_salt)
hash.update(self.u)
return AES.new(hash.digest(), mode=AES.MODE_CBC, IV='\x00' * 16).decrypt(self.oe)
hash = SHA256.new(password)
hash.update(self.u_validation_salt)
if hash.digest() == self.u_hash:
hash = SHA256.new(password)
hash.update(self.u_key_salt)
return AES.new(hash.digest(), mode=AES.MODE_CBC, IV='\x00' * 16).decrypt(self.ue)
def decrypt_aes256(self, objid, genno, data):
return AES.new(self.key, mode=AES.MODE_CBC, IV=data[:16]).decrypt(data[16:])
## PDFDocument ## PDFDocument
## ##
class PDFDocument(object): class PDFDocument(object):
@ -285,8 +503,15 @@ class PDFDocument(object):
""" """
security_handler_registry = {
1: PDFStandardSecurityHandler,
2: PDFStandardSecurityHandler,
}
if AES is not None:
security_handler_registry[4] = PDFStandardSecurityHandlerV4
if SHA256 is not None:
security_handler_registry[5] = PDFStandardSecurityHandlerV5
debug = 0 debug = 0
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
def __init__(self, parser, password='', caching=True, fallback=True): def __init__(self, parser, password='', caching=True, fallback=True):
"Set the document to use a given PDFParser object." "Set the document to use a given PDFParser object."
@ -343,61 +568,18 @@ class PDFDocument(object):
(docid, param) = self.encryption (docid, param) = self.encryption
if literal_name(param.get('Filter')) != 'Standard': if literal_name(param.get('Filter')) != 'Standard':
raise PDFEncryptionError('Unknown filter: param=%r' % param) raise PDFEncryptionError('Unknown filter: param=%r' % param)
V = int_value(param.get('V', 0)) v = int_value(param.get('V', 0))
if not (V == 1 or V == 2): factory = self.security_handler_registry.get(v)
if factory is None:
raise PDFEncryptionError('Unknown algorithm: param=%r' % param) raise PDFEncryptionError('Unknown algorithm: param=%r' % param)
length = int_value(param.get('Length', 40)) # Key length (bits) handler = factory(docid, param, password)
O = str_value(param['O']) self.decipher = handler.decrypt
R = int_value(param['R']) # Revision self.is_printable = handler.is_printable()
if 5 <= R: self.is_modifiable = handler.is_modifiable()
raise PDFEncryptionError('Unknown revision: %r' % R) self.is_extractable = handler.is_extractable()
U = str_value(param['U']) self._parser.fallback = False # need to read streams with exact length
P = int_value(param['P'])
self.is_printable = bool(P & 4)
self.is_modifiable = bool(P & 8)
self.is_extractable = bool(P & 16)
# Algorithm 3.2
password = (password+self.PASSWORD_PADDING)[:32] # 1
hash = md5.md5(password) # 2
hash.update(O) # 3
hash.update(struct.pack('<l', P)) # 4
hash.update(docid[0]) # 5
if 4 <= R:
# 6
raise PDFNotImplementedError('Revision 4 encryption is currently unsupported')
if 3 <= R:
# 8
for _ in xrange(50):
hash = md5.md5(hash.digest()[:length//8])
key = hash.digest()[:length//8]
if R == 2:
# Algorithm 3.4
u1 = Arcfour(key).process(self.PASSWORD_PADDING)
elif R == 3:
# Algorithm 3.5
hash = md5.md5(self.PASSWORD_PADDING) # 2
hash.update(docid[0]) # 3
x = Arcfour(key).process(hash.digest()[:16]) # 4
for i in xrange(1, 19+1):
k = ''.join(chr(ord(c) ^ i) for c in key)
x = Arcfour(k).process(x)
u1 = x+x # 32bytes total
if R == 2:
is_authenticated = (u1 == U)
else:
is_authenticated = (u1[:16] == U[:16])
if not is_authenticated:
raise PDFPasswordIncorrect
self.decrypt_key = key
self.decipher = self.decrypt_rc4 # XXX may be AES
return return
def decrypt_rc4(self, objid, genno, data):
key = self.decrypt_key + struct.pack('<L', objid)[:3]+struct.pack('<L', genno)[:2]
hash = md5.md5(key)
key = hash.digest()[:min(len(key), 16)]
return Arcfour(key).process(data)
def _getobj_objstm(self, stream, index, objid): def _getobj_objstm(self, stream, index, objid):
if stream.objid in self._parsed_objs: if stream.objid in self._parsed_objs:
(objs, n) = self._parsed_objs[stream.objid] (objs, n) = self._parsed_objs[stream.objid]
@ -468,6 +650,9 @@ class PDFDocument(object):
obj = self._getobj_objstm(stream, index, objid) obj = self._getobj_objstm(stream, index, objid)
else: else:
obj = self._getobj_parse(index, objid) obj = self._getobj_parse(index, objid)
if self.decipher:
obj = decipher_all(self.decipher, objid, genno, obj)
if isinstance(obj, PDFStream): if isinstance(obj, PDFStream):
obj.set_objid(objid, genno) obj.set_objid(objid, genno)
break break
@ -479,8 +664,6 @@ class PDFDocument(object):
print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj) print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj)
if self.caching: if self.caching:
self._cached_objs[objid] = (obj, genno) self._cached_objs[objid] = (obj, genno)
if self.decipher:
obj = decipher_all(self.decipher, objid, genno, obj)
return obj return obj
def get_outlines(self): def get_outlines(self):

View File

@ -112,10 +112,12 @@ class PDFParser(PSStackParser):
if 'endstream' in line: if 'endstream' in line:
i = line.index('endstream') i = line.index('endstream')
objlen += i objlen += i
data += line[:i] if self.fallback:
data += line[:i]
break break
objlen += len(line) objlen += len(line)
data += line if self.fallback:
data += line
self.seek(pos+objlen) self.seek(pos+objlen)
# XXX limit objlen not to exceed object boundary # XXX limit objlen not to exceed object boundary
if 2 <= self.debug: if 2 <= self.debug:

View File

@ -224,7 +224,7 @@ class PDFStream(PDFObject):
data = self.rawdata data = self.rawdata
if self.decipher: if self.decipher:
# Handle encryption # Handle encryption
data = self.decipher(self.objid, self.genno, data) data = self.decipher(self.objid, self.genno, data, self.attrs)
filters = self.get_filters() filters = self.get_filters()
if not filters: if not filters:
self.data = data self.data = data

View File

@ -52,11 +52,13 @@ all: htmls texts xmls
test: test:
$(MAKE) all CMP=cmp $(MAKE) all CMP=cmp
cd encryption && $(MAKE) test
clean: clean:
-$(RM) $(HTMLS) -$(RM) $(HTMLS)
-$(RM) $(TEXTS) -$(RM) $(TEXTS)
-$(RM) $(XMLS) -$(RM) $(XMLS)
cd encryption && $(MAKE) clean
htmls: $(HTMLS) htmls: $(HTMLS)
texts: $(TEXTS) texts: $(TEXTS)

View File

@ -47,3 +47,14 @@ nonfree/nlp2004slides.pdf:
nonfree/naacl06-shinyama.pdf: nonfree/naacl06-shinyama.pdf:
Yusuke Shinyama and Satoshi Sekine Yusuke Shinyama and Satoshi Sekine
"Preemptive Information Extraction using Unrestircted Relation Discovery" "Preemptive Information Extraction using Unrestircted Relation Discovery"
--
Files in the encryption folder have been generated with cpdf 1.7 [http://www.coherentpdf.com/]
from the base.pdf file generated with LibreOffice 4.1.1.2 as follows:
cpdf -encrypt 40bit foo baz base.pdf -o rc4-40.pdf
cpdf -encrypt 128bit foo baz base.pdf -o rc4-128.pdf
cpdf -encrypt AES foo baz base.pdf -o aes-128.pdf
cpdf -encrypt AES foo baz base.pdf -no-encrypt-metadata -o aes-128-m.pdf
cpdf -encrypt AES256 foo baz base.pdf -o aes-256.pdf
cpdf -encrypt AES256 foo baz base.pdf -no-encrypt-metadata -o aes-256-m.pdf

View File

@ -0,0 +1,32 @@
# GNUMakefile for test
RM=rm -f
CMP=:
PYTHON=python2
PDF2TXT=PYTHONPATH=../.. $(PYTHON) ../../tools/pdf2txt.py
XMLS= \
rc4-40.xml \
rc4-128.xml \
aes-128.xml \
aes-128-m.xml \
aes-256.xml \
aes-256-m.xml \
all: xmls
test:
$(MAKE) all CMP=cmp
clean:
-$(RM) $(XMLS)
xmls: $(XMLS)
.SUFFIXES: .pdf .xml
.pdf.xml:
$(PDF2TXT) -p1 -V -t xml -P foo -o $@ $<
$(CMP) $@ base.xml
$(PDF2TXT) -p1 -V -t xml -P baz -o $@ $<
$(CMP) $@ base.xml

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
samples/encryption/base.pdf Normal file

Binary file not shown.

View File

@ -0,0 +1,23 @@
<?xml version="1.0" encoding="utf-8" ?>
<pages>
<page id="1" bbox="0.000,0.000,595.000,842.000" rotate="0">
<textbox id="0" bbox="56.800,771.508,90.688,787.264">
<textline bbox="56.800,771.508,90.688,787.264">
<text font="BAAAAA+TimesNewRomanPSMT" bbox="56.800,771.508,63.472,787.264" size="15.756">S</text>
<text font="BAAAAA+TimesNewRomanPSMT" bbox="63.484,771.508,68.800,787.264" size="15.756">e</text>
<text font="BAAAAA+TimesNewRomanPSMT" bbox="68.788,771.508,74.104,787.264" size="15.756">c</text>
<text font="BAAAAA+TimesNewRomanPSMT" bbox="74.092,771.508,78.088,787.264" size="15.756">r</text>
<text font="BAAAAA+TimesNewRomanPSMT" bbox="78.088,771.508,83.404,787.264" size="15.756">e</text>
<text font="BAAAAA+TimesNewRomanPSMT" bbox="83.392,771.508,86.716,787.264" size="15.756">t</text>
<text font="BAAAAA+TimesNewRomanPSMT" bbox="86.692,771.508,90.688,787.264" size="15.756">!</text>
<text>
</text>
</textline>
</textbox>
<figure name="Tr4" bbox="-9.000,420.000,595.000,840.100">
</figure>
<layout>
<textbox id="0" bbox="56.800,771.508,90.688,787.264" />
</layout>
</page>
</pages>

Binary file not shown.

Binary file not shown.