Merge pull request #53 from numion/encryption

Support revision 4 and 5 encryption if PyCrypto library is available.
pull/1/head
Yusuke Shinyama 2014-05-28 11:42:43 +09:00
commit 18817d0e38
15 changed files with 316 additions and 59 deletions

View File

@ -44,6 +44,10 @@ class Arcfour(object):
(self.i, self.j) = (i, j)
return r
encrypt = decrypt = process
new = Arcfour
# test
if __name__ == '__main__':
import doctest

View File

@ -6,6 +6,14 @@ try:
import hashlib as md5
except ImportError:
import md5
try:
from Crypto.Cipher import ARC4
from Crypto.Cipher import AES
from Crypto.Hash import SHA256
except ImportError:
AES = SHA256 = None
import arcfour as ARC4
from psparser import PSEOF
from psparser import literal_name
from psparser import LIT, KWD, STRICT
@ -16,7 +24,6 @@ from pdftypes import int_value
from pdftypes import str_value, list_value, dict_value, stream_value
from pdfparser import PDFSyntaxError
from pdfparser import PDFStreamParser
from arcfour import Arcfour
from utils import choplist, nunpack
from utils import decode_text
@ -269,6 +276,217 @@ class PDFXRefStream(PDFBaseXRef):
raise KeyError(objid)
## PDFSecurityHandler
##
class PDFStandardSecurityHandler(object):
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
supported_revisions = (2, 3)
def __init__(self, docid, param, password=''):
self.docid = docid
self.param = param
self.password = password
self.init()
def init(self):
self.init_params()
if self.r not in self.supported_revisions:
raise PDFEncryptionError('Unsupported revision: param=%r' % self.param)
self.init_key()
def init_params(self):
self.v = int_value(self.param.get('V', 0))
self.r = int_value(self.param['R'])
self.p = int_value(self.param['P'])
self.o = str_value(self.param['O'])
self.u = str_value(self.param['U'])
self.length = int_value(self.param.get('Length', 40))
def init_key(self):
self.key = self.authenticate(self.password)
if self.key is None:
raise PDFPasswordIncorrect
def is_printable(self):
return bool(self.p & 4)
def is_modifiable(self):
return bool(self.p & 8)
def is_extractable(self):
return bool(self.p & 16)
def compute_u(self, key):
if self.r == 2:
# Algorithm 3.4
return ARC4.new(key).encrypt(self.PASSWORD_PADDING) # 2
else:
# Algorithm 3.5
hash = md5.md5(self.PASSWORD_PADDING) # 2
hash.update(self.docid[0]) # 3
result = ARC4.new(key).encrypt(hash.digest()) # 4
for i in range(1, 20): # 5
k = ''.join(chr(ord(c) ^ i) for c in key)
result = ARC4.new(k).encrypt(result)
result += result # 6
return result
def compute_encryption_key(self, password):
# Algorithm 3.2
password = (password + self.PASSWORD_PADDING)[:32] # 1
hash = md5.md5(password) # 2
hash.update(self.o) # 3
hash.update(struct.pack('<l', self.p)) # 4
hash.update(self.docid[0]) # 5
if self.r >= 4:
if not self.encrypt_metadata:
hash.update('\xff\xff\xff\xff')
result = hash.digest()
n = 5
if self.r >= 3:
n = self.length // 8
for _ in range(50):
result = md5.md5(result[:n]).digest()
return result[:n]
def authenticate(self, password):
key = self.authenticate_user_password(password)
if key is None:
key = self.authenticate_owner_password(password)
return key
def authenticate_user_password(self, password):
key = self.compute_encryption_key(password)
if self.verify_encryption_key(key):
return key
def verify_encryption_key(self, key):
# Algorithm 3.6
u = self.compute_u(key)
if self.r == 2:
return u == self.u
return u[:16] == self.u[:16]
def authenticate_owner_password(self, password):
# Algorithm 3.7
password = (password + self.PASSWORD_PADDING)[:32]
hash = md5.md5(password)
if self.r >= 3:
for _ in range(50):
hash = md5.md5(hash.digest())
n = 5
if self.r >= 3:
n = self.length // 8
key = hash.digest()[:n]
if self.r == 2:
user_password = ARC4.new(key).decrypt(self.o)
else:
user_password = self.o
for i in range(19, -1, -1):
k = ''.join(chr(ord(c) ^ i) for c in key)
user_password = ARC4.new(k).decrypt(user_password)
return self.authenticate_user_password(user_password)
def decrypt(self, objid, genno, data, attrs=None):
return self.decrypt_rc4(objid, genno, data)
def decrypt_rc4(self, objid, genno, data):
key = self.key + struct.pack('<L', objid)[:3] + struct.pack('<L', genno)[:2]
hash = md5.md5(key)
key = hash.digest()[:min(len(key), 16)]
return ARC4.new(key).decrypt(data)
class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
supported_revisions = (4,)
def init_params(self):
super(PDFStandardSecurityHandlerV4, self).init_params()
self.length = 128
self.cf = dict_value(self.param.get('CF'))
self.stmf = literal_name(self.param['StmF'])
self.strf = literal_name(self.param['StrF'])
self.encrypt_metadata = bool(self.param.get('EncryptMetadata', True))
if self.stmf != self.strf:
raise PDFEncryptionError('Unsupported crypt filter: param=%r' % self.param)
self.cfm = {}
for k, v in self.cf.items():
f = self.get_cfm(literal_name(v['CFM']))
if f is None:
raise PDFEncryptionError('Unknown crypt filter method: param=%r' % self.param)
self.cfm[k] = f
self.cfm['Identity'] = self.decrypt_identity
if self.strf not in self.cfm:
raise PDFEncryptionError('Undefined crypt filter: param=%r' % self.param)
def get_cfm(self, name):
if name == 'V2':
return self.decrypt_rc4
elif name == 'AESV2':
return self.decrypt_aes128
def decrypt(self, objid, genno, data, attrs=None, name=None):
if not self.encrypt_metadata and attrs is not None:
t = attrs.get('Type')
if t is not None and literal_name(t) == 'Metadata':
return data
if name is None:
name = self.strf
return self.cfm[name](objid, genno, data)
def decrypt_identity(self, objid, genno, data):
return data
def decrypt_aes128(self, objid, genno, data):
key = self.key + struct.pack('<L', objid)[:3] + struct.pack('<L', genno)[:2] + "sAlT"
hash = md5.md5(key)
key = hash.digest()[:min(len(key), 16)]
return AES.new(key, mode=AES.MODE_CBC, IV=data[:16]).decrypt(data[16:])
class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
supported_revisions = (5,)
def init_params(self):
super(PDFStandardSecurityHandlerV5, self).init_params()
self.length = 256
self.oe = str_value(self.param['OE'])
self.ue = str_value(self.param['UE'])
self.o_hash = self.o[:32]
self.o_validation_salt = self.o[32:40]
self.o_key_salt = self.o[40:]
self.u_hash = self.u[:32]
self.u_validation_salt = self.u[32:40]
self.u_key_salt = self.u[40:]
def get_cfm(self, name):
if name == 'AESV3':
return self.decrypt_aes256
def authenticate(self, password):
password = password.encode('utf-8')[:127]
hash = SHA256.new(password)
hash.update(self.o_validation_salt)
hash.update(self.u)
if hash.digest() == self.o_hash:
hash = SHA256.new(password)
hash.update(self.o_key_salt)
hash.update(self.u)
return AES.new(hash.digest(), mode=AES.MODE_CBC, IV='\x00' * 16).decrypt(self.oe)
hash = SHA256.new(password)
hash.update(self.u_validation_salt)
if hash.digest() == self.u_hash:
hash = SHA256.new(password)
hash.update(self.u_key_salt)
return AES.new(hash.digest(), mode=AES.MODE_CBC, IV='\x00' * 16).decrypt(self.ue)
def decrypt_aes256(self, objid, genno, data):
return AES.new(self.key, mode=AES.MODE_CBC, IV=data[:16]).decrypt(data[16:])
## PDFDocument
##
class PDFDocument(object):
@ -285,8 +503,15 @@ class PDFDocument(object):
"""
security_handler_registry = {
1: PDFStandardSecurityHandler,
2: PDFStandardSecurityHandler,
}
if AES is not None:
security_handler_registry[4] = PDFStandardSecurityHandlerV4
if SHA256 is not None:
security_handler_registry[5] = PDFStandardSecurityHandlerV5
debug = 0
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
def __init__(self, parser, password='', caching=True, fallback=True):
"Set the document to use a given PDFParser object."
@ -343,61 +568,18 @@ class PDFDocument(object):
(docid, param) = self.encryption
if literal_name(param.get('Filter')) != 'Standard':
raise PDFEncryptionError('Unknown filter: param=%r' % param)
V = int_value(param.get('V', 0))
if not (V == 1 or V == 2):
v = int_value(param.get('V', 0))
factory = self.security_handler_registry.get(v)
if factory is None:
raise PDFEncryptionError('Unknown algorithm: param=%r' % param)
length = int_value(param.get('Length', 40)) # Key length (bits)
O = str_value(param['O'])
R = int_value(param['R']) # Revision
if 5 <= R:
raise PDFEncryptionError('Unknown revision: %r' % R)
U = str_value(param['U'])
P = int_value(param['P'])
self.is_printable = bool(P & 4)
self.is_modifiable = bool(P & 8)
self.is_extractable = bool(P & 16)
# Algorithm 3.2
password = (password+self.PASSWORD_PADDING)[:32] # 1
hash = md5.md5(password) # 2
hash.update(O) # 3
hash.update(struct.pack('<l', P)) # 4
hash.update(docid[0]) # 5
if 4 <= R:
# 6
raise PDFNotImplementedError('Revision 4 encryption is currently unsupported')
if 3 <= R:
# 8
for _ in xrange(50):
hash = md5.md5(hash.digest()[:length//8])
key = hash.digest()[:length//8]
if R == 2:
# Algorithm 3.4
u1 = Arcfour(key).process(self.PASSWORD_PADDING)
elif R == 3:
# Algorithm 3.5
hash = md5.md5(self.PASSWORD_PADDING) # 2
hash.update(docid[0]) # 3
x = Arcfour(key).process(hash.digest()[:16]) # 4
for i in xrange(1, 19+1):
k = ''.join(chr(ord(c) ^ i) for c in key)
x = Arcfour(k).process(x)
u1 = x+x # 32bytes total
if R == 2:
is_authenticated = (u1 == U)
else:
is_authenticated = (u1[:16] == U[:16])
if not is_authenticated:
raise PDFPasswordIncorrect
self.decrypt_key = key
self.decipher = self.decrypt_rc4 # XXX may be AES
handler = factory(docid, param, password)
self.decipher = handler.decrypt
self.is_printable = handler.is_printable()
self.is_modifiable = handler.is_modifiable()
self.is_extractable = handler.is_extractable()
self._parser.fallback = False # need to read streams with exact length
return
def decrypt_rc4(self, objid, genno, data):
key = self.decrypt_key + struct.pack('<L', objid)[:3]+struct.pack('<L', genno)[:2]
hash = md5.md5(key)
key = hash.digest()[:min(len(key), 16)]
return Arcfour(key).process(data)
def _getobj_objstm(self, stream, index, objid):
if stream.objid in self._parsed_objs:
(objs, n) = self._parsed_objs[stream.objid]
@ -468,6 +650,9 @@ class PDFDocument(object):
obj = self._getobj_objstm(stream, index, objid)
else:
obj = self._getobj_parse(index, objid)
if self.decipher:
obj = decipher_all(self.decipher, objid, genno, obj)
if isinstance(obj, PDFStream):
obj.set_objid(objid, genno)
break
@ -479,8 +664,6 @@ class PDFDocument(object):
print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj)
if self.caching:
self._cached_objs[objid] = (obj, genno)
if self.decipher:
obj = decipher_all(self.decipher, objid, genno, obj)
return obj
def get_outlines(self):

View File

@ -112,9 +112,11 @@ class PDFParser(PSStackParser):
if 'endstream' in line:
i = line.index('endstream')
objlen += i
if self.fallback:
data += line[:i]
break
objlen += len(line)
if self.fallback:
data += line
self.seek(pos+objlen)
# XXX limit objlen not to exceed object boundary

View File

@ -224,7 +224,7 @@ class PDFStream(PDFObject):
data = self.rawdata
if self.decipher:
# Handle encryption
data = self.decipher(self.objid, self.genno, data)
data = self.decipher(self.objid, self.genno, data, self.attrs)
filters = self.get_filters()
if not filters:
self.data = data

View File

@ -52,11 +52,13 @@ all: htmls texts xmls
test:
$(MAKE) all CMP=cmp
cd encryption && $(MAKE) test
clean:
-$(RM) $(HTMLS)
-$(RM) $(TEXTS)
-$(RM) $(XMLS)
cd encryption && $(MAKE) clean
htmls: $(HTMLS)
texts: $(TEXTS)

View File

@ -47,3 +47,14 @@ nonfree/nlp2004slides.pdf:
nonfree/naacl06-shinyama.pdf:
Yusuke Shinyama and Satoshi Sekine
"Preemptive Information Extraction using Unrestircted Relation Discovery"
--
Files in the encryption folder have been generated with cpdf 1.7 [http://www.coherentpdf.com/]
from the base.pdf file generated with LibreOffice 4.1.1.2 as follows:
cpdf -encrypt 40bit foo baz base.pdf -o rc4-40.pdf
cpdf -encrypt 128bit foo baz base.pdf -o rc4-128.pdf
cpdf -encrypt AES foo baz base.pdf -o aes-128.pdf
cpdf -encrypt AES foo baz base.pdf -no-encrypt-metadata -o aes-128-m.pdf
cpdf -encrypt AES256 foo baz base.pdf -o aes-256.pdf
cpdf -encrypt AES256 foo baz base.pdf -no-encrypt-metadata -o aes-256-m.pdf

View File

@ -0,0 +1,32 @@
# GNUMakefile for test
RM=rm -f
CMP=:
PYTHON=python2
PDF2TXT=PYTHONPATH=../.. $(PYTHON) ../../tools/pdf2txt.py
XMLS= \
rc4-40.xml \
rc4-128.xml \
aes-128.xml \
aes-128-m.xml \
aes-256.xml \
aes-256-m.xml \
all: xmls
test:
$(MAKE) all CMP=cmp
clean:
-$(RM) $(XMLS)
xmls: $(XMLS)
.SUFFIXES: .pdf .xml
.pdf.xml:
$(PDF2TXT) -p1 -V -t xml -P foo -o $@ $<
$(CMP) $@ base.xml
$(PDF2TXT) -p1 -V -t xml -P baz -o $@ $<
$(CMP) $@ base.xml

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
samples/encryption/base.pdf Normal file

Binary file not shown.

View File

@ -0,0 +1,23 @@
<?xml version="1.0" encoding="utf-8" ?>
<pages>
<page id="1" bbox="0.000,0.000,595.000,842.000" rotate="0">
<textbox id="0" bbox="56.800,771.508,90.688,787.264">
<textline bbox="56.800,771.508,90.688,787.264">
<text font="BAAAAA+TimesNewRomanPSMT" bbox="56.800,771.508,63.472,787.264" size="15.756">S</text>
<text font="BAAAAA+TimesNewRomanPSMT" bbox="63.484,771.508,68.800,787.264" size="15.756">e</text>
<text font="BAAAAA+TimesNewRomanPSMT" bbox="68.788,771.508,74.104,787.264" size="15.756">c</text>
<text font="BAAAAA+TimesNewRomanPSMT" bbox="74.092,771.508,78.088,787.264" size="15.756">r</text>
<text font="BAAAAA+TimesNewRomanPSMT" bbox="78.088,771.508,83.404,787.264" size="15.756">e</text>
<text font="BAAAAA+TimesNewRomanPSMT" bbox="83.392,771.508,86.716,787.264" size="15.756">t</text>
<text font="BAAAAA+TimesNewRomanPSMT" bbox="86.692,771.508,90.688,787.264" size="15.756">!</text>
<text>
</text>
</textline>
</textbox>
<figure name="Tr4" bbox="-9.000,420.000,595.000,840.100">
</figure>
<layout>
<textbox id="0" bbox="56.800,771.508,90.688,787.264" />
</layout>
</page>
</pages>

Binary file not shown.

Binary file not shown.