Implement revision 4 and 5 encryption handler.
parent
29ebc2d618
commit
a4997d6f10
|
@ -43,6 +43,10 @@ class Arcfour(object):
|
|||
r += chr(ord(c) ^ k)
|
||||
(self.i, self.j) = (i, j)
|
||||
return r
|
||||
|
||||
encrypt = decrypt = process
|
||||
|
||||
new = Arcfour
|
||||
|
||||
# test
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -6,6 +6,14 @@ try:
|
|||
import hashlib as md5
|
||||
except ImportError:
|
||||
import md5
|
||||
try:
|
||||
from Crypto.Cipher import ARC4
|
||||
from Crypto.Cipher import AES
|
||||
from Crypto.Hash import SHA256
|
||||
except ImportError:
|
||||
AES = SHA256 = None
|
||||
import arcfour as ARC4
|
||||
|
||||
from psparser import PSEOF
|
||||
from psparser import literal_name
|
||||
from psparser import LIT, KWD, STRICT
|
||||
|
@ -16,7 +24,6 @@ from pdftypes import int_value
|
|||
from pdftypes import str_value, list_value, dict_value, stream_value
|
||||
from pdfparser import PDFSyntaxError
|
||||
from pdfparser import PDFStreamParser
|
||||
from arcfour import Arcfour
|
||||
from utils import choplist, nunpack
|
||||
from utils import decode_text
|
||||
|
||||
|
@ -269,6 +276,217 @@ class PDFXRefStream(PDFBaseXRef):
|
|||
raise KeyError(objid)
|
||||
|
||||
|
||||
## PDFSecurityHandler
|
||||
##
|
||||
class PDFStandardSecurityHandler(object):
|
||||
|
||||
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
|
||||
supported_revisions = (2, 3)
|
||||
|
||||
def __init__(self, docid, param, password=''):
|
||||
self.docid = docid
|
||||
self.param = param
|
||||
self.password = password
|
||||
self.init()
|
||||
|
||||
def init(self):
|
||||
self.init_params()
|
||||
if self.r not in self.supported_revisions:
|
||||
raise PDFEncryptionError('Unsupported revision: param=%r' % self.param)
|
||||
self.init_key()
|
||||
|
||||
def init_params(self):
|
||||
self.v = int_value(self.param.get('V', 0))
|
||||
self.r = int_value(self.param['R'])
|
||||
self.p = int_value(self.param['P'])
|
||||
self.o = str_value(self.param['O'])
|
||||
self.u = str_value(self.param['U'])
|
||||
self.length = int_value(self.param.get('Length', 40))
|
||||
|
||||
def init_key(self):
|
||||
self.key = self.authenticate(self.password)
|
||||
if self.key is None:
|
||||
raise PDFPasswordIncorrect
|
||||
|
||||
def is_printable(self):
|
||||
return bool(self.p & 4)
|
||||
|
||||
def is_modifiable(self):
|
||||
return bool(self.p & 8)
|
||||
|
||||
def is_extractable(self):
|
||||
return bool(self.p & 16)
|
||||
|
||||
def compute_u(self, key):
|
||||
if self.r == 2:
|
||||
# Algorithm 3.4
|
||||
return ARC4.new(key).encrypt(self.PASSWORD_PADDING) # 2
|
||||
else:
|
||||
# Algorithm 3.5
|
||||
hash = md5.md5(self.PASSWORD_PADDING) # 2
|
||||
hash.update(self.docid[0]) # 3
|
||||
result = ARC4.new(key).encrypt(hash.digest()) # 4
|
||||
for i in range(1, 20): # 5
|
||||
k = ''.join(chr(ord(c) ^ i) for c in key)
|
||||
result = ARC4.new(k).encrypt(result)
|
||||
result += result # 6
|
||||
return result
|
||||
|
||||
def compute_encryption_key(self, password):
|
||||
# Algorithm 3.2
|
||||
password = (password + self.PASSWORD_PADDING)[:32] # 1
|
||||
hash = md5.md5(password) # 2
|
||||
hash.update(self.o) # 3
|
||||
hash.update(struct.pack('<l', self.p)) # 4
|
||||
hash.update(self.docid[0]) # 5
|
||||
if self.r >= 4:
|
||||
if not self.encrypt_metadata:
|
||||
hash.update('\xff\xff\xff\xff')
|
||||
result = hash.digest()
|
||||
n = 5
|
||||
if self.r >= 3:
|
||||
n = self.length // 8
|
||||
for _ in range(50):
|
||||
result = md5.md5(result[:n]).digest()
|
||||
return result[:n]
|
||||
|
||||
def authenticate(self, password):
|
||||
key = self.authenticate_user_password(password)
|
||||
if key is None:
|
||||
key = self.authenticate_owner_password(password)
|
||||
return key
|
||||
|
||||
def authenticate_user_password(self, password):
|
||||
key = self.compute_encryption_key(password)
|
||||
if self.verify_encryption_key(key):
|
||||
return key
|
||||
|
||||
def verify_encryption_key(self, key):
|
||||
# Algorithm 3.6
|
||||
u = self.compute_u(key)
|
||||
if self.r == 2:
|
||||
return u == self.u
|
||||
return u[:16] == self.u[:16]
|
||||
|
||||
def authenticate_owner_password(self, password):
|
||||
# Algorithm 3.7
|
||||
password = (password + self.PASSWORD_PADDING)[:32]
|
||||
hash = md5.md5(password)
|
||||
if self.r >= 3:
|
||||
for _ in range(50):
|
||||
hash = md5.md5(hash.digest())
|
||||
n = 5
|
||||
if self.r >= 3:
|
||||
n = self.length // 8
|
||||
key = hash.digest()[:n]
|
||||
if self.r == 2:
|
||||
user_password = ARC4.new(key).decrypt(self.o)
|
||||
else:
|
||||
user_password = self.o
|
||||
for i in range(19, -1, -1):
|
||||
k = ''.join(chr(ord(c) ^ i) for c in key)
|
||||
user_password = ARC4.new(k).decrypt(user_password)
|
||||
return self.authenticate_user_password(user_password)
|
||||
|
||||
def decrypt(self, objid, genno, data, attrs=None):
|
||||
return self.decrypt_rc4(objid, genno, data)
|
||||
|
||||
def decrypt_rc4(self, objid, genno, data):
|
||||
key = self.key + struct.pack('<L', objid)[:3] + struct.pack('<L', genno)[:2]
|
||||
hash = md5.md5(key)
|
||||
key = hash.digest()[:min(len(key), 16)]
|
||||
return ARC4.new(key).decrypt(data)
|
||||
|
||||
|
||||
class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
|
||||
|
||||
supported_revisions = (4,)
|
||||
|
||||
def init_params(self):
|
||||
super(PDFStandardSecurityHandlerV4, self).init_params()
|
||||
self.length = 128
|
||||
self.cf = dict_value(self.param.get('CF'))
|
||||
self.stmf = literal_name(self.param['StmF'])
|
||||
self.strf = literal_name(self.param['StrF'])
|
||||
self.encrypt_metadata = bool(self.param.get('EncryptMetadata', True))
|
||||
if self.stmf != self.strf:
|
||||
raise PDFEncryptionError('Unsupported crypt filter: param=%r' % self.param)
|
||||
self.cfm = {}
|
||||
for k, v in self.cf.items():
|
||||
f = self.get_cfm(literal_name(v['CFM']))
|
||||
if f is None:
|
||||
raise PDFEncryptionError('Unknown crypt filter method: param=%r' % self.param)
|
||||
self.cfm[k] = f
|
||||
self.cfm['Identity'] = self.decrypt_identity
|
||||
if self.strf not in self.cfm:
|
||||
raise PDFEncryptionError('Undefined crypt filter: param=%r' % self.param)
|
||||
|
||||
def get_cfm(self, name):
|
||||
if name == 'V2':
|
||||
return self.decrypt_rc4
|
||||
elif name == 'AESV2':
|
||||
return self.decrypt_aes128
|
||||
|
||||
def decrypt(self, objid, genno, data, attrs=None, name=None):
|
||||
if not self.encrypt_metadata and attrs is not None:
|
||||
t = attrs.get('Type')
|
||||
if t is not None and literal_name(t) == 'Metadata':
|
||||
return data
|
||||
if name is None:
|
||||
name = self.strf
|
||||
return self.cfm[name](objid, genno, data)
|
||||
|
||||
def decrypt_identity(self, objid, genno, data):
|
||||
return data
|
||||
|
||||
def decrypt_aes128(self, objid, genno, data):
|
||||
key = self.key + struct.pack('<L', objid)[:3] + struct.pack('<L', genno)[:2] + "sAlT"
|
||||
hash = md5.md5(key)
|
||||
key = hash.digest()[:min(len(key), 16)]
|
||||
return AES.new(key, mode=AES.MODE_CBC, IV=data[:16]).decrypt(data[16:])
|
||||
|
||||
|
||||
class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
|
||||
|
||||
supported_revisions = (5,)
|
||||
|
||||
def init_params(self):
|
||||
super(PDFStandardSecurityHandlerV5, self).init_params()
|
||||
self.length = 256
|
||||
self.oe = str_value(self.param['OE'])
|
||||
self.ue = str_value(self.param['UE'])
|
||||
self.o_hash = self.o[:32]
|
||||
self.o_validation_salt = self.o[32:40]
|
||||
self.o_key_salt = self.o[40:]
|
||||
self.u_hash = self.u[:32]
|
||||
self.u_validation_salt = self.u[32:40]
|
||||
self.u_key_salt = self.u[40:]
|
||||
|
||||
def get_cfm(self, name):
|
||||
if name == 'AESV3':
|
||||
return self.decrypt_aes256
|
||||
|
||||
def authenticate(self, password):
|
||||
password = password.encode('utf-8')[:127]
|
||||
hash = SHA256.new(password)
|
||||
hash.update(self.o_validation_salt)
|
||||
hash.update(self.u)
|
||||
if hash.digest() == self.o_hash:
|
||||
hash = SHA256.new(password)
|
||||
hash.update(self.o_key_salt)
|
||||
hash.update(self.u)
|
||||
return AES.new(hash.digest(), mode=AES.MODE_CBC, IV='\x00' * 16).decrypt(self.oe)
|
||||
hash = SHA256.new(password)
|
||||
hash.update(self.u_validation_salt)
|
||||
if hash.digest() == self.u_hash:
|
||||
hash = SHA256.new(password)
|
||||
hash.update(self.u_key_salt)
|
||||
return AES.new(hash.digest(), mode=AES.MODE_CBC, IV='\x00' * 16).decrypt(self.ue)
|
||||
|
||||
def decrypt_aes256(self, objid, genno, data):
|
||||
return AES.new(self.key, mode=AES.MODE_CBC, IV=data[:16]).decrypt(data[16:])
|
||||
|
||||
|
||||
## PDFDocument
|
||||
##
|
||||
class PDFDocument(object):
|
||||
|
@ -285,8 +503,15 @@ class PDFDocument(object):
|
|||
|
||||
"""
|
||||
|
||||
security_handler_registry = {
|
||||
1: PDFStandardSecurityHandler,
|
||||
2: PDFStandardSecurityHandler,
|
||||
}
|
||||
if AES is not None:
|
||||
security_handler_registry[4] = PDFStandardSecurityHandlerV4
|
||||
if SHA256 is not None:
|
||||
security_handler_registry[5] = PDFStandardSecurityHandlerV5
|
||||
debug = 0
|
||||
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
|
||||
|
||||
def __init__(self, parser, password='', caching=True, fallback=True):
|
||||
"Set the document to use a given PDFParser object."
|
||||
|
@ -343,61 +568,18 @@ class PDFDocument(object):
|
|||
(docid, param) = self.encryption
|
||||
if literal_name(param.get('Filter')) != 'Standard':
|
||||
raise PDFEncryptionError('Unknown filter: param=%r' % param)
|
||||
V = int_value(param.get('V', 0))
|
||||
if not (V == 1 or V == 2):
|
||||
v = int_value(param.get('V', 0))
|
||||
factory = self.security_handler_registry.get(v)
|
||||
if factory is None:
|
||||
raise PDFEncryptionError('Unknown algorithm: param=%r' % param)
|
||||
length = int_value(param.get('Length', 40)) # Key length (bits)
|
||||
O = str_value(param['O'])
|
||||
R = int_value(param['R']) # Revision
|
||||
if 5 <= R:
|
||||
raise PDFEncryptionError('Unknown revision: %r' % R)
|
||||
U = str_value(param['U'])
|
||||
P = int_value(param['P'])
|
||||
self.is_printable = bool(P & 4)
|
||||
self.is_modifiable = bool(P & 8)
|
||||
self.is_extractable = bool(P & 16)
|
||||
# Algorithm 3.2
|
||||
password = (password+self.PASSWORD_PADDING)[:32] # 1
|
||||
hash = md5.md5(password) # 2
|
||||
hash.update(O) # 3
|
||||
hash.update(struct.pack('<l', P)) # 4
|
||||
hash.update(docid[0]) # 5
|
||||
if 4 <= R:
|
||||
# 6
|
||||
raise PDFNotImplementedError('Revision 4 encryption is currently unsupported')
|
||||
if 3 <= R:
|
||||
# 8
|
||||
for _ in xrange(50):
|
||||
hash = md5.md5(hash.digest()[:length//8])
|
||||
key = hash.digest()[:length//8]
|
||||
if R == 2:
|
||||
# Algorithm 3.4
|
||||
u1 = Arcfour(key).process(self.PASSWORD_PADDING)
|
||||
elif R == 3:
|
||||
# Algorithm 3.5
|
||||
hash = md5.md5(self.PASSWORD_PADDING) # 2
|
||||
hash.update(docid[0]) # 3
|
||||
x = Arcfour(key).process(hash.digest()[:16]) # 4
|
||||
for i in xrange(1, 19+1):
|
||||
k = ''.join(chr(ord(c) ^ i) for c in key)
|
||||
x = Arcfour(k).process(x)
|
||||
u1 = x+x # 32bytes total
|
||||
if R == 2:
|
||||
is_authenticated = (u1 == U)
|
||||
else:
|
||||
is_authenticated = (u1[:16] == U[:16])
|
||||
if not is_authenticated:
|
||||
raise PDFPasswordIncorrect
|
||||
self.decrypt_key = key
|
||||
self.decipher = self.decrypt_rc4 # XXX may be AES
|
||||
handler = factory(docid, param, password)
|
||||
self.decipher = handler.decrypt
|
||||
self.is_printable = handler.is_printable()
|
||||
self.is_modifiable = handler.is_modifiable()
|
||||
self.is_extractable = handler.is_extractable()
|
||||
self._parser.fallback = False # need to read streams with exact length
|
||||
return
|
||||
|
||||
def decrypt_rc4(self, objid, genno, data):
|
||||
key = self.decrypt_key + struct.pack('<L', objid)[:3]+struct.pack('<L', genno)[:2]
|
||||
hash = md5.md5(key)
|
||||
key = hash.digest()[:min(len(key), 16)]
|
||||
return Arcfour(key).process(data)
|
||||
|
||||
def _getobj_objstm(self, stream, index, objid):
|
||||
if stream.objid in self._parsed_objs:
|
||||
(objs, n) = self._parsed_objs[stream.objid]
|
||||
|
@ -468,6 +650,9 @@ class PDFDocument(object):
|
|||
obj = self._getobj_objstm(stream, index, objid)
|
||||
else:
|
||||
obj = self._getobj_parse(index, objid)
|
||||
if self.decipher:
|
||||
obj = decipher_all(self.decipher, objid, genno, obj)
|
||||
|
||||
if isinstance(obj, PDFStream):
|
||||
obj.set_objid(objid, genno)
|
||||
break
|
||||
|
@ -479,8 +664,6 @@ class PDFDocument(object):
|
|||
print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj)
|
||||
if self.caching:
|
||||
self._cached_objs[objid] = (obj, genno)
|
||||
if self.decipher:
|
||||
obj = decipher_all(self.decipher, objid, genno, obj)
|
||||
return obj
|
||||
|
||||
def get_outlines(self):
|
||||
|
|
|
@ -112,10 +112,12 @@ class PDFParser(PSStackParser):
|
|||
if 'endstream' in line:
|
||||
i = line.index('endstream')
|
||||
objlen += i
|
||||
data += line[:i]
|
||||
if self.fallback:
|
||||
data += line[:i]
|
||||
break
|
||||
objlen += len(line)
|
||||
data += line
|
||||
if self.fallback:
|
||||
data += line
|
||||
self.seek(pos+objlen)
|
||||
# XXX limit objlen not to exceed object boundary
|
||||
if 2 <= self.debug:
|
||||
|
|
|
@ -224,7 +224,7 @@ class PDFStream(PDFObject):
|
|||
data = self.rawdata
|
||||
if self.decipher:
|
||||
# Handle encryption
|
||||
data = self.decipher(self.objid, self.genno, data)
|
||||
data = self.decipher(self.objid, self.genno, data, self.attrs)
|
||||
filters = self.get_filters()
|
||||
if not filters:
|
||||
self.data = data
|
||||
|
|
|
@ -52,11 +52,13 @@ all: htmls texts xmls
|
|||
|
||||
test:
|
||||
$(MAKE) all CMP=cmp
|
||||
cd encryption && $(MAKE) test
|
||||
|
||||
clean:
|
||||
-$(RM) $(HTMLS)
|
||||
-$(RM) $(TEXTS)
|
||||
-$(RM) $(XMLS)
|
||||
cd encryption && $(MAKE) clean
|
||||
|
||||
htmls: $(HTMLS)
|
||||
texts: $(TEXTS)
|
||||
|
|
|
@ -47,3 +47,14 @@ nonfree/nlp2004slides.pdf:
|
|||
nonfree/naacl06-shinyama.pdf:
|
||||
Yusuke Shinyama and Satoshi Sekine
|
||||
"Preemptive Information Extraction using Unrestircted Relation Discovery"
|
||||
|
||||
--
|
||||
Files in the encryption folder have been generated with cpdf 1.7 [http://www.coherentpdf.com/]
|
||||
from the base.pdf file generated with LibreOffice 4.1.1.2 as follows:
|
||||
|
||||
cpdf -encrypt 40bit foo baz base.pdf -o rc4-40.pdf
|
||||
cpdf -encrypt 128bit foo baz base.pdf -o rc4-128.pdf
|
||||
cpdf -encrypt AES foo baz base.pdf -o aes-128.pdf
|
||||
cpdf -encrypt AES foo baz base.pdf -no-encrypt-metadata -o aes-128-m.pdf
|
||||
cpdf -encrypt AES256 foo baz base.pdf -o aes-256.pdf
|
||||
cpdf -encrypt AES256 foo baz base.pdf -no-encrypt-metadata -o aes-256-m.pdf
|
||||
|
|
|
@ -0,0 +1,32 @@
|
|||
# GNUMakefile for test
|
||||
|
||||
RM=rm -f
|
||||
CMP=:
|
||||
PYTHON=python2
|
||||
PDF2TXT=PYTHONPATH=../.. $(PYTHON) ../../tools/pdf2txt.py
|
||||
|
||||
XMLS= \
|
||||
rc4-40.xml \
|
||||
rc4-128.xml \
|
||||
aes-128.xml \
|
||||
aes-128-m.xml \
|
||||
aes-256.xml \
|
||||
aes-256-m.xml \
|
||||
|
||||
all: xmls
|
||||
|
||||
test:
|
||||
$(MAKE) all CMP=cmp
|
||||
|
||||
clean:
|
||||
-$(RM) $(XMLS)
|
||||
|
||||
xmls: $(XMLS)
|
||||
|
||||
.SUFFIXES: .pdf .xml
|
||||
|
||||
.pdf.xml:
|
||||
$(PDF2TXT) -p1 -V -t xml -P foo -o $@ $<
|
||||
$(CMP) $@ base.xml
|
||||
$(PDF2TXT) -p1 -V -t xml -P baz -o $@ $<
|
||||
$(CMP) $@ base.xml
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,23 @@
|
|||
<?xml version="1.0" encoding="utf-8" ?>
|
||||
<pages>
|
||||
<page id="1" bbox="0.000,0.000,595.000,842.000" rotate="0">
|
||||
<textbox id="0" bbox="56.800,771.508,90.688,787.264">
|
||||
<textline bbox="56.800,771.508,90.688,787.264">
|
||||
<text font="BAAAAA+TimesNewRomanPSMT" bbox="56.800,771.508,63.472,787.264" size="15.756">S</text>
|
||||
<text font="BAAAAA+TimesNewRomanPSMT" bbox="63.484,771.508,68.800,787.264" size="15.756">e</text>
|
||||
<text font="BAAAAA+TimesNewRomanPSMT" bbox="68.788,771.508,74.104,787.264" size="15.756">c</text>
|
||||
<text font="BAAAAA+TimesNewRomanPSMT" bbox="74.092,771.508,78.088,787.264" size="15.756">r</text>
|
||||
<text font="BAAAAA+TimesNewRomanPSMT" bbox="78.088,771.508,83.404,787.264" size="15.756">e</text>
|
||||
<text font="BAAAAA+TimesNewRomanPSMT" bbox="83.392,771.508,86.716,787.264" size="15.756">t</text>
|
||||
<text font="BAAAAA+TimesNewRomanPSMT" bbox="86.692,771.508,90.688,787.264" size="15.756">!</text>
|
||||
<text>
|
||||
</text>
|
||||
</textline>
|
||||
</textbox>
|
||||
<figure name="Tr4" bbox="-9.000,420.000,595.000,840.100">
|
||||
</figure>
|
||||
<layout>
|
||||
<textbox id="0" bbox="56.800,771.508,90.688,787.264" />
|
||||
</layout>
|
||||
</page>
|
||||
</pages>
|
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue