diff --git a/pdfminer/arcfour.py b/pdfminer/arcfour.py index 7adcf60..d16cf89 100644 --- a/pdfminer/arcfour.py +++ b/pdfminer/arcfour.py @@ -43,6 +43,10 @@ class Arcfour(object): r += chr(ord(c) ^ k) (self.i, self.j) = (i, j) return r + + encrypt = decrypt = process + +new = Arcfour # test if __name__ == '__main__': diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py index 58b2e9c..73c4002 100644 --- a/pdfminer/pdfdocument.py +++ b/pdfminer/pdfdocument.py @@ -6,6 +6,14 @@ try: import hashlib as md5 except ImportError: import md5 +try: + from Crypto.Cipher import ARC4 + from Crypto.Cipher import AES + from Crypto.Hash import SHA256 +except ImportError: + AES = SHA256 = None + import arcfour as ARC4 + from psparser import PSEOF from psparser import literal_name from psparser import LIT, KWD, STRICT @@ -16,7 +24,6 @@ from pdftypes import int_value from pdftypes import str_value, list_value, dict_value, stream_value from pdfparser import PDFSyntaxError from pdfparser import PDFStreamParser -from arcfour import Arcfour from utils import choplist, nunpack from utils import decode_text @@ -269,6 +276,217 @@ class PDFXRefStream(PDFBaseXRef): raise KeyError(objid) +## PDFSecurityHandler +## +class PDFStandardSecurityHandler(object): + + PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz' + supported_revisions = (2, 3) + + def __init__(self, docid, param, password=''): + self.docid = docid + self.param = param + self.password = password + self.init() + + def init(self): + self.init_params() + if self.r not in self.supported_revisions: + raise PDFEncryptionError('Unsupported revision: param=%r' % self.param) + self.init_key() + + def init_params(self): + self.v = int_value(self.param.get('V', 0)) + self.r = int_value(self.param['R']) + self.p = int_value(self.param['P']) + self.o = str_value(self.param['O']) + self.u = str_value(self.param['U']) + self.length = int_value(self.param.get('Length', 40)) + + def init_key(self): + self.key = self.authenticate(self.password) + if self.key is None: + raise PDFPasswordIncorrect + + def is_printable(self): + return bool(self.p & 4) + + def is_modifiable(self): + return bool(self.p & 8) + + def is_extractable(self): + return bool(self.p & 16) + + def compute_u(self, key): + if self.r == 2: + # Algorithm 3.4 + return ARC4.new(key).encrypt(self.PASSWORD_PADDING) # 2 + else: + # Algorithm 3.5 + hash = md5.md5(self.PASSWORD_PADDING) # 2 + hash.update(self.docid[0]) # 3 + result = ARC4.new(key).encrypt(hash.digest()) # 4 + for i in range(1, 20): # 5 + k = ''.join(chr(ord(c) ^ i) for c in key) + result = ARC4.new(k).encrypt(result) + result += result # 6 + return result + + def compute_encryption_key(self, password): + # Algorithm 3.2 + password = (password + self.PASSWORD_PADDING)[:32] # 1 + hash = md5.md5(password) # 2 + hash.update(self.o) # 3 + hash.update(struct.pack('= 4: + if not self.encrypt_metadata: + hash.update('\xff\xff\xff\xff') + result = hash.digest() + n = 5 + if self.r >= 3: + n = self.length // 8 + for _ in range(50): + result = md5.md5(result[:n]).digest() + return result[:n] + + def authenticate(self, password): + key = self.authenticate_user_password(password) + if key is None: + key = self.authenticate_owner_password(password) + return key + + def authenticate_user_password(self, password): + key = self.compute_encryption_key(password) + if self.verify_encryption_key(key): + return key + + def verify_encryption_key(self, key): + # Algorithm 3.6 + u = self.compute_u(key) + if self.r == 2: + return u == self.u + return u[:16] == self.u[:16] + + def authenticate_owner_password(self, password): + # Algorithm 3.7 + password = (password + self.PASSWORD_PADDING)[:32] + hash = md5.md5(password) + if self.r >= 3: + for _ in range(50): + hash = md5.md5(hash.digest()) + n = 5 + if self.r >= 3: + n = self.length // 8 + key = hash.digest()[:n] + if self.r == 2: + user_password = ARC4.new(key).decrypt(self.o) + else: + user_password = self.o + for i in range(19, -1, -1): + k = ''.join(chr(ord(c) ^ i) for c in key) + user_password = ARC4.new(k).decrypt(user_password) + return self.authenticate_user_password(user_password) + + def decrypt(self, objid, genno, data, attrs=None): + return self.decrypt_rc4(objid, genno, data) + + def decrypt_rc4(self, objid, genno, data): + key = self.key + struct.pack('>sys.stderr, 'register: objid=%r: %r' % (objid, obj) if self.caching: self._cached_objs[objid] = (obj, genno) - if self.decipher: - obj = decipher_all(self.decipher, objid, genno, obj) return obj def get_outlines(self): diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py index f1b6c7f..32d1220 100644 --- a/pdfminer/pdfparser.py +++ b/pdfminer/pdfparser.py @@ -112,10 +112,12 @@ class PDFParser(PSStackParser): if 'endstream' in line: i = line.index('endstream') objlen += i - data += line[:i] + if self.fallback: + data += line[:i] break objlen += len(line) - data += line + if self.fallback: + data += line self.seek(pos+objlen) # XXX limit objlen not to exceed object boundary if 2 <= self.debug: diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index 6851fbf..03c4a5b 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -224,7 +224,7 @@ class PDFStream(PDFObject): data = self.rawdata if self.decipher: # Handle encryption - data = self.decipher(self.objid, self.genno, data) + data = self.decipher(self.objid, self.genno, data, self.attrs) filters = self.get_filters() if not filters: self.data = data diff --git a/samples/Makefile b/samples/Makefile index 8a1befa..7aa3aa0 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -52,11 +52,13 @@ all: htmls texts xmls test: $(MAKE) all CMP=cmp + cd encryption && $(MAKE) test clean: -$(RM) $(HTMLS) -$(RM) $(TEXTS) -$(RM) $(XMLS) + cd encryption && $(MAKE) clean htmls: $(HTMLS) texts: $(TEXTS) diff --git a/samples/README b/samples/README index bece0db..e0fa424 100644 --- a/samples/README +++ b/samples/README @@ -47,3 +47,14 @@ nonfree/nlp2004slides.pdf: nonfree/naacl06-shinyama.pdf: Yusuke Shinyama and Satoshi Sekine "Preemptive Information Extraction using Unrestircted Relation Discovery" + +-- +Files in the encryption folder have been generated with cpdf 1.7 [http://www.coherentpdf.com/] +from the base.pdf file generated with LibreOffice 4.1.1.2 as follows: + +cpdf -encrypt 40bit foo baz base.pdf -o rc4-40.pdf +cpdf -encrypt 128bit foo baz base.pdf -o rc4-128.pdf +cpdf -encrypt AES foo baz base.pdf -o aes-128.pdf +cpdf -encrypt AES foo baz base.pdf -no-encrypt-metadata -o aes-128-m.pdf +cpdf -encrypt AES256 foo baz base.pdf -o aes-256.pdf +cpdf -encrypt AES256 foo baz base.pdf -no-encrypt-metadata -o aes-256-m.pdf diff --git a/samples/encryption/Makefile b/samples/encryption/Makefile new file mode 100644 index 0000000..c877f4e --- /dev/null +++ b/samples/encryption/Makefile @@ -0,0 +1,32 @@ +# GNUMakefile for test + +RM=rm -f +CMP=: +PYTHON=python2 +PDF2TXT=PYTHONPATH=../.. $(PYTHON) ../../tools/pdf2txt.py + +XMLS= \ + rc4-40.xml \ + rc4-128.xml \ + aes-128.xml \ + aes-128-m.xml \ + aes-256.xml \ + aes-256-m.xml \ + +all: xmls + +test: + $(MAKE) all CMP=cmp + +clean: + -$(RM) $(XMLS) + +xmls: $(XMLS) + +.SUFFIXES: .pdf .xml + +.pdf.xml: + $(PDF2TXT) -p1 -V -t xml -P foo -o $@ $< + $(CMP) $@ base.xml + $(PDF2TXT) -p1 -V -t xml -P baz -o $@ $< + $(CMP) $@ base.xml diff --git a/samples/encryption/aes-128-m.pdf b/samples/encryption/aes-128-m.pdf new file mode 100644 index 0000000..30537a1 Binary files /dev/null and b/samples/encryption/aes-128-m.pdf differ diff --git a/samples/encryption/aes-128.pdf b/samples/encryption/aes-128.pdf new file mode 100644 index 0000000..2e3d2aa Binary files /dev/null and b/samples/encryption/aes-128.pdf differ diff --git a/samples/encryption/aes-256-m.pdf b/samples/encryption/aes-256-m.pdf new file mode 100644 index 0000000..81d0d8a Binary files /dev/null and b/samples/encryption/aes-256-m.pdf differ diff --git a/samples/encryption/aes-256.pdf b/samples/encryption/aes-256.pdf new file mode 100644 index 0000000..3a52dd4 Binary files /dev/null and b/samples/encryption/aes-256.pdf differ diff --git a/samples/encryption/base.pdf b/samples/encryption/base.pdf new file mode 100644 index 0000000..27a7cac Binary files /dev/null and b/samples/encryption/base.pdf differ diff --git a/samples/encryption/base.xml b/samples/encryption/base.xml new file mode 100644 index 0000000..4b6954e --- /dev/null +++ b/samples/encryption/base.xml @@ -0,0 +1,23 @@ + + + + + +S +e +c +r +e +t +! + + + + +
+
+ + + +
+
diff --git a/samples/encryption/rc4-128.pdf b/samples/encryption/rc4-128.pdf new file mode 100644 index 0000000..99beb75 Binary files /dev/null and b/samples/encryption/rc4-128.pdf differ diff --git a/samples/encryption/rc4-40.pdf b/samples/encryption/rc4-40.pdf new file mode 100644 index 0000000..2934a04 Binary files /dev/null and b/samples/encryption/rc4-40.pdf differ