Change pycryptodome dependency to the faster, smaller, and industry standard cryptography package (#456)

* swap pycryptodome to the faster, smaller, and industry standard crytography io

* update changelog

* fixlint

* Update CHANGELOG.md

* from MR, unneeded ex and naming

* add samples to nosetests

* fix lint

* show mismatch

* fix lint

* typo and newline

* Revert "add samples to nosetests"

This reverts commit a49ca302

* Add tests for encrypted documents to nose test suite

* Optimize imports of pdfdocument.py

Co-authored-by: Oren Tysor <oren@atakama.com>
Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
pull/461/head
lithiumFlower 2020-07-20 16:00:54 -04:00 committed by GitHub
parent 60863cfd55
commit c10cf3cdb8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 69 additions and 154 deletions

View File

@ -14,6 +14,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
### Changed
- Hiding fallback xref by default from dumppdf.py output ([#431](https://github.com/pdfminer/pdfminer.six/pull/431))
- Raise a warning instead of an error when extracting text from a non-extractable PDF ([#350](https://github.com/pdfminer/pdfminer.six/issues/350))
- Switched from pycryptodome to cryptography package for AES decryption ([#456](https://github.com/pdfminer/pdfminer.six/pull/456))
## [20200517]

View File

@ -56,4 +56,3 @@ $(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz: $(CMAPDST)
test: cmap
nosetests
cd samples && $(MAKE) test

View File

@ -33,6 +33,3 @@ class Arcfour:
return r
encrypt = decrypt = process
new = Arcfour

View File

@ -1,20 +1,18 @@
import hashlib as md5
import logging
import re
import struct
from hashlib import sha256, md5
from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
try:
from Crypto.Cipher import ARC4, AES
from Crypto.Hash import SHA256
except ImportError:
AES = SHA256 = None
from . import arcfour as ARC4
from .psparser import PSEOF, literal_name, LIT, KWD
from . import settings
from .arcfour import Arcfour
from .pdfparser import PDFSyntaxError, PDFStreamParser
from .pdftypes import PDFException, uint_value, PDFTypeError, PDFStream, \
PDFObjectNotFound, decipher_all, int_value, str_value, list_value, \
dict_value, stream_value
from .pdfparser import PDFSyntaxError, PDFStreamParser
from .psparser import PSEOF, literal_name, LIT, KWD
from .utils import choplist, nunpack, decode_text
log = logging.getLogger(__name__)
@ -325,22 +323,22 @@ class PDFStandardSecurityHandler:
def compute_u(self, key):
if self.r == 2:
# Algorithm 3.4
return ARC4.new(key).encrypt(self.PASSWORD_PADDING) # 2
return Arcfour(key).encrypt(self.PASSWORD_PADDING) # 2
else:
# Algorithm 3.5
hash = md5.md5(self.PASSWORD_PADDING) # 2
hash = md5(self.PASSWORD_PADDING) # 2
hash.update(self.docid[0]) # 3
result = ARC4.new(key).encrypt(hash.digest()) # 4
result = Arcfour(key).encrypt(hash.digest()) # 4
for i in range(1, 20): # 5
k = b''.join(bytes((c ^ i,)) for c in iter(key))
result = ARC4.new(k).encrypt(result)
result = Arcfour(k).encrypt(result)
result += result # 6
return result
def compute_encryption_key(self, password):
# Algorithm 3.2
password = (password + self.PASSWORD_PADDING)[:32] # 1
hash = md5.md5(password) # 2
hash = md5(password) # 2
hash.update(self.o) # 3
# See https://github.com/pdfminer/pdfminer.six/issues/186
hash.update(struct.pack('<L', self.p)) # 4
@ -353,7 +351,7 @@ class PDFStandardSecurityHandler:
if self.r >= 3:
n = self.length // 8
for _ in range(50):
result = md5.md5(result[:n]).digest()
result = md5(result[:n]).digest()
return result[:n]
def authenticate(self, password):
@ -380,21 +378,21 @@ class PDFStandardSecurityHandler:
def authenticate_owner_password(self, password):
# Algorithm 3.7
password = (password + self.PASSWORD_PADDING)[:32]
hash = md5.md5(password)
hash = md5(password)
if self.r >= 3:
for _ in range(50):
hash = md5.md5(hash.digest())
hash = md5(hash.digest())
n = 5
if self.r >= 3:
n = self.length // 8
key = hash.digest()[:n]
if self.r == 2:
user_password = ARC4.new(key).decrypt(self.o)
user_password = Arcfour(key).decrypt(self.o)
else:
user_password = self.o
for i in range(19, -1, -1):
k = b''.join(bytes((c ^ i,)) for c in iter(key))
user_password = ARC4.new(k).decrypt(user_password)
user_password = Arcfour(k).decrypt(user_password)
return self.authenticate_user_password(user_password)
def decrypt(self, objid, genno, data, attrs=None):
@ -403,9 +401,9 @@ class PDFStandardSecurityHandler:
def decrypt_rc4(self, objid, genno, data):
key = self.key + struct.pack('<L', objid)[:3] \
+ struct.pack('<L', genno)[:2]
hash = md5.md5(key)
hash = md5(key)
key = hash.digest()[:min(len(key), 16)]
return ARC4.new(key).decrypt(data)
return Arcfour(key).decrypt(data)
class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
@ -459,9 +457,14 @@ class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
def decrypt_aes128(self, objid, genno, data):
key = self.key + struct.pack('<L', objid)[:3] \
+ struct.pack('<L', genno)[:2] + b'sAlT'
hash = md5.md5(key)
hash = md5(key)
key = hash.digest()[:min(len(key), 16)]
return AES.new(key, mode=AES.MODE_CBC, IV=data[:16]).decrypt(data[16:])
initialization_vector = data[:16]
ciphertext = data[16:]
cipher = Cipher(algorithms.AES(key),
modes.CBC(initialization_vector),
backend=default_backend())
return cipher.decryptor().update(ciphertext)
class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
@ -489,27 +492,35 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
def authenticate(self, password):
password = password.encode('utf-8')[:127]
hash = SHA256.new(password)
hash = sha256(password)
hash.update(self.o_validation_salt)
hash.update(self.u)
if hash.digest() == self.o_hash:
hash = SHA256.new(password)
hash = sha256(password)
hash.update(self.o_key_salt)
hash.update(self.u)
return AES.new(hash.digest(), mode=AES.MODE_CBC, IV=b'\x00' * 16)\
.decrypt(self.oe)
hash = SHA256.new(password)
cipher = Cipher(algorithms.AES(hash.digest()),
modes.CBC(b'\0' * 16),
backend=default_backend())
return cipher.decryptor().update(self.oe)
hash = sha256(password)
hash.update(self.u_validation_salt)
if hash.digest() == self.u_hash:
hash = SHA256.new(password)
hash = sha256(password)
hash.update(self.u_key_salt)
return AES.new(hash.digest(), mode=AES.MODE_CBC, IV=b'\x00' * 16)\
.decrypt(self.ue)
cipher = Cipher(algorithms.AES(hash.digest()),
modes.CBC(b'\0' * 16),
backend=default_backend())
return cipher.decryptor().update(self.ue)
return None
def decrypt_aes256(self, objid, genno, data):
return AES.new(self.key, mode=AES.MODE_CBC, IV=data[:16])\
.decrypt(data[16:])
initialization_vector = data[:16]
ciphertext = data[16:]
cipher = Cipher(algorithms.AES(self.key),
modes.CBC(initialization_vector),
backend=default_backend())
return cipher.decryptor().update(ciphertext)
class PDFDocument:
@ -528,11 +539,9 @@ class PDFDocument:
security_handler_registry = {
1: PDFStandardSecurityHandler,
2: PDFStandardSecurityHandler,
4: PDFStandardSecurityHandlerV4,
5: PDFStandardSecurityHandlerV5,
}
if AES is not None:
security_handler_registry[4] = PDFStandardSecurityHandlerV4
if SHA256 is not None:
security_handler_registry[5] = PDFStandardSecurityHandlerV5
def __init__(self, parser, password='', caching=True, fallback=True):
"Set the document to use a given PDFParser object."

View File

@ -1,80 +0,0 @@
# GNUMakefile for test
RM=rm -f
CMP=:
ECHO=echo
PYTHON=python2
PYTHON3=python3
PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -p1 -V
PDF2TXT3=PYTHONPATH=.. $(PYTHON3) ../tools/pdf2txt.py -p1 -V
FREE= \
simple1 \
simple2 \
simple3 \
jo
NONFREE= \
nonfree/dmca \
nonfree/f1040nr \
nonfree/i1040nr \
nonfree/kampo \
nonfree/naacl06-shinyama \
nonfree/nlp2004slides
TESTS=$(FREE) $(NONFREE)
CRYPT_PASS1=foo
CRYPT_PASS2=baz
CRYPT_BASE=encryption/base
CRYPTS= \
encryption/rc4-40 \
encryption/rc4-128 \
encryption/aes-128 \
encryption/aes-128-m \
encryption/aes-256 \
encryption/aes-256-m
all: tests crypts
tests:
for i in $(TESTS); do \
$(ECHO) $$i; \
$(PDF2TXT) -t html -o $$i.html $$i.pdf || exit 1; \
$(CMP) $$i.html $$i.html.ref || exit 1; \
$(PDF2TXT) -t xml -o $$i.xml $$i.pdf || exit 1; \
$(CMP) $$i.xml $$i.xml.ref || exit 1; \
$(PDF2TXT) -t text -o $$i.txt $$i.pdf || exit 1; \
$(CMP) $$i.txt $$i.txt.ref || exit 1; \
$(PDF2TXT3) -t html -o $$i.html $$i.pdf || exit 1; \
$(CMP) $$i.html $$i.html.ref || exit 1; \
$(PDF2TXT3) -t xml -o $$i.xml $$i.pdf || exit 1; \
$(CMP) $$i.xml $$i.xml.ref || exit 1; \
$(PDF2TXT3) -t text -o $$i.txt $$i.pdf || exit 1; \
$(CMP) $$i.txt $$i.txt.ref || exit 1; \
done
crypts:
for i in $(CRYPTS); do \
$(ECHO) $$i; \
$(PDF2TXT) -t xml -P $(CRYPT_PASS1) -o $$i.1.xml $$i.pdf || exit 1; \
$(CMP) $$i.1.xml $(CRYPT_BASE).xml || exit 1; \
$(PDF2TXT) -t xml -P $(CRYPT_PASS2) -o $$i.2.xml $$i.pdf || exit 1; \
$(CMP) $$i.2.xml $(CRYPT_BASE).xml || exit 1; \
$(PDF2TXT3) -t xml -P $(CRYPT_PASS1) -o $$i.1.xml $$i.pdf || exit 1; \
$(CMP) $$i.1.xml $(CRYPT_BASE).xml || exit 1; \
$(PDF2TXT3) -t xml -P $(CRYPT_PASS2) -o $$i.2.xml $$i.pdf || exit 1; \
$(CMP) $$i.2.xml $(CRYPT_BASE).xml || exit 1; \
done
test:
$(MAKE) all CMP=cmp
clean:
-for i in $(TESTS); do \
$(RM) $$i.html $$i.xml $$i.txt; \
done
-for i in $(CRYPTS); do \
$(RM) $$i.1.xml $$i.2.xml; \
done

View File

@ -1,32 +0,0 @@
# GNUMakefile for test
RM=rm -f
CMP=:
PYTHON=python3
PDF2TXT=PYTHONPATH=../.. $(PYTHON) ../../tools/pdf2txt.py
XMLS= \
rc4-40.xml \
rc4-128.xml \
aes-128.xml \
aes-128-m.xml \
aes-256.xml \
aes-256-m.xml \
all: xmls
test:
$(MAKE) all CMP=cmp
clean:
-$(RM) $(XMLS)
xmls: $(XMLS)
.SUFFIXES: .pdf .xml
.pdf.xml:
$(PDF2TXT) -p1 -V -t xml -P foo -o $@ $<
$(CMP) $@ base.xml
$(PDF2TXT) -p1 -V -t xml -P baz -o $@ $<
$(CMP) $@ base.xml

View File

@ -14,7 +14,7 @@ setup(
package_data={'pdfminer': ['cmap/*.pickle.gz']},
install_requires=[
'chardet ; python_version > "3.0"',
'pycryptodome',
'cryptography',
'sortedcontainers',
],
extras_require={

View File

@ -82,6 +82,27 @@ class TestPdf2Txt():
"""
run('contrib/issue-00369-excel.pdf', '-t html')
def test_encryption_aes128(self):
run('encryption/aes-128.pdf', '-P foo')
def test_encryption_aes128m(self):
run('encryption/aes-128-m.pdf', '-P foo')
def test_encryption_aes256(self):
run('encryption/aes-256.pdf', '-P foo')
def test_encryption_aes256m(self):
run('encryption/aes-256-m.pdf', '-P foo')
def test_encryption_base(self):
run('encryption/base.pdf', '-P foo')
def test_encryption_rc4_40(self):
run('encryption/rc4-40.pdf', '-P foo')
def test_encryption_rc4_128(self):
run('encryption/rc4-128.pdf', '-P foo')
class TestDumpImages: