Change pycryptodome dependency to the faster, smaller, and industry standard cryptography package (#456)
* swap pycryptodome to the faster, smaller, and industry standard crytography io
* update changelog
* fixlint
* Update CHANGELOG.md
* from MR, unneeded ex and naming
* add samples to nosetests
* fix lint
* show mismatch
* fix lint
* typo and newline
* Revert "add samples to nosetests"
This reverts commit a49ca302
* Add tests for encrypted documents to nose test suite
* Optimize imports of pdfdocument.py
Co-authored-by: Oren Tysor <oren@atakama.com>
Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
pull/461/head
parent
60863cfd55
commit
c10cf3cdb8
|
@ -14,6 +14,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
### Changed
|
### Changed
|
||||||
- Hiding fallback xref by default from dumppdf.py output ([#431](https://github.com/pdfminer/pdfminer.six/pull/431))
|
- Hiding fallback xref by default from dumppdf.py output ([#431](https://github.com/pdfminer/pdfminer.six/pull/431))
|
||||||
- Raise a warning instead of an error when extracting text from a non-extractable PDF ([#350](https://github.com/pdfminer/pdfminer.six/issues/350))
|
- Raise a warning instead of an error when extracting text from a non-extractable PDF ([#350](https://github.com/pdfminer/pdfminer.six/issues/350))
|
||||||
|
- Switched from pycryptodome to cryptography package for AES decryption ([#456](https://github.com/pdfminer/pdfminer.six/pull/456))
|
||||||
|
|
||||||
## [20200517]
|
## [20200517]
|
||||||
|
|
||||||
|
|
1
Makefile
1
Makefile
|
@ -56,4 +56,3 @@ $(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz: $(CMAPDST)
|
||||||
|
|
||||||
test: cmap
|
test: cmap
|
||||||
nosetests
|
nosetests
|
||||||
cd samples && $(MAKE) test
|
|
||||||
|
|
|
@ -33,6 +33,3 @@ class Arcfour:
|
||||||
return r
|
return r
|
||||||
|
|
||||||
encrypt = decrypt = process
|
encrypt = decrypt = process
|
||||||
|
|
||||||
|
|
||||||
new = Arcfour
|
|
||||||
|
|
|
@ -1,20 +1,18 @@
|
||||||
import hashlib as md5
|
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
import struct
|
import struct
|
||||||
|
from hashlib import sha256, md5
|
||||||
|
|
||||||
|
from cryptography.hazmat.backends import default_backend
|
||||||
|
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
|
||||||
|
|
||||||
try:
|
|
||||||
from Crypto.Cipher import ARC4, AES
|
|
||||||
from Crypto.Hash import SHA256
|
|
||||||
except ImportError:
|
|
||||||
AES = SHA256 = None
|
|
||||||
from . import arcfour as ARC4
|
|
||||||
from .psparser import PSEOF, literal_name, LIT, KWD
|
|
||||||
from . import settings
|
from . import settings
|
||||||
|
from .arcfour import Arcfour
|
||||||
|
from .pdfparser import PDFSyntaxError, PDFStreamParser
|
||||||
from .pdftypes import PDFException, uint_value, PDFTypeError, PDFStream, \
|
from .pdftypes import PDFException, uint_value, PDFTypeError, PDFStream, \
|
||||||
PDFObjectNotFound, decipher_all, int_value, str_value, list_value, \
|
PDFObjectNotFound, decipher_all, int_value, str_value, list_value, \
|
||||||
dict_value, stream_value
|
dict_value, stream_value
|
||||||
from .pdfparser import PDFSyntaxError, PDFStreamParser
|
from .psparser import PSEOF, literal_name, LIT, KWD
|
||||||
from .utils import choplist, nunpack, decode_text
|
from .utils import choplist, nunpack, decode_text
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
@ -325,22 +323,22 @@ class PDFStandardSecurityHandler:
|
||||||
def compute_u(self, key):
|
def compute_u(self, key):
|
||||||
if self.r == 2:
|
if self.r == 2:
|
||||||
# Algorithm 3.4
|
# Algorithm 3.4
|
||||||
return ARC4.new(key).encrypt(self.PASSWORD_PADDING) # 2
|
return Arcfour(key).encrypt(self.PASSWORD_PADDING) # 2
|
||||||
else:
|
else:
|
||||||
# Algorithm 3.5
|
# Algorithm 3.5
|
||||||
hash = md5.md5(self.PASSWORD_PADDING) # 2
|
hash = md5(self.PASSWORD_PADDING) # 2
|
||||||
hash.update(self.docid[0]) # 3
|
hash.update(self.docid[0]) # 3
|
||||||
result = ARC4.new(key).encrypt(hash.digest()) # 4
|
result = Arcfour(key).encrypt(hash.digest()) # 4
|
||||||
for i in range(1, 20): # 5
|
for i in range(1, 20): # 5
|
||||||
k = b''.join(bytes((c ^ i,)) for c in iter(key))
|
k = b''.join(bytes((c ^ i,)) for c in iter(key))
|
||||||
result = ARC4.new(k).encrypt(result)
|
result = Arcfour(k).encrypt(result)
|
||||||
result += result # 6
|
result += result # 6
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def compute_encryption_key(self, password):
|
def compute_encryption_key(self, password):
|
||||||
# Algorithm 3.2
|
# Algorithm 3.2
|
||||||
password = (password + self.PASSWORD_PADDING)[:32] # 1
|
password = (password + self.PASSWORD_PADDING)[:32] # 1
|
||||||
hash = md5.md5(password) # 2
|
hash = md5(password) # 2
|
||||||
hash.update(self.o) # 3
|
hash.update(self.o) # 3
|
||||||
# See https://github.com/pdfminer/pdfminer.six/issues/186
|
# See https://github.com/pdfminer/pdfminer.six/issues/186
|
||||||
hash.update(struct.pack('<L', self.p)) # 4
|
hash.update(struct.pack('<L', self.p)) # 4
|
||||||
|
@ -353,7 +351,7 @@ class PDFStandardSecurityHandler:
|
||||||
if self.r >= 3:
|
if self.r >= 3:
|
||||||
n = self.length // 8
|
n = self.length // 8
|
||||||
for _ in range(50):
|
for _ in range(50):
|
||||||
result = md5.md5(result[:n]).digest()
|
result = md5(result[:n]).digest()
|
||||||
return result[:n]
|
return result[:n]
|
||||||
|
|
||||||
def authenticate(self, password):
|
def authenticate(self, password):
|
||||||
|
@ -380,21 +378,21 @@ class PDFStandardSecurityHandler:
|
||||||
def authenticate_owner_password(self, password):
|
def authenticate_owner_password(self, password):
|
||||||
# Algorithm 3.7
|
# Algorithm 3.7
|
||||||
password = (password + self.PASSWORD_PADDING)[:32]
|
password = (password + self.PASSWORD_PADDING)[:32]
|
||||||
hash = md5.md5(password)
|
hash = md5(password)
|
||||||
if self.r >= 3:
|
if self.r >= 3:
|
||||||
for _ in range(50):
|
for _ in range(50):
|
||||||
hash = md5.md5(hash.digest())
|
hash = md5(hash.digest())
|
||||||
n = 5
|
n = 5
|
||||||
if self.r >= 3:
|
if self.r >= 3:
|
||||||
n = self.length // 8
|
n = self.length // 8
|
||||||
key = hash.digest()[:n]
|
key = hash.digest()[:n]
|
||||||
if self.r == 2:
|
if self.r == 2:
|
||||||
user_password = ARC4.new(key).decrypt(self.o)
|
user_password = Arcfour(key).decrypt(self.o)
|
||||||
else:
|
else:
|
||||||
user_password = self.o
|
user_password = self.o
|
||||||
for i in range(19, -1, -1):
|
for i in range(19, -1, -1):
|
||||||
k = b''.join(bytes((c ^ i,)) for c in iter(key))
|
k = b''.join(bytes((c ^ i,)) for c in iter(key))
|
||||||
user_password = ARC4.new(k).decrypt(user_password)
|
user_password = Arcfour(k).decrypt(user_password)
|
||||||
return self.authenticate_user_password(user_password)
|
return self.authenticate_user_password(user_password)
|
||||||
|
|
||||||
def decrypt(self, objid, genno, data, attrs=None):
|
def decrypt(self, objid, genno, data, attrs=None):
|
||||||
|
@ -403,9 +401,9 @@ class PDFStandardSecurityHandler:
|
||||||
def decrypt_rc4(self, objid, genno, data):
|
def decrypt_rc4(self, objid, genno, data):
|
||||||
key = self.key + struct.pack('<L', objid)[:3] \
|
key = self.key + struct.pack('<L', objid)[:3] \
|
||||||
+ struct.pack('<L', genno)[:2]
|
+ struct.pack('<L', genno)[:2]
|
||||||
hash = md5.md5(key)
|
hash = md5(key)
|
||||||
key = hash.digest()[:min(len(key), 16)]
|
key = hash.digest()[:min(len(key), 16)]
|
||||||
return ARC4.new(key).decrypt(data)
|
return Arcfour(key).decrypt(data)
|
||||||
|
|
||||||
|
|
||||||
class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
|
class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
|
||||||
|
@ -459,9 +457,14 @@ class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
|
||||||
def decrypt_aes128(self, objid, genno, data):
|
def decrypt_aes128(self, objid, genno, data):
|
||||||
key = self.key + struct.pack('<L', objid)[:3] \
|
key = self.key + struct.pack('<L', objid)[:3] \
|
||||||
+ struct.pack('<L', genno)[:2] + b'sAlT'
|
+ struct.pack('<L', genno)[:2] + b'sAlT'
|
||||||
hash = md5.md5(key)
|
hash = md5(key)
|
||||||
key = hash.digest()[:min(len(key), 16)]
|
key = hash.digest()[:min(len(key), 16)]
|
||||||
return AES.new(key, mode=AES.MODE_CBC, IV=data[:16]).decrypt(data[16:])
|
initialization_vector = data[:16]
|
||||||
|
ciphertext = data[16:]
|
||||||
|
cipher = Cipher(algorithms.AES(key),
|
||||||
|
modes.CBC(initialization_vector),
|
||||||
|
backend=default_backend())
|
||||||
|
return cipher.decryptor().update(ciphertext)
|
||||||
|
|
||||||
|
|
||||||
class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
|
class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
|
||||||
|
@ -489,27 +492,35 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
|
||||||
|
|
||||||
def authenticate(self, password):
|
def authenticate(self, password):
|
||||||
password = password.encode('utf-8')[:127]
|
password = password.encode('utf-8')[:127]
|
||||||
hash = SHA256.new(password)
|
hash = sha256(password)
|
||||||
hash.update(self.o_validation_salt)
|
hash.update(self.o_validation_salt)
|
||||||
hash.update(self.u)
|
hash.update(self.u)
|
||||||
if hash.digest() == self.o_hash:
|
if hash.digest() == self.o_hash:
|
||||||
hash = SHA256.new(password)
|
hash = sha256(password)
|
||||||
hash.update(self.o_key_salt)
|
hash.update(self.o_key_salt)
|
||||||
hash.update(self.u)
|
hash.update(self.u)
|
||||||
return AES.new(hash.digest(), mode=AES.MODE_CBC, IV=b'\x00' * 16)\
|
cipher = Cipher(algorithms.AES(hash.digest()),
|
||||||
.decrypt(self.oe)
|
modes.CBC(b'\0' * 16),
|
||||||
hash = SHA256.new(password)
|
backend=default_backend())
|
||||||
|
return cipher.decryptor().update(self.oe)
|
||||||
|
hash = sha256(password)
|
||||||
hash.update(self.u_validation_salt)
|
hash.update(self.u_validation_salt)
|
||||||
if hash.digest() == self.u_hash:
|
if hash.digest() == self.u_hash:
|
||||||
hash = SHA256.new(password)
|
hash = sha256(password)
|
||||||
hash.update(self.u_key_salt)
|
hash.update(self.u_key_salt)
|
||||||
return AES.new(hash.digest(), mode=AES.MODE_CBC, IV=b'\x00' * 16)\
|
cipher = Cipher(algorithms.AES(hash.digest()),
|
||||||
.decrypt(self.ue)
|
modes.CBC(b'\0' * 16),
|
||||||
|
backend=default_backend())
|
||||||
|
return cipher.decryptor().update(self.ue)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def decrypt_aes256(self, objid, genno, data):
|
def decrypt_aes256(self, objid, genno, data):
|
||||||
return AES.new(self.key, mode=AES.MODE_CBC, IV=data[:16])\
|
initialization_vector = data[:16]
|
||||||
.decrypt(data[16:])
|
ciphertext = data[16:]
|
||||||
|
cipher = Cipher(algorithms.AES(self.key),
|
||||||
|
modes.CBC(initialization_vector),
|
||||||
|
backend=default_backend())
|
||||||
|
return cipher.decryptor().update(ciphertext)
|
||||||
|
|
||||||
|
|
||||||
class PDFDocument:
|
class PDFDocument:
|
||||||
|
@ -528,11 +539,9 @@ class PDFDocument:
|
||||||
security_handler_registry = {
|
security_handler_registry = {
|
||||||
1: PDFStandardSecurityHandler,
|
1: PDFStandardSecurityHandler,
|
||||||
2: PDFStandardSecurityHandler,
|
2: PDFStandardSecurityHandler,
|
||||||
|
4: PDFStandardSecurityHandlerV4,
|
||||||
|
5: PDFStandardSecurityHandlerV5,
|
||||||
}
|
}
|
||||||
if AES is not None:
|
|
||||||
security_handler_registry[4] = PDFStandardSecurityHandlerV4
|
|
||||||
if SHA256 is not None:
|
|
||||||
security_handler_registry[5] = PDFStandardSecurityHandlerV5
|
|
||||||
|
|
||||||
def __init__(self, parser, password='', caching=True, fallback=True):
|
def __init__(self, parser, password='', caching=True, fallback=True):
|
||||||
"Set the document to use a given PDFParser object."
|
"Set the document to use a given PDFParser object."
|
||||||
|
|
|
@ -1,80 +0,0 @@
|
||||||
# GNUMakefile for test
|
|
||||||
|
|
||||||
RM=rm -f
|
|
||||||
CMP=:
|
|
||||||
ECHO=echo
|
|
||||||
PYTHON=python2
|
|
||||||
PYTHON3=python3
|
|
||||||
|
|
||||||
PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -p1 -V
|
|
||||||
PDF2TXT3=PYTHONPATH=.. $(PYTHON3) ../tools/pdf2txt.py -p1 -V
|
|
||||||
|
|
||||||
FREE= \
|
|
||||||
simple1 \
|
|
||||||
simple2 \
|
|
||||||
simple3 \
|
|
||||||
jo
|
|
||||||
|
|
||||||
NONFREE= \
|
|
||||||
nonfree/dmca \
|
|
||||||
nonfree/f1040nr \
|
|
||||||
nonfree/i1040nr \
|
|
||||||
nonfree/kampo \
|
|
||||||
nonfree/naacl06-shinyama \
|
|
||||||
nonfree/nlp2004slides
|
|
||||||
|
|
||||||
TESTS=$(FREE) $(NONFREE)
|
|
||||||
|
|
||||||
CRYPT_PASS1=foo
|
|
||||||
CRYPT_PASS2=baz
|
|
||||||
CRYPT_BASE=encryption/base
|
|
||||||
CRYPTS= \
|
|
||||||
encryption/rc4-40 \
|
|
||||||
encryption/rc4-128 \
|
|
||||||
encryption/aes-128 \
|
|
||||||
encryption/aes-128-m \
|
|
||||||
encryption/aes-256 \
|
|
||||||
encryption/aes-256-m
|
|
||||||
|
|
||||||
all: tests crypts
|
|
||||||
|
|
||||||
tests:
|
|
||||||
for i in $(TESTS); do \
|
|
||||||
$(ECHO) $$i; \
|
|
||||||
$(PDF2TXT) -t html -o $$i.html $$i.pdf || exit 1; \
|
|
||||||
$(CMP) $$i.html $$i.html.ref || exit 1; \
|
|
||||||
$(PDF2TXT) -t xml -o $$i.xml $$i.pdf || exit 1; \
|
|
||||||
$(CMP) $$i.xml $$i.xml.ref || exit 1; \
|
|
||||||
$(PDF2TXT) -t text -o $$i.txt $$i.pdf || exit 1; \
|
|
||||||
$(CMP) $$i.txt $$i.txt.ref || exit 1; \
|
|
||||||
$(PDF2TXT3) -t html -o $$i.html $$i.pdf || exit 1; \
|
|
||||||
$(CMP) $$i.html $$i.html.ref || exit 1; \
|
|
||||||
$(PDF2TXT3) -t xml -o $$i.xml $$i.pdf || exit 1; \
|
|
||||||
$(CMP) $$i.xml $$i.xml.ref || exit 1; \
|
|
||||||
$(PDF2TXT3) -t text -o $$i.txt $$i.pdf || exit 1; \
|
|
||||||
$(CMP) $$i.txt $$i.txt.ref || exit 1; \
|
|
||||||
done
|
|
||||||
|
|
||||||
crypts:
|
|
||||||
for i in $(CRYPTS); do \
|
|
||||||
$(ECHO) $$i; \
|
|
||||||
$(PDF2TXT) -t xml -P $(CRYPT_PASS1) -o $$i.1.xml $$i.pdf || exit 1; \
|
|
||||||
$(CMP) $$i.1.xml $(CRYPT_BASE).xml || exit 1; \
|
|
||||||
$(PDF2TXT) -t xml -P $(CRYPT_PASS2) -o $$i.2.xml $$i.pdf || exit 1; \
|
|
||||||
$(CMP) $$i.2.xml $(CRYPT_BASE).xml || exit 1; \
|
|
||||||
$(PDF2TXT3) -t xml -P $(CRYPT_PASS1) -o $$i.1.xml $$i.pdf || exit 1; \
|
|
||||||
$(CMP) $$i.1.xml $(CRYPT_BASE).xml || exit 1; \
|
|
||||||
$(PDF2TXT3) -t xml -P $(CRYPT_PASS2) -o $$i.2.xml $$i.pdf || exit 1; \
|
|
||||||
$(CMP) $$i.2.xml $(CRYPT_BASE).xml || exit 1; \
|
|
||||||
done
|
|
||||||
|
|
||||||
test:
|
|
||||||
$(MAKE) all CMP=cmp
|
|
||||||
|
|
||||||
clean:
|
|
||||||
-for i in $(TESTS); do \
|
|
||||||
$(RM) $$i.html $$i.xml $$i.txt; \
|
|
||||||
done
|
|
||||||
-for i in $(CRYPTS); do \
|
|
||||||
$(RM) $$i.1.xml $$i.2.xml; \
|
|
||||||
done
|
|
|
@ -1,32 +0,0 @@
|
||||||
# GNUMakefile for test
|
|
||||||
|
|
||||||
RM=rm -f
|
|
||||||
CMP=:
|
|
||||||
PYTHON=python3
|
|
||||||
PDF2TXT=PYTHONPATH=../.. $(PYTHON) ../../tools/pdf2txt.py
|
|
||||||
|
|
||||||
XMLS= \
|
|
||||||
rc4-40.xml \
|
|
||||||
rc4-128.xml \
|
|
||||||
aes-128.xml \
|
|
||||||
aes-128-m.xml \
|
|
||||||
aes-256.xml \
|
|
||||||
aes-256-m.xml \
|
|
||||||
|
|
||||||
all: xmls
|
|
||||||
|
|
||||||
test:
|
|
||||||
$(MAKE) all CMP=cmp
|
|
||||||
|
|
||||||
clean:
|
|
||||||
-$(RM) $(XMLS)
|
|
||||||
|
|
||||||
xmls: $(XMLS)
|
|
||||||
|
|
||||||
.SUFFIXES: .pdf .xml
|
|
||||||
|
|
||||||
.pdf.xml:
|
|
||||||
$(PDF2TXT) -p1 -V -t xml -P foo -o $@ $<
|
|
||||||
$(CMP) $@ base.xml
|
|
||||||
$(PDF2TXT) -p1 -V -t xml -P baz -o $@ $<
|
|
||||||
$(CMP) $@ base.xml
|
|
2
setup.py
2
setup.py
|
@ -14,7 +14,7 @@ setup(
|
||||||
package_data={'pdfminer': ['cmap/*.pickle.gz']},
|
package_data={'pdfminer': ['cmap/*.pickle.gz']},
|
||||||
install_requires=[
|
install_requires=[
|
||||||
'chardet ; python_version > "3.0"',
|
'chardet ; python_version > "3.0"',
|
||||||
'pycryptodome',
|
'cryptography',
|
||||||
'sortedcontainers',
|
'sortedcontainers',
|
||||||
],
|
],
|
||||||
extras_require={
|
extras_require={
|
||||||
|
|
|
@ -82,6 +82,27 @@ class TestPdf2Txt():
|
||||||
"""
|
"""
|
||||||
run('contrib/issue-00369-excel.pdf', '-t html')
|
run('contrib/issue-00369-excel.pdf', '-t html')
|
||||||
|
|
||||||
|
def test_encryption_aes128(self):
|
||||||
|
run('encryption/aes-128.pdf', '-P foo')
|
||||||
|
|
||||||
|
def test_encryption_aes128m(self):
|
||||||
|
run('encryption/aes-128-m.pdf', '-P foo')
|
||||||
|
|
||||||
|
def test_encryption_aes256(self):
|
||||||
|
run('encryption/aes-256.pdf', '-P foo')
|
||||||
|
|
||||||
|
def test_encryption_aes256m(self):
|
||||||
|
run('encryption/aes-256-m.pdf', '-P foo')
|
||||||
|
|
||||||
|
def test_encryption_base(self):
|
||||||
|
run('encryption/base.pdf', '-P foo')
|
||||||
|
|
||||||
|
def test_encryption_rc4_40(self):
|
||||||
|
run('encryption/rc4-40.pdf', '-P foo')
|
||||||
|
|
||||||
|
def test_encryption_rc4_128(self):
|
||||||
|
run('encryption/rc4-128.pdf', '-P foo')
|
||||||
|
|
||||||
|
|
||||||
class TestDumpImages:
|
class TestDumpImages:
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue