Change pycryptodome dependency to the faster, smaller, and industry standard cryptography package (#456)
* swap pycryptodome to the faster, smaller, and industry standard crytography io
* update changelog
* fixlint
* Update CHANGELOG.md
* from MR, unneeded ex and naming
* add samples to nosetests
* fix lint
* show mismatch
* fix lint
* typo and newline
* Revert "add samples to nosetests"
This reverts commit a49ca302
* Add tests for encrypted documents to nose test suite
* Optimize imports of pdfdocument.py
Co-authored-by: Oren Tysor <oren@atakama.com>
Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
pull/461/head
parent
60863cfd55
commit
c10cf3cdb8
|
@ -14,6 +14,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
|||
### Changed
|
||||
- Hiding fallback xref by default from dumppdf.py output ([#431](https://github.com/pdfminer/pdfminer.six/pull/431))
|
||||
- Raise a warning instead of an error when extracting text from a non-extractable PDF ([#350](https://github.com/pdfminer/pdfminer.six/issues/350))
|
||||
- Switched from pycryptodome to cryptography package for AES decryption ([#456](https://github.com/pdfminer/pdfminer.six/pull/456))
|
||||
|
||||
## [20200517]
|
||||
|
||||
|
|
1
Makefile
1
Makefile
|
@ -56,4 +56,3 @@ $(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz: $(CMAPDST)
|
|||
|
||||
test: cmap
|
||||
nosetests
|
||||
cd samples && $(MAKE) test
|
||||
|
|
|
@ -33,6 +33,3 @@ class Arcfour:
|
|||
return r
|
||||
|
||||
encrypt = decrypt = process
|
||||
|
||||
|
||||
new = Arcfour
|
||||
|
|
|
@ -1,20 +1,18 @@
|
|||
import hashlib as md5
|
||||
import logging
|
||||
import re
|
||||
import struct
|
||||
from hashlib import sha256, md5
|
||||
|
||||
from cryptography.hazmat.backends import default_backend
|
||||
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
|
||||
|
||||
try:
|
||||
from Crypto.Cipher import ARC4, AES
|
||||
from Crypto.Hash import SHA256
|
||||
except ImportError:
|
||||
AES = SHA256 = None
|
||||
from . import arcfour as ARC4
|
||||
from .psparser import PSEOF, literal_name, LIT, KWD
|
||||
from . import settings
|
||||
from .arcfour import Arcfour
|
||||
from .pdfparser import PDFSyntaxError, PDFStreamParser
|
||||
from .pdftypes import PDFException, uint_value, PDFTypeError, PDFStream, \
|
||||
PDFObjectNotFound, decipher_all, int_value, str_value, list_value, \
|
||||
dict_value, stream_value
|
||||
from .pdfparser import PDFSyntaxError, PDFStreamParser
|
||||
from .psparser import PSEOF, literal_name, LIT, KWD
|
||||
from .utils import choplist, nunpack, decode_text
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
@ -325,22 +323,22 @@ class PDFStandardSecurityHandler:
|
|||
def compute_u(self, key):
|
||||
if self.r == 2:
|
||||
# Algorithm 3.4
|
||||
return ARC4.new(key).encrypt(self.PASSWORD_PADDING) # 2
|
||||
return Arcfour(key).encrypt(self.PASSWORD_PADDING) # 2
|
||||
else:
|
||||
# Algorithm 3.5
|
||||
hash = md5.md5(self.PASSWORD_PADDING) # 2
|
||||
hash = md5(self.PASSWORD_PADDING) # 2
|
||||
hash.update(self.docid[0]) # 3
|
||||
result = ARC4.new(key).encrypt(hash.digest()) # 4
|
||||
result = Arcfour(key).encrypt(hash.digest()) # 4
|
||||
for i in range(1, 20): # 5
|
||||
k = b''.join(bytes((c ^ i,)) for c in iter(key))
|
||||
result = ARC4.new(k).encrypt(result)
|
||||
result = Arcfour(k).encrypt(result)
|
||||
result += result # 6
|
||||
return result
|
||||
|
||||
def compute_encryption_key(self, password):
|
||||
# Algorithm 3.2
|
||||
password = (password + self.PASSWORD_PADDING)[:32] # 1
|
||||
hash = md5.md5(password) # 2
|
||||
hash = md5(password) # 2
|
||||
hash.update(self.o) # 3
|
||||
# See https://github.com/pdfminer/pdfminer.six/issues/186
|
||||
hash.update(struct.pack('<L', self.p)) # 4
|
||||
|
@ -353,7 +351,7 @@ class PDFStandardSecurityHandler:
|
|||
if self.r >= 3:
|
||||
n = self.length // 8
|
||||
for _ in range(50):
|
||||
result = md5.md5(result[:n]).digest()
|
||||
result = md5(result[:n]).digest()
|
||||
return result[:n]
|
||||
|
||||
def authenticate(self, password):
|
||||
|
@ -380,21 +378,21 @@ class PDFStandardSecurityHandler:
|
|||
def authenticate_owner_password(self, password):
|
||||
# Algorithm 3.7
|
||||
password = (password + self.PASSWORD_PADDING)[:32]
|
||||
hash = md5.md5(password)
|
||||
hash = md5(password)
|
||||
if self.r >= 3:
|
||||
for _ in range(50):
|
||||
hash = md5.md5(hash.digest())
|
||||
hash = md5(hash.digest())
|
||||
n = 5
|
||||
if self.r >= 3:
|
||||
n = self.length // 8
|
||||
key = hash.digest()[:n]
|
||||
if self.r == 2:
|
||||
user_password = ARC4.new(key).decrypt(self.o)
|
||||
user_password = Arcfour(key).decrypt(self.o)
|
||||
else:
|
||||
user_password = self.o
|
||||
for i in range(19, -1, -1):
|
||||
k = b''.join(bytes((c ^ i,)) for c in iter(key))
|
||||
user_password = ARC4.new(k).decrypt(user_password)
|
||||
user_password = Arcfour(k).decrypt(user_password)
|
||||
return self.authenticate_user_password(user_password)
|
||||
|
||||
def decrypt(self, objid, genno, data, attrs=None):
|
||||
|
@ -403,9 +401,9 @@ class PDFStandardSecurityHandler:
|
|||
def decrypt_rc4(self, objid, genno, data):
|
||||
key = self.key + struct.pack('<L', objid)[:3] \
|
||||
+ struct.pack('<L', genno)[:2]
|
||||
hash = md5.md5(key)
|
||||
hash = md5(key)
|
||||
key = hash.digest()[:min(len(key), 16)]
|
||||
return ARC4.new(key).decrypt(data)
|
||||
return Arcfour(key).decrypt(data)
|
||||
|
||||
|
||||
class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
|
||||
|
@ -459,9 +457,14 @@ class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
|
|||
def decrypt_aes128(self, objid, genno, data):
|
||||
key = self.key + struct.pack('<L', objid)[:3] \
|
||||
+ struct.pack('<L', genno)[:2] + b'sAlT'
|
||||
hash = md5.md5(key)
|
||||
hash = md5(key)
|
||||
key = hash.digest()[:min(len(key), 16)]
|
||||
return AES.new(key, mode=AES.MODE_CBC, IV=data[:16]).decrypt(data[16:])
|
||||
initialization_vector = data[:16]
|
||||
ciphertext = data[16:]
|
||||
cipher = Cipher(algorithms.AES(key),
|
||||
modes.CBC(initialization_vector),
|
||||
backend=default_backend())
|
||||
return cipher.decryptor().update(ciphertext)
|
||||
|
||||
|
||||
class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
|
||||
|
@ -489,27 +492,35 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
|
|||
|
||||
def authenticate(self, password):
|
||||
password = password.encode('utf-8')[:127]
|
||||
hash = SHA256.new(password)
|
||||
hash = sha256(password)
|
||||
hash.update(self.o_validation_salt)
|
||||
hash.update(self.u)
|
||||
if hash.digest() == self.o_hash:
|
||||
hash = SHA256.new(password)
|
||||
hash = sha256(password)
|
||||
hash.update(self.o_key_salt)
|
||||
hash.update(self.u)
|
||||
return AES.new(hash.digest(), mode=AES.MODE_CBC, IV=b'\x00' * 16)\
|
||||
.decrypt(self.oe)
|
||||
hash = SHA256.new(password)
|
||||
cipher = Cipher(algorithms.AES(hash.digest()),
|
||||
modes.CBC(b'\0' * 16),
|
||||
backend=default_backend())
|
||||
return cipher.decryptor().update(self.oe)
|
||||
hash = sha256(password)
|
||||
hash.update(self.u_validation_salt)
|
||||
if hash.digest() == self.u_hash:
|
||||
hash = SHA256.new(password)
|
||||
hash = sha256(password)
|
||||
hash.update(self.u_key_salt)
|
||||
return AES.new(hash.digest(), mode=AES.MODE_CBC, IV=b'\x00' * 16)\
|
||||
.decrypt(self.ue)
|
||||
cipher = Cipher(algorithms.AES(hash.digest()),
|
||||
modes.CBC(b'\0' * 16),
|
||||
backend=default_backend())
|
||||
return cipher.decryptor().update(self.ue)
|
||||
return None
|
||||
|
||||
def decrypt_aes256(self, objid, genno, data):
|
||||
return AES.new(self.key, mode=AES.MODE_CBC, IV=data[:16])\
|
||||
.decrypt(data[16:])
|
||||
initialization_vector = data[:16]
|
||||
ciphertext = data[16:]
|
||||
cipher = Cipher(algorithms.AES(self.key),
|
||||
modes.CBC(initialization_vector),
|
||||
backend=default_backend())
|
||||
return cipher.decryptor().update(ciphertext)
|
||||
|
||||
|
||||
class PDFDocument:
|
||||
|
@ -528,11 +539,9 @@ class PDFDocument:
|
|||
security_handler_registry = {
|
||||
1: PDFStandardSecurityHandler,
|
||||
2: PDFStandardSecurityHandler,
|
||||
4: PDFStandardSecurityHandlerV4,
|
||||
5: PDFStandardSecurityHandlerV5,
|
||||
}
|
||||
if AES is not None:
|
||||
security_handler_registry[4] = PDFStandardSecurityHandlerV4
|
||||
if SHA256 is not None:
|
||||
security_handler_registry[5] = PDFStandardSecurityHandlerV5
|
||||
|
||||
def __init__(self, parser, password='', caching=True, fallback=True):
|
||||
"Set the document to use a given PDFParser object."
|
||||
|
|
|
@ -1,80 +0,0 @@
|
|||
# GNUMakefile for test
|
||||
|
||||
RM=rm -f
|
||||
CMP=:
|
||||
ECHO=echo
|
||||
PYTHON=python2
|
||||
PYTHON3=python3
|
||||
|
||||
PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -p1 -V
|
||||
PDF2TXT3=PYTHONPATH=.. $(PYTHON3) ../tools/pdf2txt.py -p1 -V
|
||||
|
||||
FREE= \
|
||||
simple1 \
|
||||
simple2 \
|
||||
simple3 \
|
||||
jo
|
||||
|
||||
NONFREE= \
|
||||
nonfree/dmca \
|
||||
nonfree/f1040nr \
|
||||
nonfree/i1040nr \
|
||||
nonfree/kampo \
|
||||
nonfree/naacl06-shinyama \
|
||||
nonfree/nlp2004slides
|
||||
|
||||
TESTS=$(FREE) $(NONFREE)
|
||||
|
||||
CRYPT_PASS1=foo
|
||||
CRYPT_PASS2=baz
|
||||
CRYPT_BASE=encryption/base
|
||||
CRYPTS= \
|
||||
encryption/rc4-40 \
|
||||
encryption/rc4-128 \
|
||||
encryption/aes-128 \
|
||||
encryption/aes-128-m \
|
||||
encryption/aes-256 \
|
||||
encryption/aes-256-m
|
||||
|
||||
all: tests crypts
|
||||
|
||||
tests:
|
||||
for i in $(TESTS); do \
|
||||
$(ECHO) $$i; \
|
||||
$(PDF2TXT) -t html -o $$i.html $$i.pdf || exit 1; \
|
||||
$(CMP) $$i.html $$i.html.ref || exit 1; \
|
||||
$(PDF2TXT) -t xml -o $$i.xml $$i.pdf || exit 1; \
|
||||
$(CMP) $$i.xml $$i.xml.ref || exit 1; \
|
||||
$(PDF2TXT) -t text -o $$i.txt $$i.pdf || exit 1; \
|
||||
$(CMP) $$i.txt $$i.txt.ref || exit 1; \
|
||||
$(PDF2TXT3) -t html -o $$i.html $$i.pdf || exit 1; \
|
||||
$(CMP) $$i.html $$i.html.ref || exit 1; \
|
||||
$(PDF2TXT3) -t xml -o $$i.xml $$i.pdf || exit 1; \
|
||||
$(CMP) $$i.xml $$i.xml.ref || exit 1; \
|
||||
$(PDF2TXT3) -t text -o $$i.txt $$i.pdf || exit 1; \
|
||||
$(CMP) $$i.txt $$i.txt.ref || exit 1; \
|
||||
done
|
||||
|
||||
crypts:
|
||||
for i in $(CRYPTS); do \
|
||||
$(ECHO) $$i; \
|
||||
$(PDF2TXT) -t xml -P $(CRYPT_PASS1) -o $$i.1.xml $$i.pdf || exit 1; \
|
||||
$(CMP) $$i.1.xml $(CRYPT_BASE).xml || exit 1; \
|
||||
$(PDF2TXT) -t xml -P $(CRYPT_PASS2) -o $$i.2.xml $$i.pdf || exit 1; \
|
||||
$(CMP) $$i.2.xml $(CRYPT_BASE).xml || exit 1; \
|
||||
$(PDF2TXT3) -t xml -P $(CRYPT_PASS1) -o $$i.1.xml $$i.pdf || exit 1; \
|
||||
$(CMP) $$i.1.xml $(CRYPT_BASE).xml || exit 1; \
|
||||
$(PDF2TXT3) -t xml -P $(CRYPT_PASS2) -o $$i.2.xml $$i.pdf || exit 1; \
|
||||
$(CMP) $$i.2.xml $(CRYPT_BASE).xml || exit 1; \
|
||||
done
|
||||
|
||||
test:
|
||||
$(MAKE) all CMP=cmp
|
||||
|
||||
clean:
|
||||
-for i in $(TESTS); do \
|
||||
$(RM) $$i.html $$i.xml $$i.txt; \
|
||||
done
|
||||
-for i in $(CRYPTS); do \
|
||||
$(RM) $$i.1.xml $$i.2.xml; \
|
||||
done
|
|
@ -1,32 +0,0 @@
|
|||
# GNUMakefile for test
|
||||
|
||||
RM=rm -f
|
||||
CMP=:
|
||||
PYTHON=python3
|
||||
PDF2TXT=PYTHONPATH=../.. $(PYTHON) ../../tools/pdf2txt.py
|
||||
|
||||
XMLS= \
|
||||
rc4-40.xml \
|
||||
rc4-128.xml \
|
||||
aes-128.xml \
|
||||
aes-128-m.xml \
|
||||
aes-256.xml \
|
||||
aes-256-m.xml \
|
||||
|
||||
all: xmls
|
||||
|
||||
test:
|
||||
$(MAKE) all CMP=cmp
|
||||
|
||||
clean:
|
||||
-$(RM) $(XMLS)
|
||||
|
||||
xmls: $(XMLS)
|
||||
|
||||
.SUFFIXES: .pdf .xml
|
||||
|
||||
.pdf.xml:
|
||||
$(PDF2TXT) -p1 -V -t xml -P foo -o $@ $<
|
||||
$(CMP) $@ base.xml
|
||||
$(PDF2TXT) -p1 -V -t xml -P baz -o $@ $<
|
||||
$(CMP) $@ base.xml
|
2
setup.py
2
setup.py
|
@ -14,7 +14,7 @@ setup(
|
|||
package_data={'pdfminer': ['cmap/*.pickle.gz']},
|
||||
install_requires=[
|
||||
'chardet ; python_version > "3.0"',
|
||||
'pycryptodome',
|
||||
'cryptography',
|
||||
'sortedcontainers',
|
||||
],
|
||||
extras_require={
|
||||
|
|
|
@ -82,6 +82,27 @@ class TestPdf2Txt():
|
|||
"""
|
||||
run('contrib/issue-00369-excel.pdf', '-t html')
|
||||
|
||||
def test_encryption_aes128(self):
|
||||
run('encryption/aes-128.pdf', '-P foo')
|
||||
|
||||
def test_encryption_aes128m(self):
|
||||
run('encryption/aes-128-m.pdf', '-P foo')
|
||||
|
||||
def test_encryption_aes256(self):
|
||||
run('encryption/aes-256.pdf', '-P foo')
|
||||
|
||||
def test_encryption_aes256m(self):
|
||||
run('encryption/aes-256-m.pdf', '-P foo')
|
||||
|
||||
def test_encryption_base(self):
|
||||
run('encryption/base.pdf', '-P foo')
|
||||
|
||||
def test_encryption_rc4_40(self):
|
||||
run('encryption/rc4-40.pdf', '-P foo')
|
||||
|
||||
def test_encryption_rc4_128(self):
|
||||
run('encryption/rc4-128.pdf', '-P foo')
|
||||
|
||||
|
||||
class TestDumpImages:
|
||||
|
||||
|
|
Loading…
Reference in New Issue