From c10cf3cdb8466c60723e974af4907592660ddc26 Mon Sep 17 00:00:00 2001 From: lithiumFlower Date: Mon, 20 Jul 2020 16:00:54 -0400 Subject: [PATCH] Change pycryptodome dependency to the faster, smaller, and industry standard cryptography package (#456) * swap pycryptodome to the faster, smaller, and industry standard crytography io * update changelog * fixlint * Update CHANGELOG.md * from MR, unneeded ex and naming * add samples to nosetests * fix lint * show mismatch * fix lint * typo and newline * Revert "add samples to nosetests" This reverts commit a49ca302 * Add tests for encrypted documents to nose test suite * Optimize imports of pdfdocument.py Co-authored-by: Oren Tysor Co-authored-by: Pieter Marsman --- CHANGELOG.md | 1 + Makefile | 1 - pdfminer/arcfour.py | 3 -- pdfminer/pdfdocument.py | 83 ++++++++++++++++++++----------------- samples/Makefile | 80 ----------------------------------- samples/encryption/Makefile | 32 -------------- setup.py | 2 +- tests/test_tools_pdf2txt.py | 21 ++++++++++ 8 files changed, 69 insertions(+), 154 deletions(-) delete mode 100644 samples/Makefile delete mode 100644 samples/encryption/Makefile diff --git a/CHANGELOG.md b/CHANGELOG.md index be5daf3..41ff8ca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Changed - Hiding fallback xref by default from dumppdf.py output ([#431](https://github.com/pdfminer/pdfminer.six/pull/431)) - Raise a warning instead of an error when extracting text from a non-extractable PDF ([#350](https://github.com/pdfminer/pdfminer.six/issues/350)) +- Switched from pycryptodome to cryptography package for AES decryption ([#456](https://github.com/pdfminer/pdfminer.six/pull/456)) ## [20200517] diff --git a/Makefile b/Makefile index 640624c..05f6c41 100644 --- a/Makefile +++ b/Makefile @@ -56,4 +56,3 @@ $(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz: $(CMAPDST) test: cmap nosetests - cd samples && $(MAKE) test diff --git a/pdfminer/arcfour.py b/pdfminer/arcfour.py index c771c0b..e40b080 100644 --- a/pdfminer/arcfour.py +++ b/pdfminer/arcfour.py @@ -33,6 +33,3 @@ class Arcfour: return r encrypt = decrypt = process - - -new = Arcfour diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py index 760073c..ba473be 100644 --- a/pdfminer/pdfdocument.py +++ b/pdfminer/pdfdocument.py @@ -1,20 +1,18 @@ -import hashlib as md5 import logging import re import struct +from hashlib import sha256, md5 + +from cryptography.hazmat.backends import default_backend +from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes -try: - from Crypto.Cipher import ARC4, AES - from Crypto.Hash import SHA256 -except ImportError: - AES = SHA256 = None - from . import arcfour as ARC4 -from .psparser import PSEOF, literal_name, LIT, KWD from . import settings +from .arcfour import Arcfour +from .pdfparser import PDFSyntaxError, PDFStreamParser from .pdftypes import PDFException, uint_value, PDFTypeError, PDFStream, \ PDFObjectNotFound, decipher_all, int_value, str_value, list_value, \ dict_value, stream_value -from .pdfparser import PDFSyntaxError, PDFStreamParser +from .psparser import PSEOF, literal_name, LIT, KWD from .utils import choplist, nunpack, decode_text log = logging.getLogger(__name__) @@ -325,22 +323,22 @@ class PDFStandardSecurityHandler: def compute_u(self, key): if self.r == 2: # Algorithm 3.4 - return ARC4.new(key).encrypt(self.PASSWORD_PADDING) # 2 + return Arcfour(key).encrypt(self.PASSWORD_PADDING) # 2 else: # Algorithm 3.5 - hash = md5.md5(self.PASSWORD_PADDING) # 2 + hash = md5(self.PASSWORD_PADDING) # 2 hash.update(self.docid[0]) # 3 - result = ARC4.new(key).encrypt(hash.digest()) # 4 + result = Arcfour(key).encrypt(hash.digest()) # 4 for i in range(1, 20): # 5 k = b''.join(bytes((c ^ i,)) for c in iter(key)) - result = ARC4.new(k).encrypt(result) + result = Arcfour(k).encrypt(result) result += result # 6 return result def compute_encryption_key(self, password): # Algorithm 3.2 password = (password + self.PASSWORD_PADDING)[:32] # 1 - hash = md5.md5(password) # 2 + hash = md5(password) # 2 hash.update(self.o) # 3 # See https://github.com/pdfminer/pdfminer.six/issues/186 hash.update(struct.pack('= 3: n = self.length // 8 for _ in range(50): - result = md5.md5(result[:n]).digest() + result = md5(result[:n]).digest() return result[:n] def authenticate(self, password): @@ -380,21 +378,21 @@ class PDFStandardSecurityHandler: def authenticate_owner_password(self, password): # Algorithm 3.7 password = (password + self.PASSWORD_PADDING)[:32] - hash = md5.md5(password) + hash = md5(password) if self.r >= 3: for _ in range(50): - hash = md5.md5(hash.digest()) + hash = md5(hash.digest()) n = 5 if self.r >= 3: n = self.length // 8 key = hash.digest()[:n] if self.r == 2: - user_password = ARC4.new(key).decrypt(self.o) + user_password = Arcfour(key).decrypt(self.o) else: user_password = self.o for i in range(19, -1, -1): k = b''.join(bytes((c ^ i,)) for c in iter(key)) - user_password = ARC4.new(k).decrypt(user_password) + user_password = Arcfour(k).decrypt(user_password) return self.authenticate_user_password(user_password) def decrypt(self, objid, genno, data, attrs=None): @@ -403,9 +401,9 @@ class PDFStandardSecurityHandler: def decrypt_rc4(self, objid, genno, data): key = self.key + struct.pack(' "3.0"', - 'pycryptodome', + 'cryptography', 'sortedcontainers', ], extras_require={ diff --git a/tests/test_tools_pdf2txt.py b/tests/test_tools_pdf2txt.py index dd1aecf..64ede32 100644 --- a/tests/test_tools_pdf2txt.py +++ b/tests/test_tools_pdf2txt.py @@ -82,6 +82,27 @@ class TestPdf2Txt(): """ run('contrib/issue-00369-excel.pdf', '-t html') + def test_encryption_aes128(self): + run('encryption/aes-128.pdf', '-P foo') + + def test_encryption_aes128m(self): + run('encryption/aes-128-m.pdf', '-P foo') + + def test_encryption_aes256(self): + run('encryption/aes-256.pdf', '-P foo') + + def test_encryption_aes256m(self): + run('encryption/aes-256-m.pdf', '-P foo') + + def test_encryption_base(self): + run('encryption/base.pdf', '-P foo') + + def test_encryption_rc4_40(self): + run('encryption/rc4-40.pdf', '-P foo') + + def test_encryption_rc4_128(self): + run('encryption/rc4-128.pdf', '-P foo') + class TestDumpImages: