diff --git a/CHANGELOG.md b/CHANGELOG.md index a580bca..02dc419 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Added - Support for identity cmap's ([#626](https://github.com/pdfminer/pdfminer.six/pull/626)) +### Fixed +- Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637)) + ## [20211012] ### Added diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py index 8858970..cac09f2 100644 --- a/pdfminer/pdfdocument.py +++ b/pdfminer/pdfdocument.py @@ -40,7 +40,7 @@ class PDFEncryptionError(PDFException): pass -class PDFPasswordIncorrect(PDFEncryptionError): +class PDFEncryptionWarning(UserWarning): pass diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index 6190ea9..5e0ef60 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -1,8 +1,11 @@ import zlib +import warnings import logging +import io import sys from typing import (TYPE_CHECKING, Any, Dict, Iterable, Optional, Union, List, Tuple, cast) + from .lzw import lzwdecode from .ascii85 import ascii85decode from .ascii85 import asciihexdecode @@ -216,6 +219,29 @@ def stream_value(x: object) -> "PDFStream": return x +def decompress_corrupted(data): + """Called on some data that can't be properly decoded because of CRC checksum + error. Attempt to decode it skipping the CRC. + """ + d = zlib.decompressobj() + f = io.BytesIO(data) + result_str = b'' + buffer = f.read(1) + i = 0 + try: + while buffer: + result_str += d.decompress(buffer) + buffer = f.read(1) + i += 1 + except zlib.error: + # Let the error propagates if we're not yet in the CRC checksum + if i < len(data) - 3: + # Import here to prevent circualr import + from .pdfdocument import PDFEncryptionWarning + warnings.warn("Data-loss while decompressing corrupted data", PDFEncryptionWarning) + return result_str + + class PDFStream(PDFObject): def __init__( @@ -303,12 +329,18 @@ class PDFStream(PDFObject): # will get errors if the document is encrypted. try: data = zlib.decompress(data) + except zlib.error as e: if settings.STRICT: error_msg = 'Invalid zlib bytes: {!r}, {!r}'\ .format(e, data) raise PDFException(error_msg) - data = b'' + + try: + data = decompress_corrupted(data) + except zlib.error: + data = b'' + elif f in LITERALS_LZW_DECODE: data = lzwdecode(data) elif f in LITERALS_ASCII85_DECODE: diff --git a/samples/zen_of_python_corrupted.pdf b/samples/zen_of_python_corrupted.pdf new file mode 100644 index 0000000..cbe0156 Binary files /dev/null and b/samples/zen_of_python_corrupted.pdf differ diff --git a/tests/test_highlevel_extracttext.py b/tests/test_highlevel_extracttext.py index 3c9f991..3fa41ea 100644 --- a/tests/test_highlevel_extracttext.py +++ b/tests/test_highlevel_extracttext.py @@ -31,6 +31,7 @@ test_strings = { "simple3.pdf": "Hello\n\nHello\nあ\nい\nう\nえ\nお\nあ\nい\nう\nえ\nお\n" "World\n\nWorld\n\n\f", "simple4.pdf": "Text1\nText2\nText3\n\n\f", + "zen_of_python_corrupted.pdf": "Mai 30, 18 13:27\n\nzen_of_python.txt", "contrib/issue_566_test_1.pdf": "ISSUE Date:2019-4-25 Buyer:黎荣", "contrib/issue_566_test_2.pdf": "甲方:中国饮料有限公司(盖章)", "contrib/issue-625-identity-cmap.pdf": "Termin płatności: 2021-05-03", @@ -83,6 +84,12 @@ class TestExtractText(unittest.TestCase): s = run_with_file(test_file) self.assertEqual(s, test_strings[test_file]) + def test_zlib_corrupted(self): + test_file = "zen_of_python_corrupted.pdf" + s = run_with_file(test_file) + expected = test_strings[test_file] + self.assertEqual(s[:len(expected)], expected) + def test_issue_566_cmap_bytes(self): test_file = "contrib/issue_566_test_1.pdf" s = run_with_file(test_file)