Attempt to handle decompression error on some broken PDF files (#637)

* Attempt to handle decompression error on some broken PDF files

from times to times we go through files where no text is detected, while readers
like evince reads the pdf nicely. After digging it occured this is because the
PDF includes some badly compressed data. This may be fixed by uncompressing byte
per byte and ignoring the error on the last check bytes (arbitrarily found to be
the 3 last).

This has been largely inspired by https://github.com/mstamy2/PyPDF2/issues/422
and the test file has been taken from there, so credits to @zegrep.

* Attempt to handle decompression error on some broken PDF files

from times to times we go through files where no text is detected, while readers
like evince reads the pdf nicely. After digging it occured this is because the
PDF includes some badly compressed data. This may be fixed by uncompressing byte
per byte and ignoring the error on the last check bytes (arbitrarily found to be
the 3 last).

This has been largely inspired by mstamy2/PyPDF2#422
and the test file has been taken from there, so credits to @zegrep.

* Use a warnings instead of raising exception

where zlib error is detected before the CRC checksum.

* Add line to CHANGELOG.md

* Only try decompressing if not in strict mode

* Change error into warning because warning.warn needs a subclass of Warning

Co-authored-by: Sylvain Thénault <sylvain.thenault@lowatt.fr>
Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
pull/659/head
Sylvain Thénault 2021-12-11 18:25:19 +01:00 committed by GitHub
parent c883f5e13f
commit 10f6fb40c2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 44 additions and 2 deletions

View File

@ -8,6 +8,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
### Added
- Support for identity cmap's ([#626](https://github.com/pdfminer/pdfminer.six/pull/626))
### Fixed
- Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637))
## [20211012]
### Added

View File

@ -40,7 +40,7 @@ class PDFEncryptionError(PDFException):
pass
class PDFPasswordIncorrect(PDFEncryptionError):
class PDFEncryptionWarning(UserWarning):
pass

View File

@ -1,8 +1,11 @@
import zlib
import warnings
import logging
import io
import sys
from typing import (TYPE_CHECKING, Any, Dict, Iterable, Optional, Union, List,
Tuple, cast)
from .lzw import lzwdecode
from .ascii85 import ascii85decode
from .ascii85 import asciihexdecode
@ -216,6 +219,29 @@ def stream_value(x: object) -> "PDFStream":
return x
def decompress_corrupted(data):
"""Called on some data that can't be properly decoded because of CRC checksum
error. Attempt to decode it skipping the CRC.
"""
d = zlib.decompressobj()
f = io.BytesIO(data)
result_str = b''
buffer = f.read(1)
i = 0
try:
while buffer:
result_str += d.decompress(buffer)
buffer = f.read(1)
i += 1
except zlib.error:
# Let the error propagates if we're not yet in the CRC checksum
if i < len(data) - 3:
# Import here to prevent circualr import
from .pdfdocument import PDFEncryptionWarning
warnings.warn("Data-loss while decompressing corrupted data", PDFEncryptionWarning)
return result_str
class PDFStream(PDFObject):
def __init__(
@ -303,12 +329,18 @@ class PDFStream(PDFObject):
# will get errors if the document is encrypted.
try:
data = zlib.decompress(data)
except zlib.error as e:
if settings.STRICT:
error_msg = 'Invalid zlib bytes: {!r}, {!r}'\
.format(e, data)
raise PDFException(error_msg)
data = b''
try:
data = decompress_corrupted(data)
except zlib.error:
data = b''
elif f in LITERALS_LZW_DECODE:
data = lzwdecode(data)
elif f in LITERALS_ASCII85_DECODE:

Binary file not shown.

View File

@ -31,6 +31,7 @@ test_strings = {
"simple3.pdf": "Hello\n\nHello\n\n\n\n\n\n\n\n\n\n\n"
"World\n\nWorld\n\n\f",
"simple4.pdf": "Text1\nText2\nText3\n\n\f",
"zen_of_python_corrupted.pdf": "Mai 30, 18 13:27\n\nzen_of_python.txt",
"contrib/issue_566_test_1.pdf": "ISSUE Date2019-4-25 Buyer黎荣",
"contrib/issue_566_test_2.pdf": "甲方:中国饮料有限公司(盖章)",
"contrib/issue-625-identity-cmap.pdf": "Termin płatności: 2021-05-03",
@ -83,6 +84,12 @@ class TestExtractText(unittest.TestCase):
s = run_with_file(test_file)
self.assertEqual(s, test_strings[test_file])
def test_zlib_corrupted(self):
test_file = "zen_of_python_corrupted.pdf"
s = run_with_file(test_file)
expected = test_strings[test_file]
self.assertEqual(s[:len(expected)], expected)
def test_issue_566_cmap_bytes(self):
test_file = "contrib/issue_566_test_1.pdf"
s = run_with_file(test_file)