Attempt to handle decompression error on some broken PDF files (#637)
* Attempt to handle decompression error on some broken PDF files from times to times we go through files where no text is detected, while readers like evince reads the pdf nicely. After digging it occured this is because the PDF includes some badly compressed data. This may be fixed by uncompressing byte per byte and ignoring the error on the last check bytes (arbitrarily found to be the 3 last). This has been largely inspired by https://github.com/mstamy2/PyPDF2/issues/422 and the test file has been taken from there, so credits to @zegrep. * Attempt to handle decompression error on some broken PDF files from times to times we go through files where no text is detected, while readers like evince reads the pdf nicely. After digging it occured this is because the PDF includes some badly compressed data. This may be fixed by uncompressing byte per byte and ignoring the error on the last check bytes (arbitrarily found to be the 3 last). This has been largely inspired by mstamy2/PyPDF2#422 and the test file has been taken from there, so credits to @zegrep. * Use a warnings instead of raising exception where zlib error is detected before the CRC checksum. * Add line to CHANGELOG.md * Only try decompressing if not in strict mode * Change error into warning because warning.warn needs a subclass of Warning Co-authored-by: Sylvain Thénault <sylvain.thenault@lowatt.fr> Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>pull/659/head
parent
c883f5e13f
commit
10f6fb40c2
|
@ -8,6 +8,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
### Added
|
### Added
|
||||||
- Support for identity cmap's ([#626](https://github.com/pdfminer/pdfminer.six/pull/626))
|
- Support for identity cmap's ([#626](https://github.com/pdfminer/pdfminer.six/pull/626))
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637))
|
||||||
|
|
||||||
## [20211012]
|
## [20211012]
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
|
|
|
@ -40,7 +40,7 @@ class PDFEncryptionError(PDFException):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class PDFPasswordIncorrect(PDFEncryptionError):
|
class PDFEncryptionWarning(UserWarning):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,11 @@
|
||||||
import zlib
|
import zlib
|
||||||
|
import warnings
|
||||||
import logging
|
import logging
|
||||||
|
import io
|
||||||
import sys
|
import sys
|
||||||
from typing import (TYPE_CHECKING, Any, Dict, Iterable, Optional, Union, List,
|
from typing import (TYPE_CHECKING, Any, Dict, Iterable, Optional, Union, List,
|
||||||
Tuple, cast)
|
Tuple, cast)
|
||||||
|
|
||||||
from .lzw import lzwdecode
|
from .lzw import lzwdecode
|
||||||
from .ascii85 import ascii85decode
|
from .ascii85 import ascii85decode
|
||||||
from .ascii85 import asciihexdecode
|
from .ascii85 import asciihexdecode
|
||||||
|
@ -216,6 +219,29 @@ def stream_value(x: object) -> "PDFStream":
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def decompress_corrupted(data):
|
||||||
|
"""Called on some data that can't be properly decoded because of CRC checksum
|
||||||
|
error. Attempt to decode it skipping the CRC.
|
||||||
|
"""
|
||||||
|
d = zlib.decompressobj()
|
||||||
|
f = io.BytesIO(data)
|
||||||
|
result_str = b''
|
||||||
|
buffer = f.read(1)
|
||||||
|
i = 0
|
||||||
|
try:
|
||||||
|
while buffer:
|
||||||
|
result_str += d.decompress(buffer)
|
||||||
|
buffer = f.read(1)
|
||||||
|
i += 1
|
||||||
|
except zlib.error:
|
||||||
|
# Let the error propagates if we're not yet in the CRC checksum
|
||||||
|
if i < len(data) - 3:
|
||||||
|
# Import here to prevent circualr import
|
||||||
|
from .pdfdocument import PDFEncryptionWarning
|
||||||
|
warnings.warn("Data-loss while decompressing corrupted data", PDFEncryptionWarning)
|
||||||
|
return result_str
|
||||||
|
|
||||||
|
|
||||||
class PDFStream(PDFObject):
|
class PDFStream(PDFObject):
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -303,12 +329,18 @@ class PDFStream(PDFObject):
|
||||||
# will get errors if the document is encrypted.
|
# will get errors if the document is encrypted.
|
||||||
try:
|
try:
|
||||||
data = zlib.decompress(data)
|
data = zlib.decompress(data)
|
||||||
|
|
||||||
except zlib.error as e:
|
except zlib.error as e:
|
||||||
if settings.STRICT:
|
if settings.STRICT:
|
||||||
error_msg = 'Invalid zlib bytes: {!r}, {!r}'\
|
error_msg = 'Invalid zlib bytes: {!r}, {!r}'\
|
||||||
.format(e, data)
|
.format(e, data)
|
||||||
raise PDFException(error_msg)
|
raise PDFException(error_msg)
|
||||||
data = b''
|
|
||||||
|
try:
|
||||||
|
data = decompress_corrupted(data)
|
||||||
|
except zlib.error:
|
||||||
|
data = b''
|
||||||
|
|
||||||
elif f in LITERALS_LZW_DECODE:
|
elif f in LITERALS_LZW_DECODE:
|
||||||
data = lzwdecode(data)
|
data = lzwdecode(data)
|
||||||
elif f in LITERALS_ASCII85_DECODE:
|
elif f in LITERALS_ASCII85_DECODE:
|
||||||
|
|
Binary file not shown.
|
@ -31,6 +31,7 @@ test_strings = {
|
||||||
"simple3.pdf": "Hello\n\nHello\nあ\nい\nう\nえ\nお\nあ\nい\nう\nえ\nお\n"
|
"simple3.pdf": "Hello\n\nHello\nあ\nい\nう\nえ\nお\nあ\nい\nう\nえ\nお\n"
|
||||||
"World\n\nWorld\n\n\f",
|
"World\n\nWorld\n\n\f",
|
||||||
"simple4.pdf": "Text1\nText2\nText3\n\n\f",
|
"simple4.pdf": "Text1\nText2\nText3\n\n\f",
|
||||||
|
"zen_of_python_corrupted.pdf": "Mai 30, 18 13:27\n\nzen_of_python.txt",
|
||||||
"contrib/issue_566_test_1.pdf": "ISSUE Date:2019-4-25 Buyer:黎荣",
|
"contrib/issue_566_test_1.pdf": "ISSUE Date:2019-4-25 Buyer:黎荣",
|
||||||
"contrib/issue_566_test_2.pdf": "甲方:中国饮料有限公司(盖章)",
|
"contrib/issue_566_test_2.pdf": "甲方:中国饮料有限公司(盖章)",
|
||||||
"contrib/issue-625-identity-cmap.pdf": "Termin płatności: 2021-05-03",
|
"contrib/issue-625-identity-cmap.pdf": "Termin płatności: 2021-05-03",
|
||||||
|
@ -83,6 +84,12 @@ class TestExtractText(unittest.TestCase):
|
||||||
s = run_with_file(test_file)
|
s = run_with_file(test_file)
|
||||||
self.assertEqual(s, test_strings[test_file])
|
self.assertEqual(s, test_strings[test_file])
|
||||||
|
|
||||||
|
def test_zlib_corrupted(self):
|
||||||
|
test_file = "zen_of_python_corrupted.pdf"
|
||||||
|
s = run_with_file(test_file)
|
||||||
|
expected = test_strings[test_file]
|
||||||
|
self.assertEqual(s[:len(expected)], expected)
|
||||||
|
|
||||||
def test_issue_566_cmap_bytes(self):
|
def test_issue_566_cmap_bytes(self):
|
||||||
test_file = "contrib/issue_566_test_1.pdf"
|
test_file = "contrib/issue_566_test_1.pdf"
|
||||||
s = run_with_file(test_file)
|
s = run_with_file(test_file)
|
||||||
|
|
Loading…
Reference in New Issue