Fixes jbig2 writer to write valid jb2 files
See: https://github.com/pdfminer/pdfminer.six/pull/653 Squashed commit of the following: commit 8748c9fcddab0826cca243eee45c40d2b6611e80 Author: Pieter Marsman <pietermarsman@gmail.com> Date: Sun Jan 23 21:40:50 2022 +0100 Remove prints in test commit bb977258a39fc7baa13bba1c3ea29726e17c0f6d Author: Pieter Marsman <pietermarsman@gmail.com> Date: Sun Jan 23 21:35:12 2022 +0100 Cleanup exception handling for jbig2 global streams commit cf0b47b01b7caad8acbd82097aadadb620606a8b Merge:pull/659/head^2a5831d1
708dd20
Author: Pieter Marsman <pietermarsman@gmail.com> Date: Sun Jan 23 21:29:15 2022 +0100 Merge branch 'develop' into jbig2_fix commita5831d110a
Author: Forest Gregg <fgregg@datamade.us> Date: Sun Aug 1 22:59:17 2021 -0400 flake8 tests commit18ffa29387
Author: Forest Gregg <fgregg@datamade.us> Date: Sun Aug 1 22:52:11 2021 -0400 add description in changelog commit6c7ee43d6c
Author: Forest Gregg <fgregg@datamade.us> Date: Sun Aug 1 22:43:36 2021 -0400 Fixes jbig2 writer to write valid jb2 files - closes #652
parent
708dd20465
commit
aa5dec252f
|
@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
### Fixed
|
### Fixed
|
||||||
- Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637))
|
- Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637))
|
||||||
- Add handling of JPXDecode filter to enable extraction of images for some pdfs ([#645](https://github.com/pdfminer/pdfminer.six/pull/645))
|
- Add handling of JPXDecode filter to enable extraction of images for some pdfs ([#645](https://github.com/pdfminer/pdfminer.six/pull/645))
|
||||||
|
- Fix extraction of jbig2 files, which was producing invalid files ([#652](https://github.com/pdfminer/pdfminer.six/pull/653))
|
||||||
|
|
||||||
## [20211012]
|
## [20211012]
|
||||||
|
|
||||||
|
|
|
@ -112,6 +112,13 @@ class ImageWriter:
|
||||||
i.save(fp, 'JPEG2000')
|
i.save(fp, 'JPEG2000')
|
||||||
elif is_jbig2:
|
elif is_jbig2:
|
||||||
input_stream = BytesIO()
|
input_stream = BytesIO()
|
||||||
|
global_streams = self.jbig2_global(image)
|
||||||
|
if len(global_streams) > 1:
|
||||||
|
msg = 'There should never be more than one JBIG2Globals ' \
|
||||||
|
'associated with a JBIG2 embedded image'
|
||||||
|
raise ValueError(msg)
|
||||||
|
if len(global_streams) == 1:
|
||||||
|
input_stream.write(global_streams[0].get_data().rstrip(b'\n'))
|
||||||
input_stream.write(image.stream.get_data())
|
input_stream.write(image.stream.get_data())
|
||||||
input_stream.seek(0)
|
input_stream.seek(0)
|
||||||
reader = JBIG2StreamReader(input_stream)
|
reader = JBIG2StreamReader(input_stream)
|
||||||
|
@ -157,6 +164,15 @@ class ImageWriter:
|
||||||
break
|
break
|
||||||
return is_jbig2
|
return is_jbig2
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def jbig2_global(image):
|
||||||
|
global_streams = []
|
||||||
|
filters = image.stream.get_filters()
|
||||||
|
for filter_name, params in filters:
|
||||||
|
if filter_name in LITERALS_JBIG2_DECODE:
|
||||||
|
global_streams.append(params['JBIG2Globals'].resolve())
|
||||||
|
return global_streams
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_image_extension(
|
def _get_image_extension(
|
||||||
image: LTImage,
|
image: LTImage,
|
||||||
|
|
|
@ -27,12 +27,11 @@ DATA_LEN_UNKNOWN = 0xffffffff
|
||||||
# segment types
|
# segment types
|
||||||
SEG_TYPE_IMMEDIATE_GEN_REGION = 38
|
SEG_TYPE_IMMEDIATE_GEN_REGION = 38
|
||||||
SEG_TYPE_END_OF_PAGE = 49
|
SEG_TYPE_END_OF_PAGE = 49
|
||||||
SEG_TYPE_END_OF_FILE = 50
|
SEG_TYPE_END_OF_FILE = 51
|
||||||
|
|
||||||
# file literals
|
# file literals
|
||||||
FILE_HEADER_ID = b'\x97\x4A\x42\x32\x0D\x0A\x1A\x0A'
|
FILE_HEADER_ID = b'\x97\x4A\x42\x32\x0D\x0A\x1A\x0A'
|
||||||
FILE_HEAD_FLAG_SEQUENTIAL = 0b00000001
|
FILE_HEAD_FLAG_SEQUENTIAL = 0b00000001
|
||||||
FILE_HEAD_FLAG_PAGES_UNKNOWN = 0b00000010
|
|
||||||
|
|
||||||
|
|
||||||
def bit_set(bit_pos: int, value: int) -> bool:
|
def bit_set(bit_pos: int, value: int) -> bool:
|
||||||
|
@ -243,8 +242,12 @@ class JBIG2StreamWriter:
|
||||||
fix_last_page: bool = True
|
fix_last_page: bool = True
|
||||||
) -> int:
|
) -> int:
|
||||||
header = FILE_HEADER_ID
|
header = FILE_HEADER_ID
|
||||||
header_flags = FILE_HEAD_FLAG_SEQUENTIAL | FILE_HEAD_FLAG_PAGES_UNKNOWN
|
header_flags = FILE_HEAD_FLAG_SEQUENTIAL
|
||||||
header += pack(">B", header_flags)
|
header += pack(">B", header_flags)
|
||||||
|
# The embedded JBIG2 files in a PDF always
|
||||||
|
# only have one page
|
||||||
|
number_of_pages = pack(">L", 1)
|
||||||
|
header += number_of_pages
|
||||||
self.stream.write(header)
|
self.stream.write(header)
|
||||||
data_len = len(header)
|
data_len = len(header)
|
||||||
|
|
||||||
|
@ -254,7 +257,11 @@ class JBIG2StreamWriter:
|
||||||
for segment in segments:
|
for segment in segments:
|
||||||
seg_num = cast(int, segment["number"])
|
seg_num = cast(int, segment["number"])
|
||||||
|
|
||||||
eof_segment = self.get_eof_segment(seg_num + 1)
|
if fix_last_page:
|
||||||
|
seg_num_offset = 2
|
||||||
|
else:
|
||||||
|
seg_num_offset = 1
|
||||||
|
eof_segment = self.get_eof_segment(seg_num + seg_num_offset)
|
||||||
data = self.encode_segment(eof_segment)
|
data = self.encode_segment(eof_segment)
|
||||||
|
|
||||||
self.stream.write(data)
|
self.stream.write(data)
|
||||||
|
@ -305,7 +312,8 @@ class JBIG2StreamWriter:
|
||||||
if ref_count <= 4:
|
if ref_count <= 4:
|
||||||
flags_byte = mask_value(REF_COUNT_SHORT_MASK, ref_count)
|
flags_byte = mask_value(REF_COUNT_SHORT_MASK, ref_count)
|
||||||
for ref_index, ref_retain in enumerate(retain_segments):
|
for ref_index, ref_retain in enumerate(retain_segments):
|
||||||
flags_byte |= 1 << ref_index
|
if ref_retain:
|
||||||
|
flags_byte |= 1 << ref_index
|
||||||
flags.append(flags_byte)
|
flags.append(flags_byte)
|
||||||
else:
|
else:
|
||||||
bytes_count = math.ceil((ref_count + 1) / 8)
|
bytes_count = math.ceil((ref_count + 1) / 8)
|
||||||
|
|
Binary file not shown.
|
@ -16,7 +16,4 @@ def test_font_size():
|
||||||
for char in line:
|
for char in line:
|
||||||
if isinstance(char, LTChar):
|
if isinstance(char, LTChar):
|
||||||
actual_size = int(round(char.size))
|
actual_size = int(round(char.size))
|
||||||
print(char, actual_size, expected_size)
|
|
||||||
assert expected_size == actual_size
|
assert expected_size == actual_size
|
||||||
else:
|
|
||||||
print(repr(line.get_text()))
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import os
|
import os
|
||||||
from shutil import rmtree
|
from shutil import rmtree
|
||||||
from tempfile import mkdtemp
|
from tempfile import mkdtemp
|
||||||
|
import filecmp
|
||||||
|
|
||||||
import tools.pdf2txt as pdf2txt
|
import tools.pdf2txt as pdf2txt
|
||||||
from helpers import absolute_sample_path
|
from helpers import absolute_sample_path
|
||||||
|
@ -144,9 +145,21 @@ class TestDumpImages:
|
||||||
|
|
||||||
Feature test for: https://github.com/pdfminer/pdfminer.six/pull/46
|
Feature test for: https://github.com/pdfminer/pdfminer.six/pull/46
|
||||||
"""
|
"""
|
||||||
image_files = self.extract_images(
|
input_file = absolute_sample_path(
|
||||||
absolute_sample_path('../samples/contrib/pdf-with-jbig2.pdf'))
|
'../samples/contrib/pdf-with-jbig2.pdf')
|
||||||
assert image_files[0].endswith('.jb2')
|
output_dir = mkdtemp()
|
||||||
|
with TemporaryFilePath() as output_file_name:
|
||||||
|
commands = ['-o', output_file_name, '--output-dir',
|
||||||
|
output_dir, input_file]
|
||||||
|
pdf2txt.main(commands)
|
||||||
|
image_files = os.listdir(output_dir)
|
||||||
|
try:
|
||||||
|
assert image_files[0].endswith('.jb2')
|
||||||
|
assert filecmp.cmp(output_dir + '/' + image_files[0],
|
||||||
|
absolute_sample_path(
|
||||||
|
'../samples/contrib/XIPLAYER0.jb2'))
|
||||||
|
finally:
|
||||||
|
rmtree(output_dir)
|
||||||
|
|
||||||
def test_contrib_matplotlib(self):
|
def test_contrib_matplotlib(self):
|
||||||
"""Test a pdf with Type3 font"""
|
"""Test a pdf with Type3 font"""
|
||||||
|
|
Loading…
Reference in New Issue