Fixes jbig2 writer to write valid jb2 files

See: https://github.com/pdfminer/pdfminer.six/pull/653

Squashed commit of the following:

commit 8748c9fcddab0826cca243eee45c40d2b6611e80
Author: Pieter Marsman <pietermarsman@gmail.com>
Date:   Sun Jan 23 21:40:50 2022 +0100

    Remove prints in test

commit bb977258a39fc7baa13bba1c3ea29726e17c0f6d
Author: Pieter Marsman <pietermarsman@gmail.com>
Date:   Sun Jan 23 21:35:12 2022 +0100

    Cleanup exception handling for jbig2 global streams

commit cf0b47b01b7caad8acbd82097aadadb620606a8b
Merge: a5831d1 708dd20
Author: Pieter Marsman <pietermarsman@gmail.com>
Date:   Sun Jan 23 21:29:15 2022 +0100

    Merge branch 'develop' into jbig2_fix

commit a5831d110a
Author: Forest Gregg <fgregg@datamade.us>
Date:   Sun Aug 1 22:59:17 2021 -0400

    flake8 tests

commit 18ffa29387
Author: Forest Gregg <fgregg@datamade.us>
Date:   Sun Aug 1 22:52:11 2021 -0400

    add description in changelog

commit 6c7ee43d6c
Author: Forest Gregg <fgregg@datamade.us>
Date:   Sun Aug 1 22:43:36 2021 -0400

    Fixes jbig2 writer to write valid jb2 files

    - closes #652
pull/659/head^2
Pieter Marsman 2022-01-23 21:41:08 +01:00
parent 708dd20465
commit aa5dec252f
6 changed files with 46 additions and 11 deletions

View File

@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
### Fixed ### Fixed
- Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637)) - Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637))
- Add handling of JPXDecode filter to enable extraction of images for some pdfs ([#645](https://github.com/pdfminer/pdfminer.six/pull/645)) - Add handling of JPXDecode filter to enable extraction of images for some pdfs ([#645](https://github.com/pdfminer/pdfminer.six/pull/645))
- Fix extraction of jbig2 files, which was producing invalid files ([#652](https://github.com/pdfminer/pdfminer.six/pull/653))
## [20211012] ## [20211012]

View File

@ -112,6 +112,13 @@ class ImageWriter:
i.save(fp, 'JPEG2000') i.save(fp, 'JPEG2000')
elif is_jbig2: elif is_jbig2:
input_stream = BytesIO() input_stream = BytesIO()
global_streams = self.jbig2_global(image)
if len(global_streams) > 1:
msg = 'There should never be more than one JBIG2Globals ' \
'associated with a JBIG2 embedded image'
raise ValueError(msg)
if len(global_streams) == 1:
input_stream.write(global_streams[0].get_data().rstrip(b'\n'))
input_stream.write(image.stream.get_data()) input_stream.write(image.stream.get_data())
input_stream.seek(0) input_stream.seek(0)
reader = JBIG2StreamReader(input_stream) reader = JBIG2StreamReader(input_stream)
@ -157,6 +164,15 @@ class ImageWriter:
break break
return is_jbig2 return is_jbig2
@staticmethod
def jbig2_global(image):
global_streams = []
filters = image.stream.get_filters()
for filter_name, params in filters:
if filter_name in LITERALS_JBIG2_DECODE:
global_streams.append(params['JBIG2Globals'].resolve())
return global_streams
@staticmethod @staticmethod
def _get_image_extension( def _get_image_extension(
image: LTImage, image: LTImage,

View File

@ -27,12 +27,11 @@ DATA_LEN_UNKNOWN = 0xffffffff
# segment types # segment types
SEG_TYPE_IMMEDIATE_GEN_REGION = 38 SEG_TYPE_IMMEDIATE_GEN_REGION = 38
SEG_TYPE_END_OF_PAGE = 49 SEG_TYPE_END_OF_PAGE = 49
SEG_TYPE_END_OF_FILE = 50 SEG_TYPE_END_OF_FILE = 51
# file literals # file literals
FILE_HEADER_ID = b'\x97\x4A\x42\x32\x0D\x0A\x1A\x0A' FILE_HEADER_ID = b'\x97\x4A\x42\x32\x0D\x0A\x1A\x0A'
FILE_HEAD_FLAG_SEQUENTIAL = 0b00000001 FILE_HEAD_FLAG_SEQUENTIAL = 0b00000001
FILE_HEAD_FLAG_PAGES_UNKNOWN = 0b00000010
def bit_set(bit_pos: int, value: int) -> bool: def bit_set(bit_pos: int, value: int) -> bool:
@ -243,8 +242,12 @@ class JBIG2StreamWriter:
fix_last_page: bool = True fix_last_page: bool = True
) -> int: ) -> int:
header = FILE_HEADER_ID header = FILE_HEADER_ID
header_flags = FILE_HEAD_FLAG_SEQUENTIAL | FILE_HEAD_FLAG_PAGES_UNKNOWN header_flags = FILE_HEAD_FLAG_SEQUENTIAL
header += pack(">B", header_flags) header += pack(">B", header_flags)
# The embedded JBIG2 files in a PDF always
# only have one page
number_of_pages = pack(">L", 1)
header += number_of_pages
self.stream.write(header) self.stream.write(header)
data_len = len(header) data_len = len(header)
@ -254,7 +257,11 @@ class JBIG2StreamWriter:
for segment in segments: for segment in segments:
seg_num = cast(int, segment["number"]) seg_num = cast(int, segment["number"])
eof_segment = self.get_eof_segment(seg_num + 1) if fix_last_page:
seg_num_offset = 2
else:
seg_num_offset = 1
eof_segment = self.get_eof_segment(seg_num + seg_num_offset)
data = self.encode_segment(eof_segment) data = self.encode_segment(eof_segment)
self.stream.write(data) self.stream.write(data)
@ -305,7 +312,8 @@ class JBIG2StreamWriter:
if ref_count <= 4: if ref_count <= 4:
flags_byte = mask_value(REF_COUNT_SHORT_MASK, ref_count) flags_byte = mask_value(REF_COUNT_SHORT_MASK, ref_count)
for ref_index, ref_retain in enumerate(retain_segments): for ref_index, ref_retain in enumerate(retain_segments):
flags_byte |= 1 << ref_index if ref_retain:
flags_byte |= 1 << ref_index
flags.append(flags_byte) flags.append(flags_byte)
else: else:
bytes_count = math.ceil((ref_count + 1) / 8) bytes_count = math.ceil((ref_count + 1) / 8)

Binary file not shown.

View File

@ -16,7 +16,4 @@ def test_font_size():
for char in line: for char in line:
if isinstance(char, LTChar): if isinstance(char, LTChar):
actual_size = int(round(char.size)) actual_size = int(round(char.size))
print(char, actual_size, expected_size)
assert expected_size == actual_size assert expected_size == actual_size
else:
print(repr(line.get_text()))

View File

@ -1,6 +1,7 @@
import os import os
from shutil import rmtree from shutil import rmtree
from tempfile import mkdtemp from tempfile import mkdtemp
import filecmp
import tools.pdf2txt as pdf2txt import tools.pdf2txt as pdf2txt
from helpers import absolute_sample_path from helpers import absolute_sample_path
@ -144,9 +145,21 @@ class TestDumpImages:
Feature test for: https://github.com/pdfminer/pdfminer.six/pull/46 Feature test for: https://github.com/pdfminer/pdfminer.six/pull/46
""" """
image_files = self.extract_images( input_file = absolute_sample_path(
absolute_sample_path('../samples/contrib/pdf-with-jbig2.pdf')) '../samples/contrib/pdf-with-jbig2.pdf')
assert image_files[0].endswith('.jb2') output_dir = mkdtemp()
with TemporaryFilePath() as output_file_name:
commands = ['-o', output_file_name, '--output-dir',
output_dir, input_file]
pdf2txt.main(commands)
image_files = os.listdir(output_dir)
try:
assert image_files[0].endswith('.jb2')
assert filecmp.cmp(output_dir + '/' + image_files[0],
absolute_sample_path(
'../samples/contrib/XIPLAYER0.jb2'))
finally:
rmtree(output_dir)
def test_contrib_matplotlib(self): def test_contrib_matplotlib(self):
"""Test a pdf with Type3 font""" """Test a pdf with Type3 font"""