Fixes jbig2 writer to write valid jb2 files

See: https://github.com/pdfminer/pdfminer.six/pull/653

Squashed commit of the following:

commit 8748c9fcddab0826cca243eee45c40d2b6611e80
Author: Pieter Marsman <pietermarsman@gmail.com>
Date:   Sun Jan 23 21:40:50 2022 +0100

    Remove prints in test

commit bb977258a39fc7baa13bba1c3ea29726e17c0f6d
Author: Pieter Marsman <pietermarsman@gmail.com>
Date:   Sun Jan 23 21:35:12 2022 +0100

    Cleanup exception handling for jbig2 global streams

commit cf0b47b01b7caad8acbd82097aadadb620606a8b
Merge: a5831d1 708dd20
Author: Pieter Marsman <pietermarsman@gmail.com>
Date:   Sun Jan 23 21:29:15 2022 +0100

    Merge branch 'develop' into jbig2_fix

commit a5831d110a
Author: Forest Gregg <fgregg@datamade.us>
Date:   Sun Aug 1 22:59:17 2021 -0400

    flake8 tests

commit 18ffa29387
Author: Forest Gregg <fgregg@datamade.us>
Date:   Sun Aug 1 22:52:11 2021 -0400

    add description in changelog

commit 6c7ee43d6c
Author: Forest Gregg <fgregg@datamade.us>
Date:   Sun Aug 1 22:43:36 2021 -0400

    Fixes jbig2 writer to write valid jb2 files

    - closes #652
pull/659/head^2
Pieter Marsman 2022-01-23 21:41:08 +01:00
parent 708dd20465
commit aa5dec252f
6 changed files with 46 additions and 11 deletions

View File

@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
### Fixed
- Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637))
- Add handling of JPXDecode filter to enable extraction of images for some pdfs ([#645](https://github.com/pdfminer/pdfminer.six/pull/645))
- Fix extraction of jbig2 files, which was producing invalid files ([#652](https://github.com/pdfminer/pdfminer.six/pull/653))
## [20211012]

View File

@ -112,6 +112,13 @@ class ImageWriter:
i.save(fp, 'JPEG2000')
elif is_jbig2:
input_stream = BytesIO()
global_streams = self.jbig2_global(image)
if len(global_streams) > 1:
msg = 'There should never be more than one JBIG2Globals ' \
'associated with a JBIG2 embedded image'
raise ValueError(msg)
if len(global_streams) == 1:
input_stream.write(global_streams[0].get_data().rstrip(b'\n'))
input_stream.write(image.stream.get_data())
input_stream.seek(0)
reader = JBIG2StreamReader(input_stream)
@ -157,6 +164,15 @@ class ImageWriter:
break
return is_jbig2
@staticmethod
def jbig2_global(image):
global_streams = []
filters = image.stream.get_filters()
for filter_name, params in filters:
if filter_name in LITERALS_JBIG2_DECODE:
global_streams.append(params['JBIG2Globals'].resolve())
return global_streams
@staticmethod
def _get_image_extension(
image: LTImage,

View File

@ -27,12 +27,11 @@ DATA_LEN_UNKNOWN = 0xffffffff
# segment types
SEG_TYPE_IMMEDIATE_GEN_REGION = 38
SEG_TYPE_END_OF_PAGE = 49
SEG_TYPE_END_OF_FILE = 50
SEG_TYPE_END_OF_FILE = 51
# file literals
FILE_HEADER_ID = b'\x97\x4A\x42\x32\x0D\x0A\x1A\x0A'
FILE_HEAD_FLAG_SEQUENTIAL = 0b00000001
FILE_HEAD_FLAG_PAGES_UNKNOWN = 0b00000010
def bit_set(bit_pos: int, value: int) -> bool:
@ -243,8 +242,12 @@ class JBIG2StreamWriter:
fix_last_page: bool = True
) -> int:
header = FILE_HEADER_ID
header_flags = FILE_HEAD_FLAG_SEQUENTIAL | FILE_HEAD_FLAG_PAGES_UNKNOWN
header_flags = FILE_HEAD_FLAG_SEQUENTIAL
header += pack(">B", header_flags)
# The embedded JBIG2 files in a PDF always
# only have one page
number_of_pages = pack(">L", 1)
header += number_of_pages
self.stream.write(header)
data_len = len(header)
@ -254,7 +257,11 @@ class JBIG2StreamWriter:
for segment in segments:
seg_num = cast(int, segment["number"])
eof_segment = self.get_eof_segment(seg_num + 1)
if fix_last_page:
seg_num_offset = 2
else:
seg_num_offset = 1
eof_segment = self.get_eof_segment(seg_num + seg_num_offset)
data = self.encode_segment(eof_segment)
self.stream.write(data)
@ -305,7 +312,8 @@ class JBIG2StreamWriter:
if ref_count <= 4:
flags_byte = mask_value(REF_COUNT_SHORT_MASK, ref_count)
for ref_index, ref_retain in enumerate(retain_segments):
flags_byte |= 1 << ref_index
if ref_retain:
flags_byte |= 1 << ref_index
flags.append(flags_byte)
else:
bytes_count = math.ceil((ref_count + 1) / 8)

Binary file not shown.

View File

@ -16,7 +16,4 @@ def test_font_size():
for char in line:
if isinstance(char, LTChar):
actual_size = int(round(char.size))
print(char, actual_size, expected_size)
assert expected_size == actual_size
else:
print(repr(line.get_text()))

View File

@ -1,6 +1,7 @@
import os
from shutil import rmtree
from tempfile import mkdtemp
import filecmp
import tools.pdf2txt as pdf2txt
from helpers import absolute_sample_path
@ -144,9 +145,21 @@ class TestDumpImages:
Feature test for: https://github.com/pdfminer/pdfminer.six/pull/46
"""
image_files = self.extract_images(
absolute_sample_path('../samples/contrib/pdf-with-jbig2.pdf'))
assert image_files[0].endswith('.jb2')
input_file = absolute_sample_path(
'../samples/contrib/pdf-with-jbig2.pdf')
output_dir = mkdtemp()
with TemporaryFilePath() as output_file_name:
commands = ['-o', output_file_name, '--output-dir',
output_dir, input_file]
pdf2txt.main(commands)
image_files = os.listdir(output_dir)
try:
assert image_files[0].endswith('.jb2')
assert filecmp.cmp(output_dir + '/' + image_files[0],
absolute_sample_path(
'../samples/contrib/XIPLAYER0.jb2'))
finally:
rmtree(output_dir)
def test_contrib_matplotlib(self):
"""Test a pdf with Type3 font"""