Fixes jbig2 writer to write valid jb2 files

See: https://github.com/pdfminer/pdfminer.six/pull/653 Squashed commit of the following: commit 8748c9fcddab0826cca243eee45c40d2b6611e80 Author: Pieter Marsman <pietermarsman@gmail.com> Date: Sun Jan 23 21:40:50 2022 +0100 Remove prints in test commit bb977258a39fc7baa13bba1c3ea29726e17c0f6d Author: Pieter Marsman <pietermarsman@gmail.com> Date: Sun Jan 23 21:35:12 2022 +0100 Cleanup exception handling for jbig2 global streams commit cf0b47b01b7caad8acbd82097aadadb620606a8b Merge: a5831d1 708dd20 Author: Pieter Marsman <pietermarsman@gmail.com> Date: Sun Jan 23 21:29:15 2022 +0100 Merge branch 'develop' into jbig2_fix commit a5831d110a Author: Forest Gregg <fgregg@datamade.us> Date: Sun Aug 1 22:59:17 2021 -0400 flake8 tests commit 18ffa29387 Author: Forest Gregg <fgregg@datamade.us> Date: Sun Aug 1 22:52:11 2021 -0400 add description in changelog commit 6c7ee43d6c Author: Forest Gregg <fgregg@datamade.us> Date: Sun Aug 1 22:43:36 2021 -0400 Fixes jbig2 writer to write valid jb2 files - closes #652
2022-01-23 21:41:08 +01:00 · 2022-01-23 21:41:08 +01:00 · aa5dec252f
parent 708dd20465
commit aa5dec252f
6 changed files with 46 additions and 11 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 ### Fixed
 - Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637))
 - Add handling of JPXDecode filter to enable extraction of images for some pdfs ([#645](https://github.com/pdfminer/pdfminer.six/pull/645))
+- Fix extraction of jbig2 files, which was producing invalid files ([#652](https://github.com/pdfminer/pdfminer.six/pull/653))

 ## [20211012]

--- a/pdfminer/image.py
+++ b/pdfminer/image.py
@ -112,6 +112,13 @@ class ImageWriter:
            i.save(fp, 'JPEG2000')
        elif is_jbig2:
            input_stream = BytesIO()
+            global_streams = self.jbig2_global(image)
+            if len(global_streams) > 1:
+                msg = 'There should never be more than one JBIG2Globals ' \
+                      'associated with a JBIG2 embedded image'
+                raise ValueError(msg)
+            if len(global_streams) == 1:
+                input_stream.write(global_streams[0].get_data().rstrip(b'\n'))
            input_stream.write(image.stream.get_data())
            input_stream.seek(0)
            reader = JBIG2StreamReader(input_stream)
@ -157,6 +164,15 @@ class ImageWriter:
                break
        return is_jbig2

+    @staticmethod
+    def jbig2_global(image):
+        global_streams = []
+        filters = image.stream.get_filters()
+        for filter_name, params in filters:
+            if filter_name in LITERALS_JBIG2_DECODE:
+                global_streams.append(params['JBIG2Globals'].resolve())
+        return global_streams
+
    @staticmethod
    def _get_image_extension(
        image: LTImage,
--- a/pdfminer/jbig2.py
+++ b/pdfminer/jbig2.py
@ -27,12 +27,11 @@ DATA_LEN_UNKNOWN = 0xffffffff
 # segment types
 SEG_TYPE_IMMEDIATE_GEN_REGION = 38
 SEG_TYPE_END_OF_PAGE = 49
-SEG_TYPE_END_OF_FILE = 50
+SEG_TYPE_END_OF_FILE = 51

 # file literals
 FILE_HEADER_ID = b'\x97\x4A\x42\x32\x0D\x0A\x1A\x0A'
 FILE_HEAD_FLAG_SEQUENTIAL = 0b00000001
-FILE_HEAD_FLAG_PAGES_UNKNOWN = 0b00000010


 def bit_set(bit_pos: int, value: int) -> bool:
@ -243,8 +242,12 @@ class JBIG2StreamWriter:
        fix_last_page: bool = True
    ) -> int:
        header = FILE_HEADER_ID
-        header_flags = FILE_HEAD_FLAG_SEQUENTIAL | FILE_HEAD_FLAG_PAGES_UNKNOWN
+        header_flags = FILE_HEAD_FLAG_SEQUENTIAL
        header += pack(">B", header_flags)
+        # The embedded JBIG2 files in a PDF always
+        # only have one page
+        number_of_pages = pack(">L", 1)
+        header += number_of_pages
        self.stream.write(header)
        data_len = len(header)

@ -254,7 +257,11 @@ class JBIG2StreamWriter:
        for segment in segments:
            seg_num = cast(int, segment["number"])

-        eof_segment = self.get_eof_segment(seg_num + 1)
+        if fix_last_page:
+            seg_num_offset = 2
+        else:
+            seg_num_offset = 1
+        eof_segment = self.get_eof_segment(seg_num + seg_num_offset)
        data = self.encode_segment(eof_segment)

        self.stream.write(data)
@ -305,7 +312,8 @@ class JBIG2StreamWriter:
        if ref_count <= 4:
            flags_byte = mask_value(REF_COUNT_SHORT_MASK, ref_count)
            for ref_index, ref_retain in enumerate(retain_segments):
-                flags_byte |= 1 << ref_index
+                if ref_retain:
+                    flags_byte |= 1 << ref_index
            flags.append(flags_byte)
        else:
            bytes_count = math.ceil((ref_count + 1) / 8)
--- a/samples/contrib/XIPLAYER0.jb2
+++ b/samples/contrib/XIPLAYER0.jb2
--- a/tests/test_font_size.py
+++ b/tests/test_font_size.py
@ -16,7 +16,4 @@ def test_font_size():
                        for char in line:
                            if isinstance(char, LTChar):
                                actual_size = int(round(char.size))
-                                print(char, actual_size, expected_size)
                                assert expected_size == actual_size
-                    else:
-                        print(repr(line.get_text()))
--- a/tests/test_tools_pdf2txt.py
+++ b/tests/test_tools_pdf2txt.py
@ -1,6 +1,7 @@
 import os
 from shutil import rmtree
 from tempfile import mkdtemp
+import filecmp

 import tools.pdf2txt as pdf2txt
 from helpers import absolute_sample_path
@ -144,9 +145,21 @@ class TestDumpImages:

        Feature test for: https://github.com/pdfminer/pdfminer.six/pull/46
        """
-        image_files = self.extract_images(
-            absolute_sample_path('../samples/contrib/pdf-with-jbig2.pdf'))
-        assert image_files[0].endswith('.jb2')
+        input_file = absolute_sample_path(
+            '../samples/contrib/pdf-with-jbig2.pdf')
+        output_dir = mkdtemp()
+        with TemporaryFilePath() as output_file_name:
+            commands = ['-o', output_file_name, '--output-dir',
+                        output_dir, input_file]
+            pdf2txt.main(commands)
+        image_files = os.listdir(output_dir)
+        try:
+            assert image_files[0].endswith('.jb2')
+            assert filecmp.cmp(output_dir + '/' + image_files[0],
+                               absolute_sample_path(
+                                   '../samples/contrib/XIPLAYER0.jb2'))
+        finally:
+            rmtree(output_dir)

    def test_contrib_matplotlib(self):
        """Test a pdf with Type3 font"""