diff --git a/CHANGELOG.md b/CHANGELOG.md index 35b2ee1..a0a17ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Fixed - Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637)) - Add handling of JPXDecode filter to enable extraction of images for some pdfs ([#645](https://github.com/pdfminer/pdfminer.six/pull/645)) +- Fix extraction of jbig2 files, which was producing invalid files ([#652](https://github.com/pdfminer/pdfminer.six/pull/653)) ## [20211012] diff --git a/pdfminer/image.py b/pdfminer/image.py index 1a25006..cfed324 100644 --- a/pdfminer/image.py +++ b/pdfminer/image.py @@ -112,6 +112,13 @@ class ImageWriter: i.save(fp, 'JPEG2000') elif is_jbig2: input_stream = BytesIO() + global_streams = self.jbig2_global(image) + if len(global_streams) > 1: + msg = 'There should never be more than one JBIG2Globals ' \ + 'associated with a JBIG2 embedded image' + raise ValueError(msg) + if len(global_streams) == 1: + input_stream.write(global_streams[0].get_data().rstrip(b'\n')) input_stream.write(image.stream.get_data()) input_stream.seek(0) reader = JBIG2StreamReader(input_stream) @@ -157,6 +164,15 @@ class ImageWriter: break return is_jbig2 + @staticmethod + def jbig2_global(image): + global_streams = [] + filters = image.stream.get_filters() + for filter_name, params in filters: + if filter_name in LITERALS_JBIG2_DECODE: + global_streams.append(params['JBIG2Globals'].resolve()) + return global_streams + @staticmethod def _get_image_extension( image: LTImage, diff --git a/pdfminer/jbig2.py b/pdfminer/jbig2.py index 10ee7e6..269b028 100644 --- a/pdfminer/jbig2.py +++ b/pdfminer/jbig2.py @@ -27,12 +27,11 @@ DATA_LEN_UNKNOWN = 0xffffffff # segment types SEG_TYPE_IMMEDIATE_GEN_REGION = 38 SEG_TYPE_END_OF_PAGE = 49 -SEG_TYPE_END_OF_FILE = 50 +SEG_TYPE_END_OF_FILE = 51 # file literals FILE_HEADER_ID = b'\x97\x4A\x42\x32\x0D\x0A\x1A\x0A' FILE_HEAD_FLAG_SEQUENTIAL = 0b00000001 -FILE_HEAD_FLAG_PAGES_UNKNOWN = 0b00000010 def bit_set(bit_pos: int, value: int) -> bool: @@ -243,8 +242,12 @@ class JBIG2StreamWriter: fix_last_page: bool = True ) -> int: header = FILE_HEADER_ID - header_flags = FILE_HEAD_FLAG_SEQUENTIAL | FILE_HEAD_FLAG_PAGES_UNKNOWN + header_flags = FILE_HEAD_FLAG_SEQUENTIAL header += pack(">B", header_flags) + # The embedded JBIG2 files in a PDF always + # only have one page + number_of_pages = pack(">L", 1) + header += number_of_pages self.stream.write(header) data_len = len(header) @@ -254,7 +257,11 @@ class JBIG2StreamWriter: for segment in segments: seg_num = cast(int, segment["number"]) - eof_segment = self.get_eof_segment(seg_num + 1) + if fix_last_page: + seg_num_offset = 2 + else: + seg_num_offset = 1 + eof_segment = self.get_eof_segment(seg_num + seg_num_offset) data = self.encode_segment(eof_segment) self.stream.write(data) @@ -305,7 +312,8 @@ class JBIG2StreamWriter: if ref_count <= 4: flags_byte = mask_value(REF_COUNT_SHORT_MASK, ref_count) for ref_index, ref_retain in enumerate(retain_segments): - flags_byte |= 1 << ref_index + if ref_retain: + flags_byte |= 1 << ref_index flags.append(flags_byte) else: bytes_count = math.ceil((ref_count + 1) / 8) diff --git a/samples/contrib/XIPLAYER0.jb2 b/samples/contrib/XIPLAYER0.jb2 new file mode 100644 index 0000000..e1d1aec Binary files /dev/null and b/samples/contrib/XIPLAYER0.jb2 differ diff --git a/tests/test_font_size.py b/tests/test_font_size.py index e6b0ec4..1c388a6 100644 --- a/tests/test_font_size.py +++ b/tests/test_font_size.py @@ -16,7 +16,4 @@ def test_font_size(): for char in line: if isinstance(char, LTChar): actual_size = int(round(char.size)) - print(char, actual_size, expected_size) assert expected_size == actual_size - else: - print(repr(line.get_text())) diff --git a/tests/test_tools_pdf2txt.py b/tests/test_tools_pdf2txt.py index 9fe2e36..f73bc0e 100644 --- a/tests/test_tools_pdf2txt.py +++ b/tests/test_tools_pdf2txt.py @@ -1,6 +1,7 @@ import os from shutil import rmtree from tempfile import mkdtemp +import filecmp import tools.pdf2txt as pdf2txt from helpers import absolute_sample_path @@ -144,9 +145,21 @@ class TestDumpImages: Feature test for: https://github.com/pdfminer/pdfminer.six/pull/46 """ - image_files = self.extract_images( - absolute_sample_path('../samples/contrib/pdf-with-jbig2.pdf')) - assert image_files[0].endswith('.jb2') + input_file = absolute_sample_path( + '../samples/contrib/pdf-with-jbig2.pdf') + output_dir = mkdtemp() + with TemporaryFilePath() as output_file_name: + commands = ['-o', output_file_name, '--output-dir', + output_dir, input_file] + pdf2txt.main(commands) + image_files = os.listdir(output_dir) + try: + assert image_files[0].endswith('.jb2') + assert filecmp.cmp(output_dir + '/' + image_files[0], + absolute_sample_path( + '../samples/contrib/XIPLAYER0.jb2')) + finally: + rmtree(output_dir) def test_contrib_matplotlib(self): """Test a pdf with Type3 font"""