diff --git a/CHANGELOG.md b/CHANGELOG.md index dd4b3d3..9082416 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [Unreleased] -Nothing yet +### Added +- Support for extracting JBIG2 encoded images ([#311](https://github.com/pdfminer/pdfminer.six/pull/311) and [#46](https://github.com/pdfminer/pdfminer.six/pull/46)) ## [20191020] - 2019-10-20 @@ -27,7 +28,7 @@ Nothing yet - Allow for bounding boxes with zero height or width by removing assertion ([#246](https://github.com/pdfminer/pdfminer.six/pull/246)) ### Changed -- All dependencies are managed in `setup.py` ([#306](https://github.com/pdfminer/pdfminer.six/pull/306), [#219](https://github.com/pdfminer/pdfminer.six/pull/219)) +- All dependencies are managed in `setup.py` ([#306](https://github.com/pdfminer/pdfminer.six/pull/306) and [#219](https://github.com/pdfminer/pdfminer.six/pull/219)) ## [20181108] - 2018-11-08 diff --git a/README.md b/README.md index e2e4cc8..fae5fb0 100644 --- a/README.md +++ b/README.md @@ -23,9 +23,10 @@ Features * Written entirely in Python. * Parse, analyze, and convert PDF documents. - * PDF-1.7 specification support. (well, almost) + * PDF-1.7 specification support. (well, almost). * CJK languages and vertical writing scripts support. * Various font types (Type1, TrueType, Type3, and CID) support. + * Support for extracting images (JPG, JBIG2 and Bitmaps). * Basic encryption (RC4) support. * Outline (TOC) extraction. * Tagged contents extraction. diff --git a/pdfminer/image.py b/pdfminer/image.py index 39265fb..c69b700 100644 --- a/pdfminer/image.py +++ b/pdfminer/image.py @@ -1,12 +1,14 @@ -import struct import os import os.path +import struct from io import BytesIO -from .pdftypes import LITERALS_DCT_DECODE + +from .jbig2 import JBIG2StreamReader, JBIG2StreamWriter +from .pdfcolor import LITERAL_DEVICE_CMYK from .pdfcolor import LITERAL_DEVICE_GRAY from .pdfcolor import LITERAL_DEVICE_RGB -from .pdfcolor import LITERAL_DEVICE_CMYK +from .pdftypes import LITERALS_DCT_DECODE, LITERALS_JBIG2_DECODE def align32(x): @@ -57,9 +59,11 @@ class BMPWriter(object): return -## ImageWriter -## class ImageWriter(object): + """Write image to a file + + Supports various image types: JPEG, JBIG2 and bitmaps + """ def __init__(self, outdir): self.outdir = outdir @@ -68,21 +72,15 @@ class ImageWriter(object): return def export_image(self, image): - stream = image.stream - filters = stream.get_filters() (width, height) = image.srcsize - if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE: - ext = '.jpg' - elif (image.bits == 1 or - image.bits == 8 and (LITERAL_DEVICE_RGB in image.colorspace or LITERAL_DEVICE_GRAY in image.colorspace)): - ext = '.%dx%d.bmp' % (width, height) - else: - ext = '.%d.%dx%d.img' % (image.bits, width, height) - name = image.name+ext - path = os.path.join(self.outdir, name) - fp=open(path, 'wb') + + is_jbig2 = self.is_jbig2_image(image) + ext = self._get_image_extension(image, width, height, is_jbig2) + name, path = self._create_unique_image_name(self.outdir, image.name, ext) + + fp = open(path, 'wb') if ext == '.jpg': - raw_data = stream.get_rawdata() + raw_data = image.stream.get_rawdata() if LITERAL_DEVICE_CMYK in image.colorspace: from PIL import Image from PIL import ImageChops @@ -93,9 +91,18 @@ class ImageWriter(object): i.save(fp, 'JPEG') else: fp.write(raw_data) + elif is_jbig2: + input_stream = BytesIO() + input_stream.write(image.stream.get_data()) + input_stream.seek(0) + reader = JBIG2StreamReader(input_stream) + segments = reader.get_segments() + + writer = JBIG2StreamWriter(fp) + writer.write_file(segments) elif image.bits == 1: bmp = BMPWriter(fp, 1, width, height) - data = stream.get_data() + data = image.stream.get_data() i = 0 width = (width+7)//8 for y in range(height): @@ -103,7 +110,7 @@ class ImageWriter(object): i += width elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace: bmp = BMPWriter(fp, 24, width, height) - data = stream.get_data() + data = image.stream.get_data() i = 0 width = width*3 for y in range(height): @@ -111,12 +118,47 @@ class ImageWriter(object): i += width elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace: bmp = BMPWriter(fp, 8, width, height) - data = stream.get_data() + data = image.stream.get_data() i = 0 for y in range(height): bmp.write_line(y, data[i:i+width]) i += width else: - fp.write(stream.get_data()) + fp.write(image.stream.get_data()) fp.close() return name + + @staticmethod + def is_jbig2_image(image): + filters = image.stream.get_filters() + is_jbig2 = False + for filter_name, params in filters: + if filter_name in LITERALS_JBIG2_DECODE: + is_jbig2 = True + break + return is_jbig2 + + @staticmethod + def _get_image_extension(image, width, height, is_jbig2): + filters = image.stream.get_filters() + if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE: + ext = '.jpg' + elif is_jbig2: + ext = '.jb2' + elif (image.bits == 1 or + image.bits == 8 and (LITERAL_DEVICE_RGB in image.colorspace or LITERAL_DEVICE_GRAY in image.colorspace)): + ext = '.%dx%d.bmp' % (width, height) + else: + ext = '.%d.%dx%d.img' % (image.bits, width, height) + return ext + + @staticmethod + def _create_unique_image_name(dirname, image_name, ext): + name = image_name + ext + path = os.path.join(dirname, name) + img_index = 0 + while os.path.exists(path): + name = '%s.%d%s' % (image_name, img_index, ext) + path = os.path.join(dirname, name) + img_index += 1 + return name, path diff --git a/pdfminer/jbig2.py b/pdfminer/jbig2.py new file mode 100644 index 0000000..39393e5 --- /dev/null +++ b/pdfminer/jbig2.py @@ -0,0 +1,321 @@ +import math +import os +from struct import pack, unpack, calcsize + +# segment structure base +SEG_STRUCT = [ + (">L", "number"), + (">B", "flags"), + (">B", "retention_flags"), + (">B", "page_assoc"), + (">L", "data_length"), +] + +# segment header literals +HEADER_FLAG_DEFERRED = 0b10000000 +HEADER_FLAG_PAGE_ASSOC_LONG = 0b01000000 + +SEG_TYPE_MASK = 0b00111111 + +REF_COUNT_SHORT_MASK = 0b11100000 +REF_COUNT_LONG_MASK = 0x1fffffff +REF_COUNT_LONG = 7 + +DATA_LEN_UNKNOWN = 0xffffffff + +# segment types +SEG_TYPE_IMMEDIATE_GEN_REGION = 38 +SEG_TYPE_END_OF_PAGE = 49 +SEG_TYPE_END_OF_FILE = 50 + +# file literals +FILE_HEADER_ID = b'\x97\x4A\x42\x32\x0D\x0A\x1A\x0A' +FILE_HEAD_FLAG_SEQUENTIAL = 0b00000001 +FILE_HEAD_FLAG_PAGES_UNKNOWN = 0b00000010 + + +def bit_set(bit_pos, value): + return bool((value >> bit_pos) & 1) + + +def check_flag(flag, value): + return bool(flag & value) + + +def masked_value(mask, value): + for bit_pos in range(0, 31): + if bit_set(bit_pos, mask): + return (value & mask) >> bit_pos + + raise Exception("Invalid mask or value") + + +def mask_value(mask, value): + for bit_pos in range(0, 31): + if bit_set(bit_pos, mask): + return (value & (mask >> bit_pos)) << bit_pos + + raise Exception("Invalid mask or value") + + +class JBIG2StreamReader(object): + """Read segments from a JBIG2 byte stream""" + + def __init__(self, stream): + self.stream = stream + + def get_segments(self): + segments = [] + while not self.is_eof(): + segment = {} + for field_format, name in SEG_STRUCT: + field_len = calcsize(field_format) + field = self.stream.read(field_len) + if len(field) < field_len: + segment["_error"] = True + break + value = unpack(field_format, field) + if len(value) == 1: + [value] = value + parser = getattr(self, "parse_%s" % name, None) + if callable(parser): + value = parser(segment, value, field) + segment[name] = value + + if not segment.get("_error"): + segments.append(segment) + return segments + + def is_eof(self): + if self.stream.read(1) == b'': + return True + else: + self.stream.seek(-1, os.SEEK_CUR) + return False + + def parse_flags(self, segment, flags, field): + return { + "deferred": check_flag(HEADER_FLAG_DEFERRED, flags), + "page_assoc_long": check_flag(HEADER_FLAG_PAGE_ASSOC_LONG, flags), + "type": masked_value(SEG_TYPE_MASK, flags) + } + + def parse_retention_flags(self, segment, flags, field): + ref_count = masked_value(REF_COUNT_SHORT_MASK, flags) + retain_segments = [] + ref_segments = [] + + if ref_count < REF_COUNT_LONG: + for bit_pos in range(5): + retain_segments.append(bit_set(bit_pos, flags)) + else: + field += self.stream.read(3) + [ref_count] = unpack(">L", field) + ref_count = masked_value(REF_COUNT_LONG_MASK, ref_count) + ret_bytes_count = int(math.ceil((ref_count + 1) / 8)) + for ret_byte_index in range(ret_bytes_count): + [ret_byte] = unpack(">B", self.stream.read(1)) + for bit_pos in range(7): + retain_segments.append(bit_set(bit_pos, ret_byte)) + + seg_num = segment["number"] + if seg_num <= 256: + ref_format = ">B" + elif seg_num <= 65536: + ref_format = ">I" + else: + ref_format = ">L" + + ref_size = calcsize(ref_format) + + for ref_index in range(ref_count): + ref = self.stream.read(ref_size) + [ref] = unpack(ref_format, ref) + ref_segments.append(ref) + + return { + "ref_count": ref_count, + "retain_segments": retain_segments, + "ref_segments": ref_segments, + } + + def parse_page_assoc(self, segment, page, field): + if segment["flags"]["page_assoc_long"]: + field += self.stream.read(3) + [page] = unpack(">L", field) + return page + + def parse_data_length(self, segment, length, field): + if length: + if (segment["flags"]["type"] == SEG_TYPE_IMMEDIATE_GEN_REGION) \ + and (length == DATA_LEN_UNKNOWN): + + raise NotImplementedError( + "Working with unknown segment length " + "is not implemented yet" + ) + else: + segment["raw_data"] = self.stream.read(length) + + return length + + +class JBIG2StreamWriter(object): + """Write JBIG2 segments to a file in JBIG2 format""" + + def __init__(self, stream): + self.stream = stream + + def write_segments(self, segments, fix_last_page=True): + data_len = 0 + current_page = None + seg_num = None + + for segment in segments: + data = self.encode_segment(segment) + self.stream.write(data) + data_len += len(data) + + seg_num = segment["number"] + + if fix_last_page: + seg_page = segment.get("page_assoc") + + if segment["flags"]["type"] == SEG_TYPE_END_OF_PAGE: + current_page = None + elif seg_page: + current_page = seg_page + + if fix_last_page and current_page and (seg_num is not None): + segment = self.get_eop_segment(seg_num + 1, current_page) + data = self.encode_segment(segment) + self.stream.write(data) + data_len += len(data) + + return data_len + + def write_file(self, segments, fix_last_page=True): + header = FILE_HEADER_ID + header_flags = FILE_HEAD_FLAG_SEQUENTIAL | FILE_HEAD_FLAG_PAGES_UNKNOWN + header += pack(">B", header_flags) + self.stream.write(header) + data_len = len(header) + + data_len += self.write_segments(segments, fix_last_page) + + seg_num = 0 + for segment in segments: + seg_num = segment["number"] + + eof_segment = self.get_eof_segment(seg_num + 1) + data = self.encode_segment(eof_segment) + + self.stream.write(data) + data_len += len(data) + + return data_len + + def encode_segment(self, segment): + data = b'' + for field_format, name in SEG_STRUCT: + value = segment.get(name) + encoder = getattr(self, "encode_%s" % name, None) + if callable(encoder): + field = encoder(value, segment) + else: + field = pack(field_format, value) + data += field + return data + + def encode_flags(self, value, segment): + flags = 0 + if value.get("deferred"): + flags |= HEADER_FLAG_DEFERRED + + if "page_assoc_long" in value: + flags |= HEADER_FLAG_PAGE_ASSOC_LONG \ + if value["page_assoc_long"] else flags + else: + flags |= HEADER_FLAG_PAGE_ASSOC_LONG \ + if segment.get("page", 0) > 255 else flags + + flags |= mask_value(SEG_TYPE_MASK, value["type"]) + + return pack(">B", flags) + + def encode_retention_flags(self, value, segment): + flags = [] + flags_format = ">B" + ref_count = value["ref_count"] + retain_segments = value.get("retain_segments", []) + + if ref_count <= 4: + flags_byte = mask_value(REF_COUNT_SHORT_MASK, ref_count) + for ref_index, ref_retain in enumerate(retain_segments): + flags_byte |= 1 << ref_index + flags.append(flags_byte) + else: + bytes_count = math.ceil((ref_count + 1) / 8) + flags_format = ">L" + ("B" * bytes_count) + flags_dword = mask_value( + REF_COUNT_SHORT_MASK, + REF_COUNT_LONG + ) << 24 + flags.append(flags_dword) + + for byte_index in range(bytes_count): + ret_byte = 0 + ret_part = retain_segments[byte_index * 8:byte_index * 8 + 8] + for bit_pos, ret_seg in enumerate(ret_part): + ret_byte |= 1 << bit_pos if ret_seg else ret_byte + + flags.append(ret_byte) + + ref_segments = value.get("ref_segments", []) + + seg_num = segment["number"] + if seg_num <= 256: + ref_format = "B" + elif seg_num <= 65536: + ref_format = "I" + else: + ref_format = "L" + + for ref in ref_segments: + flags_format += ref_format + flags.append(ref) + + return pack(flags_format, *flags) + + def encode_data_length(self, value, segment): + data = pack(">L", value) + data += segment["raw_data"] + return data + + def get_eop_segment(self, seg_number, page_number): + return { + 'data_length': 0, + 'flags': {'deferred': False, 'type': SEG_TYPE_END_OF_PAGE}, + 'number': seg_number, + 'page_assoc': page_number, + 'raw_data': b'', + 'retention_flags': { + 'ref_count': 0, + 'ref_segments': [], + 'retain_segments': [] + } + } + + def get_eof_segment(self, seg_number): + return { + 'data_length': 0, + 'flags': {'deferred': False, 'type': SEG_TYPE_END_OF_FILE}, + 'number': seg_number, + 'page_assoc': 0, + 'raw_data': b'', + 'retention_flags': { + 'ref_count': 0, + 'ref_segments': [], + 'retain_segments': [] + } + } diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index c6e8d86..96e255f 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -27,7 +27,7 @@ LITERALS_ASCIIHEX_DECODE = (LIT('ASCIIHexDecode'), LIT('AHx')) LITERALS_RUNLENGTH_DECODE = (LIT('RunLengthDecode'), LIT('RL')) LITERALS_CCITTFAX_DECODE = (LIT('CCITTFaxDecode'), LIT('CCF')) LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT')) - +LITERALS_JBIG2_DECODE = (LIT('JBIG2Decode'),) ## PDF Objects ## @@ -275,6 +275,8 @@ class PDFStream(PDFObject): # This is probably a JPG stream - it does not need to be decoded twice. # Just return the stream to the user. pass + elif f in LITERALS_JBIG2_DECODE: + pass elif f == LITERAL_CRYPT: # not yet.. raise PDFNotImplementedError('/Crypt filter is unsupported') diff --git a/samples/contrib/pdf-with-jbig2.pdf b/samples/contrib/pdf-with-jbig2.pdf new file mode 100644 index 0000000..b110fb4 Binary files /dev/null and b/samples/contrib/pdf-with-jbig2.pdf differ diff --git a/tests/test_tools_pdf2txt.py b/tests/test_tools_pdf2txt.py index 188f652..3b09140 100644 --- a/tests/test_tools_pdf2txt.py +++ b/tests/test_tools_pdf2txt.py @@ -13,6 +13,7 @@ def full_path(relative_path_to_this_file): return abspath + def run(datapath, filename, options=None): i = full_path(datapath + filename + '.pdf') o = full_path(filename + '.txt') @@ -89,5 +90,27 @@ class TestDumpImages(object): self.extract_images(full_path('../samples/nonfree/175.pdf')) + +class TestDumpImages(object): + + @staticmethod + def extract_images(input_file): + output_dir = mkdtemp() + with NamedTemporaryFile() as output_file: + commands = ['-o', output_file.name, '--output-dir', output_dir, input_file] + pdf2txt.main(commands) + image_files = os.listdir(output_dir) + rmtree(output_dir) + return image_files + + def test_jbig2_image_export(self): + """Extract images of pdf containing jbig2 images + + Feature test for: https://github.com/pdfminer/pdfminer.six/pull/46 + """ + image_files = self.extract_images(full_path('../samples/contrib/pdf-with-jbig2.pdf')) + assert image_files[0].endswith('.jb2') + + if __name__ == '__main__': nose.runmodule()