Added: extraction of JBIG2 encoded images (#311)
And added test for pdf with JBIG2 image. Fixes #26 Closes #46pull/316/head
parent
3001fe3a82
commit
373c6e7b97
|
@ -5,7 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
|||
|
||||
## [Unreleased]
|
||||
|
||||
Nothing yet
|
||||
### Added
|
||||
- Support for extracting JBIG2 encoded images ([#311](https://github.com/pdfminer/pdfminer.six/pull/311) and [#46](https://github.com/pdfminer/pdfminer.six/pull/46))
|
||||
|
||||
## [20191020] - 2019-10-20
|
||||
|
||||
|
@ -27,7 +28,7 @@ Nothing yet
|
|||
- Allow for bounding boxes with zero height or width by removing assertion ([#246](https://github.com/pdfminer/pdfminer.six/pull/246))
|
||||
|
||||
### Changed
|
||||
- All dependencies are managed in `setup.py` ([#306](https://github.com/pdfminer/pdfminer.six/pull/306), [#219](https://github.com/pdfminer/pdfminer.six/pull/219))
|
||||
- All dependencies are managed in `setup.py` ([#306](https://github.com/pdfminer/pdfminer.six/pull/306) and [#219](https://github.com/pdfminer/pdfminer.six/pull/219))
|
||||
|
||||
## [20181108] - 2018-11-08
|
||||
|
||||
|
|
|
@ -23,9 +23,10 @@ Features
|
|||
|
||||
* Written entirely in Python.
|
||||
* Parse, analyze, and convert PDF documents.
|
||||
* PDF-1.7 specification support. (well, almost)
|
||||
* PDF-1.7 specification support. (well, almost).
|
||||
* CJK languages and vertical writing scripts support.
|
||||
* Various font types (Type1, TrueType, Type3, and CID) support.
|
||||
* Support for extracting images (JPG, JBIG2 and Bitmaps).
|
||||
* Basic encryption (RC4) support.
|
||||
* Outline (TOC) extraction.
|
||||
* Tagged contents extraction.
|
||||
|
|
|
@ -1,12 +1,14 @@
|
|||
|
||||
import struct
|
||||
import os
|
||||
import os.path
|
||||
import struct
|
||||
from io import BytesIO
|
||||
from .pdftypes import LITERALS_DCT_DECODE
|
||||
|
||||
from .jbig2 import JBIG2StreamReader, JBIG2StreamWriter
|
||||
from .pdfcolor import LITERAL_DEVICE_CMYK
|
||||
from .pdfcolor import LITERAL_DEVICE_GRAY
|
||||
from .pdfcolor import LITERAL_DEVICE_RGB
|
||||
from .pdfcolor import LITERAL_DEVICE_CMYK
|
||||
from .pdftypes import LITERALS_DCT_DECODE, LITERALS_JBIG2_DECODE
|
||||
|
||||
|
||||
def align32(x):
|
||||
|
@ -57,9 +59,11 @@ class BMPWriter(object):
|
|||
return
|
||||
|
||||
|
||||
## ImageWriter
|
||||
##
|
||||
class ImageWriter(object):
|
||||
"""Write image to a file
|
||||
|
||||
Supports various image types: JPEG, JBIG2 and bitmaps
|
||||
"""
|
||||
|
||||
def __init__(self, outdir):
|
||||
self.outdir = outdir
|
||||
|
@ -68,21 +72,15 @@ class ImageWriter(object):
|
|||
return
|
||||
|
||||
def export_image(self, image):
|
||||
stream = image.stream
|
||||
filters = stream.get_filters()
|
||||
(width, height) = image.srcsize
|
||||
if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
|
||||
ext = '.jpg'
|
||||
elif (image.bits == 1 or
|
||||
image.bits == 8 and (LITERAL_DEVICE_RGB in image.colorspace or LITERAL_DEVICE_GRAY in image.colorspace)):
|
||||
ext = '.%dx%d.bmp' % (width, height)
|
||||
else:
|
||||
ext = '.%d.%dx%d.img' % (image.bits, width, height)
|
||||
name = image.name+ext
|
||||
path = os.path.join(self.outdir, name)
|
||||
|
||||
is_jbig2 = self.is_jbig2_image(image)
|
||||
ext = self._get_image_extension(image, width, height, is_jbig2)
|
||||
name, path = self._create_unique_image_name(self.outdir, image.name, ext)
|
||||
|
||||
fp = open(path, 'wb')
|
||||
if ext == '.jpg':
|
||||
raw_data = stream.get_rawdata()
|
||||
raw_data = image.stream.get_rawdata()
|
||||
if LITERAL_DEVICE_CMYK in image.colorspace:
|
||||
from PIL import Image
|
||||
from PIL import ImageChops
|
||||
|
@ -93,9 +91,18 @@ class ImageWriter(object):
|
|||
i.save(fp, 'JPEG')
|
||||
else:
|
||||
fp.write(raw_data)
|
||||
elif is_jbig2:
|
||||
input_stream = BytesIO()
|
||||
input_stream.write(image.stream.get_data())
|
||||
input_stream.seek(0)
|
||||
reader = JBIG2StreamReader(input_stream)
|
||||
segments = reader.get_segments()
|
||||
|
||||
writer = JBIG2StreamWriter(fp)
|
||||
writer.write_file(segments)
|
||||
elif image.bits == 1:
|
||||
bmp = BMPWriter(fp, 1, width, height)
|
||||
data = stream.get_data()
|
||||
data = image.stream.get_data()
|
||||
i = 0
|
||||
width = (width+7)//8
|
||||
for y in range(height):
|
||||
|
@ -103,7 +110,7 @@ class ImageWriter(object):
|
|||
i += width
|
||||
elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace:
|
||||
bmp = BMPWriter(fp, 24, width, height)
|
||||
data = stream.get_data()
|
||||
data = image.stream.get_data()
|
||||
i = 0
|
||||
width = width*3
|
||||
for y in range(height):
|
||||
|
@ -111,12 +118,47 @@ class ImageWriter(object):
|
|||
i += width
|
||||
elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace:
|
||||
bmp = BMPWriter(fp, 8, width, height)
|
||||
data = stream.get_data()
|
||||
data = image.stream.get_data()
|
||||
i = 0
|
||||
for y in range(height):
|
||||
bmp.write_line(y, data[i:i+width])
|
||||
i += width
|
||||
else:
|
||||
fp.write(stream.get_data())
|
||||
fp.write(image.stream.get_data())
|
||||
fp.close()
|
||||
return name
|
||||
|
||||
@staticmethod
|
||||
def is_jbig2_image(image):
|
||||
filters = image.stream.get_filters()
|
||||
is_jbig2 = False
|
||||
for filter_name, params in filters:
|
||||
if filter_name in LITERALS_JBIG2_DECODE:
|
||||
is_jbig2 = True
|
||||
break
|
||||
return is_jbig2
|
||||
|
||||
@staticmethod
|
||||
def _get_image_extension(image, width, height, is_jbig2):
|
||||
filters = image.stream.get_filters()
|
||||
if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
|
||||
ext = '.jpg'
|
||||
elif is_jbig2:
|
||||
ext = '.jb2'
|
||||
elif (image.bits == 1 or
|
||||
image.bits == 8 and (LITERAL_DEVICE_RGB in image.colorspace or LITERAL_DEVICE_GRAY in image.colorspace)):
|
||||
ext = '.%dx%d.bmp' % (width, height)
|
||||
else:
|
||||
ext = '.%d.%dx%d.img' % (image.bits, width, height)
|
||||
return ext
|
||||
|
||||
@staticmethod
|
||||
def _create_unique_image_name(dirname, image_name, ext):
|
||||
name = image_name + ext
|
||||
path = os.path.join(dirname, name)
|
||||
img_index = 0
|
||||
while os.path.exists(path):
|
||||
name = '%s.%d%s' % (image_name, img_index, ext)
|
||||
path = os.path.join(dirname, name)
|
||||
img_index += 1
|
||||
return name, path
|
||||
|
|
|
@ -0,0 +1,321 @@
|
|||
import math
|
||||
import os
|
||||
from struct import pack, unpack, calcsize
|
||||
|
||||
# segment structure base
|
||||
SEG_STRUCT = [
|
||||
(">L", "number"),
|
||||
(">B", "flags"),
|
||||
(">B", "retention_flags"),
|
||||
(">B", "page_assoc"),
|
||||
(">L", "data_length"),
|
||||
]
|
||||
|
||||
# segment header literals
|
||||
HEADER_FLAG_DEFERRED = 0b10000000
|
||||
HEADER_FLAG_PAGE_ASSOC_LONG = 0b01000000
|
||||
|
||||
SEG_TYPE_MASK = 0b00111111
|
||||
|
||||
REF_COUNT_SHORT_MASK = 0b11100000
|
||||
REF_COUNT_LONG_MASK = 0x1fffffff
|
||||
REF_COUNT_LONG = 7
|
||||
|
||||
DATA_LEN_UNKNOWN = 0xffffffff
|
||||
|
||||
# segment types
|
||||
SEG_TYPE_IMMEDIATE_GEN_REGION = 38
|
||||
SEG_TYPE_END_OF_PAGE = 49
|
||||
SEG_TYPE_END_OF_FILE = 50
|
||||
|
||||
# file literals
|
||||
FILE_HEADER_ID = b'\x97\x4A\x42\x32\x0D\x0A\x1A\x0A'
|
||||
FILE_HEAD_FLAG_SEQUENTIAL = 0b00000001
|
||||
FILE_HEAD_FLAG_PAGES_UNKNOWN = 0b00000010
|
||||
|
||||
|
||||
def bit_set(bit_pos, value):
|
||||
return bool((value >> bit_pos) & 1)
|
||||
|
||||
|
||||
def check_flag(flag, value):
|
||||
return bool(flag & value)
|
||||
|
||||
|
||||
def masked_value(mask, value):
|
||||
for bit_pos in range(0, 31):
|
||||
if bit_set(bit_pos, mask):
|
||||
return (value & mask) >> bit_pos
|
||||
|
||||
raise Exception("Invalid mask or value")
|
||||
|
||||
|
||||
def mask_value(mask, value):
|
||||
for bit_pos in range(0, 31):
|
||||
if bit_set(bit_pos, mask):
|
||||
return (value & (mask >> bit_pos)) << bit_pos
|
||||
|
||||
raise Exception("Invalid mask or value")
|
||||
|
||||
|
||||
class JBIG2StreamReader(object):
|
||||
"""Read segments from a JBIG2 byte stream"""
|
||||
|
||||
def __init__(self, stream):
|
||||
self.stream = stream
|
||||
|
||||
def get_segments(self):
|
||||
segments = []
|
||||
while not self.is_eof():
|
||||
segment = {}
|
||||
for field_format, name in SEG_STRUCT:
|
||||
field_len = calcsize(field_format)
|
||||
field = self.stream.read(field_len)
|
||||
if len(field) < field_len:
|
||||
segment["_error"] = True
|
||||
break
|
||||
value = unpack(field_format, field)
|
||||
if len(value) == 1:
|
||||
[value] = value
|
||||
parser = getattr(self, "parse_%s" % name, None)
|
||||
if callable(parser):
|
||||
value = parser(segment, value, field)
|
||||
segment[name] = value
|
||||
|
||||
if not segment.get("_error"):
|
||||
segments.append(segment)
|
||||
return segments
|
||||
|
||||
def is_eof(self):
|
||||
if self.stream.read(1) == b'':
|
||||
return True
|
||||
else:
|
||||
self.stream.seek(-1, os.SEEK_CUR)
|
||||
return False
|
||||
|
||||
def parse_flags(self, segment, flags, field):
|
||||
return {
|
||||
"deferred": check_flag(HEADER_FLAG_DEFERRED, flags),
|
||||
"page_assoc_long": check_flag(HEADER_FLAG_PAGE_ASSOC_LONG, flags),
|
||||
"type": masked_value(SEG_TYPE_MASK, flags)
|
||||
}
|
||||
|
||||
def parse_retention_flags(self, segment, flags, field):
|
||||
ref_count = masked_value(REF_COUNT_SHORT_MASK, flags)
|
||||
retain_segments = []
|
||||
ref_segments = []
|
||||
|
||||
if ref_count < REF_COUNT_LONG:
|
||||
for bit_pos in range(5):
|
||||
retain_segments.append(bit_set(bit_pos, flags))
|
||||
else:
|
||||
field += self.stream.read(3)
|
||||
[ref_count] = unpack(">L", field)
|
||||
ref_count = masked_value(REF_COUNT_LONG_MASK, ref_count)
|
||||
ret_bytes_count = int(math.ceil((ref_count + 1) / 8))
|
||||
for ret_byte_index in range(ret_bytes_count):
|
||||
[ret_byte] = unpack(">B", self.stream.read(1))
|
||||
for bit_pos in range(7):
|
||||
retain_segments.append(bit_set(bit_pos, ret_byte))
|
||||
|
||||
seg_num = segment["number"]
|
||||
if seg_num <= 256:
|
||||
ref_format = ">B"
|
||||
elif seg_num <= 65536:
|
||||
ref_format = ">I"
|
||||
else:
|
||||
ref_format = ">L"
|
||||
|
||||
ref_size = calcsize(ref_format)
|
||||
|
||||
for ref_index in range(ref_count):
|
||||
ref = self.stream.read(ref_size)
|
||||
[ref] = unpack(ref_format, ref)
|
||||
ref_segments.append(ref)
|
||||
|
||||
return {
|
||||
"ref_count": ref_count,
|
||||
"retain_segments": retain_segments,
|
||||
"ref_segments": ref_segments,
|
||||
}
|
||||
|
||||
def parse_page_assoc(self, segment, page, field):
|
||||
if segment["flags"]["page_assoc_long"]:
|
||||
field += self.stream.read(3)
|
||||
[page] = unpack(">L", field)
|
||||
return page
|
||||
|
||||
def parse_data_length(self, segment, length, field):
|
||||
if length:
|
||||
if (segment["flags"]["type"] == SEG_TYPE_IMMEDIATE_GEN_REGION) \
|
||||
and (length == DATA_LEN_UNKNOWN):
|
||||
|
||||
raise NotImplementedError(
|
||||
"Working with unknown segment length "
|
||||
"is not implemented yet"
|
||||
)
|
||||
else:
|
||||
segment["raw_data"] = self.stream.read(length)
|
||||
|
||||
return length
|
||||
|
||||
|
||||
class JBIG2StreamWriter(object):
|
||||
"""Write JBIG2 segments to a file in JBIG2 format"""
|
||||
|
||||
def __init__(self, stream):
|
||||
self.stream = stream
|
||||
|
||||
def write_segments(self, segments, fix_last_page=True):
|
||||
data_len = 0
|
||||
current_page = None
|
||||
seg_num = None
|
||||
|
||||
for segment in segments:
|
||||
data = self.encode_segment(segment)
|
||||
self.stream.write(data)
|
||||
data_len += len(data)
|
||||
|
||||
seg_num = segment["number"]
|
||||
|
||||
if fix_last_page:
|
||||
seg_page = segment.get("page_assoc")
|
||||
|
||||
if segment["flags"]["type"] == SEG_TYPE_END_OF_PAGE:
|
||||
current_page = None
|
||||
elif seg_page:
|
||||
current_page = seg_page
|
||||
|
||||
if fix_last_page and current_page and (seg_num is not None):
|
||||
segment = self.get_eop_segment(seg_num + 1, current_page)
|
||||
data = self.encode_segment(segment)
|
||||
self.stream.write(data)
|
||||
data_len += len(data)
|
||||
|
||||
return data_len
|
||||
|
||||
def write_file(self, segments, fix_last_page=True):
|
||||
header = FILE_HEADER_ID
|
||||
header_flags = FILE_HEAD_FLAG_SEQUENTIAL | FILE_HEAD_FLAG_PAGES_UNKNOWN
|
||||
header += pack(">B", header_flags)
|
||||
self.stream.write(header)
|
||||
data_len = len(header)
|
||||
|
||||
data_len += self.write_segments(segments, fix_last_page)
|
||||
|
||||
seg_num = 0
|
||||
for segment in segments:
|
||||
seg_num = segment["number"]
|
||||
|
||||
eof_segment = self.get_eof_segment(seg_num + 1)
|
||||
data = self.encode_segment(eof_segment)
|
||||
|
||||
self.stream.write(data)
|
||||
data_len += len(data)
|
||||
|
||||
return data_len
|
||||
|
||||
def encode_segment(self, segment):
|
||||
data = b''
|
||||
for field_format, name in SEG_STRUCT:
|
||||
value = segment.get(name)
|
||||
encoder = getattr(self, "encode_%s" % name, None)
|
||||
if callable(encoder):
|
||||
field = encoder(value, segment)
|
||||
else:
|
||||
field = pack(field_format, value)
|
||||
data += field
|
||||
return data
|
||||
|
||||
def encode_flags(self, value, segment):
|
||||
flags = 0
|
||||
if value.get("deferred"):
|
||||
flags |= HEADER_FLAG_DEFERRED
|
||||
|
||||
if "page_assoc_long" in value:
|
||||
flags |= HEADER_FLAG_PAGE_ASSOC_LONG \
|
||||
if value["page_assoc_long"] else flags
|
||||
else:
|
||||
flags |= HEADER_FLAG_PAGE_ASSOC_LONG \
|
||||
if segment.get("page", 0) > 255 else flags
|
||||
|
||||
flags |= mask_value(SEG_TYPE_MASK, value["type"])
|
||||
|
||||
return pack(">B", flags)
|
||||
|
||||
def encode_retention_flags(self, value, segment):
|
||||
flags = []
|
||||
flags_format = ">B"
|
||||
ref_count = value["ref_count"]
|
||||
retain_segments = value.get("retain_segments", [])
|
||||
|
||||
if ref_count <= 4:
|
||||
flags_byte = mask_value(REF_COUNT_SHORT_MASK, ref_count)
|
||||
for ref_index, ref_retain in enumerate(retain_segments):
|
||||
flags_byte |= 1 << ref_index
|
||||
flags.append(flags_byte)
|
||||
else:
|
||||
bytes_count = math.ceil((ref_count + 1) / 8)
|
||||
flags_format = ">L" + ("B" * bytes_count)
|
||||
flags_dword = mask_value(
|
||||
REF_COUNT_SHORT_MASK,
|
||||
REF_COUNT_LONG
|
||||
) << 24
|
||||
flags.append(flags_dword)
|
||||
|
||||
for byte_index in range(bytes_count):
|
||||
ret_byte = 0
|
||||
ret_part = retain_segments[byte_index * 8:byte_index * 8 + 8]
|
||||
for bit_pos, ret_seg in enumerate(ret_part):
|
||||
ret_byte |= 1 << bit_pos if ret_seg else ret_byte
|
||||
|
||||
flags.append(ret_byte)
|
||||
|
||||
ref_segments = value.get("ref_segments", [])
|
||||
|
||||
seg_num = segment["number"]
|
||||
if seg_num <= 256:
|
||||
ref_format = "B"
|
||||
elif seg_num <= 65536:
|
||||
ref_format = "I"
|
||||
else:
|
||||
ref_format = "L"
|
||||
|
||||
for ref in ref_segments:
|
||||
flags_format += ref_format
|
||||
flags.append(ref)
|
||||
|
||||
return pack(flags_format, *flags)
|
||||
|
||||
def encode_data_length(self, value, segment):
|
||||
data = pack(">L", value)
|
||||
data += segment["raw_data"]
|
||||
return data
|
||||
|
||||
def get_eop_segment(self, seg_number, page_number):
|
||||
return {
|
||||
'data_length': 0,
|
||||
'flags': {'deferred': False, 'type': SEG_TYPE_END_OF_PAGE},
|
||||
'number': seg_number,
|
||||
'page_assoc': page_number,
|
||||
'raw_data': b'',
|
||||
'retention_flags': {
|
||||
'ref_count': 0,
|
||||
'ref_segments': [],
|
||||
'retain_segments': []
|
||||
}
|
||||
}
|
||||
|
||||
def get_eof_segment(self, seg_number):
|
||||
return {
|
||||
'data_length': 0,
|
||||
'flags': {'deferred': False, 'type': SEG_TYPE_END_OF_FILE},
|
||||
'number': seg_number,
|
||||
'page_assoc': 0,
|
||||
'raw_data': b'',
|
||||
'retention_flags': {
|
||||
'ref_count': 0,
|
||||
'ref_segments': [],
|
||||
'retain_segments': []
|
||||
}
|
||||
}
|
|
@ -27,7 +27,7 @@ LITERALS_ASCIIHEX_DECODE = (LIT('ASCIIHexDecode'), LIT('AHx'))
|
|||
LITERALS_RUNLENGTH_DECODE = (LIT('RunLengthDecode'), LIT('RL'))
|
||||
LITERALS_CCITTFAX_DECODE = (LIT('CCITTFaxDecode'), LIT('CCF'))
|
||||
LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT'))
|
||||
|
||||
LITERALS_JBIG2_DECODE = (LIT('JBIG2Decode'),)
|
||||
|
||||
## PDF Objects
|
||||
##
|
||||
|
@ -275,6 +275,8 @@ class PDFStream(PDFObject):
|
|||
# This is probably a JPG stream - it does not need to be decoded twice.
|
||||
# Just return the stream to the user.
|
||||
pass
|
||||
elif f in LITERALS_JBIG2_DECODE:
|
||||
pass
|
||||
elif f == LITERAL_CRYPT:
|
||||
# not yet..
|
||||
raise PDFNotImplementedError('/Crypt filter is unsupported')
|
||||
|
|
Binary file not shown.
|
@ -13,6 +13,7 @@ def full_path(relative_path_to_this_file):
|
|||
return abspath
|
||||
|
||||
|
||||
|
||||
def run(datapath, filename, options=None):
|
||||
i = full_path(datapath + filename + '.pdf')
|
||||
o = full_path(filename + '.txt')
|
||||
|
@ -89,5 +90,27 @@ class TestDumpImages(object):
|
|||
self.extract_images(full_path('../samples/nonfree/175.pdf'))
|
||||
|
||||
|
||||
|
||||
class TestDumpImages(object):
|
||||
|
||||
@staticmethod
|
||||
def extract_images(input_file):
|
||||
output_dir = mkdtemp()
|
||||
with NamedTemporaryFile() as output_file:
|
||||
commands = ['-o', output_file.name, '--output-dir', output_dir, input_file]
|
||||
pdf2txt.main(commands)
|
||||
image_files = os.listdir(output_dir)
|
||||
rmtree(output_dir)
|
||||
return image_files
|
||||
|
||||
def test_jbig2_image_export(self):
|
||||
"""Extract images of pdf containing jbig2 images
|
||||
|
||||
Feature test for: https://github.com/pdfminer/pdfminer.six/pull/46
|
||||
"""
|
||||
image_files = self.extract_images(full_path('../samples/contrib/pdf-with-jbig2.pdf'))
|
||||
assert image_files[0].endswith('.jb2')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
nose.runmodule()
|
||||
|
|
Loading…
Reference in New Issue