From 373c6e7b97f118d6eae0b53609a9509a7feda2f3 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Tue, 22 Oct 2019 17:37:06 +0200 Subject: [PATCH] Added: extraction of JBIG2 encoded images (#311) And added test for pdf with JBIG2 image. Fixes #26 Closes #46 --- CHANGELOG.md | 5 +- README.md | 3 +- pdfminer/image.py | 86 ++++++-- pdfminer/jbig2.py | 321 +++++++++++++++++++++++++++++ pdfminer/pdftypes.py | 4 +- samples/contrib/pdf-with-jbig2.pdf | Bin 0 -> 10213 bytes tests/test_tools_pdf2txt.py | 23 +++ 7 files changed, 416 insertions(+), 26 deletions(-) create mode 100644 pdfminer/jbig2.py create mode 100644 samples/contrib/pdf-with-jbig2.pdf diff --git a/CHANGELOG.md b/CHANGELOG.md index dd4b3d3..9082416 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [Unreleased] -Nothing yet +### Added +- Support for extracting JBIG2 encoded images ([#311](https://github.com/pdfminer/pdfminer.six/pull/311) and [#46](https://github.com/pdfminer/pdfminer.six/pull/46)) ## [20191020] - 2019-10-20 @@ -27,7 +28,7 @@ Nothing yet - Allow for bounding boxes with zero height or width by removing assertion ([#246](https://github.com/pdfminer/pdfminer.six/pull/246)) ### Changed -- All dependencies are managed in `setup.py` ([#306](https://github.com/pdfminer/pdfminer.six/pull/306), [#219](https://github.com/pdfminer/pdfminer.six/pull/219)) +- All dependencies are managed in `setup.py` ([#306](https://github.com/pdfminer/pdfminer.six/pull/306) and [#219](https://github.com/pdfminer/pdfminer.six/pull/219)) ## [20181108] - 2018-11-08 diff --git a/README.md b/README.md index e2e4cc8..fae5fb0 100644 --- a/README.md +++ b/README.md @@ -23,9 +23,10 @@ Features * Written entirely in Python. * Parse, analyze, and convert PDF documents. - * PDF-1.7 specification support. (well, almost) + * PDF-1.7 specification support. (well, almost). * CJK languages and vertical writing scripts support. * Various font types (Type1, TrueType, Type3, and CID) support. + * Support for extracting images (JPG, JBIG2 and Bitmaps). * Basic encryption (RC4) support. * Outline (TOC) extraction. * Tagged contents extraction. diff --git a/pdfminer/image.py b/pdfminer/image.py index 39265fb..c69b700 100644 --- a/pdfminer/image.py +++ b/pdfminer/image.py @@ -1,12 +1,14 @@ -import struct import os import os.path +import struct from io import BytesIO -from .pdftypes import LITERALS_DCT_DECODE + +from .jbig2 import JBIG2StreamReader, JBIG2StreamWriter +from .pdfcolor import LITERAL_DEVICE_CMYK from .pdfcolor import LITERAL_DEVICE_GRAY from .pdfcolor import LITERAL_DEVICE_RGB -from .pdfcolor import LITERAL_DEVICE_CMYK +from .pdftypes import LITERALS_DCT_DECODE, LITERALS_JBIG2_DECODE def align32(x): @@ -57,9 +59,11 @@ class BMPWriter(object): return -## ImageWriter -## class ImageWriter(object): + """Write image to a file + + Supports various image types: JPEG, JBIG2 and bitmaps + """ def __init__(self, outdir): self.outdir = outdir @@ -68,21 +72,15 @@ class ImageWriter(object): return def export_image(self, image): - stream = image.stream - filters = stream.get_filters() (width, height) = image.srcsize - if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE: - ext = '.jpg' - elif (image.bits == 1 or - image.bits == 8 and (LITERAL_DEVICE_RGB in image.colorspace or LITERAL_DEVICE_GRAY in image.colorspace)): - ext = '.%dx%d.bmp' % (width, height) - else: - ext = '.%d.%dx%d.img' % (image.bits, width, height) - name = image.name+ext - path = os.path.join(self.outdir, name) - fp=open(path, 'wb') + + is_jbig2 = self.is_jbig2_image(image) + ext = self._get_image_extension(image, width, height, is_jbig2) + name, path = self._create_unique_image_name(self.outdir, image.name, ext) + + fp = open(path, 'wb') if ext == '.jpg': - raw_data = stream.get_rawdata() + raw_data = image.stream.get_rawdata() if LITERAL_DEVICE_CMYK in image.colorspace: from PIL import Image from PIL import ImageChops @@ -93,9 +91,18 @@ class ImageWriter(object): i.save(fp, 'JPEG') else: fp.write(raw_data) + elif is_jbig2: + input_stream = BytesIO() + input_stream.write(image.stream.get_data()) + input_stream.seek(0) + reader = JBIG2StreamReader(input_stream) + segments = reader.get_segments() + + writer = JBIG2StreamWriter(fp) + writer.write_file(segments) elif image.bits == 1: bmp = BMPWriter(fp, 1, width, height) - data = stream.get_data() + data = image.stream.get_data() i = 0 width = (width+7)//8 for y in range(height): @@ -103,7 +110,7 @@ class ImageWriter(object): i += width elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace: bmp = BMPWriter(fp, 24, width, height) - data = stream.get_data() + data = image.stream.get_data() i = 0 width = width*3 for y in range(height): @@ -111,12 +118,47 @@ class ImageWriter(object): i += width elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace: bmp = BMPWriter(fp, 8, width, height) - data = stream.get_data() + data = image.stream.get_data() i = 0 for y in range(height): bmp.write_line(y, data[i:i+width]) i += width else: - fp.write(stream.get_data()) + fp.write(image.stream.get_data()) fp.close() return name + + @staticmethod + def is_jbig2_image(image): + filters = image.stream.get_filters() + is_jbig2 = False + for filter_name, params in filters: + if filter_name in LITERALS_JBIG2_DECODE: + is_jbig2 = True + break + return is_jbig2 + + @staticmethod + def _get_image_extension(image, width, height, is_jbig2): + filters = image.stream.get_filters() + if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE: + ext = '.jpg' + elif is_jbig2: + ext = '.jb2' + elif (image.bits == 1 or + image.bits == 8 and (LITERAL_DEVICE_RGB in image.colorspace or LITERAL_DEVICE_GRAY in image.colorspace)): + ext = '.%dx%d.bmp' % (width, height) + else: + ext = '.%d.%dx%d.img' % (image.bits, width, height) + return ext + + @staticmethod + def _create_unique_image_name(dirname, image_name, ext): + name = image_name + ext + path = os.path.join(dirname, name) + img_index = 0 + while os.path.exists(path): + name = '%s.%d%s' % (image_name, img_index, ext) + path = os.path.join(dirname, name) + img_index += 1 + return name, path diff --git a/pdfminer/jbig2.py b/pdfminer/jbig2.py new file mode 100644 index 0000000..39393e5 --- /dev/null +++ b/pdfminer/jbig2.py @@ -0,0 +1,321 @@ +import math +import os +from struct import pack, unpack, calcsize + +# segment structure base +SEG_STRUCT = [ + (">L", "number"), + (">B", "flags"), + (">B", "retention_flags"), + (">B", "page_assoc"), + (">L", "data_length"), +] + +# segment header literals +HEADER_FLAG_DEFERRED = 0b10000000 +HEADER_FLAG_PAGE_ASSOC_LONG = 0b01000000 + +SEG_TYPE_MASK = 0b00111111 + +REF_COUNT_SHORT_MASK = 0b11100000 +REF_COUNT_LONG_MASK = 0x1fffffff +REF_COUNT_LONG = 7 + +DATA_LEN_UNKNOWN = 0xffffffff + +# segment types +SEG_TYPE_IMMEDIATE_GEN_REGION = 38 +SEG_TYPE_END_OF_PAGE = 49 +SEG_TYPE_END_OF_FILE = 50 + +# file literals +FILE_HEADER_ID = b'\x97\x4A\x42\x32\x0D\x0A\x1A\x0A' +FILE_HEAD_FLAG_SEQUENTIAL = 0b00000001 +FILE_HEAD_FLAG_PAGES_UNKNOWN = 0b00000010 + + +def bit_set(bit_pos, value): + return bool((value >> bit_pos) & 1) + + +def check_flag(flag, value): + return bool(flag & value) + + +def masked_value(mask, value): + for bit_pos in range(0, 31): + if bit_set(bit_pos, mask): + return (value & mask) >> bit_pos + + raise Exception("Invalid mask or value") + + +def mask_value(mask, value): + for bit_pos in range(0, 31): + if bit_set(bit_pos, mask): + return (value & (mask >> bit_pos)) << bit_pos + + raise Exception("Invalid mask or value") + + +class JBIG2StreamReader(object): + """Read segments from a JBIG2 byte stream""" + + def __init__(self, stream): + self.stream = stream + + def get_segments(self): + segments = [] + while not self.is_eof(): + segment = {} + for field_format, name in SEG_STRUCT: + field_len = calcsize(field_format) + field = self.stream.read(field_len) + if len(field) < field_len: + segment["_error"] = True + break + value = unpack(field_format, field) + if len(value) == 1: + [value] = value + parser = getattr(self, "parse_%s" % name, None) + if callable(parser): + value = parser(segment, value, field) + segment[name] = value + + if not segment.get("_error"): + segments.append(segment) + return segments + + def is_eof(self): + if self.stream.read(1) == b'': + return True + else: + self.stream.seek(-1, os.SEEK_CUR) + return False + + def parse_flags(self, segment, flags, field): + return { + "deferred": check_flag(HEADER_FLAG_DEFERRED, flags), + "page_assoc_long": check_flag(HEADER_FLAG_PAGE_ASSOC_LONG, flags), + "type": masked_value(SEG_TYPE_MASK, flags) + } + + def parse_retention_flags(self, segment, flags, field): + ref_count = masked_value(REF_COUNT_SHORT_MASK, flags) + retain_segments = [] + ref_segments = [] + + if ref_count < REF_COUNT_LONG: + for bit_pos in range(5): + retain_segments.append(bit_set(bit_pos, flags)) + else: + field += self.stream.read(3) + [ref_count] = unpack(">L", field) + ref_count = masked_value(REF_COUNT_LONG_MASK, ref_count) + ret_bytes_count = int(math.ceil((ref_count + 1) / 8)) + for ret_byte_index in range(ret_bytes_count): + [ret_byte] = unpack(">B", self.stream.read(1)) + for bit_pos in range(7): + retain_segments.append(bit_set(bit_pos, ret_byte)) + + seg_num = segment["number"] + if seg_num <= 256: + ref_format = ">B" + elif seg_num <= 65536: + ref_format = ">I" + else: + ref_format = ">L" + + ref_size = calcsize(ref_format) + + for ref_index in range(ref_count): + ref = self.stream.read(ref_size) + [ref] = unpack(ref_format, ref) + ref_segments.append(ref) + + return { + "ref_count": ref_count, + "retain_segments": retain_segments, + "ref_segments": ref_segments, + } + + def parse_page_assoc(self, segment, page, field): + if segment["flags"]["page_assoc_long"]: + field += self.stream.read(3) + [page] = unpack(">L", field) + return page + + def parse_data_length(self, segment, length, field): + if length: + if (segment["flags"]["type"] == SEG_TYPE_IMMEDIATE_GEN_REGION) \ + and (length == DATA_LEN_UNKNOWN): + + raise NotImplementedError( + "Working with unknown segment length " + "is not implemented yet" + ) + else: + segment["raw_data"] = self.stream.read(length) + + return length + + +class JBIG2StreamWriter(object): + """Write JBIG2 segments to a file in JBIG2 format""" + + def __init__(self, stream): + self.stream = stream + + def write_segments(self, segments, fix_last_page=True): + data_len = 0 + current_page = None + seg_num = None + + for segment in segments: + data = self.encode_segment(segment) + self.stream.write(data) + data_len += len(data) + + seg_num = segment["number"] + + if fix_last_page: + seg_page = segment.get("page_assoc") + + if segment["flags"]["type"] == SEG_TYPE_END_OF_PAGE: + current_page = None + elif seg_page: + current_page = seg_page + + if fix_last_page and current_page and (seg_num is not None): + segment = self.get_eop_segment(seg_num + 1, current_page) + data = self.encode_segment(segment) + self.stream.write(data) + data_len += len(data) + + return data_len + + def write_file(self, segments, fix_last_page=True): + header = FILE_HEADER_ID + header_flags = FILE_HEAD_FLAG_SEQUENTIAL | FILE_HEAD_FLAG_PAGES_UNKNOWN + header += pack(">B", header_flags) + self.stream.write(header) + data_len = len(header) + + data_len += self.write_segments(segments, fix_last_page) + + seg_num = 0 + for segment in segments: + seg_num = segment["number"] + + eof_segment = self.get_eof_segment(seg_num + 1) + data = self.encode_segment(eof_segment) + + self.stream.write(data) + data_len += len(data) + + return data_len + + def encode_segment(self, segment): + data = b'' + for field_format, name in SEG_STRUCT: + value = segment.get(name) + encoder = getattr(self, "encode_%s" % name, None) + if callable(encoder): + field = encoder(value, segment) + else: + field = pack(field_format, value) + data += field + return data + + def encode_flags(self, value, segment): + flags = 0 + if value.get("deferred"): + flags |= HEADER_FLAG_DEFERRED + + if "page_assoc_long" in value: + flags |= HEADER_FLAG_PAGE_ASSOC_LONG \ + if value["page_assoc_long"] else flags + else: + flags |= HEADER_FLAG_PAGE_ASSOC_LONG \ + if segment.get("page", 0) > 255 else flags + + flags |= mask_value(SEG_TYPE_MASK, value["type"]) + + return pack(">B", flags) + + def encode_retention_flags(self, value, segment): + flags = [] + flags_format = ">B" + ref_count = value["ref_count"] + retain_segments = value.get("retain_segments", []) + + if ref_count <= 4: + flags_byte = mask_value(REF_COUNT_SHORT_MASK, ref_count) + for ref_index, ref_retain in enumerate(retain_segments): + flags_byte |= 1 << ref_index + flags.append(flags_byte) + else: + bytes_count = math.ceil((ref_count + 1) / 8) + flags_format = ">L" + ("B" * bytes_count) + flags_dword = mask_value( + REF_COUNT_SHORT_MASK, + REF_COUNT_LONG + ) << 24 + flags.append(flags_dword) + + for byte_index in range(bytes_count): + ret_byte = 0 + ret_part = retain_segments[byte_index * 8:byte_index * 8 + 8] + for bit_pos, ret_seg in enumerate(ret_part): + ret_byte |= 1 << bit_pos if ret_seg else ret_byte + + flags.append(ret_byte) + + ref_segments = value.get("ref_segments", []) + + seg_num = segment["number"] + if seg_num <= 256: + ref_format = "B" + elif seg_num <= 65536: + ref_format = "I" + else: + ref_format = "L" + + for ref in ref_segments: + flags_format += ref_format + flags.append(ref) + + return pack(flags_format, *flags) + + def encode_data_length(self, value, segment): + data = pack(">L", value) + data += segment["raw_data"] + return data + + def get_eop_segment(self, seg_number, page_number): + return { + 'data_length': 0, + 'flags': {'deferred': False, 'type': SEG_TYPE_END_OF_PAGE}, + 'number': seg_number, + 'page_assoc': page_number, + 'raw_data': b'', + 'retention_flags': { + 'ref_count': 0, + 'ref_segments': [], + 'retain_segments': [] + } + } + + def get_eof_segment(self, seg_number): + return { + 'data_length': 0, + 'flags': {'deferred': False, 'type': SEG_TYPE_END_OF_FILE}, + 'number': seg_number, + 'page_assoc': 0, + 'raw_data': b'', + 'retention_flags': { + 'ref_count': 0, + 'ref_segments': [], + 'retain_segments': [] + } + } diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index c6e8d86..96e255f 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -27,7 +27,7 @@ LITERALS_ASCIIHEX_DECODE = (LIT('ASCIIHexDecode'), LIT('AHx')) LITERALS_RUNLENGTH_DECODE = (LIT('RunLengthDecode'), LIT('RL')) LITERALS_CCITTFAX_DECODE = (LIT('CCITTFaxDecode'), LIT('CCF')) LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT')) - +LITERALS_JBIG2_DECODE = (LIT('JBIG2Decode'),) ## PDF Objects ## @@ -275,6 +275,8 @@ class PDFStream(PDFObject): # This is probably a JPG stream - it does not need to be decoded twice. # Just return the stream to the user. pass + elif f in LITERALS_JBIG2_DECODE: + pass elif f == LITERAL_CRYPT: # not yet.. raise PDFNotImplementedError('/Crypt filter is unsupported') diff --git a/samples/contrib/pdf-with-jbig2.pdf b/samples/contrib/pdf-with-jbig2.pdf new file mode 100644 index 0000000000000000000000000000000000000000..b110fb44180da31490d917d21d0c867107969a6b GIT binary patch literal 10213 zcmai)1yEc|x9@Q$fuKPLcNiGl-QC?iz~I3hf(3VigpdHi-Q5Z9?(XjLNX|L;p6|Z< z>UGuL-TU9NRzqS?)GK?CPgC) zGo;tMqnVvEfaSGY8Hq{R%n9t`Xkz9B0R1J%nVDJ{34`4LI>1+fgNp^g&dj3sIuPt^ zHi2Iw%odiLsrnVU0! zNy_%s3E{tMk-uxH*Y!0OjIGT~oB_Yx)s#|{719z@1_IdsF@W?hBfrIejL4eVSvXq) zI5{{tk^bK1-zy<8skj(B|JnE-qex7`R?bd}W{x6YTYIqG?-(#6F=<$tzD@_RaIqsX zNt#(%SiVMvjqSDL&m1`;rw;&UN0-;IiCft?n>hlQq=lshAwnz{V5RgC!$kz%30s>_f0s`>{0)mn3&lPt4Dnmg) zzV_m^BS1jCygoYqzP#}`2WLBO*Hg%O)Q=HN7MNKHiZ=dt{B+U2-~78q_w1&oMSM8h~<{G91=eadBAleT)cRVi#D(``Jrz{+Goeo6sH3O zrv#}BbHsAGjUf=(Sj(RQ((;w>_R?Fw9&qo~V0b>p_Wp=_^WvwP@v(xlWSLVO)t`6` zdbIAdh*i<1(U%kw!pyz>?*047gAGl_bT|#7RB9i*@v-G=3PG(-xptwwJYw*^%uz}_ssy=ITwj>$YS?f~`qgG=^ap zBo`kgmH=<%S@3#WlqFb$L%Haz9D(qki@((X&=|_-^XZAFUa>U_8|Bp z@g~D%Z5Xx84HlFWn0=iQu93Q3^a%cg{Ko93;ri5T%%DN*f zO}QpUjd?TjFqzWZuzEBOlT?_Na?w48a4uZ<2MZ-bpg)@=T9hYx^^G!{Gafke~O+4gp3KR zL=rhuW=5#)7TeOT?k#Uhy{R3B^c2N56MM6w8EP{&tCFbmq9_AF-#tIE!fhUnqmHMj43K9QD=)?)Y$AXBv}u&pfXV&VwY+0V6^x|s*W3eI(+hg zldG{$gy2yXWuhCv)1UD{-))Rs?SgH47Mkf8_ZsOvA81lQBgt>Pq3%j8!GjFT9tj0y7u#Cn(n)mK8{3sR3V1NrV}wK^%i-~i(T`Qr zReEpRqKjun!^VNprn@}LcT%)Us-3^^Y+4=DR)pD;D47fLsyn|4&@R$rY2)&_!Nm3G zKiF!oBS?Y7Oc!`VvBz&?DiIhr$ts|k-G%VnqIUiE@ut8x)*_#EEK#idb0+!c1swHq zcyQ`Iw_=hOVtqDj27yTisklS=G%VDNmkWzJ*0VrJ8x{KDXMV_@D}mh?P4-vO)zK#}0}gM4jZJSVt0(b^)8^0`yrHmo z5wU)0L7TVOU87Ubs01IEE5OL!!8}Ne@}I7OWP7MER0J$1IEf!#$*HhqBYc8$OjkV9h6-G~ZkP z_&%k7r56JV80K>cT+n~spxQCZt$SkH@5vpj#o=kpZ7M~{Oq z=l-`;Y44rVCaXyMTlyOvUhJm&x7D}y_}ui!F+Ul<0oP!~Mx%dke1D+dVjT9%(whpa z25BMGM4v6Wu5@=SqB^52&ESN}%{gJATBeJEJ$&4l5y;A|BY2*C0W4 zl}%fnkoy?k4g6x7Lm!1Mos9dj+05TF4}=8fHqin5d(fMV)shzhHH1xd10B?0iv%^D_#!JG_D^EKcFiqA#u{QYT??Pb*Bs#jx~ zlib?EVvsU24C&5bzKE5`62GKqH7!?8eikzMQKT}^MV_8C+7W6*d5(QJ$@}eo9s<@` zZyMv=l3)6OH-I$yPF`@$5GVXgKb{rhS*xO?F=5IS`EW!dYwZc52)|d2RlCN6u?CDq z7lTP`TLdv{3K~&QDDz=eGYhT+Z8^es-;IaW`qmw-eV=(&BfyP97?y-^izg6Y3jaGN zf6uUfrbV+VFSB6xi66HQq;RilSseV#s|)>@b3S+gbZ= znwgE0-O@dR_VB=AtN}IRXHv!k_dv6{Tcb|WVfwtD5$m3~>~NdI7h|a~sq7pI z$Q}~vzaI?F*?Iiw1vzE8XnK1d$dSzL^x1KCZhkP*b;Fd0s>Wx6NQ$4@pT5vzny;6aOgr<$Zu8MuG@%+gxBtPQac9XHT#bSZZJZAk^-y_1W(eh+U}c^E`J;~>Veyx(Dy1mQ z0r&FE+89-`>M-*;t115v)BukX7zDyfi-LX*_P1+(A2l`EhELbpJ>B29jNiylPAzXN z%i+6<3WmG*Eah@P!bUl5*x5{fvmHwn3EcK!GPWPmj>pIusy>@)18I2j-br8kElowL z%@5iqrGkW!;Z z2lltL+oHF2EO;dyQQ@s4ehH|5u_tf%f%pCTfI)!2Qe_Ia?<6N1w{h+{m@Pug0@_}Y zNdyVQHy#-#W*fcN>*{0NNFuetUmH5p7YV-EHz;7G-&Sb3; zvW+8)@Fh4)Pdf$KZnWUM=t5QV_6_D)M#2^KXyy;PQ7}KUGSy8&-!-ijhus|;e7#84 zX!hbacC)83thZ6+@!X;zRwR{8t_x3m-5vuDW#|{zSv`>-O4?0kvOqqI2@SXd5m_^b zt%km|Cq;CA^5IW=pRME0h^%M%A(qih3nA4iZ^vxoh&y8Tz4!6tJcKo=D{-E#mN5r% zk>n0HvP}$HzM0;C^NHx&y`Ri2>4}hnl9Fp@a}7AL5u<8O$HG1j?(n7NWw?r|E;3+( zq$V_>7UZ4imf8R{j4>wmUsj>kl{SQyPz&uKIK;@{*~T2fXv|*|vYCW`$RSY7rdcKD z^Lj7c!NX>YV*I7ZL<*qowA^{csvR-P2P~@p>T2V6(rs3;3SPSd|7z7;Q`ef2)Ye>E zz@sgc#xHaAQa1Nosr!~ITdhr}A9vQP=Z5-u5>{3P9vAdeN zdt;#ec8?FF#fhrv267B4jobqhs-QWs4>a;k>8haaajm9G+@>`?7s)V>zAzdYma8FE zh^6Ois++J(?(_%?lK4ec1Mq)R*1&@?zskU*BH8fgaLdu~jhFUQvd0^pWy=1Lwf%KN(0Q^Jm3HGNq>ln9+Z1Wvn{jnbxgaB?ydXxet zWaI}q8emBRBY7ml=6JHrmR-rA%hW7NI;! z3(Q>KST(-V51VCc9>Dgh5-}!X^0g_H^FNP!4gnijM(>h=F1v|6BRy=axWHS zVo3X?@JW{do|7g69ytRyj89B!noZ7zfdI!vj)!E9F~u(TaEtyi%exOb9-47n=>${X zRaMgsTe{kE7$!b=r?CddFDQ_rTXT+D{?K4QT;0HFgFj6=7td$wj#nYGvksK0PG*^0 z?4%T2wF~DW6<*YX6nkYnDkbH!z6omRj&N>9sjH+4(yv)m?qu*iSqp0D91g73+P}Rf zXngu~u3>TtIY{s9*;xIZE9CR-v)rtLWW$uZRp>7~ba^cqzTch?^1C!g8rG8V|K<+n89<$>f23T-#}Sxwd9zCcu2$}6LRM31BDdWcOC%)o3P<839Pg8<)v zL)u=Nh52Rij)c8p6XESsaV&KWV+J)5M0(84H^+dzS!JAi>xgL=|) z^^hIv#n#yHcJ{{l)QULmp_($gWE7vs=te+L&xW|Zw~W%@c0V2Bxh`YbS?_vDSd3Xo zW@EaN^8?a!h#ZLt%j_a~NwDX~pd~vg7BWMS0Be9mKMq{GPrPrvP0+;_S+WU14BOES zYz0={>3#NY=RvQJgk~^AU66M8uWCgoxpW@k)BJkXcig&_zV%;ve`=MQ@^4D62V+Ce zF8%D%^59D|rAaA`R%_`*Y&DfAQ;3%GtlYpn|JS z4jJL9$U4NhOYi1m3fiQ-2HTgZu3#I4q_A}9)=l~9X^q2U*Pdr&LyqS6RLC{GfPH0n zRF%b|WfJTTY0SD?HIVX8JvVw^Q60%It9L+GvWyQBMhvD{iEI}W@}A{9 zmYB7QHo_*BNVOH%32@35%`wgaP$|2Yd_h%Vzf8>*-Bdjn&BR1dWp&l89j!aMEt@X4 ztNKBm&d*VC`VDC{7OLrq`6VXHMtcFF>6|WFg{5xZ z!NO<%FdWi(vu|2Zwh`3%5cTppFzOSRAF{|$!nV|#|29pR)(C2)ZB(S1OzYcjkdPU3kFlEGLG`a8>#HD!_Z>w6kSP#GKl3kFs z*^r*(FG>r~Ga1mN)6!?Hx$%M>$3eFld`J|_whAtpW9G89D_tFvhxw&1ml2*wy-9s7 zpClJJJbeg9r}^z_crKd=;y0vG;>5-kTg%DWw?1vMuaP}}xs;^ggxo8Y3m!ZlKibs`Gb@I9J7G15&Tczih}x;By2r6yO&xp- zDLB8C%2!;)Y8k6xJXx01b*)R8eJGn@S3NAe=j_ZS)5eg;#sxvfOYaN{i~AP_s3?5g zjjEO|5kZ9l;|6V834F(-Ne+;F;Jqes9(;SRQbu7of6FY3zet8f>8%P;83t%Cj5Wa0Ywh zxI}MR9>+vW@@ed}E%rW96{2foA)pACPkOOh_$BGnvRktkJh31&$7hj1a;unvB$P&R zrcKT^t1dcznKcD(_^Kih7SmW@6>k5ivCOs8k2Kq{JoSUe%J(j_$|!WI!5A$}??tph z+(&#8X2LHgo%sum zb%5nTwSRwk*$fRw`Ny>O(q?J;ckF>KkkC9XGAPR{;v?H1N9^fb#+lbzP0Y{ggIY-@ zFF_Qjq0eZm(u)HXBp~#a>Q+Ae>oz(Oj9$+ENRfzj_L9JM>%MnKylVyc)Y6yPfg&X}kNroqu6lE>Pl3fKs5 zpM{~&P9fQlKVrcxE1vs9RcMJ2gd@!%Eka33rRjRlx?x~5jzp8-qGZzj+yvBhiTo6Z z#Vxe5&dP6KgI6-ut~1nM?(j*J7(2bxoK-+U7aDxgF;An%ycKbF-Mg8d0_*g^e^fLDe?leQFL%OV-WOW*FiQ z&5q#Q+)&!)5)W)DA!pyd#D`-}Sy)-8B;(E(XgGw?JH$v3 zEt-Z^L@khnldC9ttB+vt%TlQSPJ2^KMy)i4#2c>K+^v4U z(1rVH{nX}67S338?o*Z34XLQ@(R>U5kpa{+2^F1wPQ>zmh|H66w*C1SeW z;<1&w5gdaVNpfKrFNlIK%?(*QevH^`g;1Yed%`Y(;v%agt&b7+WzRP(tVyju;DuZU zsL?`ZtT0rT^^xPqt(dGsE=hb9^mY)1Ufs}I(%sreTpM-tjUOgqWrAsQ(+fG-{p_uX zTi(tl-=82^F8n|P)s47H7yo1Y$Xz3sojTZpsmZ!=lf(6xlyf&K?v|IlUzh)ob@<(mO~gpHXxuL}=5p@a%>n--`Pps?C zMT*etpeDN;w`o?MuV7`-z3 zPO`Gk={r;MCLchl>;|<)p%{CMJ+QmnzJHd4Vfk}-2J?DQ_Ioi{IN?7Q;LiTC0H5{O z@~@o;2(J#Thn>IAadn}b8feBi3=`qFVA>qcE?*d`;loJ$LBxk*?cvl(tx>}op&BrH z=QK0*V>`)ST^_H}Y3jZ{*k>&V)d63-nk7K5p;6Ff7Vgx3R?jOg9TDNA8AW4UrMwYu zMeRca8tecb{AXBhN>?DM-CB<+manS5P|+XZippI z*AuqQ=pPSNOv$_9G-QLNlF>Xon*PQ3?{&9h%n!H=0GTf6ZwS7+Aq^-b`F)96WO9Eb zq44Z^FIBLqqkceB8# z{dQGDWYxc($lv|8MzLeeGeP4~NbIDX9B25EK%3n6t*F=8xN-=*CatK+w0H@IJJb6l zZr4uLQb6~E?r4FPlg)!=nMW49ev0jt3M!^6o^;_TZnLiXx1oAQgG@}y)|3g_)Yt<* zm~kxGwAkpHErU1#9t%ESa(o-v1(I)!TMYsX=W2+?adL0)u*^99nU!gxg-R4V;*l7i z!5>Z1L3UM_Jh{c9pVcfKx3Vn730yvHcxohUrITEKDJhM{eKX@FEUspM3fYI3Tm_~i zURzoU6>AltOKR8(8nk_Q-c(KB^dfASD6Jjl+7tPug5|Kt{vs6R-^MCh_7Gs@{FUL#E!!maogwtFEHiFH}TcpH1gv7Si4CBMw#n= zCVaO_0tg%$5Obk0(4!Qb)tOy~Pj>Tlo1k%7jbBG*l@bJ#@`CM|?et1K5Fw||cZo$a zUdx!#e1ZDeXhdaSiGg>PE-g`!UKH-Ex2Hb<-6^f9O@VB61zb@KgDpYfx@362hMgnu z?U({wcXRLC3kflZ8{EtT;)@`JDrQ$!T1C4^S$&lQLtCH1(>Hm%JdUWMKf07ku{{W$v@5-!WV0x2+wh?ZzExD2okG# zNQccNYQxm!i^_Fo=slKB^H)CORCBsR2%f78Z#=WOq>9tJ2sqrAu!h#Nuk4OcHQl6_ zJwz3Mb?@GyuzV+A$Sr!t6bH7JyP43fH6UwEBgpE_ZR+bIKbqPD;7$-1U$e30rei5%_mrBbOnMk(iIn#k$7o-@VL z1Pmaz*t5;bYRD(N#s!+^OIBfztA-O!ojJ{R+RpxGB z7#OaZB04f>@qW^&zgx3s(s{B>JvK7<`hrOTOLU)0hqXLoQbCm%JW#(B#-dDnR4kA@ z*!Ym+g5T-!bm2762S@=aGO(x&79#XbG3UY_t_tgo8J*B_A7LC_H_}~$&q>AId#h9njX2TG zx);Bg_&v5RV^2bFz%iz#v)r1U%N33^>%opl82LR)9vzVkd00uV7jTGe%o|JN9SgT zA%Lkpx+uq#@N3#+__jP(Q$8=!-DTtRnfL|jJKfvMa0z6N7T>f!!R;S2 zhf2KtE3*Y296kB1Xdzp=5WZPCVF)yv0=2rqOmRz@#lRQND@@+^uu;lNB=pGXH>7fl zo)vqN^Gb|`rQ3ZXLF>H6=a~QH++RSFh>^3A4cOv0lVtHqApM0T z{R{N@E&Q{c|3;XckeFnwOq~EazX>PhS9C}O?D7ZZ`Pa)~y2AFr{2KrYI0_|IzkjWGR{-ba zyA~jT85ZXExl7n3#)z$k-HYQhH^{L0Oh0sxA4i38BSFTBL;b%-{Oh-Hb2KwY0s^>@eka*q1;Eb624Vx41OC>y zKpd~B`?~<_{!3$J`M)$EGw78t`_FMeAoD8__MaLTE9Ywl{HF$F0lg;7e`;JTEU(%B zKQs=u*Sz~jLvnUBva&IA{DaY{Sb3Pe?&Fn(1HZy!zq9X^6SFf11AfQ&cUu&o!wY0J zGBxHj1(|Yim;qTh*^PnbAahm@PBtzURx>kZQ+~kze~8}izpRt9k)!jU;Bj$sG9yt? Ih$)EwAGZd3AOHXW literal 0 HcmV?d00001 diff --git a/tests/test_tools_pdf2txt.py b/tests/test_tools_pdf2txt.py index 188f652..3b09140 100644 --- a/tests/test_tools_pdf2txt.py +++ b/tests/test_tools_pdf2txt.py @@ -13,6 +13,7 @@ def full_path(relative_path_to_this_file): return abspath + def run(datapath, filename, options=None): i = full_path(datapath + filename + '.pdf') o = full_path(filename + '.txt') @@ -89,5 +90,27 @@ class TestDumpImages(object): self.extract_images(full_path('../samples/nonfree/175.pdf')) + +class TestDumpImages(object): + + @staticmethod + def extract_images(input_file): + output_dir = mkdtemp() + with NamedTemporaryFile() as output_file: + commands = ['-o', output_file.name, '--output-dir', output_dir, input_file] + pdf2txt.main(commands) + image_files = os.listdir(output_dir) + rmtree(output_dir) + return image_files + + def test_jbig2_image_export(self): + """Extract images of pdf containing jbig2 images + + Feature test for: https://github.com/pdfminer/pdfminer.six/pull/46 + """ + image_files = self.extract_images(full_path('../samples/contrib/pdf-with-jbig2.pdf')) + assert image_files[0].endswith('.jb2') + + if __name__ == '__main__': nose.runmodule()