diff --git a/CHANGELOG.md b/CHANGELOG.md index 02dc419..35b2ee1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Fixed - Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637)) +- Add handling of JPXDecode filter to enable extraction of images for some pdfs ([#645](https://github.com/pdfminer/pdfminer.six/pull/645)) ## [20211012] diff --git a/pdfminer/image.py b/pdfminer/image.py index 83f9a7a..1a25006 100644 --- a/pdfminer/image.py +++ b/pdfminer/image.py @@ -9,7 +9,8 @@ from .layout import LTImage from .pdfcolor import LITERAL_DEVICE_CMYK from .pdfcolor import LITERAL_DEVICE_GRAY from .pdfcolor import LITERAL_DEVICE_RGB -from .pdftypes import LITERALS_DCT_DECODE, LITERALS_JBIG2_DECODE +from .pdftypes import LITERALS_DCT_DECODE, LITERALS_JBIG2_DECODE, \ + LITERALS_JPX_DECODE def align32(x: int) -> int: @@ -99,6 +100,16 @@ class ImageWriter: i.save(fp, 'JPEG') else: fp.write(raw_data) + elif ext == '.jp2': + # if we just write the raw data, most image programs + # that I have tried cannot open the file. However, + # open and saving with PIL produces a file that + # seems to be easily opened by other programs + from PIL import Image + raw_data = image.stream.get_rawdata() + ifp = BytesIO(raw_data) + i = Image.open(ifp) + i.save(fp, 'JPEG2000') elif is_jbig2: input_stream = BytesIO() input_stream.write(image.stream.get_data()) @@ -156,6 +167,8 @@ class ImageWriter: filters = image.stream.get_filters() if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE: ext = '.jpg' + elif len(filters) == 1 and filters[0][0] in LITERALS_JPX_DECODE: + ext = '.jp2' elif is_jbig2: ext = '.jb2' elif (image.bits == 1 or diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index 5e0ef60..e6d94bd 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -34,6 +34,7 @@ LITERALS_RUNLENGTH_DECODE = (LIT('RunLengthDecode'), LIT('RL')) LITERALS_CCITTFAX_DECODE = (LIT('CCITTFaxDecode'), LIT('CCF')) LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT')) LITERALS_JBIG2_DECODE = (LIT('JBIG2Decode'),) +LITERALS_JPX_DECODE = (LIT('JPXDecode'),) if sys.version_info >= (3, 8): @@ -358,6 +359,8 @@ class PDFStream(PDFObject): pass elif f in LITERALS_JBIG2_DECODE: pass + elif f in LITERALS_JPX_DECODE: + pass elif f == LITERAL_CRYPT: # not yet.. raise PDFNotImplementedError('/Crypt filter is unsupported')