Add support for JPEG2000 image encoding

pull/659/head^2
Pieter Marsman 2022-01-23 21:17:47 +01:00
parent b82229245a
commit 708dd20465
3 changed files with 18 additions and 1 deletions

View File

@ -10,6 +10,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
### Fixed ### Fixed
- Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637)) - Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637))
- Add handling of JPXDecode filter to enable extraction of images for some pdfs ([#645](https://github.com/pdfminer/pdfminer.six/pull/645))
## [20211012] ## [20211012]

View File

@ -9,7 +9,8 @@ from .layout import LTImage
from .pdfcolor import LITERAL_DEVICE_CMYK from .pdfcolor import LITERAL_DEVICE_CMYK
from .pdfcolor import LITERAL_DEVICE_GRAY from .pdfcolor import LITERAL_DEVICE_GRAY
from .pdfcolor import LITERAL_DEVICE_RGB from .pdfcolor import LITERAL_DEVICE_RGB
from .pdftypes import LITERALS_DCT_DECODE, LITERALS_JBIG2_DECODE from .pdftypes import LITERALS_DCT_DECODE, LITERALS_JBIG2_DECODE, \
LITERALS_JPX_DECODE
def align32(x: int) -> int: def align32(x: int) -> int:
@ -99,6 +100,16 @@ class ImageWriter:
i.save(fp, 'JPEG') i.save(fp, 'JPEG')
else: else:
fp.write(raw_data) fp.write(raw_data)
elif ext == '.jp2':
# if we just write the raw data, most image programs
# that I have tried cannot open the file. However,
# open and saving with PIL produces a file that
# seems to be easily opened by other programs
from PIL import Image
raw_data = image.stream.get_rawdata()
ifp = BytesIO(raw_data)
i = Image.open(ifp)
i.save(fp, 'JPEG2000')
elif is_jbig2: elif is_jbig2:
input_stream = BytesIO() input_stream = BytesIO()
input_stream.write(image.stream.get_data()) input_stream.write(image.stream.get_data())
@ -156,6 +167,8 @@ class ImageWriter:
filters = image.stream.get_filters() filters = image.stream.get_filters()
if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE: if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
ext = '.jpg' ext = '.jpg'
elif len(filters) == 1 and filters[0][0] in LITERALS_JPX_DECODE:
ext = '.jp2'
elif is_jbig2: elif is_jbig2:
ext = '.jb2' ext = '.jb2'
elif (image.bits == 1 or elif (image.bits == 1 or

View File

@ -34,6 +34,7 @@ LITERALS_RUNLENGTH_DECODE = (LIT('RunLengthDecode'), LIT('RL'))
LITERALS_CCITTFAX_DECODE = (LIT('CCITTFaxDecode'), LIT('CCF')) LITERALS_CCITTFAX_DECODE = (LIT('CCITTFaxDecode'), LIT('CCF'))
LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT')) LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT'))
LITERALS_JBIG2_DECODE = (LIT('JBIG2Decode'),) LITERALS_JBIG2_DECODE = (LIT('JBIG2Decode'),)
LITERALS_JPX_DECODE = (LIT('JPXDecode'),)
if sys.version_info >= (3, 8): if sys.version_info >= (3, 8):
@ -358,6 +359,8 @@ class PDFStream(PDFObject):
pass pass
elif f in LITERALS_JBIG2_DECODE: elif f in LITERALS_JBIG2_DECODE:
pass pass
elif f in LITERALS_JPX_DECODE:
pass
elif f == LITERAL_CRYPT: elif f == LITERAL_CRYPT:
# not yet.. # not yet..
raise PDFNotImplementedError('/Crypt filter is unsupported') raise PDFNotImplementedError('/Crypt filter is unsupported')