Refactor ImageWriter and add method for exporting an image from bytes. (#737)

* Refactor ImageWriter and add method for exporting an image from bytes.

E.g. when FlateDecode just results in a list of RGB bytes.

* Added docstrings

* Add CHANGELOG.md

* Run black

* Run black
pull/747/head
Pieter Marsman 2022-03-22 20:58:16 +01:00 committed by GitHub
parent 894dabf264
commit 617e4c8388
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 128 additions and 83 deletions

View File

@ -15,6 +15,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
str ([#733](https://github.com/pdfminer/pdfminer.six/pull/733))
- `TypeError` in HTMLConverter when using a bytes fontname ([#734](https://github.com/pdfminer/pdfminer.six/pull/734))
### Added
- Exporting images without any specific encoding ([#737](https://github.com/pdfminer/pdfminer.six/pull/737))
## [20220319]
### Added

View File

@ -2,14 +2,24 @@ import os
import os.path
import struct
from io import BytesIO
from typing import BinaryIO, Tuple, List, Any
from typing import BinaryIO, Tuple
try:
from typing import Literal
except ImportError:
from typing_extensions import Literal # type: ignore[misc]
from .jbig2 import JBIG2StreamReader, JBIG2StreamWriter
from .layout import LTImage
from .pdfcolor import LITERAL_DEVICE_CMYK
from .pdfcolor import LITERAL_DEVICE_GRAY
from .pdfcolor import LITERAL_DEVICE_RGB
from .pdftypes import LITERALS_DCT_DECODE, LITERALS_JBIG2_DECODE, LITERALS_JPX_DECODE
from .pdftypes import (
LITERALS_DCT_DECODE,
LITERALS_JBIG2_DECODE,
LITERALS_JPX_DECODE,
LITERALS_FLATE_DECODE,
)
PIL_ERROR_MESSAGE = (
"Could not import Pillow. This dependency of pdfminer.six is not "
@ -88,16 +98,44 @@ class ImageWriter:
os.makedirs(self.outdir)
def export_image(self, image: LTImage) -> str:
"""Save an LTImage to disk"""
(width, height) = image.srcsize
is_jbig2 = self.is_jbig2_image(image)
ext = self._get_image_extension(image, width, height, is_jbig2)
name, path = self._create_unique_image_name(self.outdir, image.name, ext)
filters = image.stream.get_filters()
fp = open(path, "wb")
if ext == ".jpg":
raw_data = image.stream.get_rawdata()
assert raw_data is not None
if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
name = self._save_jpeg(image)
elif len(filters) == 1 and filters[0][0] in LITERALS_JPX_DECODE:
name = self._save_jpeg2000(image)
elif self._is_jbig2_iamge(image):
name = self._save_jbig2(image)
elif image.bits == 1:
name = self._save_bmp(image, width, height, (width + 7) // 8, image.bits)
elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace:
name = self._save_bmp(image, width, height, width * 3, image.bits * 3)
elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace:
name = self._save_bmp(image, width, height, width, image.bits)
elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE:
name = self._save_bytes(image)
else:
name = self._save_raw(image)
return name
def _save_jpeg(self, image: LTImage) -> str:
"""Save a JPEG encoded image"""
raw_data = image.stream.get_rawdata()
assert raw_data is not None
name, path = self._create_unique_image_name(image, ".jpg")
with open(path, "wb") as fp:
if LITERAL_DEVICE_CMYK in image.colorspace:
try:
from PIL import Image, ImageChops # type: ignore[import]
@ -111,9 +149,18 @@ class ImageWriter:
i.save(fp, "JPEG")
else:
fp.write(raw_data)
elif ext == ".jp2":
return name
def _save_jpeg2000(self, image: LTImage) -> str:
"""Save a JPEG 2000 encoded image"""
raw_data = image.stream.get_rawdata()
assert raw_data is not None
name, path = self._create_unique_image_name(image, ".jp2")
with open(path, "wb") as fp:
try:
from PIL import Image
from PIL import Image # type: ignore[import]
except ImportError:
raise ImportError(PIL_ERROR_MESSAGE)
@ -121,14 +168,23 @@ class ImageWriter:
# that I have tried cannot open the file. However,
# open and saving with PIL produces a file that
# seems to be easily opened by other programs
raw_data = image.stream.get_rawdata()
assert raw_data is not None
ifp = BytesIO(raw_data)
i = Image.open(ifp)
i.save(fp, "JPEG2000")
elif is_jbig2:
return name
def _save_jbig2(self, image: LTImage) -> str:
"""Save a JBIG2 encoded image"""
name, path = self._create_unique_image_name(image, ".jb2")
with open(path, "wb") as fp:
input_stream = BytesIO()
global_streams = self.jbig2_global(image)
global_streams = []
filters = image.stream.get_filters()
for filter_name, params in filters:
if filter_name in LITERALS_JBIG2_DECODE:
global_streams.append(params["JBIG2Globals"].resolve())
if len(global_streams) > 1:
msg = (
"There should never be more than one JBIG2Globals "
@ -144,86 +200,71 @@ class ImageWriter:
writer = JBIG2StreamWriter(fp)
writer.write_file(segments)
elif image.bits == 1:
bmp = BMPWriter(fp, 1, width, height)
data = image.stream.get_data()
i = 0
width = (width + 7) // 8
for y in range(height):
bmp.write_line(y, data[i : i + width])
i += width
elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace:
bmp = BMPWriter(fp, 24, width, height)
data = image.stream.get_data()
i = 0
width = width * 3
for y in range(height):
bmp.write_line(y, data[i : i + width])
i += width
elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace:
bmp = BMPWriter(fp, 8, width, height)
return name
def _save_bmp(
self, image: LTImage, width: int, height: int, bytes_per_line: int, bits: int
) -> str:
"""Save a BMP encoded image"""
name, path = self._create_unique_image_name(image, ".bmp")
with open(path, "wb") as fp:
bmp = BMPWriter(fp, bits, width, height)
data = image.stream.get_data()
i = 0
for y in range(height):
bmp.write_line(y, data[i : i + width])
i += width
else:
bmp.write_line(y, data[i : i + bytes_per_line])
i += bytes_per_line
return name
def _save_bytes(self, image: LTImage) -> str:
"""Save an image without encoding, just bytes"""
name, path = self._create_unique_image_name(image, ".jpg")
width, height = image.srcsize
channels = len(image.stream.get_data()) / width / height / (image.bits / 8)
with open(path, "wb") as fp:
try:
from PIL import Image # type: ignore[import]
except ImportError:
raise ImportError(PIL_ERROR_MESSAGE)
mode: Literal["1", "8", "RGB", "CMYK"]
if image.bits == 1:
mode = "1"
elif image.bits == 8 and channels == 1:
mode = "8"
elif image.bits == 8 and channels == 3:
mode = "RGB"
elif image.bits == 8 and channels == 4:
mode = "CMYK"
img = Image.frombytes(mode, image.srcsize, image.stream.get_data(), "raw")
img.save(fp)
return name
def _save_raw(self, image: LTImage) -> str:
"""Save an image with unknown encoding"""
ext = ".%d.%dx%d.img" % (image.bits, image.srcsize[0], image.srcsize[1])
name, path = self._create_unique_image_name(image, ext)
with open(path, "wb") as fp:
fp.write(image.stream.get_data())
fp.close()
return name
@staticmethod
def is_jbig2_image(image: LTImage) -> bool:
filters = image.stream.get_filters()
is_jbig2 = False
for filter_name, params in filters:
if filter_name in LITERALS_JBIG2_DECODE:
is_jbig2 = True
break
return is_jbig2
@staticmethod
def jbig2_global(image: LTImage) -> List[Any]:
global_streams = []
def _is_jbig2_iamge(image: LTImage) -> bool:
filters = image.stream.get_filters()
for filter_name, params in filters:
if filter_name in LITERALS_JBIG2_DECODE:
global_streams.append(params["JBIG2Globals"].resolve())
return global_streams
return True
return False
@staticmethod
def _get_image_extension(
image: LTImage, width: int, height: int, is_jbig2: bool
) -> str:
filters = image.stream.get_filters()
if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
ext = ".jpg"
elif len(filters) == 1 and filters[0][0] in LITERALS_JPX_DECODE:
ext = ".jp2"
elif is_jbig2:
ext = ".jb2"
elif (
image.bits == 1
or image.bits == 8
and (
LITERAL_DEVICE_RGB in image.colorspace
or LITERAL_DEVICE_GRAY in image.colorspace
)
):
ext = ".%dx%d.bmp" % (width, height)
else:
ext = ".%d.%dx%d.img" % (image.bits, width, height)
return ext
@staticmethod
def _create_unique_image_name(
dirname: str, image_name: str, ext: str
) -> Tuple[str, str]:
name = image_name + ext
path = os.path.join(dirname, name)
def _create_unique_image_name(self, image: LTImage, ext: str) -> Tuple[str, str]:
name = image.name + ext
path = os.path.join(self.outdir, name)
img_index = 0
while os.path.exists(path):
name = "%s.%d%s" % (image_name, img_index, ext)
path = os.path.join(dirname, name)
name = "%s.%d%s" % (image.name, img_index, ext)
path = os.path.join(self.outdir, name)
img_index += 1
return name, path