diff --git a/CHANGELOG.md b/CHANGELOG.md index bab05dc..c79674c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - Export type annotations from pypi package per PEP561 ([#679](https://github.com/pdfminer/pdfminer.six/pull/679)) - Support for identity cmap's ([#626](https://github.com/pdfminer/pdfminer.six/pull/626)) - Add support for PDF page labels ([#680](https://github.com/pdfminer/pdfminer.six/pull/680)) +- Installation of Pillow as an optional extra dependency ([#714](https://github.com/pdfminer/pdfminer.six/pull/714)) ### Fixed - Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637)) diff --git a/README.md b/README.md index aa1318c..ae25d0e 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,10 @@ How to use `pip install pdfminer.six` +* (Optionally) install extra dependencies for extracting images. + + `pip install 'pdfminer.six[image]` + * Use command-line interface to extract text from pdf: `python pdf2txt.py samples/simple1.pdf` diff --git a/docs/source/index.rst b/docs/source/index.rst index 7851b8e..a6e666e 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -66,6 +66,13 @@ Before using it, you must install it using Python 3.6 or newer. $ pip install pdfminer.six +Optionally install extra dependencies that are needed to extract jpg images. + +:: + + $ pip install 'pdfminer.six[image]' + + Contributing ============ diff --git a/pdfminer/image.py b/pdfminer/image.py index 3123326..fb30031 100644 --- a/pdfminer/image.py +++ b/pdfminer/image.py @@ -11,6 +11,12 @@ from .pdfcolor import LITERAL_DEVICE_GRAY from .pdfcolor import LITERAL_DEVICE_RGB from .pdftypes import LITERALS_DCT_DECODE, LITERALS_JBIG2_DECODE, LITERALS_JPX_DECODE +PIL_ERROR_MESSAGE = ( + "Could not import Pillow. This dependency of pdfminer.six is not " + "installed by default. You need it to to save jpg images to a file. Install it " + "with `pip install 'pdfminer.six[image]'`" +) + def align32(x: int) -> int: return ((x + 3) // 4) * 4 @@ -93,8 +99,10 @@ class ImageWriter: raw_data = image.stream.get_rawdata() assert raw_data is not None if LITERAL_DEVICE_CMYK in image.colorspace: - from PIL import Image # type: ignore[import] - from PIL import ImageChops + try: + from PIL import Image, ImageChops # type: ignore[import] + except ImportError: + raise ImportError(PIL_ERROR_MESSAGE) ifp = BytesIO(raw_data) i = Image.open(ifp) @@ -104,12 +112,15 @@ class ImageWriter: else: fp.write(raw_data) elif ext == ".jp2": + try: + from PIL import Image + except ImportError: + raise ImportError(PIL_ERROR_MESSAGE) + # if we just write the raw data, most image programs # that I have tried cannot open the file. However, # open and saving with PIL produces a file that # seems to be easily opened by other programs - from PIL import Image - raw_data = image.stream.get_rawdata() assert raw_data is not None ifp = BytesIO(raw_data) diff --git a/setup.py b/setup.py index db2e512..b9bb29d 100644 --- a/setup.py +++ b/setup.py @@ -23,6 +23,7 @@ setup( extras_require={ "dev": ["pytest", "nox", "black", "mypy == 0.931"], "docs": ["sphinx", "sphinx-argparse"], + "image": ["Pillow"], }, description="PDF parser and analyzer", long_description=readme,