From f8e6ad6ac11a5f22c3819cc2064b8b17abf97a9d Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sun, 25 Oct 2020 14:37:12 +0100 Subject: [PATCH] Remove supoprt for non standard output streams that are not binary by removing the try-except check that writes a unicode character to the stream (#523) Closes #191 * Remove supoprt for non standard output streams that are not binary by removing the try-except check that writes a unicode character to the stream * Add docstring * Fix flake8 --- CHANGELOG.md | 1 + pdfminer/converter.py | 35 +++++++++++++++++------------------ tests/test_converter.py | 26 ++++++++++++++++++++++++-- 3 files changed, 42 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c9a02a4..9428db9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## Removed - Support for Python 3.4 and 3.5 ([#522](https://github.com/pdfminer/pdfminer.six/pull/522)) - Unused dependency on `sortedcontainers` package ([#525](https://github.com/pdfminer/pdfminer.six/pull/525)) +- Support for non-standard output streams that are not binary ([#523](https://github.com/pdfminer/pdfminer.six/pull/523)) ## [20201018] diff --git a/pdfminer/converter.py b/pdfminer/converter.py index dcdb055..0c5742c 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -1,3 +1,4 @@ +import io import logging import re import sys @@ -167,24 +168,22 @@ class PDFConverter(PDFLayoutAnalyzer): laparams=laparams) self.outfp = outfp self.codec = codec - if hasattr(self.outfp, 'mode'): - if 'b' in self.outfp.mode: - self.outfp_binary = True - else: - self.outfp_binary = False - else: - import io - if isinstance(self.outfp, io.BytesIO): - self.outfp_binary = True - elif isinstance(self.outfp, io.StringIO): - self.outfp_binary = False - else: - try: - self.outfp.write("é") - self.outfp_binary = False - except TypeError: - self.outfp_binary = True - return + self.outfp_binary = self._is_binary_stream(self.outfp) + + @staticmethod + def _is_binary_stream(outfp): + """Test if an stream is binary or not""" + if 'b' in getattr(outfp, 'mode', ''): + return True + elif hasattr(outfp, 'mode'): + # output stream has a mode, but it does not contain 'b' + return False + elif isinstance(outfp, io.BytesIO): + return True + elif isinstance(outfp, io.StringIO): + return False + + return True class TextConverter(PDFConverter): diff --git a/tests/test_converter.py b/tests/test_converter.py index c36a572..cfd1f2a 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -1,6 +1,9 @@ -from nose.tools import assert_equal +import io +from tempfile import TemporaryFile -from pdfminer.converter import PDFLayoutAnalyzer +from nose.tools import assert_equal, assert_false, assert_true + +from pdfminer.converter import PDFLayoutAnalyzer, PDFConverter from pdfminer.layout import LTContainer, LTRect, LTCurve from pdfminer.pdfinterp import PDFGraphicState @@ -99,3 +102,22 @@ class TestPaintPath(): analyzer = PDFLayoutAnalyzer(None) analyzer.set_ctm([1, 0, 0, 1, 0, 0]) return analyzer + + +class TestBinaryDetector(): + def test_stringio(self): + assert_false(PDFConverter._is_binary_stream(io.StringIO())) + + def test_bytesio(self): + assert_true(PDFConverter._is_binary_stream(io.BytesIO())) + + def test_tmpfile(self): + with TemporaryFile(mode='w') as f: + assert_false(PDFConverter._is_binary_stream(f)) + + def test_binary_tmpfile(self): + with TemporaryFile(mode='wb') as f: + assert_true(PDFConverter._is_binary_stream(f)) + + def test_non_file_like_object_defaults_to_binary(self): + assert_true(PDFConverter._is_binary_stream(object()))