From 33d7dde4d1a5684759070115d0495c48ba8f886d Mon Sep 17 00:00:00 2001 From: htInEdin Date: Mon, 27 Sep 2021 19:30:40 +0100 Subject: [PATCH] Fix bug: _is_binary_stream should recognize TextIOWrapper as non-binary, escaped \r\n should be removed (#616) * detect TextIOWrapper as non-binary * I don't understand the CHANGELOG.md format, hope this is good enough * Delete \\\r\n in Literal Strings (ref. section 7.3.4.2 of PDF32000_2008) * Keep Travis CI happy * Added test * Remove pdfminer/Changelog * Prettify _parse_string_1 * Add CHANGELOG.md * Satisfy flake8 * Update CHANGELOG.md * Use logging.Logger.warning instead of warning.warn in most cases, following the Python official guidance that warning.warn is directed at _developers_, not users * (pdfdocument.py) remove declarations of PDFTextExtractionNotAllowedWarning, PDFNoValidXRefWarning * (pdfpage.py) Don't import warning, don't use PDFTextExtractionNotAllowedWarning * (tools/dumppdf.py) Don't import warning, don't use PDFNoValidXRefWarning * (tests/test_tools_dumppdf.py) Don't import warning, check for logging.WARN rather than PDFNoValidXRefWarning * get name right * make flake8 happy * Revert "make flake8 happy" This reverts commit 45927696869abff5041cc5a338aa9390cd98606e. * Revert "get name right" This reverts commit 80091ea211c279511d206d14b2ad6cb0fb887a1f. * Revert "Use logging.Logger.warning instead of warning.warn in most cases, following" This reverts commit 3c1e3d66064e0c42d86a7191c357e16d1406449d. * Revert "Merge branch 'preferLoggingToWarning' into hst" This reverts commit 9d9d1399216d589ab600755d6548240d935c3ff3, reversing changes made to 80091ea211c279511d206d14b2ad6cb0fb887a1f. * Revert "Revert "Merge branch 'preferLoggingToWarning' into hst"" This reverts commit b3da21934d29c5cfa9354d7a41018368b6d51e9f. Co-authored-by: Henry S. Thompson Co-authored-by: Pieter Marsman --- CHANGELOG.md | 2 ++ pdfminer/converter.py | 2 ++ pdfminer/psparser.py | 17 +++++++++++++++-- tests/test_converter.py | 3 +++ 4 files changed, 22 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 919e06d..4a508b2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - Fix `.paint_path` logic for handling single line segments and extracting point-on-curve positions of BeziƩr path commands ([#530](https://github.com/pdfminer/pdfminer.six/pull/530)) - Raising `UnboundLocalError` when a bad `--output-type` is used ([#610](https://github.com/pdfminer/pdfminer.six/pull/610)) - `TypeError` when using `TagExtractor` with non-string or non-bytes tag values ([#610](https://github.com/pdfminer/pdfminer.six/pull/610)) +- Using `io.TextIOBase` as the file to write to ([#616](https://github.com/pdfminer/pdfminer.six/pull/616)) +- Parsing \r\n after the escape character in a literal string ([#616](https://github.com/pdfminer/pdfminer.six/pull/616)) ## Removed - Support for Python 3.4 and 3.5 ([#522](https://github.com/pdfminer/pdfminer.six/pull/522)) diff --git a/pdfminer/converter.py b/pdfminer/converter.py index de58f30..812f668 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -181,6 +181,8 @@ class PDFConverter(PDFLayoutAnalyzer): return True elif isinstance(outfp, io.StringIO): return False + elif isinstance(outfp, io.TextIOBase): + return False return True diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index 8c13ec4..10cf05a 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -444,16 +444,29 @@ class PSBaseParser: return j+1 def _parse_string_1(self, s, i): + """Parse literal strings + + PDF Reference 3.2.3 + """ c = s[i:i+1] if OCT_STRING.match(c) and len(self.oct) < 3: self.oct += c return i+1 - if self.oct: + + elif self.oct: self._curtoken += bytes((int(self.oct, 8),)) self._parse1 = self._parse_string return i - if c in ESC_STRING: + + elif c in ESC_STRING: self._curtoken += bytes((ESC_STRING[c],)) + + elif c == b'\r' and len(s) > i+1 and s[i+1:i+2] == b'\n': + # If current and next character is \r\n skip both because enters + # after a \ are ignored + i += 1 + + # default action self._parse1 = self._parse_string return i+1 diff --git a/tests/test_converter.py b/tests/test_converter.py index 99d1dc9..8781fa2 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -207,3 +207,6 @@ class TestBinaryDetector(): def test_non_file_like_object_defaults_to_binary(self): assert_true(PDFConverter._is_binary_stream(object())) + + def test_textiowrapper(self): + assert_false(PDFConverter._is_binary_stream(io.TextIOBase()))