Fix #392 Split out IO logic from high level functions (#393)

* Allow file-like inputs to high level functions (#392) * PR Review - move open_filename to utils
2020-03-26 21:52:00 +00:00 · 2020-03-26 21:52:00 +00:00 · 1a4a06da9f
parent 1cc1b961c5
commit 1a4a06da9f
4 changed files with 61 additions and 11 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -10,6 +10,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 - Ignore ValueError when converting font encoding differences ([#389](https://github.com/pdfminer/pdfminer.six/pull/389))
 - Grouping of text lines outside of parent container bounding box ([#386](https://github.com/pdfminer/pdfminer.six/pull/386))

+### Added
+- Also accept file-like objects in high level functions `extract_text` and `extract_pages` ([#392](https://github.com/pdfminer/pdfminer.six/pull/392))
+
 ### Changed
 - Group text lines if they are centered ([#382](https://github.com/pdfminer/pdfminer.six/pull/382))

--- a/pdfminer/high_level.py
+++ b/pdfminer/high_level.py
@ -11,6 +11,7 @@ from .layout import LAParams
 from .pdfdevice import TagExtractor
 from .pdfinterp import PDFResourceManager, PDFPageInterpreter
 from .pdfpage import PDFPage
+from .utils import open_filename


 def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
@ -91,7 +92,8 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
                 caching=True, codec='utf-8', laparams=None):
    """Parse and return the text contained in a PDF file.

-    :param pdf_file: Path to the PDF file to be worked on
+    :param pdf_file: Either a file path or a file-like object for the PDF file
+        to be worked on.
    :param password: For encrypted PDFs, the password to decrypt.
    :param page_numbers: List of zero-indexed page numbers to extract.
    :param maxpages: The maximum number of pages to parse
@ -104,7 +106,7 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
    if laparams is None:
        laparams = LAParams()

-    with open(pdf_file, "rb") as fp, StringIO() as output_string:
+    with open_filename(pdf_file, "rb") as fp, StringIO() as output_string:
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, codec=codec,
                               laparams=laparams)
@ -127,7 +129,8 @@ def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0,
                  caching=True, laparams=None):
    """Extract and yield LTPage objects

-    :param pdf_file: Path to the PDF file to be worked on
+    :param pdf_file: Either a file path or a file-like object for the PDF file
+        to be worked on.
    :param password: For encrypted PDFs, the password to decrypt.
    :param page_numbers: List of zero-indexed page numbers to extract.
    :param maxpages: The maximum number of pages to parse
@ -139,7 +142,7 @@ def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0,
    if laparams is None:
        laparams = LAParams()

-    with open(pdf_file, "rb") as fp:
+    with open_filename(pdf_file, "rb") as fp:
        resource_manager = PDFResourceManager()
        device = PDFPageAggregator(resource_manager, laparams=laparams)
        interpreter = PDFPageInterpreter(resource_manager, device)
--- a/pdfminer/utils.py
+++ b/pdfminer/utils.py
@ -11,6 +11,28 @@ import chardet  # For str encoding detection
 INF = (1 << 31) - 1


+class open_filename(object):
+    """
+    Context manager that allows opening a filename and closes it on exit,
+    (just like `open`), but does nothing for file-like objects.
+    """
+    def __init__(self, filename, *args, **kwargs):
+        if isinstance(filename, str):
+            self.file_handler = open(filename, *args, **kwargs)
+            self.closing = True
+        else:
+            self.file_handler = filename
+            self.closing = False
+
+    def __enter__(self):
+        return self.file_handler
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.closing:
+            self.file_handler.close()
+        return False
+
+
 def make_compat_bytes(in_str):
    "Converts to bytes, encoding to unicode."
    assert isinstance(in_str, str), str(type(in_str))
--- a/tests/test_highlevel_extracttext.py
+++ b/tests/test_highlevel_extracttext.py
@ -4,12 +4,19 @@ from helpers import absolute_sample_path
 from pdfminer.high_level import extract_text


-def run(sample_path):
+def run_with_string(sample_path):
    absolute_path = absolute_sample_path(sample_path)
    s = extract_text(absolute_path)
    return s


+def run_with_file(sample_path):
+    absolute_path = absolute_sample_path(sample_path)
+    with open(absolute_path, "rb") as in_file:
+        s = extract_text(in_file)
+    return s
+
+
 test_strings = {
    "simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\n"
                   "H e l l o  \n\nW o r l d\n\n"
@ -20,19 +27,34 @@ test_strings = {


 class TestExtractText(unittest.TestCase):
-    def test_simple1(self):
+    def test_simple1_with_string(self):
        test_file = "simple1.pdf"
-        s = run(test_file)
+        s = run_with_string(test_file)
        self.assertEqual(s, test_strings[test_file])

-    def test_simple2(self):
+    def test_simple2_with_string(self):
        test_file = "simple2.pdf"
-        s = run(test_file)
+        s = run_with_string(test_file)
        self.assertEqual(s, test_strings[test_file])

-    def test_simple3(self):
+    def test_simple3_with_string(self):
        test_file = "simple3.pdf"
-        s = run(test_file)
+        s = run_with_string(test_file)
+        self.assertEqual(s, test_strings[test_file])
+
+    def test_simple1_with_file(self):
+        test_file = "simple1.pdf"
+        s = run_with_file(test_file)
+        self.assertEqual(s, test_strings[test_file])
+
+    def test_simple2_with_file(self):
+        test_file = "simple2.pdf"
+        s = run_with_file(test_file)
+        self.assertEqual(s, test_strings[test_file])
+
+    def test_simple3_with_file(self):
+        test_file = "simple3.pdf"
+        s = run_with_file(test_file)
        self.assertEqual(s, test_strings[test_file])