From 1a4a06da9fe295920e23311e12f22a37a2799899 Mon Sep 17 00:00:00 2001 From: Jake Stockwin Date: Thu, 26 Mar 2020 21:52:00 +0000 Subject: [PATCH] Fix #392 Split out IO logic from high level functions (#393) * Allow file-like inputs to high level functions (#392) * PR Review - move open_filename to utils --- CHANGELOG.md | 3 +++ pdfminer/high_level.py | 11 +++++---- pdfminer/utils.py | 22 ++++++++++++++++++ tests/test_highlevel_extracttext.py | 36 +++++++++++++++++++++++------ 4 files changed, 61 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fc8f8b0..ee55820 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - Ignore ValueError when converting font encoding differences ([#389](https://github.com/pdfminer/pdfminer.six/pull/389)) - Grouping of text lines outside of parent container bounding box ([#386](https://github.com/pdfminer/pdfminer.six/pull/386)) +### Added +- Also accept file-like objects in high level functions `extract_text` and `extract_pages` ([#392](https://github.com/pdfminer/pdfminer.six/pull/392)) + ### Changed - Group text lines if they are centered ([#382](https://github.com/pdfminer/pdfminer.six/pull/382)) diff --git a/pdfminer/high_level.py b/pdfminer/high_level.py index c4b2e8d..3acad77 100644 --- a/pdfminer/high_level.py +++ b/pdfminer/high_level.py @@ -11,6 +11,7 @@ from .layout import LAParams from .pdfdevice import TagExtractor from .pdfinterp import PDFResourceManager, PDFPageInterpreter from .pdfpage import PDFPage +from .utils import open_filename def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8', @@ -91,7 +92,8 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0, caching=True, codec='utf-8', laparams=None): """Parse and return the text contained in a PDF file. - :param pdf_file: Path to the PDF file to be worked on + :param pdf_file: Either a file path or a file-like object for the PDF file + to be worked on. :param password: For encrypted PDFs, the password to decrypt. :param page_numbers: List of zero-indexed page numbers to extract. :param maxpages: The maximum number of pages to parse @@ -104,7 +106,7 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0, if laparams is None: laparams = LAParams() - with open(pdf_file, "rb") as fp, StringIO() as output_string: + with open_filename(pdf_file, "rb") as fp, StringIO() as output_string: rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, codec=codec, laparams=laparams) @@ -127,7 +129,8 @@ def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0, caching=True, laparams=None): """Extract and yield LTPage objects - :param pdf_file: Path to the PDF file to be worked on + :param pdf_file: Either a file path or a file-like object for the PDF file + to be worked on. :param password: For encrypted PDFs, the password to decrypt. :param page_numbers: List of zero-indexed page numbers to extract. :param maxpages: The maximum number of pages to parse @@ -139,7 +142,7 @@ def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0, if laparams is None: laparams = LAParams() - with open(pdf_file, "rb") as fp: + with open_filename(pdf_file, "rb") as fp: resource_manager = PDFResourceManager() device = PDFPageAggregator(resource_manager, laparams=laparams) interpreter = PDFPageInterpreter(resource_manager, device) diff --git a/pdfminer/utils.py b/pdfminer/utils.py index a16d280..8531f79 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -11,6 +11,28 @@ import chardet # For str encoding detection INF = (1 << 31) - 1 +class open_filename(object): + """ + Context manager that allows opening a filename and closes it on exit, + (just like `open`), but does nothing for file-like objects. + """ + def __init__(self, filename, *args, **kwargs): + if isinstance(filename, str): + self.file_handler = open(filename, *args, **kwargs) + self.closing = True + else: + self.file_handler = filename + self.closing = False + + def __enter__(self): + return self.file_handler + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.closing: + self.file_handler.close() + return False + + def make_compat_bytes(in_str): "Converts to bytes, encoding to unicode." assert isinstance(in_str, str), str(type(in_str)) diff --git a/tests/test_highlevel_extracttext.py b/tests/test_highlevel_extracttext.py index 4a48fef..c5c6f95 100644 --- a/tests/test_highlevel_extracttext.py +++ b/tests/test_highlevel_extracttext.py @@ -4,12 +4,19 @@ from helpers import absolute_sample_path from pdfminer.high_level import extract_text -def run(sample_path): +def run_with_string(sample_path): absolute_path = absolute_sample_path(sample_path) s = extract_text(absolute_path) return s +def run_with_file(sample_path): + absolute_path = absolute_sample_path(sample_path) + with open(absolute_path, "rb") as in_file: + s = extract_text(in_file) + return s + + test_strings = { "simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\n" "H e l l o \n\nW o r l d\n\n" @@ -20,19 +27,34 @@ test_strings = { class TestExtractText(unittest.TestCase): - def test_simple1(self): + def test_simple1_with_string(self): test_file = "simple1.pdf" - s = run(test_file) + s = run_with_string(test_file) self.assertEqual(s, test_strings[test_file]) - def test_simple2(self): + def test_simple2_with_string(self): test_file = "simple2.pdf" - s = run(test_file) + s = run_with_string(test_file) self.assertEqual(s, test_strings[test_file]) - def test_simple3(self): + def test_simple3_with_string(self): test_file = "simple3.pdf" - s = run(test_file) + s = run_with_string(test_file) + self.assertEqual(s, test_strings[test_file]) + + def test_simple1_with_file(self): + test_file = "simple1.pdf" + s = run_with_file(test_file) + self.assertEqual(s, test_strings[test_file]) + + def test_simple2_with_file(self): + test_file = "simple2.pdf" + s = run_with_file(test_file) + self.assertEqual(s, test_strings[test_file]) + + def test_simple3_with_file(self): + test_file = "simple3.pdf" + s = run_with_file(test_file) self.assertEqual(s, test_strings[test_file])