Fix #392 Split out IO logic from high level functions (#393)

* Allow file-like inputs to high level functions (#392)

* PR Review - move open_filename to utils
pull/399/head
Jake Stockwin 2020-03-26 21:52:00 +00:00 committed by GitHub
parent 1cc1b961c5
commit 1a4a06da9f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 61 additions and 11 deletions

View File

@ -10,6 +10,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- Ignore ValueError when converting font encoding differences ([#389](https://github.com/pdfminer/pdfminer.six/pull/389))
- Grouping of text lines outside of parent container bounding box ([#386](https://github.com/pdfminer/pdfminer.six/pull/386))
### Added
- Also accept file-like objects in high level functions `extract_text` and `extract_pages` ([#392](https://github.com/pdfminer/pdfminer.six/pull/392))
### Changed
- Group text lines if they are centered ([#382](https://github.com/pdfminer/pdfminer.six/pull/382))

View File

@ -11,6 +11,7 @@ from .layout import LAParams
from .pdfdevice import TagExtractor
from .pdfinterp import PDFResourceManager, PDFPageInterpreter
from .pdfpage import PDFPage
from .utils import open_filename
def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
@ -91,7 +92,8 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
caching=True, codec='utf-8', laparams=None):
"""Parse and return the text contained in a PDF file.
:param pdf_file: Path to the PDF file to be worked on
:param pdf_file: Either a file path or a file-like object for the PDF file
to be worked on.
:param password: For encrypted PDFs, the password to decrypt.
:param page_numbers: List of zero-indexed page numbers to extract.
:param maxpages: The maximum number of pages to parse
@ -104,7 +106,7 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
if laparams is None:
laparams = LAParams()
with open(pdf_file, "rb") as fp, StringIO() as output_string:
with open_filename(pdf_file, "rb") as fp, StringIO() as output_string:
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, output_string, codec=codec,
laparams=laparams)
@ -127,7 +129,8 @@ def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0,
caching=True, laparams=None):
"""Extract and yield LTPage objects
:param pdf_file: Path to the PDF file to be worked on
:param pdf_file: Either a file path or a file-like object for the PDF file
to be worked on.
:param password: For encrypted PDFs, the password to decrypt.
:param page_numbers: List of zero-indexed page numbers to extract.
:param maxpages: The maximum number of pages to parse
@ -139,7 +142,7 @@ def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0,
if laparams is None:
laparams = LAParams()
with open(pdf_file, "rb") as fp:
with open_filename(pdf_file, "rb") as fp:
resource_manager = PDFResourceManager()
device = PDFPageAggregator(resource_manager, laparams=laparams)
interpreter = PDFPageInterpreter(resource_manager, device)

View File

@ -11,6 +11,28 @@ import chardet # For str encoding detection
INF = (1 << 31) - 1
class open_filename(object):
"""
Context manager that allows opening a filename and closes it on exit,
(just like `open`), but does nothing for file-like objects.
"""
def __init__(self, filename, *args, **kwargs):
if isinstance(filename, str):
self.file_handler = open(filename, *args, **kwargs)
self.closing = True
else:
self.file_handler = filename
self.closing = False
def __enter__(self):
return self.file_handler
def __exit__(self, exc_type, exc_val, exc_tb):
if self.closing:
self.file_handler.close()
return False
def make_compat_bytes(in_str):
"Converts to bytes, encoding to unicode."
assert isinstance(in_str, str), str(type(in_str))

View File

@ -4,12 +4,19 @@ from helpers import absolute_sample_path
from pdfminer.high_level import extract_text
def run(sample_path):
def run_with_string(sample_path):
absolute_path = absolute_sample_path(sample_path)
s = extract_text(absolute_path)
return s
def run_with_file(sample_path):
absolute_path = absolute_sample_path(sample_path)
with open(absolute_path, "rb") as in_file:
s = extract_text(in_file)
return s
test_strings = {
"simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\n"
"H e l l o \n\nW o r l d\n\n"
@ -20,19 +27,34 @@ test_strings = {
class TestExtractText(unittest.TestCase):
def test_simple1(self):
def test_simple1_with_string(self):
test_file = "simple1.pdf"
s = run(test_file)
s = run_with_string(test_file)
self.assertEqual(s, test_strings[test_file])
def test_simple2(self):
def test_simple2_with_string(self):
test_file = "simple2.pdf"
s = run(test_file)
s = run_with_string(test_file)
self.assertEqual(s, test_strings[test_file])
def test_simple3(self):
def test_simple3_with_string(self):
test_file = "simple3.pdf"
s = run(test_file)
s = run_with_string(test_file)
self.assertEqual(s, test_strings[test_file])
def test_simple1_with_file(self):
test_file = "simple1.pdf"
s = run_with_file(test_file)
self.assertEqual(s, test_strings[test_file])
def test_simple2_with_file(self):
test_file = "simple2.pdf"
s = run_with_file(test_file)
self.assertEqual(s, test_strings[test_file])
def test_simple3_with_file(self):
test_file = "simple3.pdf"
s = run_with_file(test_file)
self.assertEqual(s, test_strings[test_file])