Fix #392 Split out IO logic from high level functions (#393)

* Allow file-like inputs to high level functions (#392)

* PR Review - move open_filename to utils
pull/399/head
Jake Stockwin 2020-03-26 21:52:00 +00:00 committed by GitHub
parent 1cc1b961c5
commit 1a4a06da9f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 61 additions and 11 deletions

View File

@ -10,6 +10,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- Ignore ValueError when converting font encoding differences ([#389](https://github.com/pdfminer/pdfminer.six/pull/389)) - Ignore ValueError when converting font encoding differences ([#389](https://github.com/pdfminer/pdfminer.six/pull/389))
- Grouping of text lines outside of parent container bounding box ([#386](https://github.com/pdfminer/pdfminer.six/pull/386)) - Grouping of text lines outside of parent container bounding box ([#386](https://github.com/pdfminer/pdfminer.six/pull/386))
### Added
- Also accept file-like objects in high level functions `extract_text` and `extract_pages` ([#392](https://github.com/pdfminer/pdfminer.six/pull/392))
### Changed ### Changed
- Group text lines if they are centered ([#382](https://github.com/pdfminer/pdfminer.six/pull/382)) - Group text lines if they are centered ([#382](https://github.com/pdfminer/pdfminer.six/pull/382))

View File

@ -11,6 +11,7 @@ from .layout import LAParams
from .pdfdevice import TagExtractor from .pdfdevice import TagExtractor
from .pdfinterp import PDFResourceManager, PDFPageInterpreter from .pdfinterp import PDFResourceManager, PDFPageInterpreter
from .pdfpage import PDFPage from .pdfpage import PDFPage
from .utils import open_filename
def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8', def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
@ -91,7 +92,8 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
caching=True, codec='utf-8', laparams=None): caching=True, codec='utf-8', laparams=None):
"""Parse and return the text contained in a PDF file. """Parse and return the text contained in a PDF file.
:param pdf_file: Path to the PDF file to be worked on :param pdf_file: Either a file path or a file-like object for the PDF file
to be worked on.
:param password: For encrypted PDFs, the password to decrypt. :param password: For encrypted PDFs, the password to decrypt.
:param page_numbers: List of zero-indexed page numbers to extract. :param page_numbers: List of zero-indexed page numbers to extract.
:param maxpages: The maximum number of pages to parse :param maxpages: The maximum number of pages to parse
@ -104,7 +106,7 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
if laparams is None: if laparams is None:
laparams = LAParams() laparams = LAParams()
with open(pdf_file, "rb") as fp, StringIO() as output_string: with open_filename(pdf_file, "rb") as fp, StringIO() as output_string:
rsrcmgr = PDFResourceManager() rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, output_string, codec=codec, device = TextConverter(rsrcmgr, output_string, codec=codec,
laparams=laparams) laparams=laparams)
@ -127,7 +129,8 @@ def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0,
caching=True, laparams=None): caching=True, laparams=None):
"""Extract and yield LTPage objects """Extract and yield LTPage objects
:param pdf_file: Path to the PDF file to be worked on :param pdf_file: Either a file path or a file-like object for the PDF file
to be worked on.
:param password: For encrypted PDFs, the password to decrypt. :param password: For encrypted PDFs, the password to decrypt.
:param page_numbers: List of zero-indexed page numbers to extract. :param page_numbers: List of zero-indexed page numbers to extract.
:param maxpages: The maximum number of pages to parse :param maxpages: The maximum number of pages to parse
@ -139,7 +142,7 @@ def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0,
if laparams is None: if laparams is None:
laparams = LAParams() laparams = LAParams()
with open(pdf_file, "rb") as fp: with open_filename(pdf_file, "rb") as fp:
resource_manager = PDFResourceManager() resource_manager = PDFResourceManager()
device = PDFPageAggregator(resource_manager, laparams=laparams) device = PDFPageAggregator(resource_manager, laparams=laparams)
interpreter = PDFPageInterpreter(resource_manager, device) interpreter = PDFPageInterpreter(resource_manager, device)

View File

@ -11,6 +11,28 @@ import chardet # For str encoding detection
INF = (1 << 31) - 1 INF = (1 << 31) - 1
class open_filename(object):
"""
Context manager that allows opening a filename and closes it on exit,
(just like `open`), but does nothing for file-like objects.
"""
def __init__(self, filename, *args, **kwargs):
if isinstance(filename, str):
self.file_handler = open(filename, *args, **kwargs)
self.closing = True
else:
self.file_handler = filename
self.closing = False
def __enter__(self):
return self.file_handler
def __exit__(self, exc_type, exc_val, exc_tb):
if self.closing:
self.file_handler.close()
return False
def make_compat_bytes(in_str): def make_compat_bytes(in_str):
"Converts to bytes, encoding to unicode." "Converts to bytes, encoding to unicode."
assert isinstance(in_str, str), str(type(in_str)) assert isinstance(in_str, str), str(type(in_str))

View File

@ -4,12 +4,19 @@ from helpers import absolute_sample_path
from pdfminer.high_level import extract_text from pdfminer.high_level import extract_text
def run(sample_path): def run_with_string(sample_path):
absolute_path = absolute_sample_path(sample_path) absolute_path = absolute_sample_path(sample_path)
s = extract_text(absolute_path) s = extract_text(absolute_path)
return s return s
def run_with_file(sample_path):
absolute_path = absolute_sample_path(sample_path)
with open(absolute_path, "rb") as in_file:
s = extract_text(in_file)
return s
test_strings = { test_strings = {
"simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\n" "simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\n"
"H e l l o \n\nW o r l d\n\n" "H e l l o \n\nW o r l d\n\n"
@ -20,19 +27,34 @@ test_strings = {
class TestExtractText(unittest.TestCase): class TestExtractText(unittest.TestCase):
def test_simple1(self): def test_simple1_with_string(self):
test_file = "simple1.pdf" test_file = "simple1.pdf"
s = run(test_file) s = run_with_string(test_file)
self.assertEqual(s, test_strings[test_file]) self.assertEqual(s, test_strings[test_file])
def test_simple2(self): def test_simple2_with_string(self):
test_file = "simple2.pdf" test_file = "simple2.pdf"
s = run(test_file) s = run_with_string(test_file)
self.assertEqual(s, test_strings[test_file]) self.assertEqual(s, test_strings[test_file])
def test_simple3(self): def test_simple3_with_string(self):
test_file = "simple3.pdf" test_file = "simple3.pdf"
s = run(test_file) s = run_with_string(test_file)
self.assertEqual(s, test_strings[test_file])
def test_simple1_with_file(self):
test_file = "simple1.pdf"
s = run_with_file(test_file)
self.assertEqual(s, test_strings[test_file])
def test_simple2_with_file(self):
test_file = "simple2.pdf"
s = run_with_file(test_file)
self.assertEqual(s, test_strings[test_file])
def test_simple3_with_file(self):
test_file = "simple3.pdf"
s = run_with_file(test_file)
self.assertEqual(s, test_strings[test_file]) self.assertEqual(s, test_strings[test_file])