* Allow file-like inputs to high level functions (#392) * PR Review - move open_filename to utilspull/399/head
parent
1cc1b961c5
commit
1a4a06da9f
|
@ -10,6 +10,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
|||
- Ignore ValueError when converting font encoding differences ([#389](https://github.com/pdfminer/pdfminer.six/pull/389))
|
||||
- Grouping of text lines outside of parent container bounding box ([#386](https://github.com/pdfminer/pdfminer.six/pull/386))
|
||||
|
||||
### Added
|
||||
- Also accept file-like objects in high level functions `extract_text` and `extract_pages` ([#392](https://github.com/pdfminer/pdfminer.six/pull/392))
|
||||
|
||||
### Changed
|
||||
- Group text lines if they are centered ([#382](https://github.com/pdfminer/pdfminer.six/pull/382))
|
||||
|
||||
|
|
|
@ -11,6 +11,7 @@ from .layout import LAParams
|
|||
from .pdfdevice import TagExtractor
|
||||
from .pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
from .pdfpage import PDFPage
|
||||
from .utils import open_filename
|
||||
|
||||
|
||||
def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
|
||||
|
@ -91,7 +92,8 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
|
|||
caching=True, codec='utf-8', laparams=None):
|
||||
"""Parse and return the text contained in a PDF file.
|
||||
|
||||
:param pdf_file: Path to the PDF file to be worked on
|
||||
:param pdf_file: Either a file path or a file-like object for the PDF file
|
||||
to be worked on.
|
||||
:param password: For encrypted PDFs, the password to decrypt.
|
||||
:param page_numbers: List of zero-indexed page numbers to extract.
|
||||
:param maxpages: The maximum number of pages to parse
|
||||
|
@ -104,7 +106,7 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
|
|||
if laparams is None:
|
||||
laparams = LAParams()
|
||||
|
||||
with open(pdf_file, "rb") as fp, StringIO() as output_string:
|
||||
with open_filename(pdf_file, "rb") as fp, StringIO() as output_string:
|
||||
rsrcmgr = PDFResourceManager()
|
||||
device = TextConverter(rsrcmgr, output_string, codec=codec,
|
||||
laparams=laparams)
|
||||
|
@ -127,7 +129,8 @@ def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0,
|
|||
caching=True, laparams=None):
|
||||
"""Extract and yield LTPage objects
|
||||
|
||||
:param pdf_file: Path to the PDF file to be worked on
|
||||
:param pdf_file: Either a file path or a file-like object for the PDF file
|
||||
to be worked on.
|
||||
:param password: For encrypted PDFs, the password to decrypt.
|
||||
:param page_numbers: List of zero-indexed page numbers to extract.
|
||||
:param maxpages: The maximum number of pages to parse
|
||||
|
@ -139,7 +142,7 @@ def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0,
|
|||
if laparams is None:
|
||||
laparams = LAParams()
|
||||
|
||||
with open(pdf_file, "rb") as fp:
|
||||
with open_filename(pdf_file, "rb") as fp:
|
||||
resource_manager = PDFResourceManager()
|
||||
device = PDFPageAggregator(resource_manager, laparams=laparams)
|
||||
interpreter = PDFPageInterpreter(resource_manager, device)
|
||||
|
|
|
@ -11,6 +11,28 @@ import chardet # For str encoding detection
|
|||
INF = (1 << 31) - 1
|
||||
|
||||
|
||||
class open_filename(object):
|
||||
"""
|
||||
Context manager that allows opening a filename and closes it on exit,
|
||||
(just like `open`), but does nothing for file-like objects.
|
||||
"""
|
||||
def __init__(self, filename, *args, **kwargs):
|
||||
if isinstance(filename, str):
|
||||
self.file_handler = open(filename, *args, **kwargs)
|
||||
self.closing = True
|
||||
else:
|
||||
self.file_handler = filename
|
||||
self.closing = False
|
||||
|
||||
def __enter__(self):
|
||||
return self.file_handler
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
if self.closing:
|
||||
self.file_handler.close()
|
||||
return False
|
||||
|
||||
|
||||
def make_compat_bytes(in_str):
|
||||
"Converts to bytes, encoding to unicode."
|
||||
assert isinstance(in_str, str), str(type(in_str))
|
||||
|
|
|
@ -4,12 +4,19 @@ from helpers import absolute_sample_path
|
|||
from pdfminer.high_level import extract_text
|
||||
|
||||
|
||||
def run(sample_path):
|
||||
def run_with_string(sample_path):
|
||||
absolute_path = absolute_sample_path(sample_path)
|
||||
s = extract_text(absolute_path)
|
||||
return s
|
||||
|
||||
|
||||
def run_with_file(sample_path):
|
||||
absolute_path = absolute_sample_path(sample_path)
|
||||
with open(absolute_path, "rb") as in_file:
|
||||
s = extract_text(in_file)
|
||||
return s
|
||||
|
||||
|
||||
test_strings = {
|
||||
"simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\n"
|
||||
"H e l l o \n\nW o r l d\n\n"
|
||||
|
@ -20,19 +27,34 @@ test_strings = {
|
|||
|
||||
|
||||
class TestExtractText(unittest.TestCase):
|
||||
def test_simple1(self):
|
||||
def test_simple1_with_string(self):
|
||||
test_file = "simple1.pdf"
|
||||
s = run(test_file)
|
||||
s = run_with_string(test_file)
|
||||
self.assertEqual(s, test_strings[test_file])
|
||||
|
||||
def test_simple2(self):
|
||||
def test_simple2_with_string(self):
|
||||
test_file = "simple2.pdf"
|
||||
s = run(test_file)
|
||||
s = run_with_string(test_file)
|
||||
self.assertEqual(s, test_strings[test_file])
|
||||
|
||||
def test_simple3(self):
|
||||
def test_simple3_with_string(self):
|
||||
test_file = "simple3.pdf"
|
||||
s = run(test_file)
|
||||
s = run_with_string(test_file)
|
||||
self.assertEqual(s, test_strings[test_file])
|
||||
|
||||
def test_simple1_with_file(self):
|
||||
test_file = "simple1.pdf"
|
||||
s = run_with_file(test_file)
|
||||
self.assertEqual(s, test_strings[test_file])
|
||||
|
||||
def test_simple2_with_file(self):
|
||||
test_file = "simple2.pdf"
|
||||
s = run_with_file(test_file)
|
||||
self.assertEqual(s, test_strings[test_file])
|
||||
|
||||
def test_simple3_with_file(self):
|
||||
test_file = "simple3.pdf"
|
||||
s = run_with_file(test_file)
|
||||
self.assertEqual(s, test_strings[test_file])
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue