* Allow file-like inputs to high level functions (#392) * PR Review - move open_filename to utilspull/399/head
parent
1cc1b961c5
commit
1a4a06da9f
|
@ -10,6 +10,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
- Ignore ValueError when converting font encoding differences ([#389](https://github.com/pdfminer/pdfminer.six/pull/389))
|
- Ignore ValueError when converting font encoding differences ([#389](https://github.com/pdfminer/pdfminer.six/pull/389))
|
||||||
- Grouping of text lines outside of parent container bounding box ([#386](https://github.com/pdfminer/pdfminer.six/pull/386))
|
- Grouping of text lines outside of parent container bounding box ([#386](https://github.com/pdfminer/pdfminer.six/pull/386))
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- Also accept file-like objects in high level functions `extract_text` and `extract_pages` ([#392](https://github.com/pdfminer/pdfminer.six/pull/392))
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
- Group text lines if they are centered ([#382](https://github.com/pdfminer/pdfminer.six/pull/382))
|
- Group text lines if they are centered ([#382](https://github.com/pdfminer/pdfminer.six/pull/382))
|
||||||
|
|
||||||
|
|
|
@ -11,6 +11,7 @@ from .layout import LAParams
|
||||||
from .pdfdevice import TagExtractor
|
from .pdfdevice import TagExtractor
|
||||||
from .pdfinterp import PDFResourceManager, PDFPageInterpreter
|
from .pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||||
from .pdfpage import PDFPage
|
from .pdfpage import PDFPage
|
||||||
|
from .utils import open_filename
|
||||||
|
|
||||||
|
|
||||||
def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
|
def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
|
||||||
|
@ -91,7 +92,8 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
|
||||||
caching=True, codec='utf-8', laparams=None):
|
caching=True, codec='utf-8', laparams=None):
|
||||||
"""Parse and return the text contained in a PDF file.
|
"""Parse and return the text contained in a PDF file.
|
||||||
|
|
||||||
:param pdf_file: Path to the PDF file to be worked on
|
:param pdf_file: Either a file path or a file-like object for the PDF file
|
||||||
|
to be worked on.
|
||||||
:param password: For encrypted PDFs, the password to decrypt.
|
:param password: For encrypted PDFs, the password to decrypt.
|
||||||
:param page_numbers: List of zero-indexed page numbers to extract.
|
:param page_numbers: List of zero-indexed page numbers to extract.
|
||||||
:param maxpages: The maximum number of pages to parse
|
:param maxpages: The maximum number of pages to parse
|
||||||
|
@ -104,7 +106,7 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
|
||||||
if laparams is None:
|
if laparams is None:
|
||||||
laparams = LAParams()
|
laparams = LAParams()
|
||||||
|
|
||||||
with open(pdf_file, "rb") as fp, StringIO() as output_string:
|
with open_filename(pdf_file, "rb") as fp, StringIO() as output_string:
|
||||||
rsrcmgr = PDFResourceManager()
|
rsrcmgr = PDFResourceManager()
|
||||||
device = TextConverter(rsrcmgr, output_string, codec=codec,
|
device = TextConverter(rsrcmgr, output_string, codec=codec,
|
||||||
laparams=laparams)
|
laparams=laparams)
|
||||||
|
@ -127,7 +129,8 @@ def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0,
|
||||||
caching=True, laparams=None):
|
caching=True, laparams=None):
|
||||||
"""Extract and yield LTPage objects
|
"""Extract and yield LTPage objects
|
||||||
|
|
||||||
:param pdf_file: Path to the PDF file to be worked on
|
:param pdf_file: Either a file path or a file-like object for the PDF file
|
||||||
|
to be worked on.
|
||||||
:param password: For encrypted PDFs, the password to decrypt.
|
:param password: For encrypted PDFs, the password to decrypt.
|
||||||
:param page_numbers: List of zero-indexed page numbers to extract.
|
:param page_numbers: List of zero-indexed page numbers to extract.
|
||||||
:param maxpages: The maximum number of pages to parse
|
:param maxpages: The maximum number of pages to parse
|
||||||
|
@ -139,7 +142,7 @@ def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0,
|
||||||
if laparams is None:
|
if laparams is None:
|
||||||
laparams = LAParams()
|
laparams = LAParams()
|
||||||
|
|
||||||
with open(pdf_file, "rb") as fp:
|
with open_filename(pdf_file, "rb") as fp:
|
||||||
resource_manager = PDFResourceManager()
|
resource_manager = PDFResourceManager()
|
||||||
device = PDFPageAggregator(resource_manager, laparams=laparams)
|
device = PDFPageAggregator(resource_manager, laparams=laparams)
|
||||||
interpreter = PDFPageInterpreter(resource_manager, device)
|
interpreter = PDFPageInterpreter(resource_manager, device)
|
||||||
|
|
|
@ -11,6 +11,28 @@ import chardet # For str encoding detection
|
||||||
INF = (1 << 31) - 1
|
INF = (1 << 31) - 1
|
||||||
|
|
||||||
|
|
||||||
|
class open_filename(object):
|
||||||
|
"""
|
||||||
|
Context manager that allows opening a filename and closes it on exit,
|
||||||
|
(just like `open`), but does nothing for file-like objects.
|
||||||
|
"""
|
||||||
|
def __init__(self, filename, *args, **kwargs):
|
||||||
|
if isinstance(filename, str):
|
||||||
|
self.file_handler = open(filename, *args, **kwargs)
|
||||||
|
self.closing = True
|
||||||
|
else:
|
||||||
|
self.file_handler = filename
|
||||||
|
self.closing = False
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
return self.file_handler
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||||
|
if self.closing:
|
||||||
|
self.file_handler.close()
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def make_compat_bytes(in_str):
|
def make_compat_bytes(in_str):
|
||||||
"Converts to bytes, encoding to unicode."
|
"Converts to bytes, encoding to unicode."
|
||||||
assert isinstance(in_str, str), str(type(in_str))
|
assert isinstance(in_str, str), str(type(in_str))
|
||||||
|
|
|
@ -4,12 +4,19 @@ from helpers import absolute_sample_path
|
||||||
from pdfminer.high_level import extract_text
|
from pdfminer.high_level import extract_text
|
||||||
|
|
||||||
|
|
||||||
def run(sample_path):
|
def run_with_string(sample_path):
|
||||||
absolute_path = absolute_sample_path(sample_path)
|
absolute_path = absolute_sample_path(sample_path)
|
||||||
s = extract_text(absolute_path)
|
s = extract_text(absolute_path)
|
||||||
return s
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def run_with_file(sample_path):
|
||||||
|
absolute_path = absolute_sample_path(sample_path)
|
||||||
|
with open(absolute_path, "rb") as in_file:
|
||||||
|
s = extract_text(in_file)
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
test_strings = {
|
test_strings = {
|
||||||
"simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\n"
|
"simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\n"
|
||||||
"H e l l o \n\nW o r l d\n\n"
|
"H e l l o \n\nW o r l d\n\n"
|
||||||
|
@ -20,19 +27,34 @@ test_strings = {
|
||||||
|
|
||||||
|
|
||||||
class TestExtractText(unittest.TestCase):
|
class TestExtractText(unittest.TestCase):
|
||||||
def test_simple1(self):
|
def test_simple1_with_string(self):
|
||||||
test_file = "simple1.pdf"
|
test_file = "simple1.pdf"
|
||||||
s = run(test_file)
|
s = run_with_string(test_file)
|
||||||
self.assertEqual(s, test_strings[test_file])
|
self.assertEqual(s, test_strings[test_file])
|
||||||
|
|
||||||
def test_simple2(self):
|
def test_simple2_with_string(self):
|
||||||
test_file = "simple2.pdf"
|
test_file = "simple2.pdf"
|
||||||
s = run(test_file)
|
s = run_with_string(test_file)
|
||||||
self.assertEqual(s, test_strings[test_file])
|
self.assertEqual(s, test_strings[test_file])
|
||||||
|
|
||||||
def test_simple3(self):
|
def test_simple3_with_string(self):
|
||||||
test_file = "simple3.pdf"
|
test_file = "simple3.pdf"
|
||||||
s = run(test_file)
|
s = run_with_string(test_file)
|
||||||
|
self.assertEqual(s, test_strings[test_file])
|
||||||
|
|
||||||
|
def test_simple1_with_file(self):
|
||||||
|
test_file = "simple1.pdf"
|
||||||
|
s = run_with_file(test_file)
|
||||||
|
self.assertEqual(s, test_strings[test_file])
|
||||||
|
|
||||||
|
def test_simple2_with_file(self):
|
||||||
|
test_file = "simple2.pdf"
|
||||||
|
s = run_with_file(test_file)
|
||||||
|
self.assertEqual(s, test_strings[test_file])
|
||||||
|
|
||||||
|
def test_simple3_with_file(self):
|
||||||
|
test_file = "simple3.pdf"
|
||||||
|
s = run_with_file(test_file)
|
||||||
self.assertEqual(s, test_strings[test_file])
|
self.assertEqual(s, test_strings[test_file])
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue