Added: simple wrapper to extract text from pdf (#330)

Fixes #327
2019-11-07 03:54:10 -03:00 · 2019-11-07 03:54:10 -03:00 · 40aa2533c9
parent 027bb62943
commit 40aa2533c9
3 changed files with 84 additions and 0 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 [#307](https://github.com/pdfminer/pdfminer.six/pull/307))
 ### Added
 - Simple wrapper to easily extract text from a PDF file [#330](https://github.com/pdfminer/pdfminer.six/pull/330)
 - Support for extracting JBIG2 encoded images ([#311](https://github.com/pdfminer/pdfminer.six/pull/311) and [#46](https://github.com/pdfminer/pdfminer.six/pull/46))
 ### Fixed
--- a/pdfminer/high_level.py
+++ b/pdfminer/high_level.py
@ -8,6 +8,12 @@ import logging
 import six
 import sys
 # Conditional import because python 2 is stupid
 if sys.version_info > (3, 0):
    from io import StringIO
 else:
    from io import BytesIO as StringIO
 from .pdfdocument import PDFDocument
 from .pdfparser import PDFParser
 from .pdfinterp import PDFResourceManager, PDFPageInterpreter
@ -16,6 +22,7 @@ from .pdfpage import PDFPage
 from .converter import XMLConverter, HTMLConverter, TextConverter
 from .cmapdb import CMapDB
 from .image import ImageWriter
 from .layout import LAParams
 def extract_text_to_fp(inf, outfp,
@ -92,3 +99,41 @@ def extract_text_to_fp(inf, outfp,
        interpreter.process_page(page)    
    device.close()
 def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
                 caching=True, codec='utf-8', laparams=None):
    """
    Parses and returns the text contained in a PDF file.
    Takes loads of optional arguments but the defaults are somewhat sane.
    Returns a string containing all of the text extracted.
    :param pdf_file: Path to the PDF file to be worked on
    :param password: For encrypted PDFs, the password to decrypt.
    :param page_numbers: List of zero-indexed page numbers to extract.
    :param maxpages: The maximum number of pages to parse
    :param caching: If resources should be cached
    :param codec: Text decoding codec
    :param laparams: LAParams object from pdfminer.layout.
    """
    if laparams is None:
        laparams = LAParams()
    with open(pdf_file, "rb") as fp, StringIO() as output_string:
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, codec=codec,
                               laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(
            fp,
            page_numbers,
            maxpages=maxpages,
            password=password,
            caching=caching,
            check_extractable=True,
        ):
            interpreter.process_page(page)
        return output_string.getvalue()
--- a/tests/test_highlevel_extracttext.py
+++ b/tests/test_highlevel_extracttext.py
@ -0,0 +1,38 @@
 import unittest
 from helpers import absolute_sample_path
 from pdfminer.high_level import extract_text
 def run(sample_path):
    absolute_path = absolute_sample_path(sample_path)
    s = extract_text(absolute_path)
    return s
 test_strings = {
    "simple1.pdf": "Hello \n\nWorld\n\nWorld\n\nHello \n\nH e l l o  \n\nH e l l o  \n\nW o r l d\n\nW o r l d\n\n\f",
    "simple2.pdf": "\f",
    "simple3.pdf": "HelloHello\n\nWorld\n\nWorld\n\n\f",
 }
 class TestExtractText(unittest.TestCase):
    def test_simple1(self):
        test_file = "simple1.pdf"
        s = run(test_file)
        self.assertEqual(s, test_strings[test_file])
    def test_simple2(self):
        test_file = "simple2.pdf"
        s = run(test_file)
        self.assertEqual(s, test_strings[test_file])
    def test_simple3(self):
        test_file = "simple3.pdf"
        s = run(test_file)
        self.assertEqual(s, test_strings[test_file])
 if __name__ == "__main__":
    unittest.main()