Added: simple wrapper to extract text from pdf (#330)

Fixes #327
2019-11-07 03:54:10 -03:00 · 2019-11-07 03:54:10 -03:00 · 40aa2533c9
parent 027bb62943
commit 40aa2533c9
3 changed files with 84 additions and 0 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 [#307](https://github.com/pdfminer/pdfminer.six/pull/307))

 ### Added
+- Simple wrapper to easily extract text from a PDF file [#330](https://github.com/pdfminer/pdfminer.six/pull/330)
 - Support for extracting JBIG2 encoded images ([#311](https://github.com/pdfminer/pdfminer.six/pull/311) and [#46](https://github.com/pdfminer/pdfminer.six/pull/46))

 ### Fixed
--- a/pdfminer/high_level.py
+++ b/pdfminer/high_level.py
@ -8,6 +8,12 @@ import logging
 import six
 import sys

+# Conditional import because python 2 is stupid
+if sys.version_info > (3, 0):
+    from io import StringIO
+else:
+    from io import BytesIO as StringIO
+
 from .pdfdocument import PDFDocument
 from .pdfparser import PDFParser
 from .pdfinterp import PDFResourceManager, PDFPageInterpreter
@ -16,6 +22,7 @@ from .pdfpage import PDFPage
 from .converter import XMLConverter, HTMLConverter, TextConverter
 from .cmapdb import CMapDB
 from .image import ImageWriter
+from .layout import LAParams


 def extract_text_to_fp(inf, outfp,
@ -92,3 +99,41 @@ def extract_text_to_fp(inf, outfp,
        interpreter.process_page(page)    

    device.close()
+
+
+def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
+                 caching=True, codec='utf-8', laparams=None):
+    """
+    Parses and returns the text contained in a PDF file.
+    Takes loads of optional arguments but the defaults are somewhat sane.
+    Returns a string containing all of the text extracted.
+
+    :param pdf_file: Path to the PDF file to be worked on
+    :param password: For encrypted PDFs, the password to decrypt.
+    :param page_numbers: List of zero-indexed page numbers to extract.
+    :param maxpages: The maximum number of pages to parse
+    :param caching: If resources should be cached
+    :param codec: Text decoding codec
+    :param laparams: LAParams object from pdfminer.layout.
+    """
+    if laparams is None:
+        laparams = LAParams()
+
+    with open(pdf_file, "rb") as fp, StringIO() as output_string:
+        rsrcmgr = PDFResourceManager()
+        device = TextConverter(rsrcmgr, output_string, codec=codec,
+                               laparams=laparams)
+        interpreter = PDFPageInterpreter(rsrcmgr, device)
+
+        for page in PDFPage.get_pages(
+            fp,
+            page_numbers,
+            maxpages=maxpages,
+            password=password,
+            caching=caching,
+            check_extractable=True,
+        ):
+            interpreter.process_page(page)
+
+        return output_string.getvalue()
+    
--- a/tests/test_highlevel_extracttext.py
+++ b/tests/test_highlevel_extracttext.py
@ -0,0 +1,38 @@
+import unittest
+
+from helpers import absolute_sample_path
+from pdfminer.high_level import extract_text
+
+
+def run(sample_path):
+    absolute_path = absolute_sample_path(sample_path)
+    s = extract_text(absolute_path)
+    return s
+
+
+test_strings = {
+    "simple1.pdf": "Hello \n\nWorld\n\nWorld\n\nHello \n\nH e l l o  \n\nH e l l o  \n\nW o r l d\n\nW o r l d\n\n\f",
+    "simple2.pdf": "\f",
+    "simple3.pdf": "HelloHello\n\nWorld\n\nWorld\n\n\f",
+}
+
+
+class TestExtractText(unittest.TestCase):
+    def test_simple1(self):
+        test_file = "simple1.pdf"
+        s = run(test_file)
+        self.assertEqual(s, test_strings[test_file])
+
+    def test_simple2(self):
+        test_file = "simple2.pdf"
+        s = run(test_file)
+        self.assertEqual(s, test_strings[test_file])
+
+    def test_simple3(self):
+        test_file = "simple3.pdf"
+        s = run(test_file)
+        self.assertEqual(s, test_strings[test_file])
+
+
+if __name__ == "__main__":
+    unittest.main()