From 40aa2533c98fb9c6b700891356e638bd6821ad13 Mon Sep 17 00:00:00 2001 From: Igor Moura Date: Thu, 7 Nov 2019 03:54:10 -0300 Subject: [PATCH] Added: simple wrapper to extract text from pdf (#330) Fixes #327 --- CHANGELOG.md | 1 + pdfminer/high_level.py | 45 +++++++++++++++++++++++++++++ tests/test_highlevel_extracttext.py | 38 ++++++++++++++++++++++++ 3 files changed, 84 insertions(+) create mode 100644 tests/test_highlevel_extracttext.py diff --git a/CHANGELOG.md b/CHANGELOG.md index b8cc655..db72a80 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). [#307](https://github.com/pdfminer/pdfminer.six/pull/307)) ### Added +- Simple wrapper to easily extract text from a PDF file [#330](https://github.com/pdfminer/pdfminer.six/pull/330) - Support for extracting JBIG2 encoded images ([#311](https://github.com/pdfminer/pdfminer.six/pull/311) and [#46](https://github.com/pdfminer/pdfminer.six/pull/46)) ### Fixed diff --git a/pdfminer/high_level.py b/pdfminer/high_level.py index 8728319..2ce4276 100644 --- a/pdfminer/high_level.py +++ b/pdfminer/high_level.py @@ -8,6 +8,12 @@ import logging import six import sys +# Conditional import because python 2 is stupid +if sys.version_info > (3, 0): + from io import StringIO +else: + from io import BytesIO as StringIO + from .pdfdocument import PDFDocument from .pdfparser import PDFParser from .pdfinterp import PDFResourceManager, PDFPageInterpreter @@ -16,6 +22,7 @@ from .pdfpage import PDFPage from .converter import XMLConverter, HTMLConverter, TextConverter from .cmapdb import CMapDB from .image import ImageWriter +from .layout import LAParams def extract_text_to_fp(inf, outfp, @@ -92,3 +99,41 @@ def extract_text_to_fp(inf, outfp, interpreter.process_page(page) device.close() + + +def extract_text(pdf_file, password='', page_numbers=None, maxpages=0, + caching=True, codec='utf-8', laparams=None): + """ + Parses and returns the text contained in a PDF file. + Takes loads of optional arguments but the defaults are somewhat sane. + Returns a string containing all of the text extracted. + + :param pdf_file: Path to the PDF file to be worked on + :param password: For encrypted PDFs, the password to decrypt. + :param page_numbers: List of zero-indexed page numbers to extract. + :param maxpages: The maximum number of pages to parse + :param caching: If resources should be cached + :param codec: Text decoding codec + :param laparams: LAParams object from pdfminer.layout. + """ + if laparams is None: + laparams = LAParams() + + with open(pdf_file, "rb") as fp, StringIO() as output_string: + rsrcmgr = PDFResourceManager() + device = TextConverter(rsrcmgr, output_string, codec=codec, + laparams=laparams) + interpreter = PDFPageInterpreter(rsrcmgr, device) + + for page in PDFPage.get_pages( + fp, + page_numbers, + maxpages=maxpages, + password=password, + caching=caching, + check_extractable=True, + ): + interpreter.process_page(page) + + return output_string.getvalue() + diff --git a/tests/test_highlevel_extracttext.py b/tests/test_highlevel_extracttext.py new file mode 100644 index 0000000..7062cd8 --- /dev/null +++ b/tests/test_highlevel_extracttext.py @@ -0,0 +1,38 @@ +import unittest + +from helpers import absolute_sample_path +from pdfminer.high_level import extract_text + + +def run(sample_path): + absolute_path = absolute_sample_path(sample_path) + s = extract_text(absolute_path) + return s + + +test_strings = { + "simple1.pdf": "Hello \n\nWorld\n\nWorld\n\nHello \n\nH e l l o \n\nH e l l o \n\nW o r l d\n\nW o r l d\n\n\f", + "simple2.pdf": "\f", + "simple3.pdf": "HelloHello\n\nWorld\n\nWorld\n\n\f", +} + + +class TestExtractText(unittest.TestCase): + def test_simple1(self): + test_file = "simple1.pdf" + s = run(test_file) + self.assertEqual(s, test_strings[test_file]) + + def test_simple2(self): + test_file = "simple2.pdf" + s = run(test_file) + self.assertEqual(s, test_strings[test_file]) + + def test_simple3(self): + test_file = "simple3.pdf" + s = run(test_file) + self.assertEqual(s, test_strings[test_file]) + + +if __name__ == "__main__": + unittest.main()