Added: simple wrapper to extract text from pdf (#330)

Fixes #327
pull/333/head
Igor Moura 2019-11-07 03:54:10 -03:00 committed by Pieter Marsman
parent 027bb62943
commit 40aa2533c9
3 changed files with 84 additions and 0 deletions

View File

@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
[#307](https://github.com/pdfminer/pdfminer.six/pull/307))
### Added
- Simple wrapper to easily extract text from a PDF file [#330](https://github.com/pdfminer/pdfminer.six/pull/330)
- Support for extracting JBIG2 encoded images ([#311](https://github.com/pdfminer/pdfminer.six/pull/311) and [#46](https://github.com/pdfminer/pdfminer.six/pull/46))
### Fixed

View File

@ -8,6 +8,12 @@ import logging
import six
import sys
# Conditional import because python 2 is stupid
if sys.version_info > (3, 0):
from io import StringIO
else:
from io import BytesIO as StringIO
from .pdfdocument import PDFDocument
from .pdfparser import PDFParser
from .pdfinterp import PDFResourceManager, PDFPageInterpreter
@ -16,6 +22,7 @@ from .pdfpage import PDFPage
from .converter import XMLConverter, HTMLConverter, TextConverter
from .cmapdb import CMapDB
from .image import ImageWriter
from .layout import LAParams
def extract_text_to_fp(inf, outfp,
@ -92,3 +99,41 @@ def extract_text_to_fp(inf, outfp,
interpreter.process_page(page)
device.close()
def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
caching=True, codec='utf-8', laparams=None):
"""
Parses and returns the text contained in a PDF file.
Takes loads of optional arguments but the defaults are somewhat sane.
Returns a string containing all of the text extracted.
:param pdf_file: Path to the PDF file to be worked on
:param password: For encrypted PDFs, the password to decrypt.
:param page_numbers: List of zero-indexed page numbers to extract.
:param maxpages: The maximum number of pages to parse
:param caching: If resources should be cached
:param codec: Text decoding codec
:param laparams: LAParams object from pdfminer.layout.
"""
if laparams is None:
laparams = LAParams()
with open(pdf_file, "rb") as fp, StringIO() as output_string:
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, output_string, codec=codec,
laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(
fp,
page_numbers,
maxpages=maxpages,
password=password,
caching=caching,
check_extractable=True,
):
interpreter.process_page(page)
return output_string.getvalue()

View File

@ -0,0 +1,38 @@
import unittest
from helpers import absolute_sample_path
from pdfminer.high_level import extract_text
def run(sample_path):
absolute_path = absolute_sample_path(sample_path)
s = extract_text(absolute_path)
return s
test_strings = {
"simple1.pdf": "Hello \n\nWorld\n\nWorld\n\nHello \n\nH e l l o \n\nH e l l o \n\nW o r l d\n\nW o r l d\n\n\f",
"simple2.pdf": "\f",
"simple3.pdf": "HelloHello\n\nWorld\n\nWorld\n\n\f",
}
class TestExtractText(unittest.TestCase):
def test_simple1(self):
test_file = "simple1.pdf"
s = run(test_file)
self.assertEqual(s, test_strings[test_file])
def test_simple2(self):
test_file = "simple2.pdf"
s = run(test_file)
self.assertEqual(s, test_strings[test_file])
def test_simple3(self):
test_file = "simple3.pdf"
s = run(test_file)
self.assertEqual(s, test_strings[test_file])
if __name__ == "__main__":
unittest.main()