parent
027bb62943
commit
40aa2533c9
|
@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
[#307](https://github.com/pdfminer/pdfminer.six/pull/307))
|
[#307](https://github.com/pdfminer/pdfminer.six/pull/307))
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
|
- Simple wrapper to easily extract text from a PDF file [#330](https://github.com/pdfminer/pdfminer.six/pull/330)
|
||||||
- Support for extracting JBIG2 encoded images ([#311](https://github.com/pdfminer/pdfminer.six/pull/311) and [#46](https://github.com/pdfminer/pdfminer.six/pull/46))
|
- Support for extracting JBIG2 encoded images ([#311](https://github.com/pdfminer/pdfminer.six/pull/311) and [#46](https://github.com/pdfminer/pdfminer.six/pull/46))
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
|
|
|
@ -8,6 +8,12 @@ import logging
|
||||||
import six
|
import six
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
# Conditional import because python 2 is stupid
|
||||||
|
if sys.version_info > (3, 0):
|
||||||
|
from io import StringIO
|
||||||
|
else:
|
||||||
|
from io import BytesIO as StringIO
|
||||||
|
|
||||||
from .pdfdocument import PDFDocument
|
from .pdfdocument import PDFDocument
|
||||||
from .pdfparser import PDFParser
|
from .pdfparser import PDFParser
|
||||||
from .pdfinterp import PDFResourceManager, PDFPageInterpreter
|
from .pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||||
|
@ -16,6 +22,7 @@ from .pdfpage import PDFPage
|
||||||
from .converter import XMLConverter, HTMLConverter, TextConverter
|
from .converter import XMLConverter, HTMLConverter, TextConverter
|
||||||
from .cmapdb import CMapDB
|
from .cmapdb import CMapDB
|
||||||
from .image import ImageWriter
|
from .image import ImageWriter
|
||||||
|
from .layout import LAParams
|
||||||
|
|
||||||
|
|
||||||
def extract_text_to_fp(inf, outfp,
|
def extract_text_to_fp(inf, outfp,
|
||||||
|
@ -92,3 +99,41 @@ def extract_text_to_fp(inf, outfp,
|
||||||
interpreter.process_page(page)
|
interpreter.process_page(page)
|
||||||
|
|
||||||
device.close()
|
device.close()
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
|
||||||
|
caching=True, codec='utf-8', laparams=None):
|
||||||
|
"""
|
||||||
|
Parses and returns the text contained in a PDF file.
|
||||||
|
Takes loads of optional arguments but the defaults are somewhat sane.
|
||||||
|
Returns a string containing all of the text extracted.
|
||||||
|
|
||||||
|
:param pdf_file: Path to the PDF file to be worked on
|
||||||
|
:param password: For encrypted PDFs, the password to decrypt.
|
||||||
|
:param page_numbers: List of zero-indexed page numbers to extract.
|
||||||
|
:param maxpages: The maximum number of pages to parse
|
||||||
|
:param caching: If resources should be cached
|
||||||
|
:param codec: Text decoding codec
|
||||||
|
:param laparams: LAParams object from pdfminer.layout.
|
||||||
|
"""
|
||||||
|
if laparams is None:
|
||||||
|
laparams = LAParams()
|
||||||
|
|
||||||
|
with open(pdf_file, "rb") as fp, StringIO() as output_string:
|
||||||
|
rsrcmgr = PDFResourceManager()
|
||||||
|
device = TextConverter(rsrcmgr, output_string, codec=codec,
|
||||||
|
laparams=laparams)
|
||||||
|
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||||
|
|
||||||
|
for page in PDFPage.get_pages(
|
||||||
|
fp,
|
||||||
|
page_numbers,
|
||||||
|
maxpages=maxpages,
|
||||||
|
password=password,
|
||||||
|
caching=caching,
|
||||||
|
check_extractable=True,
|
||||||
|
):
|
||||||
|
interpreter.process_page(page)
|
||||||
|
|
||||||
|
return output_string.getvalue()
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,38 @@
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from helpers import absolute_sample_path
|
||||||
|
from pdfminer.high_level import extract_text
|
||||||
|
|
||||||
|
|
||||||
|
def run(sample_path):
|
||||||
|
absolute_path = absolute_sample_path(sample_path)
|
||||||
|
s = extract_text(absolute_path)
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
test_strings = {
|
||||||
|
"simple1.pdf": "Hello \n\nWorld\n\nWorld\n\nHello \n\nH e l l o \n\nH e l l o \n\nW o r l d\n\nW o r l d\n\n\f",
|
||||||
|
"simple2.pdf": "\f",
|
||||||
|
"simple3.pdf": "HelloHello\n\nWorld\n\nWorld\n\n\f",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtractText(unittest.TestCase):
|
||||||
|
def test_simple1(self):
|
||||||
|
test_file = "simple1.pdf"
|
||||||
|
s = run(test_file)
|
||||||
|
self.assertEqual(s, test_strings[test_file])
|
||||||
|
|
||||||
|
def test_simple2(self):
|
||||||
|
test_file = "simple2.pdf"
|
||||||
|
s = run(test_file)
|
||||||
|
self.assertEqual(s, test_strings[test_file])
|
||||||
|
|
||||||
|
def test_simple3(self):
|
||||||
|
test_file = "simple3.pdf"
|
||||||
|
s = run(test_file)
|
||||||
|
self.assertEqual(s, test_strings[test_file])
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
Loading…
Reference in New Issue