pdfminer.six/tests/test_highlevel_extracttext.py

169 lines
6.1 KiB
Python
Raw Normal View History

import unittest
from helpers import absolute_sample_path
from pdfminer.high_level import extract_text, extract_pages
from pdfminer.layout import LAParams, LTTextContainer
def run_with_string(sample_path, laparams=None):
if laparams is None:
laparams = {}
absolute_path = absolute_sample_path(sample_path)
s = extract_text(absolute_path, laparams=LAParams(**laparams))
return s
def run_with_file(sample_path):
absolute_path = absolute_sample_path(sample_path)
with open(absolute_path, "rb") as in_file:
s = extract_text(in_file)
return s
test_strings = {
Enforce pep8 coding-style (#345) * Code Refractor: Use code-style enforcement #312 * Add flake8 to travis-ci * Remove python 2 3 comment on six library. 891 errors > 870 errors. * Remove class and functions comments that consist of just the name. 870 errors > 855 errors. * Fix flake8 errors in pdftypes.py. 855 errors > 833 errors. * Moving flake8 testing from .travis.yml to tox.ini to ensure local testing before commiting * Cleanup pdfinterp.py and add documentation from PDF Reference * Cleanup pdfpage.py * Cleanup pdffont.py * Clean psparser.py * Cleanup high_level.py * Cleanup layout.py * Cleanup pdfparser.py * Cleanup pdfcolor.py * Cleanup rijndael.py * Cleanup converter.py * Rename klass to cls if it is the class variable, to be more consistent with standard practice * Cleanup cmap.py * Cleanup pdfdevice.py * flake8 ignore fontmetrics.py * Cleanup test_pdfminer_psparser.py * Fix flake8 in pdfdocument.py; 339 errors to go * Fix flake8 utils.py; 326 errors togo * pep8 correction for few files in /tools/ 328 > 160 to go (#342) * pep8 correction for few files in /tools/ 328 > 160 to go * pep8 correction: 160 > 5 to go * Fix ascii85.py errors * Fix error in getting index from target that does not exists * Remove commented print lines * Fix flake8 error in pdfinterp.py * Fix python2 specific error by removing argument from print statement * Ignore invalid python2 syntax * Update contributing.md * Added changelog * Remove unused import Co-authored-by: Fakabbir Amin <f4amin@gmail.com>
2019-12-29 20:20:20 +00:00
"simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\n"
"H e l l o \n\nW o r l d\n\n"
"H e l l o \n\nW o r l d\n\n\f",
"simple1.pdf_no_boxes_flow": "Hello \n\nWorld\n\nHello \n\nWorld\n\n"
"H e l l o \n\nW o r l d\n\n"
"H e l l o \n\nW o r l d\n\n\f",
"simple2.pdf": "\f",
"simple3.pdf": "Hello\n\nHello\n\n\n\n\n\n\n\n\n\n\n"
"World\n\nWorld\n\n\f",
"simple4.pdf": "Text1\nText2\nText3\n\n\f",
"simple5.pdf": "Heading\n\n"
"Link to heading that is working with vim-pandoc.\n\n"
"Link to heading “that is” not working with vim-pandoc.\n\n"
"Subheading\n\nSome “more text”\n\n1\n\n\f",
Attempt to handle decompression error on some broken PDF files (#637) * Attempt to handle decompression error on some broken PDF files from times to times we go through files where no text is detected, while readers like evince reads the pdf nicely. After digging it occured this is because the PDF includes some badly compressed data. This may be fixed by uncompressing byte per byte and ignoring the error on the last check bytes (arbitrarily found to be the 3 last). This has been largely inspired by https://github.com/mstamy2/PyPDF2/issues/422 and the test file has been taken from there, so credits to @zegrep. * Attempt to handle decompression error on some broken PDF files from times to times we go through files where no text is detected, while readers like evince reads the pdf nicely. After digging it occured this is because the PDF includes some badly compressed data. This may be fixed by uncompressing byte per byte and ignoring the error on the last check bytes (arbitrarily found to be the 3 last). This has been largely inspired by mstamy2/PyPDF2#422 and the test file has been taken from there, so credits to @zegrep. * Use a warnings instead of raising exception where zlib error is detected before the CRC checksum. * Add line to CHANGELOG.md * Only try decompressing if not in strict mode * Change error into warning because warning.warn needs a subclass of Warning Co-authored-by: Sylvain Thénault <sylvain.thenault@lowatt.fr> Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
2021-12-11 17:25:19 +00:00
"zen_of_python_corrupted.pdf": "Mai 30, 18 13:27\n\nzen_of_python.txt",
"contrib/issue_566_test_1.pdf": "ISSUE Date2019-4-25 Buyer黎荣",
"contrib/issue_566_test_2.pdf": "甲方:中国饮料有限公司(盖章)",
"contrib/issue-625-identity-cmap.pdf": "Termin płatności: 2021-05-03",
}
class TestExtractText(unittest.TestCase):
def test_simple1_with_string(self):
test_file = "simple1.pdf"
s = run_with_string(test_file)
self.assertEqual(s, test_strings[test_file])
def test_simple1_no_boxes_flow(self):
test_file = "simple1.pdf"
s = run_with_string(test_file, laparams={"boxes_flow": None})
self.assertEqual(s, test_strings["simple1.pdf_no_boxes_flow"])
def test_simple2_with_string(self):
test_file = "simple2.pdf"
s = run_with_string(test_file)
self.assertEqual(s, test_strings[test_file])
def test_simple3_with_string(self):
test_file = "simple3.pdf"
s = run_with_string(test_file)
self.assertEqual(s, test_strings[test_file])
def test_simple4_with_string(self):
test_file = "simple4.pdf"
s = run_with_string(test_file)
self.assertEqual(s, test_strings[test_file])
def test_simple5_with_string(self):
test_file = "simple5.pdf"
s = run_with_string(test_file)
self.assertEqual(s, test_strings[test_file])
def test_simple1_with_file(self):
test_file = "simple1.pdf"
s = run_with_file(test_file)
self.assertEqual(s, test_strings[test_file])
def test_simple2_with_file(self):
test_file = "simple2.pdf"
s = run_with_file(test_file)
self.assertEqual(s, test_strings[test_file])
def test_simple3_with_file(self):
test_file = "simple3.pdf"
s = run_with_file(test_file)
self.assertEqual(s, test_strings[test_file])
def test_simple4_with_file(self):
test_file = "simple4.pdf"
s = run_with_file(test_file)
self.assertEqual(s, test_strings[test_file])
def test_simple5_with_file(self):
test_file = "simple5.pdf"
s = run_with_file(test_file)
self.assertEqual(s, test_strings[test_file])
Attempt to handle decompression error on some broken PDF files (#637) * Attempt to handle decompression error on some broken PDF files from times to times we go through files where no text is detected, while readers like evince reads the pdf nicely. After digging it occured this is because the PDF includes some badly compressed data. This may be fixed by uncompressing byte per byte and ignoring the error on the last check bytes (arbitrarily found to be the 3 last). This has been largely inspired by https://github.com/mstamy2/PyPDF2/issues/422 and the test file has been taken from there, so credits to @zegrep. * Attempt to handle decompression error on some broken PDF files from times to times we go through files where no text is detected, while readers like evince reads the pdf nicely. After digging it occured this is because the PDF includes some badly compressed data. This may be fixed by uncompressing byte per byte and ignoring the error on the last check bytes (arbitrarily found to be the 3 last). This has been largely inspired by mstamy2/PyPDF2#422 and the test file has been taken from there, so credits to @zegrep. * Use a warnings instead of raising exception where zlib error is detected before the CRC checksum. * Add line to CHANGELOG.md * Only try decompressing if not in strict mode * Change error into warning because warning.warn needs a subclass of Warning Co-authored-by: Sylvain Thénault <sylvain.thenault@lowatt.fr> Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
2021-12-11 17:25:19 +00:00
def test_zlib_corrupted(self):
test_file = "zen_of_python_corrupted.pdf"
s = run_with_file(test_file)
expected = test_strings[test_file]
self.assertEqual(s[:len(expected)], expected)
def test_issue_566_cmap_bytes(self):
test_file = "contrib/issue_566_test_1.pdf"
s = run_with_file(test_file)
self.assertEqual(s.strip(), test_strings[test_file])
def test_issue_566_cid_range(self):
test_file = "contrib/issue_566_test_2.pdf"
s = run_with_file(test_file)
self.assertEqual(s.strip(), test_strings[test_file])
def test_issue_625_identity_cmap(self):
test_file = "contrib/issue-625-identity-cmap.pdf"
lines = run_with_file(test_file).splitlines()
self.assertEqual(lines[6], test_strings[test_file])
class TestExtractPages(unittest.TestCase):
def _get_test_file_path(self):
test_file = "simple4.pdf"
return absolute_sample_path(test_file)
def test_line_margin(self):
# The lines have margin 0.2 relative to the height.
# Extract with line_margin 0.19 should break into 3 separate textboxes.
pages = list(extract_pages(
self._get_test_file_path(), laparams=LAParams(line_margin=0.19)))
self.assertEqual(len(pages), 1)
page = pages[0]
elements = [element for element in page
if isinstance(element, LTTextContainer)]
self.assertEqual(len(elements), 3)
self.assertEqual(elements[0].get_text(), "Text1\n")
self.assertEqual(elements[1].get_text(), "Text2\n")
self.assertEqual(elements[2].get_text(), "Text3\n")
# Extract with line_margin 0.21 should merge into one textbox.
pages = list(extract_pages(
self._get_test_file_path(), laparams=LAParams(line_margin=0.21)))
self.assertEqual(len(pages), 1)
page = pages[0]
elements = [element for element in page
if isinstance(element, LTTextContainer)]
self.assertEqual(len(elements), 1)
self.assertEqual(elements[0].get_text(), "Text1\nText2\nText3\n")
def test_no_boxes_flow(self):
pages = list(extract_pages(
self._get_test_file_path(), laparams=LAParams(boxes_flow=None)))
self.assertEqual(len(pages), 1)
page = pages[0]
elements = [element for element in page
if isinstance(element, LTTextContainer)]
self.assertEqual(len(elements), 1)
self.assertEqual(elements[0].get_text(), "Text1\nText2\nText3\n")
if __name__ == "__main__":
unittest.main()