2019-11-07 06:54:10 +00:00
|
|
|
import unittest
|
|
|
|
|
|
|
|
from helpers import absolute_sample_path
|
|
|
|
from pdfminer.high_level import extract_text
|
2020-04-01 11:37:04 +00:00
|
|
|
from pdfminer.layout import LAParams
|
2019-11-07 06:54:10 +00:00
|
|
|
|
|
|
|
|
2020-04-01 11:37:04 +00:00
|
|
|
def run_with_string(sample_path, laparams=None):
|
|
|
|
if laparams is None:
|
|
|
|
laparams = {}
|
2019-11-07 06:54:10 +00:00
|
|
|
absolute_path = absolute_sample_path(sample_path)
|
2020-04-01 11:37:04 +00:00
|
|
|
s = extract_text(absolute_path, laparams=LAParams(**laparams))
|
2019-11-07 06:54:10 +00:00
|
|
|
return s
|
|
|
|
|
|
|
|
|
2020-03-26 21:52:00 +00:00
|
|
|
def run_with_file(sample_path):
|
|
|
|
absolute_path = absolute_sample_path(sample_path)
|
|
|
|
with open(absolute_path, "rb") as in_file:
|
|
|
|
s = extract_text(in_file)
|
|
|
|
return s
|
|
|
|
|
|
|
|
|
2019-11-07 06:54:10 +00:00
|
|
|
test_strings = {
|
2019-12-29 20:20:20 +00:00
|
|
|
"simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\n"
|
|
|
|
"H e l l o \n\nW o r l d\n\n"
|
|
|
|
"H e l l o \n\nW o r l d\n\n\f",
|
2020-04-01 11:37:04 +00:00
|
|
|
"simple1.pdf_no_boxes_flow": "Hello \nWorld\nHello \nWorld\n"
|
|
|
|
"H e l l o \nW o r l d\n"
|
|
|
|
"H e l l o \nW o r l d\n\f",
|
2019-11-07 06:54:10 +00:00
|
|
|
"simple2.pdf": "\f",
|
2020-01-16 21:15:50 +00:00
|
|
|
"simple3.pdf": "Hello\n\nHello\n\nWorld\n\nWorld\n\n\f",
|
2019-11-07 06:54:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
class TestExtractText(unittest.TestCase):
|
2020-03-26 21:52:00 +00:00
|
|
|
def test_simple1_with_string(self):
|
|
|
|
test_file = "simple1.pdf"
|
|
|
|
s = run_with_string(test_file)
|
|
|
|
self.assertEqual(s, test_strings[test_file])
|
|
|
|
|
2020-04-01 11:37:04 +00:00
|
|
|
def test_simple1_no_boxes_flow(self):
|
|
|
|
test_file = "simple1.pdf"
|
|
|
|
s = run_with_string(test_file, laparams={"boxes_flow": None})
|
|
|
|
self.assertEqual(s, test_strings["simple1.pdf_no_boxes_flow"])
|
|
|
|
|
2020-03-26 21:52:00 +00:00
|
|
|
def test_simple2_with_string(self):
|
|
|
|
test_file = "simple2.pdf"
|
|
|
|
s = run_with_string(test_file)
|
|
|
|
self.assertEqual(s, test_strings[test_file])
|
|
|
|
|
|
|
|
def test_simple3_with_string(self):
|
|
|
|
test_file = "simple3.pdf"
|
|
|
|
s = run_with_string(test_file)
|
|
|
|
self.assertEqual(s, test_strings[test_file])
|
|
|
|
|
|
|
|
def test_simple1_with_file(self):
|
2019-11-07 06:54:10 +00:00
|
|
|
test_file = "simple1.pdf"
|
2020-03-26 21:52:00 +00:00
|
|
|
s = run_with_file(test_file)
|
2019-11-07 06:54:10 +00:00
|
|
|
self.assertEqual(s, test_strings[test_file])
|
|
|
|
|
2020-03-26 21:52:00 +00:00
|
|
|
def test_simple2_with_file(self):
|
2019-11-07 06:54:10 +00:00
|
|
|
test_file = "simple2.pdf"
|
2020-03-26 21:52:00 +00:00
|
|
|
s = run_with_file(test_file)
|
2019-11-07 06:54:10 +00:00
|
|
|
self.assertEqual(s, test_strings[test_file])
|
|
|
|
|
2020-03-26 21:52:00 +00:00
|
|
|
def test_simple3_with_file(self):
|
2019-11-07 06:54:10 +00:00
|
|
|
test_file = "simple3.pdf"
|
2020-03-26 21:52:00 +00:00
|
|
|
s = run_with_file(test_file)
|
2019-11-07 06:54:10 +00:00
|
|
|
self.assertEqual(s, test_strings[test_file])
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
unittest.main()
|