pdfminer.six/tests/test_highlevel_extracttext.py

import unittest

from helpers import absolute_sample_path
from pdfminer.high_level import extract_text, extract_pages
from pdfminer.layout import LAParams, LTTextContainer


def run_with_string(sample_path, laparams=None):
    if laparams is None:
        laparams = {}
    absolute_path = absolute_sample_path(sample_path)
    s = extract_text(absolute_path, laparams=LAParams(**laparams))
    return s


def run_with_file(sample_path):
    absolute_path = absolute_sample_path(sample_path)
    with open(absolute_path, "rb") as in_file:
        s = extract_text(in_file)
    return s


test_strings = {
    "simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\n"
                   "H e l l o  \n\nW o r l d\n\n"
                   "H e l l o  \n\nW o r l d\n\n\f",
    "simple1.pdf_no_boxes_flow": "Hello \n\nWorld\n\nHello \n\nWorld\n\n"
                                 "H e l l o  \n\nW o r l d\n\n"
                                 "H e l l o  \n\nW o r l d\n\n\f",
    "simple2.pdf": "\f",
    "simple3.pdf": "Hello\n\nHello\nあ\nい\nう\nえ\nお\nあ\nい\nう\nえ\nお\n"
                   "World\n\nWorld\n\n\f",
    "simple4.pdf": "Text1\nText2\nText3\n\n\f",
    "contrib/issue_566_test_1.pdf": "ISSUE Date：2019-4-25 Buyer：黎荣",
    "contrib/issue_566_test_2.pdf": "甲方：中国饮料有限公司（盖章）",
}


class TestExtractText(unittest.TestCase):
    def test_simple1_with_string(self):
        test_file = "simple1.pdf"
        s = run_with_string(test_file)
        self.assertEqual(s, test_strings[test_file])

    def test_simple1_no_boxes_flow(self):
        test_file = "simple1.pdf"
        s = run_with_string(test_file, laparams={"boxes_flow": None})
        self.assertEqual(s, test_strings["simple1.pdf_no_boxes_flow"])

    def test_simple2_with_string(self):
        test_file = "simple2.pdf"
        s = run_with_string(test_file)
        self.assertEqual(s, test_strings[test_file])

    def test_simple3_with_string(self):
        test_file = "simple3.pdf"
        s = run_with_string(test_file)
        self.assertEqual(s, test_strings[test_file])

    def test_simple4_with_string(self):
        test_file = "simple4.pdf"
        s = run_with_string(test_file)
        self.assertEqual(s, test_strings[test_file])

    def test_simple1_with_file(self):
        test_file = "simple1.pdf"
        s = run_with_file(test_file)
        self.assertEqual(s, test_strings[test_file])

    def test_simple2_with_file(self):
        test_file = "simple2.pdf"
        s = run_with_file(test_file)
        self.assertEqual(s, test_strings[test_file])

    def test_simple3_with_file(self):
        test_file = "simple3.pdf"
        s = run_with_file(test_file)
        self.assertEqual(s, test_strings[test_file])

    def test_simple4_with_file(self):
        test_file = "simple4.pdf"
        s = run_with_file(test_file)
        self.assertEqual(s, test_strings[test_file])

    def test_issue_566_cmap_bytes(self):
        test_file = "contrib/issue_566_test_1.pdf"
        s = run_with_file(test_file)
        self.assertEqual(s.strip(), test_strings[test_file])

    def test_issue_566_cid_range(self):
        test_file = "contrib/issue_566_test_2.pdf"
        s = run_with_file(test_file)
        self.assertEqual(s.strip(), test_strings[test_file])


class TestExtractPages(unittest.TestCase):
    def _get_test_file_path(self):
        test_file = "simple4.pdf"
        return absolute_sample_path(test_file)

    def test_line_margin(self):
        # The lines have margin 0.2 relative to the height.
        # Extract with line_margin 0.19 should break into 3 separate textboxes.
        pages = list(extract_pages(
            self._get_test_file_path(), laparams=LAParams(line_margin=0.19)))
        self.assertEqual(len(pages), 1)
        page = pages[0]

        elements = [element for element in page
                    if isinstance(element, LTTextContainer)]
        self.assertEqual(len(elements), 3)
        self.assertEqual(elements[0].get_text(), "Text1\n")
        self.assertEqual(elements[1].get_text(), "Text2\n")
        self.assertEqual(elements[2].get_text(), "Text3\n")

        # Extract with line_margin 0.21 should merge into one textbox.
        pages = list(extract_pages(
            self._get_test_file_path(), laparams=LAParams(line_margin=0.21)))
        self.assertEqual(len(pages), 1)
        page = pages[0]

        elements = [element for element in page
                    if isinstance(element, LTTextContainer)]
        self.assertEqual(len(elements), 1)
        self.assertEqual(elements[0].get_text(), "Text1\nText2\nText3\n")

    def test_no_boxes_flow(self):
        pages = list(extract_pages(
            self._get_test_file_path(), laparams=LAParams(boxes_flow=None)))
        self.assertEqual(len(pages), 1)
        page = pages[0]

        elements = [element for element in page
                    if isinstance(element, LTTextContainer)]
        self.assertEqual(len(elements), 1)
        self.assertEqual(elements[0].get_text(), "Text1\nText2\nText3\n")


if __name__ == "__main__":
    unittest.main()
-												Added: simple wrapper to extract text from pdf (#330)

Fixes #327 


											
										
										
											2019-11-07 06:54:10 +00:00
+								import unittest
 								from helpers import absolute_sample_path
-												Fix ordering of textlines within a textbox when boxes_flow is disabled (#412)

* Fix ordering of textlines within a textbox when boxes_flow is disabled

* Add new test PDF sample
											
										
										
											2020-05-09 13:37:49 +00:00
+								from pdfminer.high_level import extract_text, extract_pages
 								from pdfminer.layout import LAParams, LTTextContainer
-												Added: simple wrapper to extract text from pdf (#330)

Fixes #327 


											
										
										
											2019-11-07 06:54:10 +00:00
-												Fix text coming in reverse order with boxes flow disabled (#399)

Closes #398
											
										
										
											2020-04-01 11:37:04 +00:00
+								def run_with_string(sample_path, laparams=None):
 								    if laparams is None:
 								        laparams = {}
-												Added: simple wrapper to extract text from pdf (#330)

Fixes #327 


											
										
										
											2019-11-07 06:54:10 +00:00
+								    absolute_path = absolute_sample_path(sample_path)
-												Fix text coming in reverse order with boxes flow disabled (#399)

Closes #398
											
										
										
											2020-04-01 11:37:04 +00:00
+								    s = extract_text(absolute_path, laparams=LAParams(**laparams))
-												Added: simple wrapper to extract text from pdf (#330)

Fixes #327 


											
										
										
											2019-11-07 06:54:10 +00:00
+								    return s
-												Fix #392 Split out IO logic from high level functions (#393)

* Allow file-like inputs to high level functions (#392)

* PR Review - move open_filename to utils
											
										
										
											2020-03-26 21:52:00 +00:00
+								def run_with_file(sample_path):
 								    absolute_path = absolute_sample_path(sample_path)
 								    with open(absolute_path, "rb") as in_file:
 								        s = extract_text(in_file)
 								    return s
-												Added: simple wrapper to extract text from pdf (#330)

Fixes #327 


											
										
										
											2019-11-07 06:54:10 +00:00
+								test_strings = {
-												Enforce pep8 coding-style (#345)

* Code Refractor: Use code-style enforcement #312

* Add flake8 to travis-ci

* Remove python 2 3 comment on six library. 891 errors > 870 errors.

* Remove class and functions comments that consist of just the name. 870 errors > 855 errors.

* Fix flake8 errors in pdftypes.py. 855 errors > 833 errors.

* Moving flake8 testing from .travis.yml to tox.ini to ensure local testing before commiting

* Cleanup pdfinterp.py and add documentation from PDF Reference

* Cleanup pdfpage.py

* Cleanup pdffont.py

* Clean psparser.py

* Cleanup high_level.py

* Cleanup layout.py

* Cleanup pdfparser.py

* Cleanup pdfcolor.py

* Cleanup rijndael.py

* Cleanup converter.py

* Rename klass to cls if it is the class variable, to be more consistent with standard practice

* Cleanup cmap.py

* Cleanup pdfdevice.py

* flake8 ignore fontmetrics.py

* Cleanup test_pdfminer_psparser.py

* Fix flake8 in pdfdocument.py; 339 errors to go

* Fix flake8 utils.py; 326 errors togo

* pep8 correction for few files in /tools/ 328 > 160 to go (#342)

* pep8 correction for few files in /tools/ 328 > 160 to go

* pep8 correction: 160 > 5 to go

* Fix ascii85.py errors

* Fix error in getting index from target that does not exists

* Remove commented print lines

* Fix flake8 error in pdfinterp.py

* Fix python2 specific error by removing argument from print statement

* Ignore invalid python2 syntax

* Update contributing.md

* Added changelog

* Remove unused import

Co-authored-by: Fakabbir Amin <f4amin@gmail.com>

											
										
										
											2019-12-29 20:20:20 +00:00
+								    "simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\n"
 								                   "H e l l o  \n\nW o r l d\n\n"
 								                   "H e l l o  \n\nW o r l d\n\n\f",
-												Fix ordering of textlines within a textbox when boxes_flow is disabled (#412)

* Fix ordering of textlines within a textbox when boxes_flow is disabled

* Add new test PDF sample
											
										
										
											2020-05-09 13:37:49 +00:00
+								    "simple1.pdf_no_boxes_flow": "Hello \n\nWorld\n\nHello \n\nWorld\n\n"
 								                                 "H e l l o  \n\nW o r l d\n\n"
 								                                 "H e l l o  \n\nW o r l d\n\n\f",
-												Added: simple wrapper to extract text from pdf (#330)

Fixes #327 


											
										
										
											2019-11-07 06:54:10 +00:00
+								    "simple2.pdf": "\f",
-												Always try to get CMap, even if name is not recognized (#438)

* Add trying to get cmap from pickle file. And cleaning up a bit.

* Don't use keyword argument for dict.get

* Add docs

* Make _get_cmap_name static

* Add test

* Add CHANGELOG.md

* Remove identity mappings from IDENTITY_ENCODER because that's now the default if the key is not in there

* Add CJK characters to expected output of simple3.pdf

* Fix line length

* Add comment
											
										
										
											2020-07-23 18:27:38 +00:00
+								    "simple3.pdf": "Hello\n\nHello\nあ\nい\nう\nえ\nお\nあ\nい\nう\nえ\nお\n"
 								                   "World\n\nWorld\n\n\f",
-												Fix extraction of some cjk characters (#593)

Fixes #566 

* try to fix issue of some Chinese characters cannot be extracted
correctly (#566).

* format code to pass flake8 check.

* fix typo and refer to issue 593.

Co-authored-by: huan_cheng <huan_cheng@bestsign.cn>
Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
											
										
										
											2021-08-26 19:05:03 +00:00
+								    "simple4.pdf": "Text1\nText2\nText3\n\n\f",
 								    "contrib/issue_566_test_1.pdf": "ISSUE Date：2019-4-25 Buyer：黎荣",
 								    "contrib/issue_566_test_2.pdf": "甲方：中国饮料有限公司（盖章）",
-												Added: simple wrapper to extract text from pdf (#330)

Fixes #327 


											
										
										
											2019-11-07 06:54:10 +00:00
+								}
 								class TestExtractText(unittest.TestCase):
-												Fix #392 Split out IO logic from high level functions (#393)

* Allow file-like inputs to high level functions (#392)

* PR Review - move open_filename to utils
											
										
										
											2020-03-26 21:52:00 +00:00
+								    def test_simple1_with_string(self):
 								        test_file = "simple1.pdf"
 								        s = run_with_string(test_file)
 								        self.assertEqual(s, test_strings[test_file])
-												Fix text coming in reverse order with boxes flow disabled (#399)

Closes #398
											
										
										
											2020-04-01 11:37:04 +00:00
+								    def test_simple1_no_boxes_flow(self):
 								        test_file = "simple1.pdf"
 								        s = run_with_string(test_file, laparams={"boxes_flow": None})
 								        self.assertEqual(s, test_strings["simple1.pdf_no_boxes_flow"])
-												Fix #392 Split out IO logic from high level functions (#393)

* Allow file-like inputs to high level functions (#392)

* PR Review - move open_filename to utils
											
										
										
											2020-03-26 21:52:00 +00:00
+								    def test_simple2_with_string(self):
 								        test_file = "simple2.pdf"
 								        s = run_with_string(test_file)
 								        self.assertEqual(s, test_strings[test_file])
 								    def test_simple3_with_string(self):
 								        test_file = "simple3.pdf"
 								        s = run_with_string(test_file)
 								        self.assertEqual(s, test_strings[test_file])
-												Fix ordering of textlines within a textbox when boxes_flow is disabled (#412)

* Fix ordering of textlines within a textbox when boxes_flow is disabled

* Add new test PDF sample
											
										
										
											2020-05-09 13:37:49 +00:00
+								    def test_simple4_with_string(self):
 								        test_file = "simple4.pdf"
 								        s = run_with_string(test_file)
 								        self.assertEqual(s, test_strings[test_file])
-												Fix #392 Split out IO logic from high level functions (#393)

* Allow file-like inputs to high level functions (#392)

* PR Review - move open_filename to utils
											
										
										
											2020-03-26 21:52:00 +00:00
+								    def test_simple1_with_file(self):
-												Added: simple wrapper to extract text from pdf (#330)

Fixes #327 


											
										
										
											2019-11-07 06:54:10 +00:00
+								        test_file = "simple1.pdf"
-												Fix #392 Split out IO logic from high level functions (#393)

* Allow file-like inputs to high level functions (#392)

* PR Review - move open_filename to utils
											
										
										
											2020-03-26 21:52:00 +00:00
+								        s = run_with_file(test_file)
-												Added: simple wrapper to extract text from pdf (#330)

Fixes #327 


											
										
										
											2019-11-07 06:54:10 +00:00
+								        self.assertEqual(s, test_strings[test_file])
-												Fix #392 Split out IO logic from high level functions (#393)

* Allow file-like inputs to high level functions (#392)

* PR Review - move open_filename to utils
											
										
										
											2020-03-26 21:52:00 +00:00
+								    def test_simple2_with_file(self):
-												Added: simple wrapper to extract text from pdf (#330)

Fixes #327 


											
										
										
											2019-11-07 06:54:10 +00:00
+								        test_file = "simple2.pdf"
-												Fix #392 Split out IO logic from high level functions (#393)

* Allow file-like inputs to high level functions (#392)

* PR Review - move open_filename to utils
											
										
										
											2020-03-26 21:52:00 +00:00
+								        s = run_with_file(test_file)
-												Added: simple wrapper to extract text from pdf (#330)

Fixes #327 


											
										
										
											2019-11-07 06:54:10 +00:00
+								        self.assertEqual(s, test_strings[test_file])
-												Fix #392 Split out IO logic from high level functions (#393)

* Allow file-like inputs to high level functions (#392)

* PR Review - move open_filename to utils
											
										
										
											2020-03-26 21:52:00 +00:00
+								    def test_simple3_with_file(self):
-												Added: simple wrapper to extract text from pdf (#330)

Fixes #327 


											
										
										
											2019-11-07 06:54:10 +00:00
+								        test_file = "simple3.pdf"
-												Fix #392 Split out IO logic from high level functions (#393)

* Allow file-like inputs to high level functions (#392)

* PR Review - move open_filename to utils
											
										
										
											2020-03-26 21:52:00 +00:00
+								        s = run_with_file(test_file)
-												Added: simple wrapper to extract text from pdf (#330)

Fixes #327 


											
										
										
											2019-11-07 06:54:10 +00:00
+								        self.assertEqual(s, test_strings[test_file])
-												Fix ordering of textlines within a textbox when boxes_flow is disabled (#412)

* Fix ordering of textlines within a textbox when boxes_flow is disabled

* Add new test PDF sample
											
										
										
											2020-05-09 13:37:49 +00:00
+								    def test_simple4_with_file(self):
 								        test_file = "simple4.pdf"
 								        s = run_with_file(test_file)
 								        self.assertEqual(s, test_strings[test_file])
-												Fix extraction of some cjk characters (#593)

Fixes #566 

* try to fix issue of some Chinese characters cannot be extracted
correctly (#566).

* format code to pass flake8 check.

* fix typo and refer to issue 593.

Co-authored-by: huan_cheng <huan_cheng@bestsign.cn>
Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
											
										
										
											2021-08-26 19:05:03 +00:00
+								    def test_issue_566_cmap_bytes(self):
 								        test_file = "contrib/issue_566_test_1.pdf"
 								        s = run_with_file(test_file)
 								        self.assertEqual(s.strip(), test_strings[test_file])
 								    def test_issue_566_cid_range(self):
 								        test_file = "contrib/issue_566_test_2.pdf"
 								        s = run_with_file(test_file)
 								        self.assertEqual(s.strip(), test_strings[test_file])
-												Fix ordering of textlines within a textbox when boxes_flow is disabled (#412)

* Fix ordering of textlines within a textbox when boxes_flow is disabled

* Add new test PDF sample
											
										
										
											2020-05-09 13:37:49 +00:00
 								class TestExtractPages(unittest.TestCase):
 								    def _get_test_file_path(self):
 								        test_file = "simple4.pdf"
 								        return absolute_sample_path(test_file)
 								    def test_line_margin(self):
 								        # The lines have margin 0.2 relative to the height.
 								        # Extract with line_margin 0.19 should break into 3 separate textboxes.
 								        pages = list(extract_pages(
 								            self._get_test_file_path(), laparams=LAParams(line_margin=0.19)))
 								        self.assertEqual(len(pages), 1)
 								        page = pages[0]
 								        elements = [element for element in page
 								                    if isinstance(element, LTTextContainer)]
 								        self.assertEqual(len(elements), 3)
 								        self.assertEqual(elements[0].get_text(), "Text1\n")
 								        self.assertEqual(elements[1].get_text(), "Text2\n")
 								        self.assertEqual(elements[2].get_text(), "Text3\n")
 								        # Extract with line_margin 0.21 should merge into one textbox.
 								        pages = list(extract_pages(
 								            self._get_test_file_path(), laparams=LAParams(line_margin=0.21)))
 								        self.assertEqual(len(pages), 1)
 								        page = pages[0]
 								        elements = [element for element in page
 								                    if isinstance(element, LTTextContainer)]
 								        self.assertEqual(len(elements), 1)
 								        self.assertEqual(elements[0].get_text(), "Text1\nText2\nText3\n")
 								    def test_no_boxes_flow(self):
 								        pages = list(extract_pages(
 								            self._get_test_file_path(), laparams=LAParams(boxes_flow=None)))
 								        self.assertEqual(len(pages), 1)
 								        page = pages[0]
 								        elements = [element for element in page
 								                    if isinstance(element, LTTextContainer)]
 								        self.assertEqual(len(elements), 1)
 								        self.assertEqual(elements[0].get_text(), "Text1\nText2\nText3\n")
-												Added: simple wrapper to extract text from pdf (#330)

Fixes #327 


											
										
										
											2019-11-07 06:54:10 +00:00
 								if __name__ == "__main__":
 								    unittest.main()