pdfminer.six/tests/test_highlevel_extracttext.py

import unittest

from helpers import absolute_sample_path
from pdfminer.high_level import extract_text, extract_pages
from pdfminer.layout import LAParams, LTTextContainer


def run_with_string(sample_path, laparams=None):
    if laparams is None:
        laparams = {}
    absolute_path = absolute_sample_path(sample_path)
    s = extract_text(absolute_path, laparams=LAParams(**laparams))
    return s


def run_with_file(sample_path):
    absolute_path = absolute_sample_path(sample_path)
    with open(absolute_path, "rb") as in_file:
        s = extract_text(in_file)
    return s


test_strings = {
    "simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\n"
                   "H e l l o  \n\nW o r l d\n\n"
                   "H e l l o  \n\nW o r l d\n\n\f",
    "simple1.pdf_no_boxes_flow": "Hello \n\nWorld\n\nHello \n\nWorld\n\n"
                                 "H e l l o  \n\nW o r l d\n\n"
                                 "H e l l o  \n\nW o r l d\n\n\f",
    "simple2.pdf": "\f",
    "simple3.pdf": "Hello\n\nHello\nあ\nい\nう\nえ\nお\nあ\nい\nう\nえ\nお\n"
                   "World\n\nWorld\n\n\f",
    "simple4.pdf": "Text1\nText2\nText3\n\n\f",
    "simple5.pdf": "Heading\n\n"
                   "Link to heading that is working with vim-pandoc.\n\n"
                   "Link to heading “that is” not working with vim-pandoc.\n\n"
                   "Subheading\n\nSome “more text”\n\n1\n\n\f",
    "zen_of_python_corrupted.pdf": "Mai 30, 18 13:27\n\nzen_of_python.txt",
    "contrib/issue_566_test_1.pdf": "ISSUE Date：2019-4-25 Buyer：黎荣",
    "contrib/issue_566_test_2.pdf": "甲方：中国饮料有限公司（盖章）",
    "contrib/issue-625-identity-cmap.pdf": "Termin płatności: 2021-05-03",
}


class TestExtractText(unittest.TestCase):
    def test_simple1_with_string(self):
        test_file = "simple1.pdf"
        s = run_with_string(test_file)
        self.assertEqual(s, test_strings[test_file])

    def test_simple1_no_boxes_flow(self):
        test_file = "simple1.pdf"
        s = run_with_string(test_file, laparams={"boxes_flow": None})
        self.assertEqual(s, test_strings["simple1.pdf_no_boxes_flow"])

    def test_simple2_with_string(self):
        test_file = "simple2.pdf"
        s = run_with_string(test_file)
        self.assertEqual(s, test_strings[test_file])

    def test_simple3_with_string(self):
        test_file = "simple3.pdf"
        s = run_with_string(test_file)
        self.assertEqual(s, test_strings[test_file])

    def test_simple4_with_string(self):
        test_file = "simple4.pdf"
        s = run_with_string(test_file)
        self.assertEqual(s, test_strings[test_file])

    def test_simple5_with_string(self):
        test_file = "simple5.pdf"
        s = run_with_string(test_file)
        self.assertEqual(s, test_strings[test_file])

    def test_simple1_with_file(self):
        test_file = "simple1.pdf"
        s = run_with_file(test_file)
        self.assertEqual(s, test_strings[test_file])

    def test_simple2_with_file(self):
        test_file = "simple2.pdf"
        s = run_with_file(test_file)
        self.assertEqual(s, test_strings[test_file])

    def test_simple3_with_file(self):
        test_file = "simple3.pdf"
        s = run_with_file(test_file)
        self.assertEqual(s, test_strings[test_file])

    def test_simple4_with_file(self):
        test_file = "simple4.pdf"
        s = run_with_file(test_file)
        self.assertEqual(s, test_strings[test_file])

    def test_simple5_with_file(self):
        test_file = "simple5.pdf"
        s = run_with_file(test_file)
        self.assertEqual(s, test_strings[test_file])

    def test_zlib_corrupted(self):
        test_file = "zen_of_python_corrupted.pdf"
        s = run_with_file(test_file)
        expected = test_strings[test_file]
        self.assertEqual(s[:len(expected)], expected)

    def test_issue_566_cmap_bytes(self):
        test_file = "contrib/issue_566_test_1.pdf"
        s = run_with_file(test_file)
        self.assertEqual(s.strip(), test_strings[test_file])

    def test_issue_566_cid_range(self):
        test_file = "contrib/issue_566_test_2.pdf"
        s = run_with_file(test_file)
        self.assertEqual(s.strip(), test_strings[test_file])

    def test_issue_625_identity_cmap(self):
        test_file = "contrib/issue-625-identity-cmap.pdf"
        lines = run_with_file(test_file).splitlines()

        self.assertEqual(lines[6], test_strings[test_file])


class TestExtractPages(unittest.TestCase):
    def _get_test_file_path(self):
        test_file = "simple4.pdf"
        return absolute_sample_path(test_file)

    def test_line_margin(self):
        # The lines have margin 0.2 relative to the height.
        # Extract with line_margin 0.19 should break into 3 separate textboxes.
        pages = list(extract_pages(
            self._get_test_file_path(), laparams=LAParams(line_margin=0.19)))
        self.assertEqual(len(pages), 1)
        page = pages[0]

        elements = [element for element in page
                    if isinstance(element, LTTextContainer)]
        self.assertEqual(len(elements), 3)
        self.assertEqual(elements[0].get_text(), "Text1\n")
        self.assertEqual(elements[1].get_text(), "Text2\n")
        self.assertEqual(elements[2].get_text(), "Text3\n")

        # Extract with line_margin 0.21 should merge into one textbox.
        pages = list(extract_pages(
            self._get_test_file_path(), laparams=LAParams(line_margin=0.21)))
        self.assertEqual(len(pages), 1)
        page = pages[0]

        elements = [element for element in page
                    if isinstance(element, LTTextContainer)]
        self.assertEqual(len(elements), 1)
        self.assertEqual(elements[0].get_text(), "Text1\nText2\nText3\n")

    def test_no_boxes_flow(self):
        pages = list(extract_pages(
            self._get_test_file_path(), laparams=LAParams(boxes_flow=None)))
        self.assertEqual(len(pages), 1)
        page = pages[0]

        elements = [element for element in page
                    if isinstance(element, LTTextContainer)]
        self.assertEqual(len(elements), 1)
        self.assertEqual(elements[0].get_text(), "Text1\nText2\nText3\n")


if __name__ == "__main__":
    unittest.main()
-												Added: simple wrapper to extract text from pdf (#330)

Fixes #327 


											
										
										
											2019-11-07 06:54:10 +00:00
+								import unittest
 								from helpers import absolute_sample_path
-												Fix ordering of textlines within a textbox when boxes_flow is disabled (#412)

* Fix ordering of textlines within a textbox when boxes_flow is disabled

* Add new test PDF sample
											
										
										
											2020-05-09 13:37:49 +00:00
+								from pdfminer.high_level import extract_text, extract_pages
 								from pdfminer.layout import LAParams, LTTextContainer
-												Added: simple wrapper to extract text from pdf (#330)

Fixes #327 


											
										
										
											2019-11-07 06:54:10 +00:00
-												Fix text coming in reverse order with boxes flow disabled (#399)

Closes #398
											
										
										
											2020-04-01 11:37:04 +00:00
+								def run_with_string(sample_path, laparams=None):
 								    if laparams is None:
 								        laparams = {}
-												Added: simple wrapper to extract text from pdf (#330)

Fixes #327 


											
										
										
											2019-11-07 06:54:10 +00:00
+								    absolute_path = absolute_sample_path(sample_path)
-												Fix text coming in reverse order with boxes flow disabled (#399)

Closes #398
											
										
										
											2020-04-01 11:37:04 +00:00
+								    s = extract_text(absolute_path, laparams=LAParams(**laparams))
-												Added: simple wrapper to extract text from pdf (#330)

Fixes #327 


											
										
										
											2019-11-07 06:54:10 +00:00
+								    return s
-												Fix #392 Split out IO logic from high level functions (#393)

* Allow file-like inputs to high level functions (#392)

* PR Review - move open_filename to utils
											
										
										
											2020-03-26 21:52:00 +00:00
+								def run_with_file(sample_path):
 								    absolute_path = absolute_sample_path(sample_path)
 								    with open(absolute_path, "rb") as in_file:
 								        s = extract_text(in_file)
 								    return s
-												Added: simple wrapper to extract text from pdf (#330)

Fixes #327 


											
										
										
											2019-11-07 06:54:10 +00:00
+								test_strings = {
-												Enforce pep8 coding-style (#345)

* Code Refractor: Use code-style enforcement #312

* Add flake8 to travis-ci

* Remove python 2 3 comment on six library. 891 errors > 870 errors.

* Remove class and functions comments that consist of just the name. 870 errors > 855 errors.

* Fix flake8 errors in pdftypes.py. 855 errors > 833 errors.

* Moving flake8 testing from .travis.yml to tox.ini to ensure local testing before commiting

* Cleanup pdfinterp.py and add documentation from PDF Reference

* Cleanup pdfpage.py

* Cleanup pdffont.py

* Clean psparser.py

* Cleanup high_level.py

* Cleanup layout.py

* Cleanup pdfparser.py

* Cleanup pdfcolor.py

* Cleanup rijndael.py

* Cleanup converter.py

* Rename klass to cls if it is the class variable, to be more consistent with standard practice

* Cleanup cmap.py

* Cleanup pdfdevice.py

* flake8 ignore fontmetrics.py

* Cleanup test_pdfminer_psparser.py

* Fix flake8 in pdfdocument.py; 339 errors to go

* Fix flake8 utils.py; 326 errors togo

* pep8 correction for few files in /tools/ 328 > 160 to go (#342)

* pep8 correction for few files in /tools/ 328 > 160 to go

* pep8 correction: 160 > 5 to go

* Fix ascii85.py errors

* Fix error in getting index from target that does not exists

* Remove commented print lines

* Fix flake8 error in pdfinterp.py

* Fix python2 specific error by removing argument from print statement

* Ignore invalid python2 syntax

* Update contributing.md

* Added changelog

* Remove unused import

Co-authored-by: Fakabbir Amin <f4amin@gmail.com>

											
										
										
											2019-12-29 20:20:20 +00:00
+								    "simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\n"
 								                   "H e l l o  \n\nW o r l d\n\n"
 								                   "H e l l o  \n\nW o r l d\n\n\f",
-												Fix ordering of textlines within a textbox when boxes_flow is disabled (#412)

* Fix ordering of textlines within a textbox when boxes_flow is disabled

* Add new test PDF sample
											
										
										
											2020-05-09 13:37:49 +00:00
+								    "simple1.pdf_no_boxes_flow": "Hello \n\nWorld\n\nHello \n\nWorld\n\n"
 								                                 "H e l l o  \n\nW o r l d\n\n"
 								                                 "H e l l o  \n\nW o r l d\n\n\f",
-												Added: simple wrapper to extract text from pdf (#330)

Fixes #327 


											
										
										
											2019-11-07 06:54:10 +00:00
+								    "simple2.pdf": "\f",
-												Always try to get CMap, even if name is not recognized (#438)

* Add trying to get cmap from pickle file. And cleaning up a bit.

* Don't use keyword argument for dict.get

* Add docs

* Make _get_cmap_name static

* Add test

* Add CHANGELOG.md

* Remove identity mappings from IDENTITY_ENCODER because that's now the default if the key is not in there

* Add CJK characters to expected output of simple3.pdf

* Fix line length

* Add comment
											
										
										
											2020-07-23 18:27:38 +00:00
+								    "simple3.pdf": "Hello\n\nHello\nあ\nい\nう\nえ\nお\nあ\nい\nう\nえ\nお\n"
 								                   "World\n\nWorld\n\n\f",
-												Fix extraction of some cjk characters (#593)

Fixes #566 

* try to fix issue of some Chinese characters cannot be extracted
correctly (#566).

* format code to pass flake8 check.

* fix typo and refer to issue 593.

Co-authored-by: huan_cheng <huan_cheng@bestsign.cn>
Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
											
										
										
											2021-08-26 19:05:03 +00:00
+								    "simple4.pdf": "Text1\nText2\nText3\n\n\f",
-												Fix regression in page layout that sometimes returned text lines out of order (#659)

* add a test

* fix the bug

* rewrap long lines

* update CHANGELOG

* re-merge CHANGELOG

Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
											
										
										
											2022-01-26 18:55:08 +00:00
+								    "simple5.pdf": "Heading\n\n"
 								                   "Link to heading that is working with vim-pandoc.\n\n"
 								                   "Link to heading “that is” not working with vim-pandoc.\n\n"
 								                   "Subheading\n\nSome “more text”\n\n1\n\n\f",
-												Attempt to handle decompression error on some broken PDF files (#637)

* Attempt to handle decompression error on some broken PDF files

from times to times we go through files where no text is detected, while readers
like evince reads the pdf nicely. After digging it occured this is because the
PDF includes some badly compressed data. This may be fixed by uncompressing byte
per byte and ignoring the error on the last check bytes (arbitrarily found to be
the 3 last).

This has been largely inspired by https://github.com/mstamy2/PyPDF2/issues/422
and the test file has been taken from there, so credits to @zegrep.

* Attempt to handle decompression error on some broken PDF files

from times to times we go through files where no text is detected, while readers
like evince reads the pdf nicely. After digging it occured this is because the
PDF includes some badly compressed data. This may be fixed by uncompressing byte
per byte and ignoring the error on the last check bytes (arbitrarily found to be
the 3 last).

This has been largely inspired by mstamy2/PyPDF2#422
and the test file has been taken from there, so credits to @zegrep.

* Use a warnings instead of raising exception

where zlib error is detected before the CRC checksum.

* Add line to CHANGELOG.md

* Only try decompressing if not in strict mode

* Change error into warning because warning.warn needs a subclass of Warning

Co-authored-by: Sylvain Thénault <sylvain.thenault@lowatt.fr>
Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
											
										
										
											2021-12-11 17:25:19 +00:00
+								    "zen_of_python_corrupted.pdf": "Mai 30, 18 13:27\n\nzen_of_python.txt",
-												Fix extraction of some cjk characters (#593)

Fixes #566 

* try to fix issue of some Chinese characters cannot be extracted
correctly (#566).

* format code to pass flake8 check.

* fix typo and refer to issue 593.

Co-authored-by: huan_cheng <huan_cheng@bestsign.cn>
Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
											
										
										
											2021-08-26 19:05:03 +00:00
+								    "contrib/issue_566_test_1.pdf": "ISSUE Date：2019-4-25 Buyer：黎荣",
 								    "contrib/issue_566_test_2.pdf": "甲方：中国饮料有限公司（盖章）",
-												Add support identity unicode cmap (#626)

Fixes #625 

* add support for Identity-H/V cmap fonts

* format code to pass flake8 check

* Remove indent

* Remove indent

* Use isinstance instead of type check

* Use or instead of any

* Use str in variable, instead of str.find()

* Fix mypy error: add typing annotations to get_unichr()

* Fix type of PDFCIDFont. Can be any type of CMapBase.

This is a quick fix, the entire cmap structure does not have proper inheritance.

* Added line to CHANGELOG.md

* Add separate class for IdentityUnicodeMap

* Remove ABC from CmapBase

* Remove ABC from CmapBase

* Remove blank line

Co-authored-by: huan_cheng <huan_cheng@bestsign.cn>
Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
											
										
										
											2021-10-13 19:52:00 +00:00
+								    "contrib/issue-625-identity-cmap.pdf": "Termin płatności: 2021-05-03",
-												Added: simple wrapper to extract text from pdf (#330)

Fixes #327 


											
										
										
											2019-11-07 06:54:10 +00:00
+								}
 								class TestExtractText(unittest.TestCase):
-												Fix #392 Split out IO logic from high level functions (#393)

* Allow file-like inputs to high level functions (#392)

* PR Review - move open_filename to utils
											
										
										
											2020-03-26 21:52:00 +00:00
+								    def test_simple1_with_string(self):
 								        test_file = "simple1.pdf"
 								        s = run_with_string(test_file)
 								        self.assertEqual(s, test_strings[test_file])
-												Fix text coming in reverse order with boxes flow disabled (#399)

Closes #398
											
										
										
											2020-04-01 11:37:04 +00:00
+								    def test_simple1_no_boxes_flow(self):
 								        test_file = "simple1.pdf"
 								        s = run_with_string(test_file, laparams={"boxes_flow": None})
 								        self.assertEqual(s, test_strings["simple1.pdf_no_boxes_flow"])
-												Fix #392 Split out IO logic from high level functions (#393)

* Allow file-like inputs to high level functions (#392)

* PR Review - move open_filename to utils
											
										
										
											2020-03-26 21:52:00 +00:00
+								    def test_simple2_with_string(self):
 								        test_file = "simple2.pdf"
 								        s = run_with_string(test_file)
 								        self.assertEqual(s, test_strings[test_file])
 								    def test_simple3_with_string(self):
 								        test_file = "simple3.pdf"
 								        s = run_with_string(test_file)
 								        self.assertEqual(s, test_strings[test_file])
-												Fix ordering of textlines within a textbox when boxes_flow is disabled (#412)

* Fix ordering of textlines within a textbox when boxes_flow is disabled

* Add new test PDF sample
											
										
										
											2020-05-09 13:37:49 +00:00
+								    def test_simple4_with_string(self):
 								        test_file = "simple4.pdf"
 								        s = run_with_string(test_file)
 								        self.assertEqual(s, test_strings[test_file])
-												Fix regression in page layout that sometimes returned text lines out of order (#659)

* add a test

* fix the bug

* rewrap long lines

* update CHANGELOG

* re-merge CHANGELOG

Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
											
										
										
											2022-01-26 18:55:08 +00:00
+								    def test_simple5_with_string(self):
 								        test_file = "simple5.pdf"
 								        s = run_with_string(test_file)
 								        self.assertEqual(s, test_strings[test_file])
-												Fix #392 Split out IO logic from high level functions (#393)

* Allow file-like inputs to high level functions (#392)

* PR Review - move open_filename to utils
											
										
										
											2020-03-26 21:52:00 +00:00
+								    def test_simple1_with_file(self):
-												Added: simple wrapper to extract text from pdf (#330)

Fixes #327 


											
										
										
											2019-11-07 06:54:10 +00:00
+								        test_file = "simple1.pdf"
-												Fix #392 Split out IO logic from high level functions (#393)

* Allow file-like inputs to high level functions (#392)

* PR Review - move open_filename to utils
											
										
										
											2020-03-26 21:52:00 +00:00
+								        s = run_with_file(test_file)
-												Added: simple wrapper to extract text from pdf (#330)

Fixes #327 


											
										
										
											2019-11-07 06:54:10 +00:00
+								        self.assertEqual(s, test_strings[test_file])
-												Fix #392 Split out IO logic from high level functions (#393)

* Allow file-like inputs to high level functions (#392)

* PR Review - move open_filename to utils
											
										
										
											2020-03-26 21:52:00 +00:00
+								    def test_simple2_with_file(self):
-												Added: simple wrapper to extract text from pdf (#330)

Fixes #327 


											
										
										
											2019-11-07 06:54:10 +00:00
+								        test_file = "simple2.pdf"
-												Fix #392 Split out IO logic from high level functions (#393)

* Allow file-like inputs to high level functions (#392)

* PR Review - move open_filename to utils
											
										
										
											2020-03-26 21:52:00 +00:00
+								        s = run_with_file(test_file)
-												Added: simple wrapper to extract text from pdf (#330)

Fixes #327 


											
										
										
											2019-11-07 06:54:10 +00:00
+								        self.assertEqual(s, test_strings[test_file])
-												Fix #392 Split out IO logic from high level functions (#393)

* Allow file-like inputs to high level functions (#392)

* PR Review - move open_filename to utils
											
										
										
											2020-03-26 21:52:00 +00:00
+								    def test_simple3_with_file(self):
-												Added: simple wrapper to extract text from pdf (#330)

Fixes #327 


											
										
										
											2019-11-07 06:54:10 +00:00
+								        test_file = "simple3.pdf"
-												Fix #392 Split out IO logic from high level functions (#393)

* Allow file-like inputs to high level functions (#392)

* PR Review - move open_filename to utils
											
										
										
											2020-03-26 21:52:00 +00:00
+								        s = run_with_file(test_file)
-												Added: simple wrapper to extract text from pdf (#330)

Fixes #327 


											
										
										
											2019-11-07 06:54:10 +00:00
+								        self.assertEqual(s, test_strings[test_file])
-												Fix ordering of textlines within a textbox when boxes_flow is disabled (#412)

* Fix ordering of textlines within a textbox when boxes_flow is disabled

* Add new test PDF sample
											
										
										
											2020-05-09 13:37:49 +00:00
+								    def test_simple4_with_file(self):
 								        test_file = "simple4.pdf"
 								        s = run_with_file(test_file)
 								        self.assertEqual(s, test_strings[test_file])
-												Fix regression in page layout that sometimes returned text lines out of order (#659)

* add a test

* fix the bug

* rewrap long lines

* update CHANGELOG

* re-merge CHANGELOG

Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
											
										
										
											2022-01-26 18:55:08 +00:00
+								    def test_simple5_with_file(self):
 								        test_file = "simple5.pdf"
 								        s = run_with_file(test_file)
 								        self.assertEqual(s, test_strings[test_file])
-												Attempt to handle decompression error on some broken PDF files (#637)

* Attempt to handle decompression error on some broken PDF files

from times to times we go through files where no text is detected, while readers
like evince reads the pdf nicely. After digging it occured this is because the
PDF includes some badly compressed data. This may be fixed by uncompressing byte
per byte and ignoring the error on the last check bytes (arbitrarily found to be
the 3 last).

This has been largely inspired by https://github.com/mstamy2/PyPDF2/issues/422
and the test file has been taken from there, so credits to @zegrep.

* Attempt to handle decompression error on some broken PDF files

from times to times we go through files where no text is detected, while readers
like evince reads the pdf nicely. After digging it occured this is because the
PDF includes some badly compressed data. This may be fixed by uncompressing byte
per byte and ignoring the error on the last check bytes (arbitrarily found to be
the 3 last).

This has been largely inspired by mstamy2/PyPDF2#422
and the test file has been taken from there, so credits to @zegrep.

* Use a warnings instead of raising exception

where zlib error is detected before the CRC checksum.

* Add line to CHANGELOG.md

* Only try decompressing if not in strict mode

* Change error into warning because warning.warn needs a subclass of Warning

Co-authored-by: Sylvain Thénault <sylvain.thenault@lowatt.fr>
Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
											
										
										
											2021-12-11 17:25:19 +00:00
+								    def test_zlib_corrupted(self):
 								        test_file = "zen_of_python_corrupted.pdf"
 								        s = run_with_file(test_file)
 								        expected = test_strings[test_file]
 								        self.assertEqual(s[:len(expected)], expected)
-												Fix extraction of some cjk characters (#593)

Fixes #566 

* try to fix issue of some Chinese characters cannot be extracted
correctly (#566).

* format code to pass flake8 check.

* fix typo and refer to issue 593.

Co-authored-by: huan_cheng <huan_cheng@bestsign.cn>
Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
											
										
										
											2021-08-26 19:05:03 +00:00
+								    def test_issue_566_cmap_bytes(self):
 								        test_file = "contrib/issue_566_test_1.pdf"
 								        s = run_with_file(test_file)
 								        self.assertEqual(s.strip(), test_strings[test_file])
 								    def test_issue_566_cid_range(self):
 								        test_file = "contrib/issue_566_test_2.pdf"
 								        s = run_with_file(test_file)
 								        self.assertEqual(s.strip(), test_strings[test_file])
-												Add support identity unicode cmap (#626)

Fixes #625 

* add support for Identity-H/V cmap fonts

* format code to pass flake8 check

* Remove indent

* Remove indent

* Use isinstance instead of type check

* Use or instead of any

* Use str in variable, instead of str.find()

* Fix mypy error: add typing annotations to get_unichr()

* Fix type of PDFCIDFont. Can be any type of CMapBase.

This is a quick fix, the entire cmap structure does not have proper inheritance.

* Added line to CHANGELOG.md

* Add separate class for IdentityUnicodeMap

* Remove ABC from CmapBase

* Remove ABC from CmapBase

* Remove blank line

Co-authored-by: huan_cheng <huan_cheng@bestsign.cn>
Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
											
										
										
											2021-10-13 19:52:00 +00:00
+								    def test_issue_625_identity_cmap(self):
 								        test_file = "contrib/issue-625-identity-cmap.pdf"
 								        lines = run_with_file(test_file).splitlines()
 								        self.assertEqual(lines[6], test_strings[test_file])
-												Fix ordering of textlines within a textbox when boxes_flow is disabled (#412)

* Fix ordering of textlines within a textbox when boxes_flow is disabled

* Add new test PDF sample
											
										
										
											2020-05-09 13:37:49 +00:00
 								class TestExtractPages(unittest.TestCase):
 								    def _get_test_file_path(self):
 								        test_file = "simple4.pdf"
 								        return absolute_sample_path(test_file)
 								    def test_line_margin(self):
 								        # The lines have margin 0.2 relative to the height.
 								        # Extract with line_margin 0.19 should break into 3 separate textboxes.
 								        pages = list(extract_pages(
 								            self._get_test_file_path(), laparams=LAParams(line_margin=0.19)))
 								        self.assertEqual(len(pages), 1)
 								        page = pages[0]
 								        elements = [element for element in page
 								                    if isinstance(element, LTTextContainer)]
 								        self.assertEqual(len(elements), 3)
 								        self.assertEqual(elements[0].get_text(), "Text1\n")
 								        self.assertEqual(elements[1].get_text(), "Text2\n")
 								        self.assertEqual(elements[2].get_text(), "Text3\n")
 								        # Extract with line_margin 0.21 should merge into one textbox.
 								        pages = list(extract_pages(
 								            self._get_test_file_path(), laparams=LAParams(line_margin=0.21)))
 								        self.assertEqual(len(pages), 1)
 								        page = pages[0]
 								        elements = [element for element in page
 								                    if isinstance(element, LTTextContainer)]
 								        self.assertEqual(len(elements), 1)
 								        self.assertEqual(elements[0].get_text(), "Text1\nText2\nText3\n")
 								    def test_no_boxes_flow(self):
 								        pages = list(extract_pages(
 								            self._get_test_file_path(), laparams=LAParams(boxes_flow=None)))
 								        self.assertEqual(len(pages), 1)
 								        page = pages[0]
 								        elements = [element for element in page
 								                    if isinstance(element, LTTextContainer)]
 								        self.assertEqual(len(elements), 1)
 								        self.assertEqual(elements[0].get_text(), "Text1\nText2\nText3\n")
-												Added: simple wrapper to extract text from pdf (#330)

Fixes #327 


											
										
										
											2019-11-07 06:54:10 +00:00
 								if __name__ == "__main__":
 								    unittest.main()