diff --git a/CHANGELOG.md b/CHANGELOG.md index b4c8d3f..fc6bcbe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Added - Python3 shebang line to script in tools ([408](https://github.com/pdfminer/pdfminer.six/pull/408) +### Fixed +- Fix ordering of textlines within a textbox when `boxes_flow=None` ([#411](https://github.com/pdfminer/pdfminer.six/issues/411)) ## [20200402] diff --git a/pdfminer/layout.py b/pdfminer/layout.py index cc0c88d..8bce26b 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -798,6 +798,9 @@ class LTLayoutContainer(LTContainer): obj.analyze(laparams) textboxes = list(self.group_textlines(laparams, textlines)) if laparams.boxes_flow is None: + for textbox in textboxes: + textbox.analyze(laparams) + def getkey(box): if isinstance(box, LTTextBoxVertical): return (0, -box.x1, -box.y0) diff --git a/samples/simple4.pdf b/samples/simple4.pdf new file mode 100644 index 0000000..14b6fc5 Binary files /dev/null and b/samples/simple4.pdf differ diff --git a/tests/test_highlevel_extracttext.py b/tests/test_highlevel_extracttext.py index 9ed7224..86788a0 100644 --- a/tests/test_highlevel_extracttext.py +++ b/tests/test_highlevel_extracttext.py @@ -1,8 +1,8 @@ import unittest from helpers import absolute_sample_path -from pdfminer.high_level import extract_text -from pdfminer.layout import LAParams +from pdfminer.high_level import extract_text, extract_pages +from pdfminer.layout import LAParams, LTTextContainer def run_with_string(sample_path, laparams=None): @@ -24,11 +24,12 @@ test_strings = { "simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\n" "H e l l o \n\nW o r l d\n\n" "H e l l o \n\nW o r l d\n\n\f", - "simple1.pdf_no_boxes_flow": "Hello \nWorld\nHello \nWorld\n" - "H e l l o \nW o r l d\n" - "H e l l o \nW o r l d\n\f", + "simple1.pdf_no_boxes_flow": "Hello \n\nWorld\n\nHello \n\nWorld\n\n" + "H e l l o \n\nW o r l d\n\n" + "H e l l o \n\nW o r l d\n\n\f", "simple2.pdf": "\f", "simple3.pdf": "Hello\n\nHello\n\nWorld\n\nWorld\n\n\f", + "simple4.pdf": "Text1\nText2\nText3\n\n\f" } @@ -53,6 +54,11 @@ class TestExtractText(unittest.TestCase): s = run_with_string(test_file) self.assertEqual(s, test_strings[test_file]) + def test_simple4_with_string(self): + test_file = "simple4.pdf" + s = run_with_string(test_file) + self.assertEqual(s, test_strings[test_file]) + def test_simple1_with_file(self): test_file = "simple1.pdf" s = run_with_file(test_file) @@ -68,6 +74,54 @@ class TestExtractText(unittest.TestCase): s = run_with_file(test_file) self.assertEqual(s, test_strings[test_file]) + def test_simple4_with_file(self): + test_file = "simple4.pdf" + s = run_with_file(test_file) + self.assertEqual(s, test_strings[test_file]) + + +class TestExtractPages(unittest.TestCase): + def _get_test_file_path(self): + test_file = "simple4.pdf" + return absolute_sample_path(test_file) + + def test_line_margin(self): + # The lines have margin 0.2 relative to the height. + # Extract with line_margin 0.19 should break into 3 separate textboxes. + pages = list(extract_pages( + self._get_test_file_path(), laparams=LAParams(line_margin=0.19))) + self.assertEqual(len(pages), 1) + page = pages[0] + + elements = [element for element in page + if isinstance(element, LTTextContainer)] + self.assertEqual(len(elements), 3) + self.assertEqual(elements[0].get_text(), "Text1\n") + self.assertEqual(elements[1].get_text(), "Text2\n") + self.assertEqual(elements[2].get_text(), "Text3\n") + + # Extract with line_margin 0.21 should merge into one textbox. + pages = list(extract_pages( + self._get_test_file_path(), laparams=LAParams(line_margin=0.21))) + self.assertEqual(len(pages), 1) + page = pages[0] + + elements = [element for element in page + if isinstance(element, LTTextContainer)] + self.assertEqual(len(elements), 1) + self.assertEqual(elements[0].get_text(), "Text1\nText2\nText3\n") + + def test_no_boxes_flow(self): + pages = list(extract_pages( + self._get_test_file_path(), laparams=LAParams(boxes_flow=None))) + self.assertEqual(len(pages), 1) + page = pages[0] + + elements = [element for element in page + if isinstance(element, LTTextContainer)] + self.assertEqual(len(elements), 1) + self.assertEqual(elements[0].get_text(), "Text1\nText2\nText3\n") + if __name__ == "__main__": unittest.main()