diff --git a/CHANGELOG.md b/CHANGELOG.md index e4cf496..563fc9c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [Unreleased] -Nothing +### Fixed +- Grouping of text lines outside of parent container bounding box ([#386](https://github.com/pdfminer/pdfminer.six/pull/386)) ## [20200124] - 2020-01-24 diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 8c22622..312cdec 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -597,16 +597,14 @@ class LTLayoutContainer(LTContainer): yield line return - # group_textlines: group neighboring lines to textboxes. def group_textlines(self, laparams, lines): + """Group neighboring lines to textboxes""" plane = Plane(self.bbox) plane.extend(lines) boxes = {} for line in lines: neighbors = line.find_neighbors(plane, laparams.line_margin) - if line not in neighbors: - continue - members = [] + members = [line] for obj1 in neighbors: members.append(obj1) if obj1 in boxes: diff --git a/tests/test_layout.py b/tests/test_layout.py new file mode 100644 index 0000000..a6788a3 --- /dev/null +++ b/tests/test_layout.py @@ -0,0 +1,23 @@ +import unittest + +from pdfminer.layout import LTLayoutContainer, LAParams, LTTextLineHorizontal + + +class TestGroupTextLines(unittest.TestCase): + def test_parent_with_wrong_bbox_returns_non_empty_neighbour_list(self): + """ + LTLayoutContainer.group_textlines() should return all the lines in a + separate LTTextBoxes if they do not overlap. Even when the bounding box + of the parent container does not contain all the lines. + """ + laparams = LAParams() + layout = LTLayoutContainer((0, 0, 50, 50)) + line1 = LTTextLineHorizontal(laparams.word_margin) + line1.set_bbox((0, 0, 50, 5)) + line2 = LTTextLineHorizontal(laparams.word_margin) + line2.set_bbox((0, 50, 50, 55)) + lines = [line1, line2] + + textboxes = list(layout.group_textlines(laparams, lines)) + + self.assertEqual(len(textboxes), 2) diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index bc22329..2332135 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -122,7 +122,7 @@ def maketheparser(): "should be within the range of -1.0 (only horizontal position " "matters) to +1.0 (only vertical position matters).") la_params.add_argument( - "--all-texts", "-A", default=True, action="store_true", + "--all-texts", "-A", default=False, action="store_true", help="If layout analysis should be performed on text in figures.") output_params = parser.add_argument_group(