Fix grouping textlines when bounding box of parent container is wrong (#386)

* Default value for --all-texts should be false, because using the flag enables it

* Fix edge case: when no neighbors are found a line should form its own text box

* Added test for grouping textlines where 1 is outside the parent bounding box

* Added CHANGELOG.md line
pull/389/head^2
Pieter Marsman 2020-03-14 10:33:39 +01:00 committed by GitHub
parent 7e91d4ec6d
commit 1d773dc38a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 28 additions and 6 deletions

View File

@ -5,7 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
## [Unreleased] ## [Unreleased]
Nothing ### Fixed
- Grouping of text lines outside of parent container bounding box ([#386](https://github.com/pdfminer/pdfminer.six/pull/386))
## [20200124] - 2020-01-24 ## [20200124] - 2020-01-24

View File

@ -597,16 +597,14 @@ class LTLayoutContainer(LTContainer):
yield line yield line
return return
# group_textlines: group neighboring lines to textboxes.
def group_textlines(self, laparams, lines): def group_textlines(self, laparams, lines):
"""Group neighboring lines to textboxes"""
plane = Plane(self.bbox) plane = Plane(self.bbox)
plane.extend(lines) plane.extend(lines)
boxes = {} boxes = {}
for line in lines: for line in lines:
neighbors = line.find_neighbors(plane, laparams.line_margin) neighbors = line.find_neighbors(plane, laparams.line_margin)
if line not in neighbors: members = [line]
continue
members = []
for obj1 in neighbors: for obj1 in neighbors:
members.append(obj1) members.append(obj1)
if obj1 in boxes: if obj1 in boxes:

23
tests/test_layout.py Normal file
View File

@ -0,0 +1,23 @@
import unittest
from pdfminer.layout import LTLayoutContainer, LAParams, LTTextLineHorizontal
class TestGroupTextLines(unittest.TestCase):
def test_parent_with_wrong_bbox_returns_non_empty_neighbour_list(self):
"""
LTLayoutContainer.group_textlines() should return all the lines in a
separate LTTextBoxes if they do not overlap. Even when the bounding box
of the parent container does not contain all the lines.
"""
laparams = LAParams()
layout = LTLayoutContainer((0, 0, 50, 50))
line1 = LTTextLineHorizontal(laparams.word_margin)
line1.set_bbox((0, 0, 50, 5))
line2 = LTTextLineHorizontal(laparams.word_margin)
line2.set_bbox((0, 50, 50, 55))
lines = [line1, line2]
textboxes = list(layout.group_textlines(laparams, lines))
self.assertEqual(len(textboxes), 2)

View File

@ -122,7 +122,7 @@ def maketheparser():
"should be within the range of -1.0 (only horizontal position " "should be within the range of -1.0 (only horizontal position "
"matters) to +1.0 (only vertical position matters).") "matters) to +1.0 (only vertical position matters).")
la_params.add_argument( la_params.add_argument(
"--all-texts", "-A", default=True, action="store_true", "--all-texts", "-A", default=False, action="store_true",
help="If layout analysis should be performed on text in figures.") help="If layout analysis should be performed on text in figures.")
output_params = parser.add_argument_group( output_params = parser.add_argument_group(