Fix grouping textlines when bounding box of parent container is wrong (#386)
* Default value for --all-texts should be false, because using the flag enables it * Fix edge case: when no neighbors are found a line should form its own text box * Added test for grouping textlines where 1 is outside the parent bounding box * Added CHANGELOG.md linepull/389/head^2
parent
7e91d4ec6d
commit
1d773dc38a
|
@ -5,7 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
|||
|
||||
## [Unreleased]
|
||||
|
||||
Nothing
|
||||
### Fixed
|
||||
- Grouping of text lines outside of parent container bounding box ([#386](https://github.com/pdfminer/pdfminer.six/pull/386))
|
||||
|
||||
## [20200124] - 2020-01-24
|
||||
|
||||
|
|
|
@ -597,16 +597,14 @@ class LTLayoutContainer(LTContainer):
|
|||
yield line
|
||||
return
|
||||
|
||||
# group_textlines: group neighboring lines to textboxes.
|
||||
def group_textlines(self, laparams, lines):
|
||||
"""Group neighboring lines to textboxes"""
|
||||
plane = Plane(self.bbox)
|
||||
plane.extend(lines)
|
||||
boxes = {}
|
||||
for line in lines:
|
||||
neighbors = line.find_neighbors(plane, laparams.line_margin)
|
||||
if line not in neighbors:
|
||||
continue
|
||||
members = []
|
||||
members = [line]
|
||||
for obj1 in neighbors:
|
||||
members.append(obj1)
|
||||
if obj1 in boxes:
|
||||
|
|
|
@ -0,0 +1,23 @@
|
|||
import unittest
|
||||
|
||||
from pdfminer.layout import LTLayoutContainer, LAParams, LTTextLineHorizontal
|
||||
|
||||
|
||||
class TestGroupTextLines(unittest.TestCase):
|
||||
def test_parent_with_wrong_bbox_returns_non_empty_neighbour_list(self):
|
||||
"""
|
||||
LTLayoutContainer.group_textlines() should return all the lines in a
|
||||
separate LTTextBoxes if they do not overlap. Even when the bounding box
|
||||
of the parent container does not contain all the lines.
|
||||
"""
|
||||
laparams = LAParams()
|
||||
layout = LTLayoutContainer((0, 0, 50, 50))
|
||||
line1 = LTTextLineHorizontal(laparams.word_margin)
|
||||
line1.set_bbox((0, 0, 50, 5))
|
||||
line2 = LTTextLineHorizontal(laparams.word_margin)
|
||||
line2.set_bbox((0, 50, 50, 55))
|
||||
lines = [line1, line2]
|
||||
|
||||
textboxes = list(layout.group_textlines(laparams, lines))
|
||||
|
||||
self.assertEqual(len(textboxes), 2)
|
|
@ -122,7 +122,7 @@ def maketheparser():
|
|||
"should be within the range of -1.0 (only horizontal position "
|
||||
"matters) to +1.0 (only vertical position matters).")
|
||||
la_params.add_argument(
|
||||
"--all-texts", "-A", default=True, action="store_true",
|
||||
"--all-texts", "-A", default=False, action="store_true",
|
||||
help="If layout analysis should be performed on text in figures.")
|
||||
|
||||
output_params = parser.add_argument_group(
|
||||
|
|
Loading…
Reference in New Issue