Fix grouping textlines when bounding box of parent container is wrong (#386)
* Default value for --all-texts should be false, because using the flag enables it * Fix edge case: when no neighbors are found a line should form its own text box * Added test for grouping textlines where 1 is outside the parent bounding box * Added CHANGELOG.md linepull/389/head^2
parent
7e91d4ec6d
commit
1d773dc38a
|
@ -5,7 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
|
|
||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
Nothing
|
### Fixed
|
||||||
|
- Grouping of text lines outside of parent container bounding box ([#386](https://github.com/pdfminer/pdfminer.six/pull/386))
|
||||||
|
|
||||||
## [20200124] - 2020-01-24
|
## [20200124] - 2020-01-24
|
||||||
|
|
||||||
|
|
|
@ -597,16 +597,14 @@ class LTLayoutContainer(LTContainer):
|
||||||
yield line
|
yield line
|
||||||
return
|
return
|
||||||
|
|
||||||
# group_textlines: group neighboring lines to textboxes.
|
|
||||||
def group_textlines(self, laparams, lines):
|
def group_textlines(self, laparams, lines):
|
||||||
|
"""Group neighboring lines to textboxes"""
|
||||||
plane = Plane(self.bbox)
|
plane = Plane(self.bbox)
|
||||||
plane.extend(lines)
|
plane.extend(lines)
|
||||||
boxes = {}
|
boxes = {}
|
||||||
for line in lines:
|
for line in lines:
|
||||||
neighbors = line.find_neighbors(plane, laparams.line_margin)
|
neighbors = line.find_neighbors(plane, laparams.line_margin)
|
||||||
if line not in neighbors:
|
members = [line]
|
||||||
continue
|
|
||||||
members = []
|
|
||||||
for obj1 in neighbors:
|
for obj1 in neighbors:
|
||||||
members.append(obj1)
|
members.append(obj1)
|
||||||
if obj1 in boxes:
|
if obj1 in boxes:
|
||||||
|
|
|
@ -0,0 +1,23 @@
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from pdfminer.layout import LTLayoutContainer, LAParams, LTTextLineHorizontal
|
||||||
|
|
||||||
|
|
||||||
|
class TestGroupTextLines(unittest.TestCase):
|
||||||
|
def test_parent_with_wrong_bbox_returns_non_empty_neighbour_list(self):
|
||||||
|
"""
|
||||||
|
LTLayoutContainer.group_textlines() should return all the lines in a
|
||||||
|
separate LTTextBoxes if they do not overlap. Even when the bounding box
|
||||||
|
of the parent container does not contain all the lines.
|
||||||
|
"""
|
||||||
|
laparams = LAParams()
|
||||||
|
layout = LTLayoutContainer((0, 0, 50, 50))
|
||||||
|
line1 = LTTextLineHorizontal(laparams.word_margin)
|
||||||
|
line1.set_bbox((0, 0, 50, 5))
|
||||||
|
line2 = LTTextLineHorizontal(laparams.word_margin)
|
||||||
|
line2.set_bbox((0, 50, 50, 55))
|
||||||
|
lines = [line1, line2]
|
||||||
|
|
||||||
|
textboxes = list(layout.group_textlines(laparams, lines))
|
||||||
|
|
||||||
|
self.assertEqual(len(textboxes), 2)
|
|
@ -122,7 +122,7 @@ def maketheparser():
|
||||||
"should be within the range of -1.0 (only horizontal position "
|
"should be within the range of -1.0 (only horizontal position "
|
||||||
"matters) to +1.0 (only vertical position matters).")
|
"matters) to +1.0 (only vertical position matters).")
|
||||||
la_params.add_argument(
|
la_params.add_argument(
|
||||||
"--all-texts", "-A", default=True, action="store_true",
|
"--all-texts", "-A", default=False, action="store_true",
|
||||||
help="If layout analysis should be performed on text in figures.")
|
help="If layout analysis should be performed on text in figures.")
|
||||||
|
|
||||||
output_params = parser.add_argument_group(
|
output_params = parser.add_argument_group(
|
||||||
|
|
Loading…
Reference in New Issue