Also group center-aligned text lines in addition to left-aligned and right-aligned text lines (#382) (#384)

* Group text lines if they are centered (#382)

Closes #382

* Add comparison private methods to LTTextLines

* Add missing docstrings

* Add tests for find_neighbors

* Update changelog

* Cosmetic changes from code review
pull/393/head
Jake Stockwin 2020-03-23 21:38:39 +00:00 committed by GitHub
parent 9d7fe2d9ee
commit 1cc1b961c5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 166 additions and 13 deletions

View File

@ -10,6 +10,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- Ignore ValueError when converting font encoding differences ([#389](https://github.com/pdfminer/pdfminer.six/pull/389))
- Grouping of text lines outside of parent container bounding box ([#386](https://github.com/pdfminer/pdfminer.six/pull/386))
### Changed
- Group text lines if they are centered ([#382](https://github.com/pdfminer/pdfminer.six/pull/382))
## [20200124] - 2020-01-24
### Security

View File

@ -416,13 +416,44 @@ class LTTextLineHorizontal(LTTextLine):
return
def find_neighbors(self, plane, ratio):
"""
Finds neighboring LTTextLineHorizontals in the plane.
Returns a list of other LTTestLineHorizontals in the plane which are
close to self. "Close" can be controlled by ratio. The returned objects
will be the same height as self, and also either left-, right-, or
centrally-aligned.
"""
d = ratio * self.height
objs = plane.find((self.x0, self.y0 - d, self.x1, self.y1 + d))
return [obj for obj in objs
if (isinstance(obj, LTTextLineHorizontal) and
abs(obj.height-self.height) < d and
(abs(obj.x0-self.x0) < d or
abs(obj.x1-self.x1) < d))]
self._is_same_height_as(obj, tolerance=d) and
(self._is_left_aligned_with(obj, tolerance=d) or
self._is_right_aligned_with(obj, tolerance=d) or
self._is_centrally_aligned_with(obj, tolerance=d)))]
def _is_left_aligned_with(self, other, tolerance=0):
"""
Whether the left-hand edge of `other` is within `tolerance`.
"""
return abs(other.x0 - self.x0) <= tolerance
def _is_right_aligned_with(self, other, tolerance=0):
"""
Whether the right-hand edge of `other` is within `tolerance`.
"""
return abs(other.x1 - self.x1) <= tolerance
def _is_centrally_aligned_with(self, other, tolerance=0):
"""
Whether the horizontal center of `other` is within `tolerance`.
"""
return abs(
(other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance
def _is_same_height_as(self, other, tolerance):
return abs(other.height - self.height) <= tolerance
class LTTextLineVertical(LTTextLine):
@ -441,13 +472,44 @@ class LTTextLineVertical(LTTextLine):
return
def find_neighbors(self, plane, ratio):
"""
Finds neighboring LTTextLineVerticals in the plane.
Returns a list of other LTTextLineVerticals in the plane which are
close to self. "Close" can be controlled by ratio. The returned objects
will be the same width as self, and also either upper-, lower-, or
centrally-aligned.
"""
d = ratio * self.width
objs = plane.find((self.x0 - d, self.y0, self.x1 + d, self.y1))
return [obj for obj in objs
if (isinstance(obj, LTTextLineVertical) and
abs(obj.width-self.width) < d and
(abs(obj.y0-self.y0) < d or
abs(obj.y1-self.y1) < d))]
self._is_same_width_as(obj, tolerance=d) and
(self._is_lower_aligned_with(obj, tolerance=d) or
self._is_upper_aligned_with(obj, tolerance=d) or
self._is_centrally_aligned_with(obj, tolerance=d)))]
def _is_lower_aligned_with(self, other, tolerance=0):
"""
Whether the lower edge of `other` is within `tolerance`.
"""
return abs(other.y0 - self.y0) <= tolerance
def _is_upper_aligned_with(self, other, tolerance=0):
"""
Whether the upper edge of `other` is within `tolerance`.
"""
return abs(other.y1 - self.y1) <= tolerance
def _is_centrally_aligned_with(self, other, tolerance=0):
"""
Whether the vertical center of `other` is within `tolerance`.
"""
return abs(
(other.y0 + other.y1) / 2 - (self.y0 + self.y1) / 2) <= tolerance
def _is_same_width_as(self, other, tolerance):
return abs(other.width - self.width) <= tolerance
class LTTextBox(LTTextContainer):

View File

@ -1,6 +1,12 @@
import unittest
from pdfminer.layout import LTLayoutContainer, LAParams, LTTextLineHorizontal
from pdfminer.layout import (
LTLayoutContainer,
LAParams,
LTTextLineHorizontal,
LTTextLineVertical,
)
from pdfminer.utils import Plane
class TestGroupTextLines(unittest.TestCase):
@ -21,3 +27,85 @@ class TestGroupTextLines(unittest.TestCase):
textboxes = list(layout.group_textlines(laparams, lines))
self.assertEqual(len(textboxes), 2)
class TestFindNeigbors(unittest.TestCase):
def test_find_neighbors_horizontal(self):
laparams = LAParams()
plane = Plane((0, 0, 50, 50))
line = LTTextLineHorizontal(laparams.word_margin)
line.set_bbox((10, 4, 20, 6))
plane.add(line)
left_aligned_above = LTTextLineHorizontal(laparams.word_margin)
left_aligned_above.set_bbox((10, 6, 15, 8))
plane.add(left_aligned_above)
right_aligned_below = LTTextLineHorizontal(laparams.word_margin)
right_aligned_below.set_bbox((15, 2, 20, 4))
plane.add(right_aligned_below)
centrally_aligned_overlapping = LTTextLineHorizontal(
laparams.word_margin)
centrally_aligned_overlapping.set_bbox((13, 5, 17, 7))
plane.add(centrally_aligned_overlapping)
not_aligned = LTTextLineHorizontal(laparams.word_margin)
not_aligned.set_bbox((0, 6, 5, 8))
plane.add(not_aligned)
wrong_height = LTTextLineHorizontal(laparams.word_margin)
wrong_height.set_bbox((10, 6, 15, 10))
plane.add(wrong_height)
neighbors = line.find_neighbors(plane, laparams.line_margin)
self.assertCountEqual(
neighbors,
[
line,
left_aligned_above,
right_aligned_below,
centrally_aligned_overlapping,
],
)
def test_find_neighbors_vertical(self):
laparams = LAParams()
plane = Plane((0, 0, 50, 50))
line = LTTextLineVertical(laparams.word_margin)
line.set_bbox((4, 10, 6, 20))
plane.add(line)
bottom_aligned_right = LTTextLineVertical(laparams.word_margin)
bottom_aligned_right.set_bbox((6, 10, 8, 15))
plane.add(bottom_aligned_right)
top_aligned_left = LTTextLineVertical(laparams.word_margin)
top_aligned_left.set_bbox((2, 15, 4, 20))
plane.add(top_aligned_left)
centrally_aligned_overlapping = LTTextLineVertical(
laparams.word_margin)
centrally_aligned_overlapping.set_bbox((5, 13, 7, 17))
plane.add(centrally_aligned_overlapping)
not_aligned = LTTextLineVertical(laparams.word_margin)
not_aligned.set_bbox((6, 0, 8, 5))
plane.add(not_aligned)
wrong_width = LTTextLineVertical(laparams.word_margin)
wrong_width.set_bbox((6, 10, 10, 15))
plane.add(wrong_width)
neighbors = line.find_neighbors(plane, laparams.line_margin)
self.assertCountEqual(
neighbors,
[
line,
bottom_aligned_right,
top_aligned_left,
centrally_aligned_overlapping,
],
)