Also group center-aligned text lines in addition to left-aligned and right-aligned text lines (#382) (#384)
* Group text lines if they are centered (#382) Closes #382 * Add comparison private methods to LTTextLines * Add missing docstrings * Add tests for find_neighbors * Update changelog * Cosmetic changes from code reviewpull/393/head
parent
9d7fe2d9ee
commit
1cc1b961c5
|
@ -10,6 +10,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
- Ignore ValueError when converting font encoding differences ([#389](https://github.com/pdfminer/pdfminer.six/pull/389))
|
- Ignore ValueError when converting font encoding differences ([#389](https://github.com/pdfminer/pdfminer.six/pull/389))
|
||||||
- Grouping of text lines outside of parent container bounding box ([#386](https://github.com/pdfminer/pdfminer.six/pull/386))
|
- Grouping of text lines outside of parent container bounding box ([#386](https://github.com/pdfminer/pdfminer.six/pull/386))
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
- Group text lines if they are centered ([#382](https://github.com/pdfminer/pdfminer.six/pull/382))
|
||||||
|
|
||||||
## [20200124] - 2020-01-24
|
## [20200124] - 2020-01-24
|
||||||
|
|
||||||
### Security
|
### Security
|
||||||
|
|
|
@ -416,13 +416,44 @@ class LTTextLineHorizontal(LTTextLine):
|
||||||
return
|
return
|
||||||
|
|
||||||
def find_neighbors(self, plane, ratio):
|
def find_neighbors(self, plane, ratio):
|
||||||
|
"""
|
||||||
|
Finds neighboring LTTextLineHorizontals in the plane.
|
||||||
|
|
||||||
|
Returns a list of other LTTestLineHorizontals in the plane which are
|
||||||
|
close to self. "Close" can be controlled by ratio. The returned objects
|
||||||
|
will be the same height as self, and also either left-, right-, or
|
||||||
|
centrally-aligned.
|
||||||
|
"""
|
||||||
d = ratio * self.height
|
d = ratio * self.height
|
||||||
objs = plane.find((self.x0, self.y0 - d, self.x1, self.y1 + d))
|
objs = plane.find((self.x0, self.y0 - d, self.x1, self.y1 + d))
|
||||||
return [obj for obj in objs
|
return [obj for obj in objs
|
||||||
if (isinstance(obj, LTTextLineHorizontal) and
|
if (isinstance(obj, LTTextLineHorizontal) and
|
||||||
abs(obj.height-self.height) < d and
|
self._is_same_height_as(obj, tolerance=d) and
|
||||||
(abs(obj.x0-self.x0) < d or
|
(self._is_left_aligned_with(obj, tolerance=d) or
|
||||||
abs(obj.x1-self.x1) < d))]
|
self._is_right_aligned_with(obj, tolerance=d) or
|
||||||
|
self._is_centrally_aligned_with(obj, tolerance=d)))]
|
||||||
|
|
||||||
|
def _is_left_aligned_with(self, other, tolerance=0):
|
||||||
|
"""
|
||||||
|
Whether the left-hand edge of `other` is within `tolerance`.
|
||||||
|
"""
|
||||||
|
return abs(other.x0 - self.x0) <= tolerance
|
||||||
|
|
||||||
|
def _is_right_aligned_with(self, other, tolerance=0):
|
||||||
|
"""
|
||||||
|
Whether the right-hand edge of `other` is within `tolerance`.
|
||||||
|
"""
|
||||||
|
return abs(other.x1 - self.x1) <= tolerance
|
||||||
|
|
||||||
|
def _is_centrally_aligned_with(self, other, tolerance=0):
|
||||||
|
"""
|
||||||
|
Whether the horizontal center of `other` is within `tolerance`.
|
||||||
|
"""
|
||||||
|
return abs(
|
||||||
|
(other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance
|
||||||
|
|
||||||
|
def _is_same_height_as(self, other, tolerance):
|
||||||
|
return abs(other.height - self.height) <= tolerance
|
||||||
|
|
||||||
|
|
||||||
class LTTextLineVertical(LTTextLine):
|
class LTTextLineVertical(LTTextLine):
|
||||||
|
@ -441,13 +472,44 @@ class LTTextLineVertical(LTTextLine):
|
||||||
return
|
return
|
||||||
|
|
||||||
def find_neighbors(self, plane, ratio):
|
def find_neighbors(self, plane, ratio):
|
||||||
|
"""
|
||||||
|
Finds neighboring LTTextLineVerticals in the plane.
|
||||||
|
|
||||||
|
Returns a list of other LTTextLineVerticals in the plane which are
|
||||||
|
close to self. "Close" can be controlled by ratio. The returned objects
|
||||||
|
will be the same width as self, and also either upper-, lower-, or
|
||||||
|
centrally-aligned.
|
||||||
|
"""
|
||||||
d = ratio * self.width
|
d = ratio * self.width
|
||||||
objs = plane.find((self.x0 - d, self.y0, self.x1 + d, self.y1))
|
objs = plane.find((self.x0 - d, self.y0, self.x1 + d, self.y1))
|
||||||
return [obj for obj in objs
|
return [obj for obj in objs
|
||||||
if (isinstance(obj, LTTextLineVertical) and
|
if (isinstance(obj, LTTextLineVertical) and
|
||||||
abs(obj.width-self.width) < d and
|
self._is_same_width_as(obj, tolerance=d) and
|
||||||
(abs(obj.y0-self.y0) < d or
|
(self._is_lower_aligned_with(obj, tolerance=d) or
|
||||||
abs(obj.y1-self.y1) < d))]
|
self._is_upper_aligned_with(obj, tolerance=d) or
|
||||||
|
self._is_centrally_aligned_with(obj, tolerance=d)))]
|
||||||
|
|
||||||
|
def _is_lower_aligned_with(self, other, tolerance=0):
|
||||||
|
"""
|
||||||
|
Whether the lower edge of `other` is within `tolerance`.
|
||||||
|
"""
|
||||||
|
return abs(other.y0 - self.y0) <= tolerance
|
||||||
|
|
||||||
|
def _is_upper_aligned_with(self, other, tolerance=0):
|
||||||
|
"""
|
||||||
|
Whether the upper edge of `other` is within `tolerance`.
|
||||||
|
"""
|
||||||
|
return abs(other.y1 - self.y1) <= tolerance
|
||||||
|
|
||||||
|
def _is_centrally_aligned_with(self, other, tolerance=0):
|
||||||
|
"""
|
||||||
|
Whether the vertical center of `other` is within `tolerance`.
|
||||||
|
"""
|
||||||
|
return abs(
|
||||||
|
(other.y0 + other.y1) / 2 - (self.y0 + self.y1) / 2) <= tolerance
|
||||||
|
|
||||||
|
def _is_same_width_as(self, other, tolerance):
|
||||||
|
return abs(other.width - self.width) <= tolerance
|
||||||
|
|
||||||
|
|
||||||
class LTTextBox(LTTextContainer):
|
class LTTextBox(LTTextContainer):
|
||||||
|
|
|
@ -1,6 +1,12 @@
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from pdfminer.layout import LTLayoutContainer, LAParams, LTTextLineHorizontal
|
from pdfminer.layout import (
|
||||||
|
LTLayoutContainer,
|
||||||
|
LAParams,
|
||||||
|
LTTextLineHorizontal,
|
||||||
|
LTTextLineVertical,
|
||||||
|
)
|
||||||
|
from pdfminer.utils import Plane
|
||||||
|
|
||||||
|
|
||||||
class TestGroupTextLines(unittest.TestCase):
|
class TestGroupTextLines(unittest.TestCase):
|
||||||
|
@ -21,3 +27,85 @@ class TestGroupTextLines(unittest.TestCase):
|
||||||
textboxes = list(layout.group_textlines(laparams, lines))
|
textboxes = list(layout.group_textlines(laparams, lines))
|
||||||
|
|
||||||
self.assertEqual(len(textboxes), 2)
|
self.assertEqual(len(textboxes), 2)
|
||||||
|
|
||||||
|
|
||||||
|
class TestFindNeigbors(unittest.TestCase):
|
||||||
|
def test_find_neighbors_horizontal(self):
|
||||||
|
laparams = LAParams()
|
||||||
|
plane = Plane((0, 0, 50, 50))
|
||||||
|
|
||||||
|
line = LTTextLineHorizontal(laparams.word_margin)
|
||||||
|
line.set_bbox((10, 4, 20, 6))
|
||||||
|
plane.add(line)
|
||||||
|
|
||||||
|
left_aligned_above = LTTextLineHorizontal(laparams.word_margin)
|
||||||
|
left_aligned_above.set_bbox((10, 6, 15, 8))
|
||||||
|
plane.add(left_aligned_above)
|
||||||
|
|
||||||
|
right_aligned_below = LTTextLineHorizontal(laparams.word_margin)
|
||||||
|
right_aligned_below.set_bbox((15, 2, 20, 4))
|
||||||
|
plane.add(right_aligned_below)
|
||||||
|
|
||||||
|
centrally_aligned_overlapping = LTTextLineHorizontal(
|
||||||
|
laparams.word_margin)
|
||||||
|
centrally_aligned_overlapping.set_bbox((13, 5, 17, 7))
|
||||||
|
plane.add(centrally_aligned_overlapping)
|
||||||
|
|
||||||
|
not_aligned = LTTextLineHorizontal(laparams.word_margin)
|
||||||
|
not_aligned.set_bbox((0, 6, 5, 8))
|
||||||
|
plane.add(not_aligned)
|
||||||
|
|
||||||
|
wrong_height = LTTextLineHorizontal(laparams.word_margin)
|
||||||
|
wrong_height.set_bbox((10, 6, 15, 10))
|
||||||
|
plane.add(wrong_height)
|
||||||
|
|
||||||
|
neighbors = line.find_neighbors(plane, laparams.line_margin)
|
||||||
|
self.assertCountEqual(
|
||||||
|
neighbors,
|
||||||
|
[
|
||||||
|
line,
|
||||||
|
left_aligned_above,
|
||||||
|
right_aligned_below,
|
||||||
|
centrally_aligned_overlapping,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_find_neighbors_vertical(self):
|
||||||
|
laparams = LAParams()
|
||||||
|
plane = Plane((0, 0, 50, 50))
|
||||||
|
|
||||||
|
line = LTTextLineVertical(laparams.word_margin)
|
||||||
|
line.set_bbox((4, 10, 6, 20))
|
||||||
|
plane.add(line)
|
||||||
|
|
||||||
|
bottom_aligned_right = LTTextLineVertical(laparams.word_margin)
|
||||||
|
bottom_aligned_right.set_bbox((6, 10, 8, 15))
|
||||||
|
plane.add(bottom_aligned_right)
|
||||||
|
|
||||||
|
top_aligned_left = LTTextLineVertical(laparams.word_margin)
|
||||||
|
top_aligned_left.set_bbox((2, 15, 4, 20))
|
||||||
|
plane.add(top_aligned_left)
|
||||||
|
|
||||||
|
centrally_aligned_overlapping = LTTextLineVertical(
|
||||||
|
laparams.word_margin)
|
||||||
|
centrally_aligned_overlapping.set_bbox((5, 13, 7, 17))
|
||||||
|
plane.add(centrally_aligned_overlapping)
|
||||||
|
|
||||||
|
not_aligned = LTTextLineVertical(laparams.word_margin)
|
||||||
|
not_aligned.set_bbox((6, 0, 8, 5))
|
||||||
|
plane.add(not_aligned)
|
||||||
|
|
||||||
|
wrong_width = LTTextLineVertical(laparams.word_margin)
|
||||||
|
wrong_width.set_bbox((6, 10, 10, 15))
|
||||||
|
plane.add(wrong_width)
|
||||||
|
|
||||||
|
neighbors = line.find_neighbors(plane, laparams.line_margin)
|
||||||
|
self.assertCountEqual(
|
||||||
|
neighbors,
|
||||||
|
[
|
||||||
|
line,
|
||||||
|
bottom_aligned_right,
|
||||||
|
top_aligned_left,
|
||||||
|
centrally_aligned_overlapping,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
Loading…
Reference in New Issue