Also group center-aligned text lines in addition to left-aligned and right-aligned text lines (#382) (#384)
* Group text lines if they are centered (#382) Closes #382 * Add comparison private methods to LTTextLines * Add missing docstrings * Add tests for find_neighbors * Update changelog * Cosmetic changes from code reviewpull/393/head
parent
9d7fe2d9ee
commit
1cc1b961c5
|
@ -10,6 +10,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
|||
- Ignore ValueError when converting font encoding differences ([#389](https://github.com/pdfminer/pdfminer.six/pull/389))
|
||||
- Grouping of text lines outside of parent container bounding box ([#386](https://github.com/pdfminer/pdfminer.six/pull/386))
|
||||
|
||||
### Changed
|
||||
- Group text lines if they are centered ([#382](https://github.com/pdfminer/pdfminer.six/pull/382))
|
||||
|
||||
## [20200124] - 2020-01-24
|
||||
|
||||
### Security
|
||||
|
|
|
@ -409,20 +409,51 @@ class LTTextLineHorizontal(LTTextLine):
|
|||
def add(self, obj):
|
||||
if isinstance(obj, LTChar) and self.word_margin:
|
||||
margin = self.word_margin * max(obj.width, obj.height)
|
||||
if self._x1 < obj.x0-margin:
|
||||
if self._x1 < obj.x0 - margin:
|
||||
LTContainer.add(self, LTAnno(' '))
|
||||
self._x1 = obj.x1
|
||||
LTTextLine.add(self, obj)
|
||||
return
|
||||
|
||||
def find_neighbors(self, plane, ratio):
|
||||
d = ratio*self.height
|
||||
objs = plane.find((self.x0, self.y0-d, self.x1, self.y1+d))
|
||||
"""
|
||||
Finds neighboring LTTextLineHorizontals in the plane.
|
||||
|
||||
Returns a list of other LTTestLineHorizontals in the plane which are
|
||||
close to self. "Close" can be controlled by ratio. The returned objects
|
||||
will be the same height as self, and also either left-, right-, or
|
||||
centrally-aligned.
|
||||
"""
|
||||
d = ratio * self.height
|
||||
objs = plane.find((self.x0, self.y0 - d, self.x1, self.y1 + d))
|
||||
return [obj for obj in objs
|
||||
if (isinstance(obj, LTTextLineHorizontal) and
|
||||
abs(obj.height-self.height) < d and
|
||||
(abs(obj.x0-self.x0) < d or
|
||||
abs(obj.x1-self.x1) < d))]
|
||||
self._is_same_height_as(obj, tolerance=d) and
|
||||
(self._is_left_aligned_with(obj, tolerance=d) or
|
||||
self._is_right_aligned_with(obj, tolerance=d) or
|
||||
self._is_centrally_aligned_with(obj, tolerance=d)))]
|
||||
|
||||
def _is_left_aligned_with(self, other, tolerance=0):
|
||||
"""
|
||||
Whether the left-hand edge of `other` is within `tolerance`.
|
||||
"""
|
||||
return abs(other.x0 - self.x0) <= tolerance
|
||||
|
||||
def _is_right_aligned_with(self, other, tolerance=0):
|
||||
"""
|
||||
Whether the right-hand edge of `other` is within `tolerance`.
|
||||
"""
|
||||
return abs(other.x1 - self.x1) <= tolerance
|
||||
|
||||
def _is_centrally_aligned_with(self, other, tolerance=0):
|
||||
"""
|
||||
Whether the horizontal center of `other` is within `tolerance`.
|
||||
"""
|
||||
return abs(
|
||||
(other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance
|
||||
|
||||
def _is_same_height_as(self, other, tolerance):
|
||||
return abs(other.height - self.height) <= tolerance
|
||||
|
||||
|
||||
class LTTextLineVertical(LTTextLine):
|
||||
|
@ -434,20 +465,51 @@ class LTTextLineVertical(LTTextLine):
|
|||
def add(self, obj):
|
||||
if isinstance(obj, LTChar) and self.word_margin:
|
||||
margin = self.word_margin * max(obj.width, obj.height)
|
||||
if obj.y1+margin < self._y0:
|
||||
if obj.y1 + margin < self._y0:
|
||||
LTContainer.add(self, LTAnno(' '))
|
||||
self._y0 = obj.y0
|
||||
LTTextLine.add(self, obj)
|
||||
return
|
||||
|
||||
def find_neighbors(self, plane, ratio):
|
||||
d = ratio*self.width
|
||||
objs = plane.find((self.x0-d, self.y0, self.x1+d, self.y1))
|
||||
"""
|
||||
Finds neighboring LTTextLineVerticals in the plane.
|
||||
|
||||
Returns a list of other LTTextLineVerticals in the plane which are
|
||||
close to self. "Close" can be controlled by ratio. The returned objects
|
||||
will be the same width as self, and also either upper-, lower-, or
|
||||
centrally-aligned.
|
||||
"""
|
||||
d = ratio * self.width
|
||||
objs = plane.find((self.x0 - d, self.y0, self.x1 + d, self.y1))
|
||||
return [obj for obj in objs
|
||||
if (isinstance(obj, LTTextLineVertical) and
|
||||
abs(obj.width-self.width) < d and
|
||||
(abs(obj.y0-self.y0) < d or
|
||||
abs(obj.y1-self.y1) < d))]
|
||||
self._is_same_width_as(obj, tolerance=d) and
|
||||
(self._is_lower_aligned_with(obj, tolerance=d) or
|
||||
self._is_upper_aligned_with(obj, tolerance=d) or
|
||||
self._is_centrally_aligned_with(obj, tolerance=d)))]
|
||||
|
||||
def _is_lower_aligned_with(self, other, tolerance=0):
|
||||
"""
|
||||
Whether the lower edge of `other` is within `tolerance`.
|
||||
"""
|
||||
return abs(other.y0 - self.y0) <= tolerance
|
||||
|
||||
def _is_upper_aligned_with(self, other, tolerance=0):
|
||||
"""
|
||||
Whether the upper edge of `other` is within `tolerance`.
|
||||
"""
|
||||
return abs(other.y1 - self.y1) <= tolerance
|
||||
|
||||
def _is_centrally_aligned_with(self, other, tolerance=0):
|
||||
"""
|
||||
Whether the vertical center of `other` is within `tolerance`.
|
||||
"""
|
||||
return abs(
|
||||
(other.y0 + other.y1) / 2 - (self.y0 + self.y1) / 2) <= tolerance
|
||||
|
||||
def _is_same_width_as(self, other, tolerance):
|
||||
return abs(other.width - self.width) <= tolerance
|
||||
|
||||
|
||||
class LTTextBox(LTTextContainer):
|
||||
|
|
|
@ -1,6 +1,12 @@
|
|||
import unittest
|
||||
|
||||
from pdfminer.layout import LTLayoutContainer, LAParams, LTTextLineHorizontal
|
||||
from pdfminer.layout import (
|
||||
LTLayoutContainer,
|
||||
LAParams,
|
||||
LTTextLineHorizontal,
|
||||
LTTextLineVertical,
|
||||
)
|
||||
from pdfminer.utils import Plane
|
||||
|
||||
|
||||
class TestGroupTextLines(unittest.TestCase):
|
||||
|
@ -21,3 +27,85 @@ class TestGroupTextLines(unittest.TestCase):
|
|||
textboxes = list(layout.group_textlines(laparams, lines))
|
||||
|
||||
self.assertEqual(len(textboxes), 2)
|
||||
|
||||
|
||||
class TestFindNeigbors(unittest.TestCase):
|
||||
def test_find_neighbors_horizontal(self):
|
||||
laparams = LAParams()
|
||||
plane = Plane((0, 0, 50, 50))
|
||||
|
||||
line = LTTextLineHorizontal(laparams.word_margin)
|
||||
line.set_bbox((10, 4, 20, 6))
|
||||
plane.add(line)
|
||||
|
||||
left_aligned_above = LTTextLineHorizontal(laparams.word_margin)
|
||||
left_aligned_above.set_bbox((10, 6, 15, 8))
|
||||
plane.add(left_aligned_above)
|
||||
|
||||
right_aligned_below = LTTextLineHorizontal(laparams.word_margin)
|
||||
right_aligned_below.set_bbox((15, 2, 20, 4))
|
||||
plane.add(right_aligned_below)
|
||||
|
||||
centrally_aligned_overlapping = LTTextLineHorizontal(
|
||||
laparams.word_margin)
|
||||
centrally_aligned_overlapping.set_bbox((13, 5, 17, 7))
|
||||
plane.add(centrally_aligned_overlapping)
|
||||
|
||||
not_aligned = LTTextLineHorizontal(laparams.word_margin)
|
||||
not_aligned.set_bbox((0, 6, 5, 8))
|
||||
plane.add(not_aligned)
|
||||
|
||||
wrong_height = LTTextLineHorizontal(laparams.word_margin)
|
||||
wrong_height.set_bbox((10, 6, 15, 10))
|
||||
plane.add(wrong_height)
|
||||
|
||||
neighbors = line.find_neighbors(plane, laparams.line_margin)
|
||||
self.assertCountEqual(
|
||||
neighbors,
|
||||
[
|
||||
line,
|
||||
left_aligned_above,
|
||||
right_aligned_below,
|
||||
centrally_aligned_overlapping,
|
||||
],
|
||||
)
|
||||
|
||||
def test_find_neighbors_vertical(self):
|
||||
laparams = LAParams()
|
||||
plane = Plane((0, 0, 50, 50))
|
||||
|
||||
line = LTTextLineVertical(laparams.word_margin)
|
||||
line.set_bbox((4, 10, 6, 20))
|
||||
plane.add(line)
|
||||
|
||||
bottom_aligned_right = LTTextLineVertical(laparams.word_margin)
|
||||
bottom_aligned_right.set_bbox((6, 10, 8, 15))
|
||||
plane.add(bottom_aligned_right)
|
||||
|
||||
top_aligned_left = LTTextLineVertical(laparams.word_margin)
|
||||
top_aligned_left.set_bbox((2, 15, 4, 20))
|
||||
plane.add(top_aligned_left)
|
||||
|
||||
centrally_aligned_overlapping = LTTextLineVertical(
|
||||
laparams.word_margin)
|
||||
centrally_aligned_overlapping.set_bbox((5, 13, 7, 17))
|
||||
plane.add(centrally_aligned_overlapping)
|
||||
|
||||
not_aligned = LTTextLineVertical(laparams.word_margin)
|
||||
not_aligned.set_bbox((6, 0, 8, 5))
|
||||
plane.add(not_aligned)
|
||||
|
||||
wrong_width = LTTextLineVertical(laparams.word_margin)
|
||||
wrong_width.set_bbox((6, 10, 10, 15))
|
||||
plane.add(wrong_width)
|
||||
|
||||
neighbors = line.find_neighbors(plane, laparams.line_margin)
|
||||
self.assertCountEqual(
|
||||
neighbors,
|
||||
[
|
||||
line,
|
||||
bottom_aligned_right,
|
||||
top_aligned_left,
|
||||
centrally_aligned_overlapping,
|
||||
],
|
||||
)
|
||||
|
|
Loading…
Reference in New Issue