Speedup grouping of textboxes (#315)
Changed: using a heap instead of a SortedList and avoid rebuilding the heap in each iteration Changed: avoid potentially huge number of variable assignments in list comprehension. Changed: avoid repeatly evaluating `obj is obj` in list comprehension by storing id(obj).pull/326/head
parent
6cc78ee124
commit
44b223cf0a
|
@ -13,6 +13,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
|||
|
||||
### Changed
|
||||
- Using argparse instead of getopt for command line interface of dumppdf.py ([#321](https://github.com/pdfminer/pdfminer.six/pull/321))
|
||||
- Refactor `LTLayoutContainer.group_textboxes` for a significant speed up in layout analysis ([#315](https://github.com/pdfminer/pdfminer.six/pull/315))
|
||||
|
||||
### Removed
|
||||
- Files for external applications such as django, cgi and pyinstaller ([#314](https://github.com/pdfminer/pdfminer.six/issues/314))
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
from sortedcontainers import SortedListWithKey
|
||||
|
||||
import heapq
|
||||
from .utils import INF
|
||||
from .utils import Plane
|
||||
from .utils import get_bound
|
||||
|
@ -603,9 +602,22 @@ class LTLayoutContainer(LTContainer):
|
|||
yield box
|
||||
return
|
||||
|
||||
# group_textboxes: group textboxes hierarchically.
|
||||
def group_textboxes(self, laparams, boxes):
|
||||
assert boxes, str((laparams, boxes))
|
||||
"""Group textboxes hierarchically.
|
||||
|
||||
Get pair-wise distances, via dist func defined below, and then merge from the closest textbox pair. Once
|
||||
obj1 and obj2 are merged / grouped, the resulting group is considered as a new object, and its distances to
|
||||
other objects & groups are added to the process queue.
|
||||
|
||||
For performance reason, pair-wise distances and object pair info are maintained in a heap of
|
||||
(idx, dist, id(obj1), id(obj2), obj1, obj2) tuples. It ensures quick access to the smallest element. Note that
|
||||
since comparison operators, e.g., __lt__, are disabled for LTComponent, id(obj) has to appear before obj in
|
||||
element tuples.
|
||||
|
||||
:param laparams: LAParams object.
|
||||
:param boxes: All textbox objects to be grouped.
|
||||
:return: a list that has only one element, the final top level textbox.
|
||||
"""
|
||||
|
||||
def dist(obj1, obj2):
|
||||
"""A distance function between two TextBoxes.
|
||||
|
@ -626,8 +638,7 @@ class LTLayoutContainer(LTContainer):
|
|||
return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height)
|
||||
|
||||
def isany(obj1, obj2):
|
||||
"""Check if there's any other object between obj1 and obj2.
|
||||
"""
|
||||
"""Check if there's any other object between obj1 and obj2."""
|
||||
x0 = min(obj1.x0, obj2.x0)
|
||||
y0 = min(obj1.y0, obj2.y0)
|
||||
x1 = max(obj1.x1, obj2.x1)
|
||||
|
@ -635,39 +646,36 @@ class LTLayoutContainer(LTContainer):
|
|||
objs = set(plane.find((x0, y0, x1, y1)))
|
||||
return objs.difference((obj1, obj2))
|
||||
|
||||
def key_obj(t):
|
||||
(c,d,_,_) = t
|
||||
return (c,d)
|
||||
|
||||
dists = SortedListWithKey(key=key_obj)
|
||||
dists = []
|
||||
for i in range(len(boxes)):
|
||||
obj1 = boxes[i]
|
||||
for j in range(i+1, len(boxes)):
|
||||
obj2 = boxes[j]
|
||||
dists.add((0, dist(obj1, obj2), obj1, obj2))
|
||||
dists.append((True, dist(obj1, obj2), id(obj1), id(obj2), obj1, obj2))
|
||||
heapq.heapify(dists)
|
||||
|
||||
plane = Plane(self.bbox)
|
||||
plane.extend(boxes)
|
||||
while dists:
|
||||
(c, d, obj1, obj2) = dists.pop(0)
|
||||
if c == 0 and isany(obj1, obj2):
|
||||
dists.add((1, d, obj1, obj2))
|
||||
continue
|
||||
if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or
|
||||
isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))):
|
||||
group = LTTextGroupTBRL([obj1, obj2])
|
||||
else:
|
||||
group = LTTextGroupLRTB([obj1, obj2])
|
||||
plane.remove(obj1)
|
||||
plane.remove(obj2)
|
||||
removed = [obj1, obj2]
|
||||
to_remove = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists
|
||||
if (obj1 in removed or obj2 in removed) ]
|
||||
for r in to_remove:
|
||||
dists.remove(r)
|
||||
for other in plane:
|
||||
dists.add((0, dist(group, other), group, other))
|
||||
plane.add(group)
|
||||
assert len(plane) == 1, str(len(plane))
|
||||
done = set()
|
||||
while len(dists) > 0:
|
||||
(is_first, d, id1, id2, obj1, obj2) = heapq.heappop(dists)
|
||||
# Skip objects that are already merged
|
||||
if (id1 not in done) and (id2 not in done):
|
||||
if is_first and isany(obj1, obj2):
|
||||
heapq.heappush(dists, (False, d, id1, id2, obj1, obj2))
|
||||
continue
|
||||
if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or \
|
||||
isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL)):
|
||||
group = LTTextGroupTBRL([obj1, obj2])
|
||||
else:
|
||||
group = LTTextGroupLRTB([obj1, obj2])
|
||||
plane.remove(obj1)
|
||||
plane.remove(obj2)
|
||||
done.update([id1, id2])
|
||||
|
||||
for other in plane:
|
||||
heapq.heappush(dists, (False, dist(group, other), id(group), id(other), group, other))
|
||||
plane.add(group)
|
||||
return list(plane)
|
||||
|
||||
def analyze(self, laparams):
|
||||
|
|
Loading…
Reference in New Issue