Speedup grouping of textboxes (#315)

Changed: using a heap instead of a SortedList and avoid rebuilding the heap in each iteration
Changed: avoid potentially huge number of variable assignments in list comprehension.
Changed: avoid repeatly evaluating `obj is obj` in list comprehension by storing id(obj).
pull/326/head
Jianfeng 2019-10-31 16:22:58 +08:00 committed by Pieter Marsman
parent 6cc78ee124
commit 44b223cf0a
2 changed files with 42 additions and 33 deletions

View File

@ -13,6 +13,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
### Changed
- Using argparse instead of getopt for command line interface of dumppdf.py ([#321](https://github.com/pdfminer/pdfminer.six/pull/321))
- Refactor `LTLayoutContainer.group_textboxes` for a significant speed up in layout analysis ([#315](https://github.com/pdfminer/pdfminer.six/pull/315))
### Removed
- Files for external applications such as django, cgi and pyinstaller ([#314](https://github.com/pdfminer/pdfminer.six/issues/314))

View File

@ -1,5 +1,4 @@
from sortedcontainers import SortedListWithKey
import heapq
from .utils import INF
from .utils import Plane
from .utils import get_bound
@ -603,9 +602,22 @@ class LTLayoutContainer(LTContainer):
yield box
return
# group_textboxes: group textboxes hierarchically.
def group_textboxes(self, laparams, boxes):
assert boxes, str((laparams, boxes))
"""Group textboxes hierarchically.
Get pair-wise distances, via dist func defined below, and then merge from the closest textbox pair. Once
obj1 and obj2 are merged / grouped, the resulting group is considered as a new object, and its distances to
other objects & groups are added to the process queue.
For performance reason, pair-wise distances and object pair info are maintained in a heap of
(idx, dist, id(obj1), id(obj2), obj1, obj2) tuples. It ensures quick access to the smallest element. Note that
since comparison operators, e.g., __lt__, are disabled for LTComponent, id(obj) has to appear before obj in
element tuples.
:param laparams: LAParams object.
:param boxes: All textbox objects to be grouped.
:return: a list that has only one element, the final top level textbox.
"""
def dist(obj1, obj2):
"""A distance function between two TextBoxes.
@ -626,8 +638,7 @@ class LTLayoutContainer(LTContainer):
return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height)
def isany(obj1, obj2):
"""Check if there's any other object between obj1 and obj2.
"""
"""Check if there's any other object between obj1 and obj2."""
x0 = min(obj1.x0, obj2.x0)
y0 = min(obj1.y0, obj2.y0)
x1 = max(obj1.x1, obj2.x1)
@ -635,39 +646,36 @@ class LTLayoutContainer(LTContainer):
objs = set(plane.find((x0, y0, x1, y1)))
return objs.difference((obj1, obj2))
def key_obj(t):
(c,d,_,_) = t
return (c,d)
dists = SortedListWithKey(key=key_obj)
dists = []
for i in range(len(boxes)):
obj1 = boxes[i]
for j in range(i+1, len(boxes)):
obj2 = boxes[j]
dists.add((0, dist(obj1, obj2), obj1, obj2))
dists.append((True, dist(obj1, obj2), id(obj1), id(obj2), obj1, obj2))
heapq.heapify(dists)
plane = Plane(self.bbox)
plane.extend(boxes)
while dists:
(c, d, obj1, obj2) = dists.pop(0)
if c == 0 and isany(obj1, obj2):
dists.add((1, d, obj1, obj2))
done = set()
while len(dists) > 0:
(is_first, d, id1, id2, obj1, obj2) = heapq.heappop(dists)
# Skip objects that are already merged
if (id1 not in done) and (id2 not in done):
if is_first and isany(obj1, obj2):
heapq.heappush(dists, (False, d, id1, id2, obj1, obj2))
continue
if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or
isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))):
if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or \
isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL)):
group = LTTextGroupTBRL([obj1, obj2])
else:
group = LTTextGroupLRTB([obj1, obj2])
plane.remove(obj1)
plane.remove(obj2)
removed = [obj1, obj2]
to_remove = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists
if (obj1 in removed or obj2 in removed) ]
for r in to_remove:
dists.remove(r)
done.update([id1, id2])
for other in plane:
dists.add((0, dist(group, other), group, other))
heapq.heappush(dists, (False, dist(group, other), id(group), id(other), group, other))
plane.add(group)
assert len(plane) == 1, str(len(plane))
return list(plane)
def analyze(self, laparams):