Speedup grouping of textboxes (#315)
Changed: using a heap instead of a SortedList and avoid rebuilding the heap in each iteration Changed: avoid potentially huge number of variable assignments in list comprehension. Changed: avoid repeatly evaluating `obj is obj` in list comprehension by storing id(obj).pull/326/head
parent
6cc78ee124
commit
44b223cf0a
|
@ -13,6 +13,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
- Using argparse instead of getopt for command line interface of dumppdf.py ([#321](https://github.com/pdfminer/pdfminer.six/pull/321))
|
- Using argparse instead of getopt for command line interface of dumppdf.py ([#321](https://github.com/pdfminer/pdfminer.six/pull/321))
|
||||||
|
- Refactor `LTLayoutContainer.group_textboxes` for a significant speed up in layout analysis ([#315](https://github.com/pdfminer/pdfminer.six/pull/315))
|
||||||
|
|
||||||
### Removed
|
### Removed
|
||||||
- Files for external applications such as django, cgi and pyinstaller ([#314](https://github.com/pdfminer/pdfminer.six/issues/314))
|
- Files for external applications such as django, cgi and pyinstaller ([#314](https://github.com/pdfminer/pdfminer.six/issues/314))
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from sortedcontainers import SortedListWithKey
|
import heapq
|
||||||
|
|
||||||
from .utils import INF
|
from .utils import INF
|
||||||
from .utils import Plane
|
from .utils import Plane
|
||||||
from .utils import get_bound
|
from .utils import get_bound
|
||||||
|
@ -603,9 +602,22 @@ class LTLayoutContainer(LTContainer):
|
||||||
yield box
|
yield box
|
||||||
return
|
return
|
||||||
|
|
||||||
# group_textboxes: group textboxes hierarchically.
|
|
||||||
def group_textboxes(self, laparams, boxes):
|
def group_textboxes(self, laparams, boxes):
|
||||||
assert boxes, str((laparams, boxes))
|
"""Group textboxes hierarchically.
|
||||||
|
|
||||||
|
Get pair-wise distances, via dist func defined below, and then merge from the closest textbox pair. Once
|
||||||
|
obj1 and obj2 are merged / grouped, the resulting group is considered as a new object, and its distances to
|
||||||
|
other objects & groups are added to the process queue.
|
||||||
|
|
||||||
|
For performance reason, pair-wise distances and object pair info are maintained in a heap of
|
||||||
|
(idx, dist, id(obj1), id(obj2), obj1, obj2) tuples. It ensures quick access to the smallest element. Note that
|
||||||
|
since comparison operators, e.g., __lt__, are disabled for LTComponent, id(obj) has to appear before obj in
|
||||||
|
element tuples.
|
||||||
|
|
||||||
|
:param laparams: LAParams object.
|
||||||
|
:param boxes: All textbox objects to be grouped.
|
||||||
|
:return: a list that has only one element, the final top level textbox.
|
||||||
|
"""
|
||||||
|
|
||||||
def dist(obj1, obj2):
|
def dist(obj1, obj2):
|
||||||
"""A distance function between two TextBoxes.
|
"""A distance function between two TextBoxes.
|
||||||
|
@ -626,8 +638,7 @@ class LTLayoutContainer(LTContainer):
|
||||||
return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height)
|
return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height)
|
||||||
|
|
||||||
def isany(obj1, obj2):
|
def isany(obj1, obj2):
|
||||||
"""Check if there's any other object between obj1 and obj2.
|
"""Check if there's any other object between obj1 and obj2."""
|
||||||
"""
|
|
||||||
x0 = min(obj1.x0, obj2.x0)
|
x0 = min(obj1.x0, obj2.x0)
|
||||||
y0 = min(obj1.y0, obj2.y0)
|
y0 = min(obj1.y0, obj2.y0)
|
||||||
x1 = max(obj1.x1, obj2.x1)
|
x1 = max(obj1.x1, obj2.x1)
|
||||||
|
@ -635,39 +646,36 @@ class LTLayoutContainer(LTContainer):
|
||||||
objs = set(plane.find((x0, y0, x1, y1)))
|
objs = set(plane.find((x0, y0, x1, y1)))
|
||||||
return objs.difference((obj1, obj2))
|
return objs.difference((obj1, obj2))
|
||||||
|
|
||||||
def key_obj(t):
|
dists = []
|
||||||
(c,d,_,_) = t
|
|
||||||
return (c,d)
|
|
||||||
|
|
||||||
dists = SortedListWithKey(key=key_obj)
|
|
||||||
for i in range(len(boxes)):
|
for i in range(len(boxes)):
|
||||||
obj1 = boxes[i]
|
obj1 = boxes[i]
|
||||||
for j in range(i+1, len(boxes)):
|
for j in range(i+1, len(boxes)):
|
||||||
obj2 = boxes[j]
|
obj2 = boxes[j]
|
||||||
dists.add((0, dist(obj1, obj2), obj1, obj2))
|
dists.append((True, dist(obj1, obj2), id(obj1), id(obj2), obj1, obj2))
|
||||||
|
heapq.heapify(dists)
|
||||||
|
|
||||||
plane = Plane(self.bbox)
|
plane = Plane(self.bbox)
|
||||||
plane.extend(boxes)
|
plane.extend(boxes)
|
||||||
while dists:
|
done = set()
|
||||||
(c, d, obj1, obj2) = dists.pop(0)
|
while len(dists) > 0:
|
||||||
if c == 0 and isany(obj1, obj2):
|
(is_first, d, id1, id2, obj1, obj2) = heapq.heappop(dists)
|
||||||
dists.add((1, d, obj1, obj2))
|
# Skip objects that are already merged
|
||||||
continue
|
if (id1 not in done) and (id2 not in done):
|
||||||
if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or
|
if is_first and isany(obj1, obj2):
|
||||||
isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))):
|
heapq.heappush(dists, (False, d, id1, id2, obj1, obj2))
|
||||||
group = LTTextGroupTBRL([obj1, obj2])
|
continue
|
||||||
else:
|
if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or \
|
||||||
group = LTTextGroupLRTB([obj1, obj2])
|
isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL)):
|
||||||
plane.remove(obj1)
|
group = LTTextGroupTBRL([obj1, obj2])
|
||||||
plane.remove(obj2)
|
else:
|
||||||
removed = [obj1, obj2]
|
group = LTTextGroupLRTB([obj1, obj2])
|
||||||
to_remove = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists
|
plane.remove(obj1)
|
||||||
if (obj1 in removed or obj2 in removed) ]
|
plane.remove(obj2)
|
||||||
for r in to_remove:
|
done.update([id1, id2])
|
||||||
dists.remove(r)
|
|
||||||
for other in plane:
|
for other in plane:
|
||||||
dists.add((0, dist(group, other), group, other))
|
heapq.heappush(dists, (False, dist(group, other), id(group), id(other), group, other))
|
||||||
plane.add(group)
|
plane.add(group)
|
||||||
assert len(plane) == 1, str(len(plane))
|
|
||||||
return list(plane)
|
return list(plane)
|
||||||
|
|
||||||
def analyze(self, laparams):
|
def analyze(self, laparams):
|
||||||
|
|
Loading…
Reference in New Issue