From 44b223cf0a3b29bc1f0c3ece71c483a22b71614d Mon Sep 17 00:00:00 2001 From: Jianfeng Date: Thu, 31 Oct 2019 16:22:58 +0800 Subject: [PATCH] Speedup grouping of textboxes (#315) Changed: using a heap instead of a SortedList and avoid rebuilding the heap in each iteration Changed: avoid potentially huge number of variable assignments in list comprehension. Changed: avoid repeatly evaluating `obj is obj` in list comprehension by storing id(obj). --- CHANGELOG.md | 1 + pdfminer/layout.py | 74 +++++++++++++++++++++++++--------------------- 2 files changed, 42 insertions(+), 33 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6595543..e79fdcc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Changed - Using argparse instead of getopt for command line interface of dumppdf.py ([#321](https://github.com/pdfminer/pdfminer.six/pull/321)) +- Refactor `LTLayoutContainer.group_textboxes` for a significant speed up in layout analysis ([#315](https://github.com/pdfminer/pdfminer.six/pull/315)) ### Removed - Files for external applications such as django, cgi and pyinstaller ([#314](https://github.com/pdfminer/pdfminer.six/issues/314)) diff --git a/pdfminer/layout.py b/pdfminer/layout.py index ba9a2bf..4ae7822 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -1,5 +1,4 @@ -from sortedcontainers import SortedListWithKey - +import heapq from .utils import INF from .utils import Plane from .utils import get_bound @@ -603,9 +602,22 @@ class LTLayoutContainer(LTContainer): yield box return - # group_textboxes: group textboxes hierarchically. def group_textboxes(self, laparams, boxes): - assert boxes, str((laparams, boxes)) + """Group textboxes hierarchically. + + Get pair-wise distances, via dist func defined below, and then merge from the closest textbox pair. Once + obj1 and obj2 are merged / grouped, the resulting group is considered as a new object, and its distances to + other objects & groups are added to the process queue. + + For performance reason, pair-wise distances and object pair info are maintained in a heap of + (idx, dist, id(obj1), id(obj2), obj1, obj2) tuples. It ensures quick access to the smallest element. Note that + since comparison operators, e.g., __lt__, are disabled for LTComponent, id(obj) has to appear before obj in + element tuples. + + :param laparams: LAParams object. + :param boxes: All textbox objects to be grouped. + :return: a list that has only one element, the final top level textbox. + """ def dist(obj1, obj2): """A distance function between two TextBoxes. @@ -626,8 +638,7 @@ class LTLayoutContainer(LTContainer): return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height) def isany(obj1, obj2): - """Check if there's any other object between obj1 and obj2. - """ + """Check if there's any other object between obj1 and obj2.""" x0 = min(obj1.x0, obj2.x0) y0 = min(obj1.y0, obj2.y0) x1 = max(obj1.x1, obj2.x1) @@ -635,39 +646,36 @@ class LTLayoutContainer(LTContainer): objs = set(plane.find((x0, y0, x1, y1))) return objs.difference((obj1, obj2)) - def key_obj(t): - (c,d,_,_) = t - return (c,d) - - dists = SortedListWithKey(key=key_obj) + dists = [] for i in range(len(boxes)): obj1 = boxes[i] for j in range(i+1, len(boxes)): obj2 = boxes[j] - dists.add((0, dist(obj1, obj2), obj1, obj2)) + dists.append((True, dist(obj1, obj2), id(obj1), id(obj2), obj1, obj2)) + heapq.heapify(dists) + plane = Plane(self.bbox) plane.extend(boxes) - while dists: - (c, d, obj1, obj2) = dists.pop(0) - if c == 0 and isany(obj1, obj2): - dists.add((1, d, obj1, obj2)) - continue - if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or - isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))): - group = LTTextGroupTBRL([obj1, obj2]) - else: - group = LTTextGroupLRTB([obj1, obj2]) - plane.remove(obj1) - plane.remove(obj2) - removed = [obj1, obj2] - to_remove = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists - if (obj1 in removed or obj2 in removed) ] - for r in to_remove: - dists.remove(r) - for other in plane: - dists.add((0, dist(group, other), group, other)) - plane.add(group) - assert len(plane) == 1, str(len(plane)) + done = set() + while len(dists) > 0: + (is_first, d, id1, id2, obj1, obj2) = heapq.heappop(dists) + # Skip objects that are already merged + if (id1 not in done) and (id2 not in done): + if is_first and isany(obj1, obj2): + heapq.heappush(dists, (False, d, id1, id2, obj1, obj2)) + continue + if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or \ + isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL)): + group = LTTextGroupTBRL([obj1, obj2]) + else: + group = LTTextGroupLRTB([obj1, obj2]) + plane.remove(obj1) + plane.remove(obj2) + done.update([id1, id2]) + + for other in plane: + heapq.heappush(dists, (False, dist(group, other), id(group), id(other), group, other)) + plane.add(group) return list(plane) def analyze(self, laparams):