From 45170e718396502c45d13cb44bfde2b6f95e576f Mon Sep 17 00:00:00 2001 From: speedplane Date: Tue, 11 Nov 2014 23:49:13 -0500 Subject: [PATCH] There are a number of relatively complex changes here. Comments are in order of where the change appears. 1. When detecting text in a horizontal line, we already add a space between words if separated by more than word_margin apart. However now, we only do it if there is not already an existing space. This prevents multiple spaces being placed between words. 2. Detect a horizontal line if the line is zero width. This improves our detection of horizonal lines when looking for both horizontal and vertical. 3. Don't detect a vertical line if the previous letter is whitspace. Prevents double spaces being caught as vert lines. 4. Improve upon an unfortunate O(N^2) algorithm which I have seen taking many minutes to execute. Unfortunately, while the "fix" reduces algorithmic complexity, it isn't technically correct, so we only do it when we know things will take a long time. --- pdfminer/layout.py | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 32b706f..b2dde8c 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -368,9 +368,17 @@ class LTTextLineHorizontal(LTTextLine): def add(self, obj): if isinstance(obj, LTChar) and self.word_margin: + # Add a space between words if separated by more than word_margin + # apart. Use the max of obj width and height so narrow letters + # (i, l) are treated like wider letters, reducing extra spaces margin = self.word_margin * max(obj.width, obj.height) if self._x1 < obj.x0-margin: - LTContainer.add(self, LTAnno(' ')) + # But only do it if there is not already a space. + last_was_alpha = self._objs and \ + isinstance(self._objs[-1], LTChar) and \ + self._objs[-1]._text == ' ' + if not last_was_alpha: + LTContainer.add(self, LTAnno(' ')) self._x1 = obj.x1 LTTextLine.add(self, obj) return @@ -513,7 +521,9 @@ class LTLayoutContainer(LTContainer): (min(obj0.height, obj1.height) * laparams.line_overlap < obj0.voverlap(obj1)) and (obj0.hdistance(obj1) < - max(obj0.width, obj1.width) * laparams.char_margin)) + max(obj0.width, obj1.width) * laparams.char_margin) or + # If the line is zero width, default to horizontal + (max(obj0.width, obj1.width) == 0 and obj1.x0 >= obj0.x0)) # valign: obj0 and obj1 is vertically aligned. # @@ -535,7 +545,10 @@ class LTLayoutContainer(LTContainer): (min(obj0.width, obj1.width) * laparams.line_overlap < obj0.hoverlap(obj1)) and (obj0.vdistance(obj1) < - max(obj0.height, obj1.height) * laparams.char_margin)) + max(obj0.height, obj1.height) * laparams.char_margin) and + # Don't start a vertical line if the previous letter is + # whitspace. Prevents double spaces being caught as vert lines. + (line or obj0._text.strip())) if ((halign and isinstance(line, LTTextLineHorizontal)) or (valign and isinstance(line, LTTextLineVertical))): @@ -631,8 +644,19 @@ class LTLayoutContainer(LTContainer): (c,d,_,_) = t return (c,d) - # XXX this still takes O(n^2) :( + # The algorithm below still takes O(n^2) :( + # For now, if we have many boxes, split them into two and perform them + # separately. This will cause bugs, but will prevent hanging. + if len(boxes) > 100: + boxes = sorted(boxes, key=lambda obj: obj.y0) + # Divide in two, then perform grouping + # print "Making Recursive Call %d"%len(boxes) + bot_boxes = self.group_textboxes(laparams, boxes[:len(boxes)/2]) + top_boxes = self.group_textboxes(laparams, boxes[len(boxes)/2:]) + boxes = bot_boxes + top_boxes + dists = [] + # Calculate the distance between each box for i in xrange(len(boxes)): obj1 = boxes[i] for j in xrange(i+1, len(boxes)): @@ -642,16 +666,20 @@ class LTLayoutContainer(LTContainer): dists = csort(dists, key=key_obj) plane = Plane(self.bbox) plane.extend(boxes) + # Start with the two closest objects while dists: (c, d, obj1, obj2) = dists.pop(0) + # If there are any objects in between, then skip these two if c == 0 and isany(obj1, obj2): dists.append((1, d, obj1, obj2)) continue + # Group these two closest objects if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))): group = LTTextGroupTBRL([obj1, obj2]) else: group = LTTextGroupLRTB([obj1, obj2]) + # Remove the two individual objects plane.remove(obj1) plane.remove(obj2) dists = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists