There are a number of relatively complex changes here. Comments are in order of where the change appears.

1.
When detecting text in a horizontal line, we already add a space between words if separated by more than word_margin apart.  However now, we only do it if there is not already an existing space. This prevents multiple spaces being placed between words.

2.
Detect a horizontal line if the line is zero width. This improves our detection of horizonal lines when looking for both horizontal and vertical.

3.
Don't detect a vertical line if the previous letter is whitspace. Prevents double spaces being caught as vert lines.

4.
Improve upon an unfortunate O(N^2) algorithm which I have seen taking many minutes to execute.  Unfortunately, while the "fix" reduces algorithmic complexity, it isn't technically correct, so we only do it when we know things will take a long time.
pull/55/head
speedplane 2014-11-11 23:49:13 -05:00
parent c32550dd4a
commit 45170e7183
1 changed files with 32 additions and 4 deletions

View File

@ -368,9 +368,17 @@ class LTTextLineHorizontal(LTTextLine):
def add(self, obj):
if isinstance(obj, LTChar) and self.word_margin:
# Add a space between words if separated by more than word_margin
# apart. Use the max of obj width and height so narrow letters
# (i, l) are treated like wider letters, reducing extra spaces
margin = self.word_margin * max(obj.width, obj.height)
if self._x1 < obj.x0-margin:
LTContainer.add(self, LTAnno(' '))
# But only do it if there is not already a space.
last_was_alpha = self._objs and \
isinstance(self._objs[-1], LTChar) and \
self._objs[-1]._text == ' '
if not last_was_alpha:
LTContainer.add(self, LTAnno(' '))
self._x1 = obj.x1
LTTextLine.add(self, obj)
return
@ -513,7 +521,9 @@ class LTLayoutContainer(LTContainer):
(min(obj0.height, obj1.height) * laparams.line_overlap <
obj0.voverlap(obj1)) and
(obj0.hdistance(obj1) <
max(obj0.width, obj1.width) * laparams.char_margin))
max(obj0.width, obj1.width) * laparams.char_margin) or
# If the line is zero width, default to horizontal
(max(obj0.width, obj1.width) == 0 and obj1.x0 >= obj0.x0))
# valign: obj0 and obj1 is vertically aligned.
#
@ -535,7 +545,10 @@ class LTLayoutContainer(LTContainer):
(min(obj0.width, obj1.width) * laparams.line_overlap <
obj0.hoverlap(obj1)) and
(obj0.vdistance(obj1) <
max(obj0.height, obj1.height) * laparams.char_margin))
max(obj0.height, obj1.height) * laparams.char_margin) and
# Don't start a vertical line if the previous letter is
# whitspace. Prevents double spaces being caught as vert lines.
(line or obj0._text.strip()))
if ((halign and isinstance(line, LTTextLineHorizontal)) or
(valign and isinstance(line, LTTextLineVertical))):
@ -631,8 +644,19 @@ class LTLayoutContainer(LTContainer):
(c,d,_,_) = t
return (c,d)
# XXX this still takes O(n^2) :(
# The algorithm below still takes O(n^2) :(
# For now, if we have many boxes, split them into two and perform them
# separately. This will cause bugs, but will prevent hanging.
if len(boxes) > 100:
boxes = sorted(boxes, key=lambda obj: obj.y0)
# Divide in two, then perform grouping
# print "Making Recursive Call %d"%len(boxes)
bot_boxes = self.group_textboxes(laparams, boxes[:len(boxes)/2])
top_boxes = self.group_textboxes(laparams, boxes[len(boxes)/2:])
boxes = bot_boxes + top_boxes
dists = []
# Calculate the distance between each box
for i in xrange(len(boxes)):
obj1 = boxes[i]
for j in xrange(i+1, len(boxes)):
@ -642,16 +666,20 @@ class LTLayoutContainer(LTContainer):
dists = csort(dists, key=key_obj)
plane = Plane(self.bbox)
plane.extend(boxes)
# Start with the two closest objects
while dists:
(c, d, obj1, obj2) = dists.pop(0)
# If there are any objects in between, then skip these two
if c == 0 and isany(obj1, obj2):
dists.append((1, d, obj1, obj2))
continue
# Group these two closest objects
if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or
isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))):
group = LTTextGroupTBRL([obj1, obj2])
else:
group = LTTextGroupLRTB([obj1, obj2])
# Remove the two individual objects
plane.remove(obj1)
plane.remove(obj2)
dists = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists