There are a number of relatively complex changes here. Comments are in order of where the change appears.
1. When detecting text in a horizontal line, we already add a space between words if separated by more than word_margin apart. However now, we only do it if there is not already an existing space. This prevents multiple spaces being placed between words. 2. Detect a horizontal line if the line is zero width. This improves our detection of horizonal lines when looking for both horizontal and vertical. 3. Don't detect a vertical line if the previous letter is whitspace. Prevents double spaces being caught as vert lines. 4. Improve upon an unfortunate O(N^2) algorithm which I have seen taking many minutes to execute. Unfortunately, while the "fix" reduces algorithmic complexity, it isn't technically correct, so we only do it when we know things will take a long time.pull/55/head
parent
c32550dd4a
commit
45170e7183
|
@ -368,8 +368,16 @@ class LTTextLineHorizontal(LTTextLine):
|
|||
|
||||
def add(self, obj):
|
||||
if isinstance(obj, LTChar) and self.word_margin:
|
||||
# Add a space between words if separated by more than word_margin
|
||||
# apart. Use the max of obj width and height so narrow letters
|
||||
# (i, l) are treated like wider letters, reducing extra spaces
|
||||
margin = self.word_margin * max(obj.width, obj.height)
|
||||
if self._x1 < obj.x0-margin:
|
||||
# But only do it if there is not already a space.
|
||||
last_was_alpha = self._objs and \
|
||||
isinstance(self._objs[-1], LTChar) and \
|
||||
self._objs[-1]._text == ' '
|
||||
if not last_was_alpha:
|
||||
LTContainer.add(self, LTAnno(' '))
|
||||
self._x1 = obj.x1
|
||||
LTTextLine.add(self, obj)
|
||||
|
@ -513,7 +521,9 @@ class LTLayoutContainer(LTContainer):
|
|||
(min(obj0.height, obj1.height) * laparams.line_overlap <
|
||||
obj0.voverlap(obj1)) and
|
||||
(obj0.hdistance(obj1) <
|
||||
max(obj0.width, obj1.width) * laparams.char_margin))
|
||||
max(obj0.width, obj1.width) * laparams.char_margin) or
|
||||
# If the line is zero width, default to horizontal
|
||||
(max(obj0.width, obj1.width) == 0 and obj1.x0 >= obj0.x0))
|
||||
|
||||
# valign: obj0 and obj1 is vertically aligned.
|
||||
#
|
||||
|
@ -535,7 +545,10 @@ class LTLayoutContainer(LTContainer):
|
|||
(min(obj0.width, obj1.width) * laparams.line_overlap <
|
||||
obj0.hoverlap(obj1)) and
|
||||
(obj0.vdistance(obj1) <
|
||||
max(obj0.height, obj1.height) * laparams.char_margin))
|
||||
max(obj0.height, obj1.height) * laparams.char_margin) and
|
||||
# Don't start a vertical line if the previous letter is
|
||||
# whitspace. Prevents double spaces being caught as vert lines.
|
||||
(line or obj0._text.strip()))
|
||||
|
||||
if ((halign and isinstance(line, LTTextLineHorizontal)) or
|
||||
(valign and isinstance(line, LTTextLineVertical))):
|
||||
|
@ -631,8 +644,19 @@ class LTLayoutContainer(LTContainer):
|
|||
(c,d,_,_) = t
|
||||
return (c,d)
|
||||
|
||||
# XXX this still takes O(n^2) :(
|
||||
# The algorithm below still takes O(n^2) :(
|
||||
# For now, if we have many boxes, split them into two and perform them
|
||||
# separately. This will cause bugs, but will prevent hanging.
|
||||
if len(boxes) > 100:
|
||||
boxes = sorted(boxes, key=lambda obj: obj.y0)
|
||||
# Divide in two, then perform grouping
|
||||
# print "Making Recursive Call %d"%len(boxes)
|
||||
bot_boxes = self.group_textboxes(laparams, boxes[:len(boxes)/2])
|
||||
top_boxes = self.group_textboxes(laparams, boxes[len(boxes)/2:])
|
||||
boxes = bot_boxes + top_boxes
|
||||
|
||||
dists = []
|
||||
# Calculate the distance between each box
|
||||
for i in xrange(len(boxes)):
|
||||
obj1 = boxes[i]
|
||||
for j in xrange(i+1, len(boxes)):
|
||||
|
@ -642,16 +666,20 @@ class LTLayoutContainer(LTContainer):
|
|||
dists = csort(dists, key=key_obj)
|
||||
plane = Plane(self.bbox)
|
||||
plane.extend(boxes)
|
||||
# Start with the two closest objects
|
||||
while dists:
|
||||
(c, d, obj1, obj2) = dists.pop(0)
|
||||
# If there are any objects in between, then skip these two
|
||||
if c == 0 and isany(obj1, obj2):
|
||||
dists.append((1, d, obj1, obj2))
|
||||
continue
|
||||
# Group these two closest objects
|
||||
if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or
|
||||
isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))):
|
||||
group = LTTextGroupTBRL([obj1, obj2])
|
||||
else:
|
||||
group = LTTextGroupLRTB([obj1, obj2])
|
||||
# Remove the two individual objects
|
||||
plane.remove(obj1)
|
||||
plane.remove(obj2)
|
||||
dists = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists
|
||||
|
|
Loading…
Reference in New Issue