There are a number of relatively complex changes here. Comments are in order of where the change appears.
1. When detecting text in a horizontal line, we already add a space between words if separated by more than word_margin apart. However now, we only do it if there is not already an existing space. This prevents multiple spaces being placed between words. 2. Detect a horizontal line if the line is zero width. This improves our detection of horizonal lines when looking for both horizontal and vertical. 3. Don't detect a vertical line if the previous letter is whitspace. Prevents double spaces being caught as vert lines. 4. Improve upon an unfortunate O(N^2) algorithm which I have seen taking many minutes to execute. Unfortunately, while the "fix" reduces algorithmic complexity, it isn't technically correct, so we only do it when we know things will take a long time.pull/55/head
parent
c32550dd4a
commit
45170e7183
|
@ -368,8 +368,16 @@ class LTTextLineHorizontal(LTTextLine):
|
||||||
|
|
||||||
def add(self, obj):
|
def add(self, obj):
|
||||||
if isinstance(obj, LTChar) and self.word_margin:
|
if isinstance(obj, LTChar) and self.word_margin:
|
||||||
|
# Add a space between words if separated by more than word_margin
|
||||||
|
# apart. Use the max of obj width and height so narrow letters
|
||||||
|
# (i, l) are treated like wider letters, reducing extra spaces
|
||||||
margin = self.word_margin * max(obj.width, obj.height)
|
margin = self.word_margin * max(obj.width, obj.height)
|
||||||
if self._x1 < obj.x0-margin:
|
if self._x1 < obj.x0-margin:
|
||||||
|
# But only do it if there is not already a space.
|
||||||
|
last_was_alpha = self._objs and \
|
||||||
|
isinstance(self._objs[-1], LTChar) and \
|
||||||
|
self._objs[-1]._text == ' '
|
||||||
|
if not last_was_alpha:
|
||||||
LTContainer.add(self, LTAnno(' '))
|
LTContainer.add(self, LTAnno(' '))
|
||||||
self._x1 = obj.x1
|
self._x1 = obj.x1
|
||||||
LTTextLine.add(self, obj)
|
LTTextLine.add(self, obj)
|
||||||
|
@ -513,7 +521,9 @@ class LTLayoutContainer(LTContainer):
|
||||||
(min(obj0.height, obj1.height) * laparams.line_overlap <
|
(min(obj0.height, obj1.height) * laparams.line_overlap <
|
||||||
obj0.voverlap(obj1)) and
|
obj0.voverlap(obj1)) and
|
||||||
(obj0.hdistance(obj1) <
|
(obj0.hdistance(obj1) <
|
||||||
max(obj0.width, obj1.width) * laparams.char_margin))
|
max(obj0.width, obj1.width) * laparams.char_margin) or
|
||||||
|
# If the line is zero width, default to horizontal
|
||||||
|
(max(obj0.width, obj1.width) == 0 and obj1.x0 >= obj0.x0))
|
||||||
|
|
||||||
# valign: obj0 and obj1 is vertically aligned.
|
# valign: obj0 and obj1 is vertically aligned.
|
||||||
#
|
#
|
||||||
|
@ -535,7 +545,10 @@ class LTLayoutContainer(LTContainer):
|
||||||
(min(obj0.width, obj1.width) * laparams.line_overlap <
|
(min(obj0.width, obj1.width) * laparams.line_overlap <
|
||||||
obj0.hoverlap(obj1)) and
|
obj0.hoverlap(obj1)) and
|
||||||
(obj0.vdistance(obj1) <
|
(obj0.vdistance(obj1) <
|
||||||
max(obj0.height, obj1.height) * laparams.char_margin))
|
max(obj0.height, obj1.height) * laparams.char_margin) and
|
||||||
|
# Don't start a vertical line if the previous letter is
|
||||||
|
# whitspace. Prevents double spaces being caught as vert lines.
|
||||||
|
(line or obj0._text.strip()))
|
||||||
|
|
||||||
if ((halign and isinstance(line, LTTextLineHorizontal)) or
|
if ((halign and isinstance(line, LTTextLineHorizontal)) or
|
||||||
(valign and isinstance(line, LTTextLineVertical))):
|
(valign and isinstance(line, LTTextLineVertical))):
|
||||||
|
@ -631,8 +644,19 @@ class LTLayoutContainer(LTContainer):
|
||||||
(c,d,_,_) = t
|
(c,d,_,_) = t
|
||||||
return (c,d)
|
return (c,d)
|
||||||
|
|
||||||
# XXX this still takes O(n^2) :(
|
# The algorithm below still takes O(n^2) :(
|
||||||
|
# For now, if we have many boxes, split them into two and perform them
|
||||||
|
# separately. This will cause bugs, but will prevent hanging.
|
||||||
|
if len(boxes) > 100:
|
||||||
|
boxes = sorted(boxes, key=lambda obj: obj.y0)
|
||||||
|
# Divide in two, then perform grouping
|
||||||
|
# print "Making Recursive Call %d"%len(boxes)
|
||||||
|
bot_boxes = self.group_textboxes(laparams, boxes[:len(boxes)/2])
|
||||||
|
top_boxes = self.group_textboxes(laparams, boxes[len(boxes)/2:])
|
||||||
|
boxes = bot_boxes + top_boxes
|
||||||
|
|
||||||
dists = []
|
dists = []
|
||||||
|
# Calculate the distance between each box
|
||||||
for i in xrange(len(boxes)):
|
for i in xrange(len(boxes)):
|
||||||
obj1 = boxes[i]
|
obj1 = boxes[i]
|
||||||
for j in xrange(i+1, len(boxes)):
|
for j in xrange(i+1, len(boxes)):
|
||||||
|
@ -642,16 +666,20 @@ class LTLayoutContainer(LTContainer):
|
||||||
dists = csort(dists, key=key_obj)
|
dists = csort(dists, key=key_obj)
|
||||||
plane = Plane(self.bbox)
|
plane = Plane(self.bbox)
|
||||||
plane.extend(boxes)
|
plane.extend(boxes)
|
||||||
|
# Start with the two closest objects
|
||||||
while dists:
|
while dists:
|
||||||
(c, d, obj1, obj2) = dists.pop(0)
|
(c, d, obj1, obj2) = dists.pop(0)
|
||||||
|
# If there are any objects in between, then skip these two
|
||||||
if c == 0 and isany(obj1, obj2):
|
if c == 0 and isany(obj1, obj2):
|
||||||
dists.append((1, d, obj1, obj2))
|
dists.append((1, d, obj1, obj2))
|
||||||
continue
|
continue
|
||||||
|
# Group these two closest objects
|
||||||
if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or
|
if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or
|
||||||
isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))):
|
isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))):
|
||||||
group = LTTextGroupTBRL([obj1, obj2])
|
group = LTTextGroupTBRL([obj1, obj2])
|
||||||
else:
|
else:
|
||||||
group = LTTextGroupLRTB([obj1, obj2])
|
group = LTTextGroupLRTB([obj1, obj2])
|
||||||
|
# Remove the two individual objects
|
||||||
plane.remove(obj1)
|
plane.remove(obj1)
|
||||||
plane.remove(obj2)
|
plane.remove(obj2)
|
||||||
dists = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists
|
dists = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists
|
||||||
|
|
Loading…
Reference in New Issue