better layout analysis

pull/1/head
Yusuke Shinyama 2011-02-14 23:41:23 +09:00
parent b2d13db29a
commit f00f1dbd04
1 changed files with 20 additions and 18 deletions

View File

@ -27,7 +27,7 @@ class LAParams(object):
char_margin=2.0, char_margin=2.0,
line_margin=0.5, line_margin=0.5,
word_margin=0.1, word_margin=0.1,
boxes_flow=0, boxes_flow=0.5,
detect_vertical=False, detect_vertical=False,
all_texts=False): all_texts=False):
self.line_overlap = line_overlap self.line_overlap = line_overlap
@ -528,9 +528,6 @@ class LTLayoutContainer(LTContainer):
def get_textboxes(self, laparams, lines): def get_textboxes(self, laparams, lines):
plane = Plane(lines) plane = Plane(lines)
for line in lines:
plane.add(line)
plane.finish()
boxes = {} boxes = {}
for line in lines: for line in lines:
neighbors = line.find_neighbors(plane, laparams.line_margin) neighbors = line.find_neighbors(plane, laparams.line_margin)
@ -556,35 +553,40 @@ class LTLayoutContainer(LTContainer):
return return
def group_textboxes(self, laparams, boxes): def group_textboxes(self, laparams, boxes):
def dist(obj1, obj2): def dist((x0,y0,x1,y1), obj1, obj2):
"""A distance function between two TextBoxes. """A distance function between two TextBoxes.
Consider the bounding rectangle for obj1 and obj2. Consider the bounding rectangle for obj1 and obj2.
Return its area less the areas of obj1 and obj2, Return its area less the areas of obj1 and obj2,
shown as 'www' below. This value may be negative. shown as 'www' below. This value may be negative.
+------+..........+ +------+..........+ (x1,y1)
| obj1 |wwwwwwwwww: | obj1 |wwwwwwwwww:
+------+www+------+ +------+www+------+
:wwwwwwwwww| obj2 | :wwwwwwwwww| obj2 |
+..........+------+ (x0,y0) +..........+------+
""" """
return ((max(obj1.x1,obj2.x1) - min(obj1.x0,obj2.x0)) * return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height)
(max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) -
(obj1.width*obj1.height + obj2.width*obj2.height))
boxes = boxes[:] boxes = boxes[:]
# XXX this is slow when there're many textboxes. # XXX this is slow when there're many textboxes.
while 2 <= len(boxes): while 2 <= len(boxes):
mindist = INF mindist = INF
minpair = None minpair = None
plane = Plane(boxes)
boxes = csort(boxes, key=lambda obj: obj.width*obj.height) boxes = csort(boxes, key=lambda obj: obj.width*obj.height)
for i in xrange(len(boxes)): for i in xrange(len(boxes)):
for j in xrange(i+1, len(boxes)): for j in xrange(i+1, len(boxes)):
(obj1, obj2) = (boxes[i], boxes[j]) (obj1, obj2) = (boxes[i], boxes[j])
d = dist(obj1, obj2) b = (min(obj1.x0,obj2.x0), min(obj1.y0,obj2.y0),
if d < mindist: max(obj1.x1,obj2.x1), max(obj1.y1,obj2.y1))
others = set(plane.find(b)).difference((obj1,obj2))
d = dist(b, obj1, obj2)
# disregard if there's any other object in between.
if 0 < d and others:
d *= 2
if mindist <= d: continue
mindist = d mindist = d
minpair = (obj1, obj2) minpair = (obj1, obj2)
assert minpair assert minpair is not None, boxes
(obj1, obj2) = minpair (obj1, obj2) = minpair
boxes.remove(obj1) boxes.remove(obj1)
boxes.remove(obj2) boxes.remove(obj2)