code cleanup and testcase stabilization

pull/1/head
Yusuke Shinyama 2011-05-15 01:22:19 +09:00
parent e5d02f8653
commit c134596e2f
2 changed files with 42 additions and 37 deletions

View File

@ -459,30 +459,6 @@ class LTLayoutContainer(LTContainer):
self.groups = None self.groups = None
return return
def analyze(self, laparams):
# textobjs is a list of LTChar objects, i.e.
# it has all the individual characters in the page.
(textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self._objs)
for obj in otherobjs:
obj.analyze(laparams)
if not textobjs: return
textlines = list(self.get_textlines(laparams, textobjs))
assert len(textobjs) <= sum( len(line._objs) for line in textlines )
(empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)
for obj in empties:
obj.analyze(laparams)
textboxes = list(self.get_textboxes(laparams, textlines))
assert len(textlines) == sum( len(box._objs) for box in textboxes )
groups = self.group_textboxes(laparams, textboxes)
assigner = IndexAssigner()
for group in groups:
group.analyze(laparams)
assigner.run(group)
textboxes.sort(key=lambda box:box.index)
self._objs = textboxes + otherobjs + empties
self.groups = groups
return
def get_textlines(self, laparams, objs): def get_textlines(self, laparams, objs):
obj0 = None obj0 = None
line = None line = None
@ -593,6 +569,15 @@ class LTLayoutContainer(LTContainer):
x1 = max(obj1.x1,obj2.x1) x1 = max(obj1.x1,obj2.x1)
y1 = max(obj1.y1,obj2.y1) y1 = max(obj1.y1,obj2.y1)
return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height) return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height)
def isany(obj1, obj2):
"""Check if there's any other object between obj1 and obj2.
"""
x0 = min(obj1.x0,obj2.x0)
y0 = min(obj1.y0,obj2.y0)
x1 = max(obj1.x1,obj2.x1)
y1 = max(obj1.y1,obj2.y1)
objs = set(plane.find((x0,y0,x1,y1)))
return objs.difference((obj1,obj2))
# XXX this still takes O(n^2) :( # XXX this still takes O(n^2) :(
dists = [] dists = []
for i in xrange(len(boxes)): for i in xrange(len(boxes)):
@ -604,11 +589,7 @@ class LTLayoutContainer(LTContainer):
plane = Plane(boxes) plane = Plane(boxes)
while dists: while dists:
(c,d,obj1,obj2) = dists.pop(0) (c,d,obj1,obj2) = dists.pop(0)
x0 = min(obj1.x0,obj2.x0) if c == 0 and isany(obj1, obj2):
y0 = min(obj1.y0,obj2.y0)
x1 = max(obj1.x1,obj2.x1)
y1 = max(obj1.y1,obj2.y1)
if c == 0 and plane.find((x0,y0,x1,y1)).difference((obj1,obj2)):
dists.append((1,d,obj1,obj2)) dists.append((1,d,obj1,obj2))
continue continue
if (isinstance(obj1, LTTextBoxVertical) or if (isinstance(obj1, LTTextBoxVertical) or
@ -620,7 +601,8 @@ class LTLayoutContainer(LTContainer):
group = LTTextGroupLRTB([obj1,obj2]) group = LTTextGroupLRTB([obj1,obj2])
plane.remove(obj1) plane.remove(obj1)
plane.remove(obj2) plane.remove(obj2)
dists = [ (c,d,o1,o2) for (c,d,o1,o2) in dists if o1 not in (obj1,obj2) and o2 not in (obj1,obj2) ] dists = [ (c,d,o1,o2) for (c,d,o1,o2) in dists
if o1 in plane and o2 in plane ]
for other in plane: for other in plane:
dists.append((0, dist(group,other), group, other)) dists.append((0, dist(group,other), group, other))
dists.sort() dists.sort()
@ -628,6 +610,30 @@ class LTLayoutContainer(LTContainer):
assert len(plane) == 1 assert len(plane) == 1
return list(plane) return list(plane)
def analyze(self, laparams):
# textobjs is a list of LTChar objects, i.e.
# it has all the individual characters in the page.
(textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self._objs)
for obj in otherobjs:
obj.analyze(laparams)
if not textobjs: return
textlines = list(self.get_textlines(laparams, textobjs))
assert len(textobjs) <= sum( len(line._objs) for line in textlines )
(empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)
for obj in empties:
obj.analyze(laparams)
textboxes = list(self.get_textboxes(laparams, textlines))
assert len(textlines) == sum( len(box._objs) for box in textboxes )
groups = self.group_textboxes(laparams, textboxes)
assigner = IndexAssigner()
for group in groups:
group.analyze(laparams)
assigner.run(group)
textboxes.sort(key=lambda box:box.index)
self._objs = textboxes + otherobjs + empties
self.groups = groups
return
## LTFigure ## LTFigure
## ##

View File

@ -43,10 +43,10 @@ def uniq(objs):
return return
# csort # csort
def csort(objs, key): def csort(objs, key=lambda x:x):
"""Order-preserving sorting function.""" """Order-preserving sorting function."""
idxs = dict( (obj,i) for (i,obj) in enumerate(objs) ) idxs = dict( (obj,i) for (i,obj) in enumerate(objs) )
return sorted(objs, key=lambda obj:(key(obj), idxs[obj])) return sorted(objs, key=lambda obj: (key(obj), idxs[obj]))
# fsplit # fsplit
def fsplit(pred, objs): def fsplit(pred, objs):
@ -204,7 +204,7 @@ class ObjIdRange(object):
class Plane(object): class Plane(object):
def __init__(self, objs=None, gridsize=50): def __init__(self, objs=None, gridsize=50):
self._objs = set() self._objs = []
self._grid = {} self._grid = {}
self.gridsize = gridsize self.gridsize = gridsize
if objs is not None: if objs is not None:
@ -239,7 +239,7 @@ class Plane(object):
else: else:
r = self._grid[k] r = self._grid[k]
r.append(obj) r.append(obj)
self._objs.add(obj) self._objs.append(obj)
return return
# remove(obj): displace an object. # remove(obj): displace an object.
@ -254,7 +254,6 @@ class Plane(object):
# find(): finds objects that are in a certain area. # find(): finds objects that are in a certain area.
def find(self, (x0,y0,x1,y1)): def find(self, (x0,y0,x1,y1)):
r = set()
done = set() done = set()
for k in self._getrange((x0,y0,x1,y1)): for k in self._getrange((x0,y0,x1,y1)):
if k not in self._grid: continue if k not in self._grid: continue
@ -263,8 +262,8 @@ class Plane(object):
done.add(obj) done.add(obj)
if (obj.x1 <= x0 or x1 <= obj.x0 or if (obj.x1 <= x0 or x1 <= obj.x0 or
obj.y1 <= y0 or y1 <= obj.y0): continue obj.y1 <= y0 or y1 <= obj.y0): continue
r.add(obj) yield obj
return r return
# create_bmp # create_bmp