code cleanup and testcase stabilization
parent
e5d02f8653
commit
c134596e2f
|
@ -459,30 +459,6 @@ class LTLayoutContainer(LTContainer):
|
||||||
self.groups = None
|
self.groups = None
|
||||||
return
|
return
|
||||||
|
|
||||||
def analyze(self, laparams):
|
|
||||||
# textobjs is a list of LTChar objects, i.e.
|
|
||||||
# it has all the individual characters in the page.
|
|
||||||
(textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self._objs)
|
|
||||||
for obj in otherobjs:
|
|
||||||
obj.analyze(laparams)
|
|
||||||
if not textobjs: return
|
|
||||||
textlines = list(self.get_textlines(laparams, textobjs))
|
|
||||||
assert len(textobjs) <= sum( len(line._objs) for line in textlines )
|
|
||||||
(empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)
|
|
||||||
for obj in empties:
|
|
||||||
obj.analyze(laparams)
|
|
||||||
textboxes = list(self.get_textboxes(laparams, textlines))
|
|
||||||
assert len(textlines) == sum( len(box._objs) for box in textboxes )
|
|
||||||
groups = self.group_textboxes(laparams, textboxes)
|
|
||||||
assigner = IndexAssigner()
|
|
||||||
for group in groups:
|
|
||||||
group.analyze(laparams)
|
|
||||||
assigner.run(group)
|
|
||||||
textboxes.sort(key=lambda box:box.index)
|
|
||||||
self._objs = textboxes + otherobjs + empties
|
|
||||||
self.groups = groups
|
|
||||||
return
|
|
||||||
|
|
||||||
def get_textlines(self, laparams, objs):
|
def get_textlines(self, laparams, objs):
|
||||||
obj0 = None
|
obj0 = None
|
||||||
line = None
|
line = None
|
||||||
|
@ -593,6 +569,15 @@ class LTLayoutContainer(LTContainer):
|
||||||
x1 = max(obj1.x1,obj2.x1)
|
x1 = max(obj1.x1,obj2.x1)
|
||||||
y1 = max(obj1.y1,obj2.y1)
|
y1 = max(obj1.y1,obj2.y1)
|
||||||
return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height)
|
return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height)
|
||||||
|
def isany(obj1, obj2):
|
||||||
|
"""Check if there's any other object between obj1 and obj2.
|
||||||
|
"""
|
||||||
|
x0 = min(obj1.x0,obj2.x0)
|
||||||
|
y0 = min(obj1.y0,obj2.y0)
|
||||||
|
x1 = max(obj1.x1,obj2.x1)
|
||||||
|
y1 = max(obj1.y1,obj2.y1)
|
||||||
|
objs = set(plane.find((x0,y0,x1,y1)))
|
||||||
|
return objs.difference((obj1,obj2))
|
||||||
# XXX this still takes O(n^2) :(
|
# XXX this still takes O(n^2) :(
|
||||||
dists = []
|
dists = []
|
||||||
for i in xrange(len(boxes)):
|
for i in xrange(len(boxes)):
|
||||||
|
@ -604,11 +589,7 @@ class LTLayoutContainer(LTContainer):
|
||||||
plane = Plane(boxes)
|
plane = Plane(boxes)
|
||||||
while dists:
|
while dists:
|
||||||
(c,d,obj1,obj2) = dists.pop(0)
|
(c,d,obj1,obj2) = dists.pop(0)
|
||||||
x0 = min(obj1.x0,obj2.x0)
|
if c == 0 and isany(obj1, obj2):
|
||||||
y0 = min(obj1.y0,obj2.y0)
|
|
||||||
x1 = max(obj1.x1,obj2.x1)
|
|
||||||
y1 = max(obj1.y1,obj2.y1)
|
|
||||||
if c == 0 and plane.find((x0,y0,x1,y1)).difference((obj1,obj2)):
|
|
||||||
dists.append((1,d,obj1,obj2))
|
dists.append((1,d,obj1,obj2))
|
||||||
continue
|
continue
|
||||||
if (isinstance(obj1, LTTextBoxVertical) or
|
if (isinstance(obj1, LTTextBoxVertical) or
|
||||||
|
@ -620,7 +601,8 @@ class LTLayoutContainer(LTContainer):
|
||||||
group = LTTextGroupLRTB([obj1,obj2])
|
group = LTTextGroupLRTB([obj1,obj2])
|
||||||
plane.remove(obj1)
|
plane.remove(obj1)
|
||||||
plane.remove(obj2)
|
plane.remove(obj2)
|
||||||
dists = [ (c,d,o1,o2) for (c,d,o1,o2) in dists if o1 not in (obj1,obj2) and o2 not in (obj1,obj2) ]
|
dists = [ (c,d,o1,o2) for (c,d,o1,o2) in dists
|
||||||
|
if o1 in plane and o2 in plane ]
|
||||||
for other in plane:
|
for other in plane:
|
||||||
dists.append((0, dist(group,other), group, other))
|
dists.append((0, dist(group,other), group, other))
|
||||||
dists.sort()
|
dists.sort()
|
||||||
|
@ -628,6 +610,30 @@ class LTLayoutContainer(LTContainer):
|
||||||
assert len(plane) == 1
|
assert len(plane) == 1
|
||||||
return list(plane)
|
return list(plane)
|
||||||
|
|
||||||
|
def analyze(self, laparams):
|
||||||
|
# textobjs is a list of LTChar objects, i.e.
|
||||||
|
# it has all the individual characters in the page.
|
||||||
|
(textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self._objs)
|
||||||
|
for obj in otherobjs:
|
||||||
|
obj.analyze(laparams)
|
||||||
|
if not textobjs: return
|
||||||
|
textlines = list(self.get_textlines(laparams, textobjs))
|
||||||
|
assert len(textobjs) <= sum( len(line._objs) for line in textlines )
|
||||||
|
(empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)
|
||||||
|
for obj in empties:
|
||||||
|
obj.analyze(laparams)
|
||||||
|
textboxes = list(self.get_textboxes(laparams, textlines))
|
||||||
|
assert len(textlines) == sum( len(box._objs) for box in textboxes )
|
||||||
|
groups = self.group_textboxes(laparams, textboxes)
|
||||||
|
assigner = IndexAssigner()
|
||||||
|
for group in groups:
|
||||||
|
group.analyze(laparams)
|
||||||
|
assigner.run(group)
|
||||||
|
textboxes.sort(key=lambda box:box.index)
|
||||||
|
self._objs = textboxes + otherobjs + empties
|
||||||
|
self.groups = groups
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
## LTFigure
|
## LTFigure
|
||||||
##
|
##
|
||||||
|
|
|
@ -43,10 +43,10 @@ def uniq(objs):
|
||||||
return
|
return
|
||||||
|
|
||||||
# csort
|
# csort
|
||||||
def csort(objs, key):
|
def csort(objs, key=lambda x:x):
|
||||||
"""Order-preserving sorting function."""
|
"""Order-preserving sorting function."""
|
||||||
idxs = dict( (obj,i) for (i,obj) in enumerate(objs) )
|
idxs = dict( (obj,i) for (i,obj) in enumerate(objs) )
|
||||||
return sorted(objs, key=lambda obj:(key(obj), idxs[obj]))
|
return sorted(objs, key=lambda obj: (key(obj), idxs[obj]))
|
||||||
|
|
||||||
# fsplit
|
# fsplit
|
||||||
def fsplit(pred, objs):
|
def fsplit(pred, objs):
|
||||||
|
@ -204,7 +204,7 @@ class ObjIdRange(object):
|
||||||
class Plane(object):
|
class Plane(object):
|
||||||
|
|
||||||
def __init__(self, objs=None, gridsize=50):
|
def __init__(self, objs=None, gridsize=50):
|
||||||
self._objs = set()
|
self._objs = []
|
||||||
self._grid = {}
|
self._grid = {}
|
||||||
self.gridsize = gridsize
|
self.gridsize = gridsize
|
||||||
if objs is not None:
|
if objs is not None:
|
||||||
|
@ -239,7 +239,7 @@ class Plane(object):
|
||||||
else:
|
else:
|
||||||
r = self._grid[k]
|
r = self._grid[k]
|
||||||
r.append(obj)
|
r.append(obj)
|
||||||
self._objs.add(obj)
|
self._objs.append(obj)
|
||||||
return
|
return
|
||||||
|
|
||||||
# remove(obj): displace an object.
|
# remove(obj): displace an object.
|
||||||
|
@ -254,7 +254,6 @@ class Plane(object):
|
||||||
|
|
||||||
# find(): finds objects that are in a certain area.
|
# find(): finds objects that are in a certain area.
|
||||||
def find(self, (x0,y0,x1,y1)):
|
def find(self, (x0,y0,x1,y1)):
|
||||||
r = set()
|
|
||||||
done = set()
|
done = set()
|
||||||
for k in self._getrange((x0,y0,x1,y1)):
|
for k in self._getrange((x0,y0,x1,y1)):
|
||||||
if k not in self._grid: continue
|
if k not in self._grid: continue
|
||||||
|
@ -263,8 +262,8 @@ class Plane(object):
|
||||||
done.add(obj)
|
done.add(obj)
|
||||||
if (obj.x1 <= x0 or x1 <= obj.x0 or
|
if (obj.x1 <= x0 or x1 <= obj.x0 or
|
||||||
obj.y1 <= y0 or y1 <= obj.y0): continue
|
obj.y1 <= y0 or y1 <= obj.y0): continue
|
||||||
r.add(obj)
|
yield obj
|
||||||
return r
|
return
|
||||||
|
|
||||||
|
|
||||||
# create_bmp
|
# create_bmp
|
||||||
|
|
Loading…
Reference in New Issue