layout analysis improvement

pull/1/head
Yusuke Shinyama 2011-02-27 12:56:28 +09:00
parent 7dbb664db3
commit cabaa10e4f
2 changed files with 55 additions and 66 deletions

View File

@ -1,23 +1,10 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
import sys import sys
from utils import apply_matrix_pt, get_bound, INF from utils import apply_matrix_pt, get_bound, INF
from utils import bsearch, bbox2str, matrix2str, Plane from utils import bbox2str, matrix2str, uniq, csort, Plane
from pdffont import PDFUnicodeNotDefined from pdffont import PDFUnicodeNotDefined
def uniq(objs):
done = set()
for obj in objs:
if obj in done: continue
done.add(obj)
yield obj
return
def csort(objs, key):
idxs = dict( (obj,i) for (i,obj) in enumerate(objs) )
return sorted(objs, key=lambda obj:(key(obj), idxs[obj]))
## LAParams ## LAParams
## ##
class LAParams(object): class LAParams(object):
@ -567,9 +554,9 @@ class LTLayoutContainer(LTContainer):
""" """
return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height) return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height)
boxes = boxes[:] boxes = boxes[:]
# XXX this is slow when there're many textboxes. # XXX this is very slow when there're many textboxes.
while 2 <= len(boxes): while 2 <= len(boxes):
mindist = INF mindist = (INF,0)
minpair = None minpair = None
plane = Plane(boxes) plane = Plane(boxes)
boxes = csort(boxes, key=lambda obj: obj.width*obj.height) boxes = csort(boxes, key=lambda obj: obj.width*obj.height)
@ -582,7 +569,9 @@ class LTLayoutContainer(LTContainer):
d = dist(b, obj1, obj2) d = dist(b, obj1, obj2)
# disregard if there's any other object in between. # disregard if there's any other object in between.
if 0 < d and others: if 0 < d and others:
d *= 2 d = (1,d)
else:
d = (0,d)
if mindist <= d: continue if mindist <= d: continue
mindist = d mindist = d
minpair = (obj1, obj2) minpair = (obj1, obj2)

View File

@ -32,6 +32,28 @@ def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
## Utility functions ## Utility functions
## ##
# uniq
def uniq(objs):
'''Eliminates duplicated elements.'''
done = set()
for obj in objs:
if obj in done: continue
done.add(obj)
yield obj
return
# csort
def csort(objs, key):
'''Order-preserving sorting function.'''
idxs = dict( (obj,i) for (i,obj) in enumerate(objs) )
return sorted(objs, key=lambda obj:(key(obj), idxs[obj]))
# drange
def drange(v0, v1, d):
'''Returns a discrete range.'''
assert v0 < v1
return xrange(int(v0)/d, int(v1+d-1)/d)
# get_bound # get_bound
def get_bound(pts): def get_bound(pts):
'''Compute a minimal rectangle that covers all the points.''' '''Compute a minimal rectangle that covers all the points.'''
@ -53,28 +75,6 @@ def pick(seq, func, maxobj=None):
(maxscore,maxobj) = (score,obj) (maxscore,maxobj) = (score,obj)
return maxobj return maxobj
# bsearch
def bsearch(objs, v0):
'''Tries to find the closest value to v0.'''
nb_objs = len(objs)
i0 = 0
i1 = nb_objs
while i0 < i1:
i = (i0+i1)/2
(v, obj) = objs[i]
if v0 == v:
(i0,i1) = (i,i+1)
while 0 < i0 and objs[i0-1][0] == v0:
i0 -= 1
while i1 < nb_objs-1 and objs[i1][0] == v0:
i1 += 1
break
elif v0 < v:
i1 = i
else:
i0 = i+1
return (i0,i1)
# choplist # choplist
def choplist(n, seq): def choplist(n, seq):
'''Groups every n elements of the list.''' '''Groups every n elements of the list.'''
@ -191,47 +191,46 @@ class ObjIdRange(object):
## ##
class Plane(object): class Plane(object):
def __init__(self, objs=None): def __init__(self, objs=None, gridsize=50):
self._idxs = {} self._objs = {}
self._xobjs = [] self.gridsize = gridsize
self._yobjs = []
if objs is not None: if objs is not None:
for obj in objs: for obj in objs:
self.add(obj) self.add(obj)
self.finish()
return return
def __repr__(self): def __repr__(self):
return ('<Plane objs=%r>' % list(self)) return ('<Plane objs=%r>' % list(self))
def __iter__(self): def _getrange(self, (x0,y0,x1,y1)):
return self._idxs.iterkeys() for y in drange(y0, y1, self.gridsize):
for x in drange(x0, x1, self.gridsize):
yield (x,y)
return
# add(obj): place an object in a certain area. # add(obj): place an object in a certain area.
def add(self, obj): def add(self, obj):
self._idxs[obj] = len(self._idxs) for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
self._xobjs.append((obj.x0, obj)) if k not in self._objs:
self._xobjs.append((obj.x1, obj)) r = []
self._yobjs.append((obj.y0, obj)) self._objs[k] = r
self._yobjs.append((obj.y1, obj)) else:
return r = self._objs[k]
r.append(obj)
# finish()
def finish(self):
self._xobjs.sort()
self._yobjs.sort()
return return
# find(): finds objects that are in a certain area. # find(): finds objects that are in a certain area.
def find(self, (x0,y0,x1,y1)): def find(self, (x0,y0,x1,y1)):
i0 = bsearch(self._xobjs, x0)[0] r = set()
i1 = bsearch(self._xobjs, x1)[1] for k in self._getrange((x0,y0,x1,y1)):
xobjs = set( obj for (_,obj) in self._xobjs[i0:i1] ) if k not in self._objs: continue
i0 = bsearch(self._yobjs, y0)[0] for obj in self._objs[k]:
i1 = bsearch(self._yobjs, y1)[1] if obj in r: continue
yobjs = set( obj for (_,obj) in self._yobjs[i0:i1] ) r.add(obj)
xobjs.intersection_update(yobjs) if (obj.x1 <= x0 or x1 <= obj.x0 or
return sorted(xobjs, key=lambda obj: self._idxs[obj]) obj.y1 <= y0 or y1 <= obj.y0): continue
yield obj
return
# create_bmp # create_bmp
@ -239,5 +238,6 @@ def create_bmp(data, bits, width, height):
info = pack('<IiiHHIIIIII', 40, width, height, 1, bits, 0, len(data), 0, 0, 0, 0) info = pack('<IiiHHIIIIII', 40, width, height, 1, bits, 0, len(data), 0, 0, 0, 0)
assert len(info) == 40, len(info) assert len(info) == 40, len(info)
header = pack('<ccIHHI', 'B', 'M', 14+40+len(data), 0, 0, 14+40) header = pack('<ccIHHI', 'B', 'M', 14+40+len(data), 0, 0, 14+40)
assert len(header) == 14, len(header)
# XXX re-rasterize every line # XXX re-rasterize every line
return header+info+data return header+info+data