layout analysis improvement
parent
7dbb664db3
commit
cabaa10e4f
|
@ -1,23 +1,10 @@
|
||||||
#!/usr/bin/env python2
|
#!/usr/bin/env python2
|
||||||
import sys
|
import sys
|
||||||
from utils import apply_matrix_pt, get_bound, INF
|
from utils import apply_matrix_pt, get_bound, INF
|
||||||
from utils import bsearch, bbox2str, matrix2str, Plane
|
from utils import bbox2str, matrix2str, uniq, csort, Plane
|
||||||
from pdffont import PDFUnicodeNotDefined
|
from pdffont import PDFUnicodeNotDefined
|
||||||
|
|
||||||
|
|
||||||
def uniq(objs):
|
|
||||||
done = set()
|
|
||||||
for obj in objs:
|
|
||||||
if obj in done: continue
|
|
||||||
done.add(obj)
|
|
||||||
yield obj
|
|
||||||
return
|
|
||||||
|
|
||||||
def csort(objs, key):
|
|
||||||
idxs = dict( (obj,i) for (i,obj) in enumerate(objs) )
|
|
||||||
return sorted(objs, key=lambda obj:(key(obj), idxs[obj]))
|
|
||||||
|
|
||||||
|
|
||||||
## LAParams
|
## LAParams
|
||||||
##
|
##
|
||||||
class LAParams(object):
|
class LAParams(object):
|
||||||
|
@ -567,9 +554,9 @@ class LTLayoutContainer(LTContainer):
|
||||||
"""
|
"""
|
||||||
return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height)
|
return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height)
|
||||||
boxes = boxes[:]
|
boxes = boxes[:]
|
||||||
# XXX this is slow when there're many textboxes.
|
# XXX this is very slow when there're many textboxes.
|
||||||
while 2 <= len(boxes):
|
while 2 <= len(boxes):
|
||||||
mindist = INF
|
mindist = (INF,0)
|
||||||
minpair = None
|
minpair = None
|
||||||
plane = Plane(boxes)
|
plane = Plane(boxes)
|
||||||
boxes = csort(boxes, key=lambda obj: obj.width*obj.height)
|
boxes = csort(boxes, key=lambda obj: obj.width*obj.height)
|
||||||
|
@ -582,7 +569,9 @@ class LTLayoutContainer(LTContainer):
|
||||||
d = dist(b, obj1, obj2)
|
d = dist(b, obj1, obj2)
|
||||||
# disregard if there's any other object in between.
|
# disregard if there's any other object in between.
|
||||||
if 0 < d and others:
|
if 0 < d and others:
|
||||||
d *= 2
|
d = (1,d)
|
||||||
|
else:
|
||||||
|
d = (0,d)
|
||||||
if mindist <= d: continue
|
if mindist <= d: continue
|
||||||
mindist = d
|
mindist = d
|
||||||
minpair = (obj1, obj2)
|
minpair = (obj1, obj2)
|
||||||
|
|
|
@ -32,6 +32,28 @@ def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
|
||||||
## Utility functions
|
## Utility functions
|
||||||
##
|
##
|
||||||
|
|
||||||
|
# uniq
|
||||||
|
def uniq(objs):
|
||||||
|
'''Eliminates duplicated elements.'''
|
||||||
|
done = set()
|
||||||
|
for obj in objs:
|
||||||
|
if obj in done: continue
|
||||||
|
done.add(obj)
|
||||||
|
yield obj
|
||||||
|
return
|
||||||
|
|
||||||
|
# csort
|
||||||
|
def csort(objs, key):
|
||||||
|
'''Order-preserving sorting function.'''
|
||||||
|
idxs = dict( (obj,i) for (i,obj) in enumerate(objs) )
|
||||||
|
return sorted(objs, key=lambda obj:(key(obj), idxs[obj]))
|
||||||
|
|
||||||
|
# drange
|
||||||
|
def drange(v0, v1, d):
|
||||||
|
'''Returns a discrete range.'''
|
||||||
|
assert v0 < v1
|
||||||
|
return xrange(int(v0)/d, int(v1+d-1)/d)
|
||||||
|
|
||||||
# get_bound
|
# get_bound
|
||||||
def get_bound(pts):
|
def get_bound(pts):
|
||||||
'''Compute a minimal rectangle that covers all the points.'''
|
'''Compute a minimal rectangle that covers all the points.'''
|
||||||
|
@ -53,28 +75,6 @@ def pick(seq, func, maxobj=None):
|
||||||
(maxscore,maxobj) = (score,obj)
|
(maxscore,maxobj) = (score,obj)
|
||||||
return maxobj
|
return maxobj
|
||||||
|
|
||||||
# bsearch
|
|
||||||
def bsearch(objs, v0):
|
|
||||||
'''Tries to find the closest value to v0.'''
|
|
||||||
nb_objs = len(objs)
|
|
||||||
i0 = 0
|
|
||||||
i1 = nb_objs
|
|
||||||
while i0 < i1:
|
|
||||||
i = (i0+i1)/2
|
|
||||||
(v, obj) = objs[i]
|
|
||||||
if v0 == v:
|
|
||||||
(i0,i1) = (i,i+1)
|
|
||||||
while 0 < i0 and objs[i0-1][0] == v0:
|
|
||||||
i0 -= 1
|
|
||||||
while i1 < nb_objs-1 and objs[i1][0] == v0:
|
|
||||||
i1 += 1
|
|
||||||
break
|
|
||||||
elif v0 < v:
|
|
||||||
i1 = i
|
|
||||||
else:
|
|
||||||
i0 = i+1
|
|
||||||
return (i0,i1)
|
|
||||||
|
|
||||||
# choplist
|
# choplist
|
||||||
def choplist(n, seq):
|
def choplist(n, seq):
|
||||||
'''Groups every n elements of the list.'''
|
'''Groups every n elements of the list.'''
|
||||||
|
@ -191,47 +191,46 @@ class ObjIdRange(object):
|
||||||
##
|
##
|
||||||
class Plane(object):
|
class Plane(object):
|
||||||
|
|
||||||
def __init__(self, objs=None):
|
def __init__(self, objs=None, gridsize=50):
|
||||||
self._idxs = {}
|
self._objs = {}
|
||||||
self._xobjs = []
|
self.gridsize = gridsize
|
||||||
self._yobjs = []
|
|
||||||
if objs is not None:
|
if objs is not None:
|
||||||
for obj in objs:
|
for obj in objs:
|
||||||
self.add(obj)
|
self.add(obj)
|
||||||
self.finish()
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<Plane objs=%r>' % list(self))
|
return ('<Plane objs=%r>' % list(self))
|
||||||
|
|
||||||
def __iter__(self):
|
def _getrange(self, (x0,y0,x1,y1)):
|
||||||
return self._idxs.iterkeys()
|
for y in drange(y0, y1, self.gridsize):
|
||||||
|
for x in drange(x0, x1, self.gridsize):
|
||||||
|
yield (x,y)
|
||||||
|
return
|
||||||
|
|
||||||
# add(obj): place an object in a certain area.
|
# add(obj): place an object in a certain area.
|
||||||
def add(self, obj):
|
def add(self, obj):
|
||||||
self._idxs[obj] = len(self._idxs)
|
for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
|
||||||
self._xobjs.append((obj.x0, obj))
|
if k not in self._objs:
|
||||||
self._xobjs.append((obj.x1, obj))
|
r = []
|
||||||
self._yobjs.append((obj.y0, obj))
|
self._objs[k] = r
|
||||||
self._yobjs.append((obj.y1, obj))
|
else:
|
||||||
return
|
r = self._objs[k]
|
||||||
|
r.append(obj)
|
||||||
# finish()
|
|
||||||
def finish(self):
|
|
||||||
self._xobjs.sort()
|
|
||||||
self._yobjs.sort()
|
|
||||||
return
|
return
|
||||||
|
|
||||||
# find(): finds objects that are in a certain area.
|
# find(): finds objects that are in a certain area.
|
||||||
def find(self, (x0,y0,x1,y1)):
|
def find(self, (x0,y0,x1,y1)):
|
||||||
i0 = bsearch(self._xobjs, x0)[0]
|
r = set()
|
||||||
i1 = bsearch(self._xobjs, x1)[1]
|
for k in self._getrange((x0,y0,x1,y1)):
|
||||||
xobjs = set( obj for (_,obj) in self._xobjs[i0:i1] )
|
if k not in self._objs: continue
|
||||||
i0 = bsearch(self._yobjs, y0)[0]
|
for obj in self._objs[k]:
|
||||||
i1 = bsearch(self._yobjs, y1)[1]
|
if obj in r: continue
|
||||||
yobjs = set( obj for (_,obj) in self._yobjs[i0:i1] )
|
r.add(obj)
|
||||||
xobjs.intersection_update(yobjs)
|
if (obj.x1 <= x0 or x1 <= obj.x0 or
|
||||||
return sorted(xobjs, key=lambda obj: self._idxs[obj])
|
obj.y1 <= y0 or y1 <= obj.y0): continue
|
||||||
|
yield obj
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
# create_bmp
|
# create_bmp
|
||||||
|
@ -239,5 +238,6 @@ def create_bmp(data, bits, width, height):
|
||||||
info = pack('<IiiHHIIIIII', 40, width, height, 1, bits, 0, len(data), 0, 0, 0, 0)
|
info = pack('<IiiHHIIIIII', 40, width, height, 1, bits, 0, len(data), 0, 0, 0, 0)
|
||||||
assert len(info) == 40, len(info)
|
assert len(info) == 40, len(info)
|
||||||
header = pack('<ccIHHI', 'B', 'M', 14+40+len(data), 0, 0, 14+40)
|
header = pack('<ccIHHI', 'B', 'M', 14+40+len(data), 0, 0, 14+40)
|
||||||
|
assert len(header) == 14, len(header)
|
||||||
# XXX re-rasterize every line
|
# XXX re-rasterize every line
|
||||||
return header+info+data
|
return header+info+data
|
||||||
|
|
Loading…
Reference in New Issue