#!/usr/bin/env python import sys ## binary search ## def bsearch(objs, v0, v1): assert v0 <= v1 i0 = 0 i1 = len(objs)-1 while i0 <= i1: i = (i0+i1)/2 assert 0 <= i and i < len(objs) (v, obj) = objs[i] if v < v0: i0 = i+1 elif v1 < v: i1 = i-1 else: i0 = i while 0 < i0: (v,_) = objs[i0-1] if v < v0: break i0 -= 1 i1 = i while i1 < len(objs)-1: (v,_) = objs[i1+1] if v1 < v: break i1 += 1 return [ obj for (_,obj) in objs[i0:i1+1] ] return [] ## Plane ## class Plane(object): def __init__(self): self.xobjs = [] self.yobjs = [] return def add(self, (x0,y0,x1,y1), obj): self.xobjs.append((x0, obj)) self.xobjs.append((x1, obj)) self.yobjs.append((y0, obj)) self.yobjs.append((y1, obj)) return def finish(self): self.xobjs.sort() self.yobjs.sort() return def find(self, (x0,y0,x1,y1)): xobjs = set(bsearch(self.xobjs, x0, x1)) yobjs = set(bsearch(self.yobjs, y0, y1)) objs = xobjs.intersection(yobjs) return objs ## ClusterSet ## class ClusterSet(object): def __init__(self): self.clusters = {} return def add(self, obj): self.clusters[obj] = (obj,) return def merge(self, objs): allobjs = set(objs) for obj in objs: if obj in self.clusters: allobjs.update(self.clusters[obj]) c = tuple(allobjs) for obj in allobjs: self.clusters[obj] = c return def finish(self): return set(self.clusters.itervalues()) def cluster_pageobjs(objs, ratio): idx = dict( (obj,i) for (i,obj) in enumerate(objs) ) plane = Plane() for obj in objs: plane.add(obj.bbox, obj) plane.finish() cset = ClusterSet() for obj in objs: (bx0,by0,bx1,by1) = obj.bbox margin = abs(obj.fontsize * ratio) x0 = min(bx0,bx1) y0 = min(by0,by1) x1 = max(bx0,bx1) y1 = max(by0,by1) found = plane.find((x0-margin, y0-margin, x1+margin, y1+margin)) if len(found) == 1: cset.add(found.pop()) else: cset.merge(found) clusters = sorted(cset.finish(), key=lambda objs: idx[objs[0]]) r = [] for objs in clusters: objs = sorted(objs, key=lambda obj: idx[obj]) h = v = 0 (bx0,by0,bx1,by1) = objs[0].bbox (lx0,ly0,_,_) = objs[0].bbox for obj in objs[1:]: (x0,y0,x1,y1) = obj.bbox if len(obj.text) == 1 and abs(lx0-x0) < abs(ly0-y0): v += 1 else: h += 1 (lx0,ly0) = (x0,y0) bx0 = min(bx0, x0) bx1 = max(bx1, x1) by0 = min(by0, y0) by1 = max(by1, y1) r.append(((bx0,by0,bx1,by1), h < v, objs)) return r