diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 73e707a..aa9a049 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -1,23 +1,10 @@ #!/usr/bin/env python2 import sys from utils import apply_matrix_pt, get_bound, INF -from utils import bsearch, bbox2str, matrix2str, Plane +from utils import bbox2str, matrix2str, uniq, csort, Plane from pdffont import PDFUnicodeNotDefined -def uniq(objs): - done = set() - for obj in objs: - if obj in done: continue - done.add(obj) - yield obj - return - -def csort(objs, key): - idxs = dict( (obj,i) for (i,obj) in enumerate(objs) ) - return sorted(objs, key=lambda obj:(key(obj), idxs[obj])) - - ## LAParams ## class LAParams(object): @@ -567,9 +554,9 @@ class LTLayoutContainer(LTContainer): """ return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height) boxes = boxes[:] - # XXX this is slow when there're many textboxes. + # XXX this is very slow when there're many textboxes. while 2 <= len(boxes): - mindist = INF + mindist = (INF,0) minpair = None plane = Plane(boxes) boxes = csort(boxes, key=lambda obj: obj.width*obj.height) @@ -582,7 +569,9 @@ class LTLayoutContainer(LTContainer): d = dist(b, obj1, obj2) # disregard if there's any other object in between. if 0 < d and others: - d *= 2 + d = (1,d) + else: + d = (0,d) if mindist <= d: continue mindist = d minpair = (obj1, obj2) diff --git a/pdfminer/utils.py b/pdfminer/utils.py index 0eee82d..fcb21a0 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -32,6 +32,28 @@ def apply_matrix_norm((a,b,c,d,e,f), (p,q)): ## Utility functions ## +# uniq +def uniq(objs): + '''Eliminates duplicated elements.''' + done = set() + for obj in objs: + if obj in done: continue + done.add(obj) + yield obj + return + +# csort +def csort(objs, key): + '''Order-preserving sorting function.''' + idxs = dict( (obj,i) for (i,obj) in enumerate(objs) ) + return sorted(objs, key=lambda obj:(key(obj), idxs[obj])) + +# drange +def drange(v0, v1, d): + '''Returns a discrete range.''' + assert v0 < v1 + return xrange(int(v0)/d, int(v1+d-1)/d) + # get_bound def get_bound(pts): '''Compute a minimal rectangle that covers all the points.''' @@ -53,28 +75,6 @@ def pick(seq, func, maxobj=None): (maxscore,maxobj) = (score,obj) return maxobj -# bsearch -def bsearch(objs, v0): - '''Tries to find the closest value to v0.''' - nb_objs = len(objs) - i0 = 0 - i1 = nb_objs - while i0 < i1: - i = (i0+i1)/2 - (v, obj) = objs[i] - if v0 == v: - (i0,i1) = (i,i+1) - while 0 < i0 and objs[i0-1][0] == v0: - i0 -= 1 - while i1 < nb_objs-1 and objs[i1][0] == v0: - i1 += 1 - break - elif v0 < v: - i1 = i - else: - i0 = i+1 - return (i0,i1) - # choplist def choplist(n, seq): '''Groups every n elements of the list.''' @@ -191,47 +191,46 @@ class ObjIdRange(object): ## class Plane(object): - def __init__(self, objs=None): - self._idxs = {} - self._xobjs = [] - self._yobjs = [] + def __init__(self, objs=None, gridsize=50): + self._objs = {} + self.gridsize = gridsize if objs is not None: for obj in objs: self.add(obj) - self.finish() return def __repr__(self): return ('' % list(self)) - def __iter__(self): - return self._idxs.iterkeys() - + def _getrange(self, (x0,y0,x1,y1)): + for y in drange(y0, y1, self.gridsize): + for x in drange(x0, x1, self.gridsize): + yield (x,y) + return + # add(obj): place an object in a certain area. def add(self, obj): - self._idxs[obj] = len(self._idxs) - self._xobjs.append((obj.x0, obj)) - self._xobjs.append((obj.x1, obj)) - self._yobjs.append((obj.y0, obj)) - self._yobjs.append((obj.y1, obj)) - return - - # finish() - def finish(self): - self._xobjs.sort() - self._yobjs.sort() + for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)): + if k not in self._objs: + r = [] + self._objs[k] = r + else: + r = self._objs[k] + r.append(obj) return # find(): finds objects that are in a certain area. def find(self, (x0,y0,x1,y1)): - i0 = bsearch(self._xobjs, x0)[0] - i1 = bsearch(self._xobjs, x1)[1] - xobjs = set( obj for (_,obj) in self._xobjs[i0:i1] ) - i0 = bsearch(self._yobjs, y0)[0] - i1 = bsearch(self._yobjs, y1)[1] - yobjs = set( obj for (_,obj) in self._yobjs[i0:i1] ) - xobjs.intersection_update(yobjs) - return sorted(xobjs, key=lambda obj: self._idxs[obj]) + r = set() + for k in self._getrange((x0,y0,x1,y1)): + if k not in self._objs: continue + for obj in self._objs[k]: + if obj in r: continue + r.add(obj) + if (obj.x1 <= x0 or x1 <= obj.x0 or + obj.y1 <= y0 or y1 <= obj.y0): continue + yield obj + return # create_bmp @@ -239,5 +238,6 @@ def create_bmp(data, bits, width, height): info = pack('