diff --git a/pdfminer/layout.py b/pdfminer/layout.py index a9237a8..6b8e911 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -546,11 +546,11 @@ class LTLayoutContainer(LTContainer): """ return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height) boxes = boxes[:] + plane = Plane(boxes) # XXX this is very slow when there're many textboxes. while 2 <= len(boxes): mindist = (INF,0) minpair = None - plane = Plane(boxes) boxes = csort(boxes, key=lambda obj: obj.width*obj.height) for i in xrange(len(boxes)): for j in xrange(i+1, len(boxes)): @@ -571,6 +571,8 @@ class LTLayoutContainer(LTContainer): (obj1, obj2) = minpair boxes.remove(obj1) boxes.remove(obj2) + plane.remove(obj1) + plane.remove(obj2) if (isinstance(obj1, LTTextBoxVertical) or isinstance(obj2, LTTextBoxVertical) or isinstance(obj1, LTTextGroupTBRL) or @@ -578,7 +580,9 @@ class LTLayoutContainer(LTContainer): group = LTTextGroupTBRL([obj1, obj2]) else: group = LTTextGroupLRTB([obj1, obj2]) - boxes.append(group.analyze(laparams)) + group.analyze(laparams) + boxes.append(group) + plane.add(group) assert len(boxes) == 1 return boxes.pop() diff --git a/pdfminer/utils.py b/pdfminer/utils.py index d3d0b76..0d531c3 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -220,7 +220,7 @@ class Plane(object): yield (x,y) return - # add(obj): place an object in a certain area. + # add(obj): place an object. def add(self, obj): for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)): if k not in self._objs: @@ -231,6 +231,15 @@ class Plane(object): r.append(obj) return + # remove(obj): displace an object. + def remove(self, obj): + for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)): + try: + self._objs[k].remove(obj) + except (KeyError, ValueError): + pass + return + # find(): finds objects that are in a certain area. def find(self, (x0,y0,x1,y1)): r = set()