From 2e900e5d1064058bb54f217efcc65c9eb0a9dafe Mon Sep 17 00:00:00 2001 From: Yusuke Shinyama Date: Thu, 26 Jun 2014 17:41:31 +0900 Subject: [PATCH] Fixed for consistent test results. (hopefully...) --- pdfminer/layout.py | 21 ++++++++++++++++++--- pdfminer/utils.py | 2 +- samples/nonfree/f1040nr.html.ref | 4 ++-- samples/nonfree/f1040nr.xml.ref | 24 ++++++++++++------------ 4 files changed, 33 insertions(+), 18 deletions(-) diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 7037fe7..2b34a4b 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -81,6 +81,16 @@ class LTComponent(LTItem): return ('<%s %s>' % (self.__class__.__name__, bbox2str(self.bbox))) + # Disable comparison. + def __lt__(self, _): + raise ValueError + def __le__(self, _): + raise ValueError + def __gt__(self, _): + raise ValueError + def __ge__(self, _): + raise ValueError + def set_bbox(self, bbox): (x0, y0, x1, y1) = bbox self.x0 = x0 @@ -609,6 +619,10 @@ class LTLayoutContainer(LTContainer): y1 = max(obj1.y1, obj2.y1) objs = set(plane.find((x0, y0, x1, y1))) return objs.difference((obj1, obj2)) + + def key_obj(t): + (c,d,_,_) = t + return (c,d) # XXX this still takes O(n^2) :( dists = [] @@ -618,7 +632,7 @@ class LTLayoutContainer(LTContainer): obj2 = boxes[j] dists.append((0, dist(obj1, obj2), obj1, obj2)) # We could use dists.sort(), but it would randomize the test result. - dists = csort(dists) + dists = csort(dists, key=key_obj) plane = Plane(self.bbox) plane.extend(boxes) while dists: @@ -633,10 +647,11 @@ class LTLayoutContainer(LTContainer): group = LTTextGroupLRTB([obj1, obj2]) plane.remove(obj1) plane.remove(obj2) - dists = [ n for n in dists if (n[2] in plane and n[3] in plane) ] + dists = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists + if (obj1 in plane and obj2 in plane) ] for other in plane: dists.append((0, dist(group, other), group, other)) - dists = csort(dists) + dists = csort(dists, key=key_obj) plane.add(group) assert len(plane) == 1 return list(plane) diff --git a/pdfminer/utils.py b/pdfminer/utils.py index 80c735c..91e19c0 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -104,7 +104,7 @@ def uniq(objs): # csort -def csort(objs, key=lambda x: x): +def csort(objs, key): """Order-preserving sorting function.""" idxs = dict((obj, i) for (i, obj) in enumerate(objs)) return sorted(objs, key=lambda obj: (key(obj), idxs[obj])) diff --git a/samples/nonfree/f1040nr.html.ref b/samples/nonfree/f1040nr.html.ref index 53f117c..e0d71fb 100644 --- a/samples/nonfree/f1040nr.html.ref +++ b/samples/nonfree/f1040nr.html.ref @@ -269,9 +269,9 @@
Taxable amount (see page 13)

22 -
-

+
+



diff --git a/samples/nonfree/f1040nr.xml.ref b/samples/nonfree/f1040nr.xml.ref index 533d951..914dab6 100644 --- a/samples/nonfree/f1040nr.xml.ref +++ b/samples/nonfree/f1040nr.xml.ref @@ -4976,9 +4976,9 @@ - - - + + + @@ -4990,9 +4990,9 @@ - - - + + + @@ -5876,8 +5876,8 @@ - + @@ -5891,8 +5891,8 @@ - + @@ -5932,11 +5932,11 @@ - + @@ -5964,16 +5964,16 @@ - + - - + +