From 0298e26acc4ec917782f2ba2314a464e84f7f63b Mon Sep 17 00:00:00 2001
From: "yusuke.shinyama.dummy"
 <yusuke.shinyama.dummy@1aa58f4a-7d42-0410-adbc-911cccaed67c>
Date: Sat, 14 Nov 2009 11:29:40 +0000
Subject: [PATCH] speed-tweak.diff from Yannick Gingras

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@158 1aa58f4a-7d42-0410-adbc-911cccaed67c
---
 docs/index.html      |  9 ++++----
 pdfminer/layout.py   | 38 ++++++++++++++++++++++-----------
 pdfminer/psparser.py | 51 +++++++++++++++++++++++++++-----------------
 pdfminer/utils.py    | 10 +++++----
 4 files changed, 68 insertions(+), 40 deletions(-)
diff --git a/docs/index.html b/docs/index.html
index 1bf6fdb..02b5509 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -19,7 +19,7 @@ Python PDF parser and analyzer
 
 <div align=right class=lastmod>
 <!-- hhmts start -->
-Last Modified: Sat Nov  7 18:11:40 JST 2009
+Last Modified: Fri Nov 13 19:12:36 JST 2009
 <!-- hhmts end -->
 </div>
 
@@ -41,8 +41,9 @@ Last Modified: Sat Nov  7 18:11:40 JST 2009
 <h2>What's It?</h2>
 <p>
 PDFMiner is a suite of programs that help
-extracting and analyzing text data of PDF documents.
-Unlike other PDF-related tools, it allows to obtain
+extracting some meaningful informatin out of PDF documents.
+Unlike other PDF-related tools, it focuses entirely on getting 
+and analyzing text data from PDFs. PDFMiner allows to obtain
 the exact location of texts in a page, as well as 
 other extra information such as font information or ruled lines.
 It includes a PDF converter that can transform PDF files
@@ -59,7 +60,7 @@ PDF parser that can be used for other purposes instead of text analysis.
 <li> PDF to HTML conversion (with a sample converter web app).
 <li> Outline (TOC) extraction.
 <li> Tagged contents extraction.
-<li> Infer text running by using clustering technique.
+<li> Reconstruct the original layout by grouping text chunks.
 </ul>
 
 <a name="source"></a>
diff --git a/pdfminer/layout.py b/pdfminer/layout.py
index 218856c..7a15ce7 100644
--- a/pdfminer/layout.py
+++ b/pdfminer/layout.py
@@ -57,14 +57,14 @@ class Plane(object):
 
     # find(): finds objects that are in a certain area.
     def find(self, (x0,y0,x1,y1)):
-        (i0,_) = bsearch(self.xobjs, x0)
-        (_,i1) = bsearch(self.xobjs, x1)
-        xobjs = set( obj for (_,obj) in self.xobjs[i0:i1] )
-        (i0,_) = bsearch(self.yobjs, y0)
-        (_,i1) = bsearch(self.yobjs, y1)
-        yobjs = set( obj for (_,obj) in self.yobjs[i0:i1] )
-        objs = xobjs.intersection(yobjs)
-        return objs
+        i0 = bsearch(self.xobjs, x0)[0]
+        i1 = bsearch(self.xobjs, x1)[1]
+        xobjs = set( [pair[1] for pair in self.xobjs[i0:i1]] )
+        i0 = bsearch(self.yobjs, y0)[0]
+        i1 = bsearch(self.yobjs, y1)[1]
+        yobjs = [pair[1] for pair in self.yobjs[i0:i1]]
+        xobjs.intersection_update(yobjs)
+        return xobjs
 
 
 ##  ClusterSet
@@ -139,6 +139,13 @@ class LayoutItem(object):
     def get_bbox(self):
         return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1)
 
+    def is_hoverlap(self, obj):
+        assert isinstance(obj, LayoutItem)
+        if self.x1 <= obj.x0 or obj.x1 <= self.x0:
+            return False
+        else:
+            return True
+
     def hoverlap(self, obj):
         assert isinstance(obj, LayoutItem)
         if self.x1 <= obj.x0 or obj.x1 <= self.x0:
@@ -146,6 +153,13 @@ class LayoutItem(object):
         else:
             return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
 
+    def is_voverlap(self, obj):
+        assert isinstance(obj, LayoutItem)
+        if self.y1 <= obj.y0 or obj.y1 <= self.y0:
+            return False
+        else:
+            return True
+
     def voverlap(self, obj):
         assert isinstance(obj, LayoutItem)
         if self.y1 <= obj.y0 or obj.y1 <= self.y0:
@@ -473,9 +487,9 @@ class LTPage(LayoutContainer):
             def vline(obj1, obj2):
                 return obj1.width * laparams.line_overlap < obj1.hoverlap(obj2)
             def vorder(obj1, obj2):
-                if obj1.voverlap(obj2):
+                if obj1.is_voverlap(obj2):
                     return obj2.x1 < obj1.x0
-                elif obj1.hoverlap(obj2):
+                elif obj1.is_hoverlap(obj2):
                     return obj2.y1 < obj1.y0
                 else:
                     return obj2.x1 < obj1.x0 and obj2.y1 < obj1.y0
@@ -489,9 +503,9 @@ class LTPage(LayoutContainer):
             def hline(obj1, obj2):
                 return obj1.height * laparams.line_overlap < obj1.voverlap(obj2)
             def horder(obj1, obj2):
-                if obj1.hoverlap(obj2):
+                if obj1.is_hoverlap(obj2):
                     return obj2.y1 < obj1.y0
-                elif obj1.voverlap(obj2):
+                elif obj1.is_voverlap(obj2):
                     return obj1.x1 < obj2.x0
                 else:
                     return obj1.x1 < obj2.x0 and obj2.y1 < obj1.y0
diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py
index aa69192..a2d98cf 100644
--- a/pdfminer/psparser.py
+++ b/pdfminer/psparser.py
@@ -21,23 +21,28 @@ class PSValueError(PSException): pass
 
 ##  PSObject
 ##
-##  Base class for all PS or PDF-related data types.
-##
-class PSObject(object): pass
+class PSObject(object):
+
+    """Base class for all PS or PDF-related data types."""
+
+    pass
 
 
 ##  PSLiteral
 ##
-##  Postscript literals are used as identifiers, such as
-##  variable names, property names and dictionary keys.
-##  Literals are case sensitive and denoted by a preceding
-##  slash sign (e.g. "/Name")
-##
-##  Note: Never create an instance of PSLiteral by hand.
-##  Always use PSLiteralTable.intern().
-##
 class PSLiteral(PSObject):
 
+    """A class that represents a PostScript literal.
+    
+    Postscript literals are used as identifiers, such as
+    variable names, property names and dictionary keys.
+    Literals are case sensitive and denoted by a preceding
+    slash sign (e.g. "/Name")
+
+    Note: Do not create an instance of PSLiteral directly.
+    Always use PSLiteralTable.intern().
+    """
+
     def __init__(self, name):
         self.name = name
         return
@@ -48,11 +53,18 @@ class PSLiteral(PSObject):
 
 ##  PSKeyword
 ##
-##  Note: Never create an instance of PSLiteral by hand.
-##  Always use PSKeywordTable.intern().
-##
 class PSKeyword(PSObject):
 
+    """A class that represents a PostScript keyword.
+    
+    PostScript keywords are a dozen of predefined words.
+    Commands and directives in PostScript are expressed by keywords.
+    They are also used to denote the content boundaries.
+    
+    Note: Do not create an instance of PSKeyword directly.
+    Always use PSKeywordTable.intern().
+    """
+
     def __init__(self, name):
         self.name = name
         return
@@ -63,14 +75,13 @@ class PSKeyword(PSObject):
 
 ##  PSSymbolTable
 ##
-##  A dictionary-like object that is used for
-##  storing PSLiteral/PSKeyword objects so that
-##  an object that has the same name can never be defined
-##  twice and it is always assured that the same name is
-##  referred to as the same PSLiteral/PSKeyword object.
-##
 class PSSymbolTable(object):
 
+    """A utility class for storing PSLiteral/PSKeyword objects.
+
+    Interned objects can be checked its identity with "is" operator.
+    """
+    
     def __init__(self, klass):
         self.dic = {}
         self.klass = klass
diff --git a/pdfminer/utils.py b/pdfminer/utils.py
index e390f84..19799cc 100644
--- a/pdfminer/utils.py
+++ b/pdfminer/utils.py
@@ -13,6 +13,7 @@ def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
             a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
 
 def translate_matrix((a,b,c,d,e,f), (x,y)):
+    '''Translates a matrix by (x,y).'''
     return (a,b,c,d,x*a+y*c+e,x*b+y*d+f)
 
 def apply_matrix_pt((a,b,c,d,e,f), (x,y)):
@@ -29,7 +30,7 @@ def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
 
 # pick
 def pick(seq, func, maxobj=None):
-    '''Picks the object that has the highest value of func(obj).'''
+    '''Picks the object obj where func(obj) has the highest value.'''
     maxscore = None
     for obj in seq:
         score = func(obj)
@@ -40,8 +41,9 @@ def pick(seq, func, maxobj=None):
 # bsearch
 def bsearch(objs, v0):
     '''Tries to find the closest value to v0.'''
+    nb_objs = len(objs)
     i0 = 0
-    i1 = len(objs)
+    i1 = nb_objs
     while i0 < i1:
         i = (i0+i1)/2
         (v, obj) = objs[i]
@@ -49,7 +51,7 @@ def bsearch(objs, v0):
             (i0,i1) = (i,i+1)
             while 0 < i0 and objs[i0-1][0] == v0:
                 i0 -= 1
-            while i1 < len(objs)-1 and objs[i1][0] == v0:
+            while i1 < nb_objs-1 and objs[i1][0] == v0:
                 i1 += 1
             break
         elif v0 < v:
@@ -71,7 +73,7 @@ def choplist(n, seq):
 
 # nunpack
 def nunpack(s, default=0):
-    '''Unpacks up to 4 bytes big endian.'''
+    '''Unpacks 1 to 4 byte integers (big endian).'''
     l = len(s)
     if not l:
         return default