From 787ae4f81418953f04b1a2f03d744b6244c754ed Mon Sep 17 00:00:00 2001
From: "yusuke.shinyama.dummy"
 <yusuke.shinyama.dummy@1aa58f4a-7d42-0410-adbc-911cccaed67c>
Date: Sat, 11 Jul 2009 12:42:12 +0000
Subject: [PATCH] documentation fix

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@117 1aa58f4a-7d42-0410-adbc-911cccaed67c
---
 README.html           |  52 +++++++++-----
 pdfminer/converter.py |  31 +++++----
 pdfminer/layout.py    | 156 ++++++++++++++++++++----------------------
 setup.py              |   3 +-
 tools/pdf2txt.py      |  25 ++++---
 5 files changed, 146 insertions(+), 121 deletions(-)
diff --git a/README.html b/README.html
index 53ad545..da23724 100644
--- a/README.html
+++ b/README.html
@@ -18,7 +18,7 @@ Python PDF parser and analyzer
 
 <div align=right class=lastmod>
 <!-- hhmts start -->
-Last Modified: Sat May 23 10:06:04 JST 2009
+Last Modified: Sat Jun 20 19:51:02 JST 2009
 <!-- hhmts end -->
 </div>
 
@@ -89,18 +89,13 @@ http://pdf2html.tabesugi.net:8080/
 <li> Do the following test:<br>
 <blockquote><pre>
 $ <strong>pdf2txt.py samples/simple1.pdf</strong>
-&lt;html&gt;&lt;head&gt;
-&lt;meta http-equiv="Content-Type" content="text/html; charset=utf-8"&gt;
-&lt;/head&gt;&lt;body&gt;
-&lt;span style="position:absolute; border: gray 1px solid; left:0px; top:50px; width:612px; height:792px;"&gt;&lt;/span&gt;
-&lt;div style="position:absolute; top:50px;"&gt;&lt;a name="1"&gt;Page 1&lt;/a&gt;&lt;/div&gt;
-&lt;span style="position:absolute; writing-mode:lr-tb; left:158px; top:224px; font-size:22px;"&gt; World &lt;/span&gt;
-&lt;span style="position:absolute; writing-mode:lr-tb; left:100px; top:124px; font-size:22px;"&gt; &lt;/span&gt;
-&lt;span style="position:absolute; writing-mode:lr-tb; left:100px; top:224px; font-size:22px;"&gt; Hello &lt;/span&gt;
-&lt;span style="position:absolute; writing-mode:lr-tb; left:368px; top:124px; font-size:22px;"&gt;World &lt;/span&gt;
-&lt;span style="position:absolute; writing-mode:lr-tb; left:206px; top:124px; font-size:22px;"&gt;Hello &lt;/span&gt;
-&lt;div style="position:absolute; top:0px;"&gt;Page: &lt;a href="#1"&gt;1&lt;/a&gt;&lt;/div&gt;
-&lt;/body&gt;&lt;/html&gt;
+
+
+Hello
+
+World
+
+ Hello  World
 </pre></blockquote>
 <li> Done!
 </ol>
@@ -160,13 +155,13 @@ For non-ASCII languages, you can specify the output encoding
 <p>
 Examples:
 <blockquote><pre>
-$ <strong>pdf2txt.py samples/naacl06-shinyama.pdf &gt; output.html</strong>
+$ <strong>pdf2txt.py samples/naacl06-shinyama.pdf -o output.html</strong>
 (extract text as an HTML file whose filename is output.html)
 
-$ <strong>pdf2txt.py -c euc-jp samples/jo.pdf &gt; output.html</strong>
+$ <strong>pdf2txt.py -c euc-jp samples/jo.pdf -o output.html</strong>
 (extract a Japanese HTML file in vertical writing, CMap is required)
 
-$ <strong>pdf2txt.py -P mypassword -t text secret.pdf &gt; output.txt</strong>
+$ <strong>pdf2txt.py -P mypassword secret.pdf -o output.txt</strong>
 (extract a text from an encrypted PDF file)
 </pre></blockquote>
 
@@ -175,7 +170,7 @@ Options:
 <dl>
 <dt> <code>-o <em>filename</em></code> 
 <dd> Specifies the output file name.
-By default, it prints the extracted contents to stdout.
+By default, it prints the extracted contents to stdout in text format.
 <p>
 <dt> <code>-p <em>pageno[,pageno,...]</em></code> 
 <dd> Specifies the comma-separated list of the page numbers to be extracted. 
@@ -196,9 +191,24 @@ HTML-like tags. pdf2txt tries to extract its content streams rather than inferri
 Tags used here are defined in the PDF specification (See &sect;10.7 "<em>Tagged PDF</em>").
 </ul>
 <p>
+<dt> <code>-T <em>cluster_margin</em></code> 
+<dd> 
+<p>
+<dt> <code>-W <em>word_margin</em></code> 
+<dd> 
+<p>
+<dt> <code>-s <em>scale</em></code> 
+<dd> 
+<p>
+<dt> <code>-m <em>maxpages</em></code> 
+<dd> 
+<p>
 <dt> <code>-P <em>password</em></code> 
 <dd> Provides the user password to open the PDF file.
 <p>
+<dt> <code>-C <em>CMap directory</em></code> 
+<dd> 
+<p>
 <dt> <code>-d</code> 
 <dd> Increases the debug level.
 </dl>
@@ -231,7 +241,10 @@ Options:
 <dd> Instructs to dump all the objects.
 By default, it only prints the document trailer (like a header).
 <p>
-<dt> <code>-p <em>pageno</em></code> 
+<dt> <code>-i <em>objno,objno, ...</em></code> 
+<dd> 
+<p>
+<dt> <code>-p <em>pageno,pageno, ...</em></code> 
 <dd> Specifies the page number to be extracted.
 Multiple <code>-p</code> options are allowed.
 Note that page numbers start from one.
@@ -253,6 +266,9 @@ no stream header is displayed for the ease of saving it to a file.
 <dt> <code>-P <em>password</em></code> 
 <dd> Provides the user password to open the PDF file.
 <p>
+<dt> <code>-T</code> 
+<dd> 
+<p>
 <dt> <code>-d</code> 
 <dd> Increases the debug level.
 </dl>
diff --git a/pdfminer/converter.py b/pdfminer/converter.py
index 666c10d..9cf86d5 100644
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@@ -10,9 +10,10 @@ from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
 ##
 class PDFPageAggregator(PDFDevice):
 
-  def __init__(self, rsrc, pageno=1, cluster_margin=None):
+  def __init__(self, rsrc, pageno=1, char_margin=None, line_margin=None):
     PDFDevice.__init__(self, rsrc)
-    self.cluster_margin = cluster_margin
+    self.char_margin = char_margin
+    self.line_margin = line_margin
     self.undefined_char = '?'
     self.pageno = pageno
     self.stack = []
@@ -27,8 +28,8 @@ class PDFPageAggregator(PDFDevice):
     assert isinstance(self.cur_item, LTPage)
     self.cur_item.fixate()
     self.pageno += 1
-    if self.cluster_margin:
-      self.cur_item.group_text(self.cluster_margin)
+    if self.char_margin != None and self.line_margin != None:
+      self.cur_item.group_text(self.char_margin, self.line_margin)
     return self.cur_item
 
   def begin_figure(self, name, bbox, matrix):
@@ -115,8 +116,10 @@ class PDFPageAggregator(PDFDevice):
 ##
 class PDFConverter(PDFPageAggregator):
   
-  def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, word_margin=None, codec='utf-8'):
-    PDFPageAggregator.__init__(self, rsrc, pageno=pageno, cluster_margin=cluster_margin)
+  def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
+               char_margin=None, line_margin=None, word_margin=None):
+    PDFPageAggregator.__init__(self, rsrc, pageno=pageno,
+                               char_margin=char_margin, line_margin=line_margin)
     self.outfp = outfp
     self.codec = codec
     self.word_margin = word_margin
@@ -234,9 +237,11 @@ class SGMLConverter(PDFConverter):
 ##
 class HTMLConverter(PDFConverter):
 
-  def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, word_margin=None, codec='utf-8',
+  def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
+               char_margin=None, line_margin=None, word_margin=None, 
                scale=1, showpageno=True, pagepad=50):
-    PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, word_margin=word_margin, codec=codec)
+    PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno,
+                          char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
     self.showpageno = showpageno
     self.pagepad = pagepad
     self.scale = scale
@@ -277,7 +282,7 @@ class HTMLConverter(PDFConverter):
         if self.debug:
           self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
       elif isinstance(item, LTAnon):
-        self.write(item.text)
+        pass
       elif isinstance(item, LTLine) or isinstance(item, LTRect):
         self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
       elif isinstance(item, LTTextBox):
@@ -302,9 +307,11 @@ class HTMLConverter(PDFConverter):
 ##
 class TextConverter(PDFConverter):
 
-  def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8',
-               showpageno=False, word_margin=None):
-    PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, word_margin=word_margin, codec=codec)
+  def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
+               char_margin=None, line_margin=None, word_margin=None, 
+               showpageno=False):
+    PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno,
+                          char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
     self.showpageno = showpageno
     return
   
diff --git a/pdfminer/layout.py b/pdfminer/layout.py
index cb2598c..7e575cf 100644
--- a/pdfminer/layout.py
+++ b/pdfminer/layout.py
@@ -21,31 +21,20 @@ def pick(seq, func, maxobj=None):
 ##  It performs binary search so that the processing time
 ##  should be around O(log n).
 ##
-def bsearch(objs, v0, v1):
-  if v1 <= v0: return []
+def bsearch(objs, v0):
   i0 = 0
-  i1 = len(objs)-1
-  while i0 <= i1:
+  i1 = len(objs)
+  while i0 < i1:
     i = (i0+i1)/2
-    assert 0 <= i and i < len(objs)
     (v, obj) = objs[i]
-    if v < v0:
-      i0 = i+1
-    elif v1 < v:
-      i1 = i-1
-    else:
-      i0 = i
-      while 0 < i0:
-        (v,_) = objs[i0-1]
-        if v < v0: break
-        i0 -= 1
+    if v0 == v:
+      (i0,i1) = (i,i+1)
+      break
+    elif v0 < v:
       i1 = i
-      while i1 < len(objs)-1:
-        (v,_) = objs[i1+1]
-        if v1 < v: break
-        i1 += 1
-      return [ obj for (_,obj) in objs[i0:i1+1] ]
-  return []
+    else:
+      i0 = i+1
+  return (i0,i1)
 
 
 ##  reorder_hv, reorder_vh
@@ -63,10 +52,12 @@ def reorder_vh(objs, hdir):
   r = []
   line = []
   for obj in sorted(objs, key=vkey):
-    if line and not line[-1].voverlap(obj):
-      line.sort(key=hkey)
-      r.append(line)
-      line = []
+    if line:
+      v = line[-1].voverlap(obj) * 2
+      if v < obj.height or v < line[-1].height:
+        line.sort(key=hkey)
+        r.append(line)
+        line = []
     line.append(obj)
   line.sort(key=hkey)
   r.append(line)
@@ -106,7 +97,8 @@ class Plane(object):
     self.yobjs = []
     for obj in objs:
       self.place(obj)
-    self.fixate()
+    self.xobjs.sort()
+    self.yobjs.sort()
     return
 
   # place(obj): place an object in a certain area.
@@ -118,16 +110,14 @@ class Plane(object):
     self.yobjs.append((obj.y1, obj))
     return
 
-  # fixate(): you must call this after adding all objects.
-  def fixate(self):
-    self.xobjs.sort()
-    self.yobjs.sort()
-    return
-
   # find(): finds objects that are in a certain area.
   def find(self, (x0,y0,x1,y1)):
-    xobjs = set(bsearch(self.xobjs, x0, x1))
-    yobjs = set(bsearch(self.yobjs, y0, y1))
+    (i0,_) = bsearch(self.xobjs, x0)
+    (_,i1) = bsearch(self.xobjs, x1)
+    xobjs = set( obj for (_,obj) in self.xobjs[i0:i1] )
+    (i0,_) = bsearch(self.yobjs, y0)
+    (_,i1) = bsearch(self.yobjs, y1)
+    yobjs = set( obj for (_,obj) in self.yobjs[i0:i1] )
     objs = xobjs.intersection(yobjs)
     return objs
 
@@ -166,12 +156,14 @@ class ClusterSet(object):
       group.fixate()
     return list(r)
 
-def group_objs(objs, ratio, klass):
+def group_objs(objs, hratio, vratio, klass):
   plane = Plane(objs)
   cset = ClusterSet(klass)
   for obj in objs:
-    margin = abs(obj.get_margin(ratio))
-    neighbors = plane.find((obj.x0-margin, obj.y0-margin, obj.x1+margin, obj.y1+margin))
+    margin = obj.get_margin()
+    hmargin = hratio * margin
+    vmargin = vratio * margin
+    neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin))
     cset.add(neighbors)
   return cset.finish()
 
@@ -214,7 +206,7 @@ class LayoutItem(object):
   def get_bbox(self):
     return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1)
   
-  def get_margin(self, ratio):
+  def get_margin(self):
     return 0
 
   def get_weight(self):
@@ -253,7 +245,7 @@ class LayoutContainer(LayoutItem):
     return
 
   # fixate(): determines its boundery and writing direction.
-  def fixate(self):
+  def fixate(self, direction=None):
     if not self.width and self.objs:
       (bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
       for obj in self.objs:
@@ -354,8 +346,8 @@ class LTText(LayoutItem):
              '(%.1f, %.1f)' % self.adv,
              self.text))
 
-  def get_margin(self, ratio):
-    return self.fontsize * ratio
+  def get_margin(self):
+    return abs(self.fontsize)
 
   def get_weight(self):
     return len(self.text)
@@ -392,24 +384,25 @@ class LTTextBox(LayoutContainer):
   def __repr__(self):
     return ('<textbox %s(%s)>' % (self.get_bbox(), self.direction))
 
-  def fixate(self):
-    LayoutContainer.fixate(self)
-    self.direction = 'H'
-    for obj in self.objs:
-      if obj.is_vertical():
-        self.direction = 'V'
-      break
-    if 2 <= len(self.objs):
-      objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
-      if objs[0].get_weight() == 1 and objs[1].get_weight() == 1:
-        h = objs[0].voverlap(objs[1])
-        v = objs[0].hoverlap(objs[1])
-        if h < v:
-          self.direction = 'V'
-    if self.direction == 'H':
-      self.lines = reorder_vh(self.objs, +1)
-    else:
+  def fixate(self, direction='H'):
+    LayoutContainer.fixate(self, direction=direction)
+    if not direction:
+      for obj in self.objs:
+        if obj.is_vertical():
+          direction = 'V'
+        break
+      if 2 <= len(self.objs):
+        objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
+        if objs[0].get_weight() == 1 and objs[1].get_weight() == 1:
+          h = objs[0].voverlap(objs[1])
+          v = objs[0].hoverlap(objs[1])
+          if h < v:
+            direction = 'V'
+    self.direction = direction
+    if self.direction == 'V':
       self.lines = reorder_hv(self.objs, -1)
+    else:
+      self.lines = reorder_vh(self.objs, +1)
     self.objs = []
     for line in self.lines:
       self.objs.extend(line)
@@ -418,31 +411,31 @@ class LTTextBox(LayoutContainer):
   def get_direction(self):
     return self.direction
 
-  def get_lines(self, ratio):
-    if self.get_direction() == 'H':
-      for line in self.lines:
-        x1 = INF
-        for obj in line:
-          if not isinstance(obj, LTText): continue
-          if ratio:
-            margin = obj.get_margin(ratio)
-            if x1 < obj.x0-margin:
-              yield LTAnon(' ')
-          yield obj
-          x1 = obj.x1
-        yield LTAnon('\n')
-    else:
+  def get_lines(self, word_margin):
+    if self.get_direction() == 'V':
       for line in self.lines:
         y0 = -INF
         for obj in line:
           if not isinstance(obj, LTText): continue
-          if ratio:
-            margin = obj.get_margin(ratio)
+          if word_margin:
+            margin = word_margin * obj.get_margin()
             if obj.y1+margin < y0:
               yield LTAnon(' ')
           yield obj
           y0 = obj.y0
         yield LTAnon('\n')
+    else:
+      for line in self.lines:
+        x1 = INF
+        for obj in line:
+          if not isinstance(obj, LTText): continue
+          if word_margin:
+            margin = word_margin * obj.get_margin()
+            if x1 < obj.x0-margin:
+              yield LTAnon(' ')
+          yield obj
+          x1 = obj.x1
+        yield LTAnon('\n')
     return
 
 
@@ -458,17 +451,18 @@ class LTPage(LayoutContainer):
   def __repr__(self):
     return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.get_bbox(), self.rotate))
 
-  def fixate(self):
+  def fixate(self, dirtection='H'):
     return
 
-  def group_text(self, ratio):
+  def group_text(self, char_margin, line_margin):
     textobjs = [ obj for obj in self.objs if isinstance(obj, LTText) ]
-    otherobjs = [ obj for obj in self.objs if not isinstance(obj, LTText) ]
-    self.objs = group_objs(textobjs, ratio, LTTextBox) + otherobjs
-    if self.get_direction() == 'H':
-      lines = reorder_vh(self.objs, +1)
+    objs = [ obj for obj in self.objs if not isinstance(obj, LTText) ]
+    if self.get_direction() == 'V':
+      objs += group_objs(textobjs, line_margin, char_margin, LTTextBox)
+      lines = reorder_hv(objs, -1)
     else:
-      lines = reorder_hv(self.objs, -1)
+      objs += group_objs(textobjs, char_margin, line_margin, LTTextBox)
+      lines = reorder_vh(objs, +1)
     self.objs = []
     for line in lines:
       self.objs.extend(line)
diff --git a/setup.py b/setup.py
index 3ea37f9..8ab539f 100644
--- a/setup.py
+++ b/setup.py
@@ -1,8 +1,9 @@
 #!/usr/bin/env python
 from distutils.core import setup
+from pdfminer import __version__
 
 setup(name='pdfminer',
-      version='20090330',
+      version=__version__,
       description='PDF parser and analyzer',
       license='MIT/X',
       author='Yusuke Shinyama',
diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py
index c44eb01..6ad95e6 100755
--- a/tools/pdf2txt.py
+++ b/tools/pdf2txt.py
@@ -10,10 +10,12 @@ from pdfminer.cmap import CMapDB, find_cmap_path
 def main(argv):
   import getopt
   def usage():
-    print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-T cluster_threshold] [-W word_margin] [-t text|html|sgml|tag] [-o output] file ...' % argv[0]
+    print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
+           '[-M char_margin] [-L line_margin] [-W word_margin] '
+           '[-t text|html|sgml|tag] [-o output] file ...' % argv[0])
     return 100
   try:
-    (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:T:W:t:o:C:D:m:')
+    (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:M:L:W:t:o:C:D:m:')
   except getopt.GetoptError:
     return usage()
   if not args: return usage()
@@ -29,7 +31,8 @@ def main(argv):
   outfile = None
   outtype = None
   codec = 'utf-8'
-  cluster_margin = 0.5
+  char_margin = 1.0
+  line_margin = 0.3
   word_margin = 0.2
   pageno = 1
   scale = 1
@@ -44,7 +47,8 @@ def main(argv):
     elif k == '-c': codec = v
     elif k == '-o': outfile = v
     elif k == '-s': scale = float(v)
-    elif k == '-T': cluster_margin = float(v)
+    elif k == '-M': char_margin = float(v)
+    elif k == '-L': line_margin = float(v)
     elif k == '-W': word_margin = float(v)
   #
   CMapDB.debug = debug
@@ -69,12 +73,15 @@ def main(argv):
     outfp = file(outfile, 'w')
   else:
     outfp = sys.stdout
-  if outtype == 'sgml':
-    device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin)
+  if outtype == 'text':
+    device = TextConverter(rsrc, outfp, codec=codec, 
+                           char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
+  elif outtype == 'sgml':
+    device = SGMLConverter(rsrc, outfp, codec=codec,
+                           char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
   elif outtype == 'html':
-    device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin, scale=scale)
-  elif outtype == 'text':
-    device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin)
+    device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale,
+                           char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
   elif outtype == 'tag':
     device = TagExtractor(rsrc, outfp, codec=codec)
   else: