documentation fix

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@117 1aa58f4a-7d42-0410-adbc-911cccaed67c
2009-07-11 12:42:12 +00:00 · 2009-07-11 12:42:12 +00:00 · 787ae4f814
parent 97dd4dda5e
commit 787ae4f814
5 changed files with 146 additions and 121 deletions
--- a/README.html
+++ b/README.html
@ -18,7 +18,7 @@ Python PDF parser and analyzer

 <div align=right class=lastmod>
 <!-- hhmts start -->
-Last Modified: Sat May 23 10:06:04 JST 2009
+Last Modified: Sat Jun 20 19:51:02 JST 2009
 <!-- hhmts end -->
 </div>

@ -89,18 +89,13 @@ http://pdf2html.tabesugi.net:8080/
 <li> Do the following test:<br>
 <blockquote><pre>
 $ <strong>pdf2txt.py samples/simple1.pdf</strong>
-&lt;html&gt;&lt;head&gt;
-&lt;meta http-equiv="Content-Type" content="text/html; charset=utf-8"&gt;
-&lt;/head&gt;&lt;body&gt;
-&lt;span style="position:absolute; border: gray 1px solid; left:0px; top:50px; width:612px; height:792px;"&gt;&lt;/span&gt;
-&lt;div style="position:absolute; top:50px;"&gt;&lt;a name="1"&gt;Page 1&lt;/a&gt;&lt;/div&gt;
-&lt;span style="position:absolute; writing-mode:lr-tb; left:158px; top:224px; font-size:22px;"&gt; World &lt;/span&gt;
-&lt;span style="position:absolute; writing-mode:lr-tb; left:100px; top:124px; font-size:22px;"&gt; &lt;/span&gt;
-&lt;span style="position:absolute; writing-mode:lr-tb; left:100px; top:224px; font-size:22px;"&gt; Hello &lt;/span&gt;
-&lt;span style="position:absolute; writing-mode:lr-tb; left:368px; top:124px; font-size:22px;"&gt;World &lt;/span&gt;
-&lt;span style="position:absolute; writing-mode:lr-tb; left:206px; top:124px; font-size:22px;"&gt;Hello &lt;/span&gt;
-&lt;div style="position:absolute; top:0px;"&gt;Page: &lt;a href="#1"&gt;1&lt;/a&gt;&lt;/div&gt;
-&lt;/body&gt;&lt;/html&gt;
+
+
+Hello
+
+World
+
+ Hello  World
 </pre></blockquote>
 <li> Done!
 </ol>
@ -160,13 +155,13 @@ For non-ASCII languages, you can specify the output encoding
 <p>
 Examples:
 <blockquote><pre>
-$ <strong>pdf2txt.py samples/naacl06-shinyama.pdf &gt; output.html</strong>
+$ <strong>pdf2txt.py samples/naacl06-shinyama.pdf -o output.html</strong>
 (extract text as an HTML file whose filename is output.html)

-$ <strong>pdf2txt.py -c euc-jp samples/jo.pdf &gt; output.html</strong>
+$ <strong>pdf2txt.py -c euc-jp samples/jo.pdf -o output.html</strong>
 (extract a Japanese HTML file in vertical writing, CMap is required)

-$ <strong>pdf2txt.py -P mypassword -t text secret.pdf &gt; output.txt</strong>
+$ <strong>pdf2txt.py -P mypassword secret.pdf -o output.txt</strong>
 (extract a text from an encrypted PDF file)
 </pre></blockquote>

@ -175,7 +170,7 @@ Options:
 <dl>
 <dt> <code>-o <em>filename</em></code> 
 <dd> Specifies the output file name.
-By default, it prints the extracted contents to stdout.
+By default, it prints the extracted contents to stdout in text format.
 <p>
 <dt> <code>-p <em>pageno[,pageno,...]</em></code> 
 <dd> Specifies the comma-separated list of the page numbers to be extracted. 
@ -196,9 +191,24 @@ HTML-like tags. pdf2txt tries to extract its content streams rather than inferri
 Tags used here are defined in the PDF specification (See &sect;10.7 "<em>Tagged PDF</em>").
 </ul>
 <p>
+<dt> <code>-T <em>cluster_margin</em></code> 
+<dd> 
+<p>
+<dt> <code>-W <em>word_margin</em></code> 
+<dd> 
+<p>
+<dt> <code>-s <em>scale</em></code> 
+<dd> 
+<p>
+<dt> <code>-m <em>maxpages</em></code> 
+<dd> 
+<p>
 <dt> <code>-P <em>password</em></code> 
 <dd> Provides the user password to open the PDF file.
 <p>
+<dt> <code>-C <em>CMap directory</em></code> 
+<dd> 
+<p>
 <dt> <code>-d</code> 
 <dd> Increases the debug level.
 </dl>
@ -231,7 +241,10 @@ Options:
 <dd> Instructs to dump all the objects.
 By default, it only prints the document trailer (like a header).
 <p>
-<dt> <code>-p <em>pageno</em></code> 
+<dt> <code>-i <em>objno,objno, ...</em></code> 
+<dd> 
+<p>
+<dt> <code>-p <em>pageno,pageno, ...</em></code> 
 <dd> Specifies the page number to be extracted.
 Multiple <code>-p</code> options are allowed.
 Note that page numbers start from one.
@ -253,6 +266,9 @@ no stream header is displayed for the ease of saving it to a file.
 <dt> <code>-P <em>password</em></code> 
 <dd> Provides the user password to open the PDF file.
 <p>
+<dt> <code>-T</code> 
+<dd> 
+<p>
 <dt> <code>-d</code> 
 <dd> Increases the debug level.
 </dl>
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@ -10,9 +10,10 @@ from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
 ##
 class PDFPageAggregator(PDFDevice):

-  def __init__(self, rsrc, pageno=1, cluster_margin=None):
+  def __init__(self, rsrc, pageno=1, char_margin=None, line_margin=None):
    PDFDevice.__init__(self, rsrc)
-    self.cluster_margin = cluster_margin
+    self.char_margin = char_margin
+    self.line_margin = line_margin
    self.undefined_char = '?'
    self.pageno = pageno
    self.stack = []
@ -27,8 +28,8 @@ class PDFPageAggregator(PDFDevice):
    assert isinstance(self.cur_item, LTPage)
    self.cur_item.fixate()
    self.pageno += 1
-    if self.cluster_margin:
-      self.cur_item.group_text(self.cluster_margin)
+    if self.char_margin != None and self.line_margin != None:
+      self.cur_item.group_text(self.char_margin, self.line_margin)
    return self.cur_item

  def begin_figure(self, name, bbox, matrix):
@ -115,8 +116,10 @@ class PDFPageAggregator(PDFDevice):
 ##
 class PDFConverter(PDFPageAggregator):
  
-  def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, word_margin=None, codec='utf-8'):
-    PDFPageAggregator.__init__(self, rsrc, pageno=pageno, cluster_margin=cluster_margin)
+  def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
+               char_margin=None, line_margin=None, word_margin=None):
+    PDFPageAggregator.__init__(self, rsrc, pageno=pageno,
+                               char_margin=char_margin, line_margin=line_margin)
    self.outfp = outfp
    self.codec = codec
    self.word_margin = word_margin
@ -234,9 +237,11 @@ class SGMLConverter(PDFConverter):
 ##
 class HTMLConverter(PDFConverter):

-  def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, word_margin=None, codec='utf-8',
+  def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
+               char_margin=None, line_margin=None, word_margin=None, 
               scale=1, showpageno=True, pagepad=50):
-    PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, word_margin=word_margin, codec=codec)
+    PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno,
+                          char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
    self.showpageno = showpageno
    self.pagepad = pagepad
    self.scale = scale
@ -277,7 +282,7 @@ class HTMLConverter(PDFConverter):
        if self.debug:
          self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
      elif isinstance(item, LTAnon):
-        self.write(item.text)
+        pass
      elif isinstance(item, LTLine) or isinstance(item, LTRect):
        self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
      elif isinstance(item, LTTextBox):
@ -302,9 +307,11 @@ class HTMLConverter(PDFConverter):
 ##
 class TextConverter(PDFConverter):

-  def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8',
-               showpageno=False, word_margin=None):
-    PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, word_margin=word_margin, codec=codec)
+  def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
+               char_margin=None, line_margin=None, word_margin=None, 
+               showpageno=False):
+    PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno,
+                          char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
    self.showpageno = showpageno
    return
  
--- a/pdfminer/layout.py
+++ b/pdfminer/layout.py
@ -21,31 +21,20 @@ def pick(seq, func, maxobj=None):
 ##  It performs binary search so that the processing time
 ##  should be around O(log n).
 ##
-def bsearch(objs, v0, v1):
-  if v1 <= v0: return []
+def bsearch(objs, v0):
  i0 = 0
-  i1 = len(objs)-1
-  while i0 <= i1:
+  i1 = len(objs)
+  while i0 < i1:
    i = (i0+i1)/2
-    assert 0 <= i and i < len(objs)
    (v, obj) = objs[i]
-    if v < v0:
-      i0 = i+1
-    elif v1 < v:
-      i1 = i-1
-    else:
-      i0 = i
-      while 0 < i0:
-        (v,_) = objs[i0-1]
-        if v < v0: break
-        i0 -= 1
+    if v0 == v:
+      (i0,i1) = (i,i+1)
+      break
+    elif v0 < v:
      i1 = i
-      while i1 < len(objs)-1:
-        (v,_) = objs[i1+1]
-        if v1 < v: break
-        i1 += 1
-      return [ obj for (_,obj) in objs[i0:i1+1] ]
-  return []
+    else:
+      i0 = i+1
+  return (i0,i1)


 ##  reorder_hv, reorder_vh
@ -63,7 +52,9 @@ def reorder_vh(objs, hdir):
  r = []
  line = []
  for obj in sorted(objs, key=vkey):
-    if line and not line[-1].voverlap(obj):
+    if line:
+      v = line[-1].voverlap(obj) * 2
+      if v < obj.height or v < line[-1].height:
        line.sort(key=hkey)
        r.append(line)
        line = []
@ -106,7 +97,8 @@ class Plane(object):
    self.yobjs = []
    for obj in objs:
      self.place(obj)
-    self.fixate()
+    self.xobjs.sort()
+    self.yobjs.sort()
    return

  # place(obj): place an object in a certain area.
@ -118,16 +110,14 @@ class Plane(object):
    self.yobjs.append((obj.y1, obj))
    return

-  # fixate(): you must call this after adding all objects.
-  def fixate(self):
-    self.xobjs.sort()
-    self.yobjs.sort()
-    return
-
  # find(): finds objects that are in a certain area.
  def find(self, (x0,y0,x1,y1)):
-    xobjs = set(bsearch(self.xobjs, x0, x1))
-    yobjs = set(bsearch(self.yobjs, y0, y1))
+    (i0,_) = bsearch(self.xobjs, x0)
+    (_,i1) = bsearch(self.xobjs, x1)
+    xobjs = set( obj for (_,obj) in self.xobjs[i0:i1] )
+    (i0,_) = bsearch(self.yobjs, y0)
+    (_,i1) = bsearch(self.yobjs, y1)
+    yobjs = set( obj for (_,obj) in self.yobjs[i0:i1] )
    objs = xobjs.intersection(yobjs)
    return objs

@ -166,12 +156,14 @@ class ClusterSet(object):
      group.fixate()
    return list(r)

-def group_objs(objs, ratio, klass):
+def group_objs(objs, hratio, vratio, klass):
  plane = Plane(objs)
  cset = ClusterSet(klass)
  for obj in objs:
-    margin = abs(obj.get_margin(ratio))
-    neighbors = plane.find((obj.x0-margin, obj.y0-margin, obj.x1+margin, obj.y1+margin))
+    margin = obj.get_margin()
+    hmargin = hratio * margin
+    vmargin = vratio * margin
+    neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin))
    cset.add(neighbors)
  return cset.finish()

@ -214,7 +206,7 @@ class LayoutItem(object):
  def get_bbox(self):
    return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1)
  
-  def get_margin(self, ratio):
+  def get_margin(self):
    return 0

  def get_weight(self):
@ -253,7 +245,7 @@ class LayoutContainer(LayoutItem):
    return

  # fixate(): determines its boundery and writing direction.
-  def fixate(self):
+  def fixate(self, direction=None):
    if not self.width and self.objs:
      (bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
      for obj in self.objs:
@ -354,8 +346,8 @@ class LTText(LayoutItem):
             '(%.1f, %.1f)' % self.adv,
             self.text))

-  def get_margin(self, ratio):
-    return self.fontsize * ratio
+  def get_margin(self):
+    return abs(self.fontsize)

  def get_weight(self):
    return len(self.text)
@ -392,12 +384,12 @@ class LTTextBox(LayoutContainer):
  def __repr__(self):
    return ('<textbox %s(%s)>' % (self.get_bbox(), self.direction))

-  def fixate(self):
-    LayoutContainer.fixate(self)
-    self.direction = 'H'
+  def fixate(self, direction='H'):
+    LayoutContainer.fixate(self, direction=direction)
+    if not direction:
      for obj in self.objs:
        if obj.is_vertical():
-        self.direction = 'V'
+          direction = 'V'
        break
      if 2 <= len(self.objs):
        objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
@ -405,11 +397,12 @@ class LTTextBox(LayoutContainer):
          h = objs[0].voverlap(objs[1])
          v = objs[0].hoverlap(objs[1])
          if h < v:
-          self.direction = 'V'
-    if self.direction == 'H':
-      self.lines = reorder_vh(self.objs, +1)
-    else:
+            direction = 'V'
+    self.direction = direction
+    if self.direction == 'V':
      self.lines = reorder_hv(self.objs, -1)
+    else:
+      self.lines = reorder_vh(self.objs, +1)
    self.objs = []
    for line in self.lines:
      self.objs.extend(line)
@ -418,31 +411,31 @@ class LTTextBox(LayoutContainer):
  def get_direction(self):
    return self.direction

-  def get_lines(self, ratio):
-    if self.get_direction() == 'H':
-      for line in self.lines:
-        x1 = INF
-        for obj in line:
-          if not isinstance(obj, LTText): continue
-          if ratio:
-            margin = obj.get_margin(ratio)
-            if x1 < obj.x0-margin:
-              yield LTAnon(' ')
-          yield obj
-          x1 = obj.x1
-        yield LTAnon('\n')
-    else:
+  def get_lines(self, word_margin):
+    if self.get_direction() == 'V':
      for line in self.lines:
        y0 = -INF
        for obj in line:
          if not isinstance(obj, LTText): continue
-          if ratio:
-            margin = obj.get_margin(ratio)
+          if word_margin:
+            margin = word_margin * obj.get_margin()
            if obj.y1+margin < y0:
              yield LTAnon(' ')
          yield obj
          y0 = obj.y0
        yield LTAnon('\n')
+    else:
+      for line in self.lines:
+        x1 = INF
+        for obj in line:
+          if not isinstance(obj, LTText): continue
+          if word_margin:
+            margin = word_margin * obj.get_margin()
+            if x1 < obj.x0-margin:
+              yield LTAnon(' ')
+          yield obj
+          x1 = obj.x1
+        yield LTAnon('\n')
    return


@ -458,17 +451,18 @@ class LTPage(LayoutContainer):
  def __repr__(self):
    return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.get_bbox(), self.rotate))

-  def fixate(self):
+  def fixate(self, dirtection='H'):
    return

-  def group_text(self, ratio):
+  def group_text(self, char_margin, line_margin):
    textobjs = [ obj for obj in self.objs if isinstance(obj, LTText) ]
-    otherobjs = [ obj for obj in self.objs if not isinstance(obj, LTText) ]
-    self.objs = group_objs(textobjs, ratio, LTTextBox) + otherobjs
-    if self.get_direction() == 'H':
-      lines = reorder_vh(self.objs, +1)
+    objs = [ obj for obj in self.objs if not isinstance(obj, LTText) ]
+    if self.get_direction() == 'V':
+      objs += group_objs(textobjs, line_margin, char_margin, LTTextBox)
+      lines = reorder_hv(objs, -1)
    else:
-      lines = reorder_hv(self.objs, -1)
+      objs += group_objs(textobjs, char_margin, line_margin, LTTextBox)
+      lines = reorder_vh(objs, +1)
    self.objs = []
    for line in lines:
      self.objs.extend(line)
--- a/setup.py
+++ b/setup.py
@ -1,8 +1,9 @@
 #!/usr/bin/env python
 from distutils.core import setup
+from pdfminer import __version__

 setup(name='pdfminer',
-      version='20090330',
+      version=__version__,
      description='PDF parser and analyzer',
      license='MIT/X',
      author='Yusuke Shinyama',
--- a/tools/pdf2txt.py
+++ b/tools/pdf2txt.py
@ -10,10 +10,12 @@ from pdfminer.cmap import CMapDB, find_cmap_path
 def main(argv):
  import getopt
  def usage():
-    print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-T cluster_threshold] [-W word_margin] [-t text|html|sgml|tag] [-o output] file ...' % argv[0]
+    print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
+           '[-M char_margin] [-L line_margin] [-W word_margin] '
+           '[-t text|html|sgml|tag] [-o output] file ...' % argv[0])
    return 100
  try:
-    (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:T:W:t:o:C:D:m:')
+    (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:M:L:W:t:o:C:D:m:')
  except getopt.GetoptError:
    return usage()
  if not args: return usage()
@ -29,7 +31,8 @@ def main(argv):
  outfile = None
  outtype = None
  codec = 'utf-8'
-  cluster_margin = 0.5
+  char_margin = 1.0
+  line_margin = 0.3
  word_margin = 0.2
  pageno = 1
  scale = 1
@ -44,7 +47,8 @@ def main(argv):
    elif k == '-c': codec = v
    elif k == '-o': outfile = v
    elif k == '-s': scale = float(v)
-    elif k == '-T': cluster_margin = float(v)
+    elif k == '-M': char_margin = float(v)
+    elif k == '-L': line_margin = float(v)
    elif k == '-W': word_margin = float(v)
  #
  CMapDB.debug = debug
@ -69,12 +73,15 @@ def main(argv):
    outfp = file(outfile, 'w')
  else:
    outfp = sys.stdout
-  if outtype == 'sgml':
-    device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin)
+  if outtype == 'text':
+    device = TextConverter(rsrc, outfp, codec=codec, 
+                           char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
+  elif outtype == 'sgml':
+    device = SGMLConverter(rsrc, outfp, codec=codec,
+                           char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
  elif outtype == 'html':
-    device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin, scale=scale)
-  elif outtype == 'text':
-    device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin)
+    device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale,
+                           char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
  elif outtype == 'tag':
    device = TagExtractor(rsrc, outfp, codec=codec)
  else: