20090330 release

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@80 1aa58f4a-7d42-0410-adbc-911cccaed67c
2009-03-29 15:14:23 +00:00 · 2009-03-29 15:14:23 +00:00 · 37a6a0450d
parent 68cc99379d
commit 37a6a0450d
3 changed files with 18 additions and 10 deletions
--- a/2
+++ b/2
@ -1,7 +1,7 @@
 # Makefile for pdfminer

 PACKAGE=pdfminer
-VERSION=20090325
+VERSION=20090330
 GNUTAR=tar
 SVN=svn
 PYTHON=python
--- a/README.html
+++ b/README.html
@ -17,7 +17,7 @@ Python PDF parser and analyzer

 <div align=right class=lastmod>
 <!-- hhmts start -->
-Last Modified: Sun Mar 29 19:09:46 JST 2009
+Last Modified: Mon Mar 30 00:13:34 JST 2009
 <!-- hhmts end -->
 </div>

@ -50,8 +50,8 @@ which could be useful for analyzing the document.
 <a name="source"></a>
 <p>
 <strong>Download:</strong><br>
-<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090325.tar.gz">
-http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090325.tar.gz
+<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090330.tar.gz">
+http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090330.tar.gz
 </a>
 (1.8Mbytes)

@ -274,6 +274,7 @@ no stream header is displayed for the ease of saving it to a file.
 <hr noshade>
 <h2>Changes</h2>
 <ul>
+<li> 2009/03/30: Text output mode added.
 <li> 2009/03/25: Encoding problems fixed. Word splitting option added.
 <li> 2009/02/01: Various bugfixes. Thanks to Hiroshi Manabe.
 <li> 2009/01/17: Handling a trailer correctly that contains both /XrefStm and /Prev entries.
--- a/pdflib/pdf2txt.py
+++ b/pdflib/pdf2txt.py
@ -66,7 +66,7 @@ class SGMLConverter(PDFConverter):
 ##
 class HTMLConverter(PDFConverter):

-  def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, cluster_margin=0.5, splitwords=False):
+  def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, cluster_margin=None, splitwords=False):
    PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=splitwords)
    self.pagenum = pagenum
    self.pagepad = pagepad
@ -124,16 +124,19 @@ class HTMLConverter(PDFConverter):
 ##
 class TextConverter(PDFConverter):

-  def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, cluster_margin=0.5, splitwords=False, hyphenation=True):
+  def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, cluster_margin=None, splitwords=False):
    PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=True)
    self.pagenum = pagenum
+    if cluster_margin == None:
+      cluster_margin = 0.5
    self.cluster_margin = cluster_margin
-    self.hyphenation = hyphenation
    return
  
  def end_page(self, page):
    from cluster import cluster_pageobjs
    page = PDFConverter.end_page(self, page)
+    if self.pagenum:
+      self.outfp.write('Page %d\n' % page.id)
    if self.cluster_margin:
      textobjs = get_textobjs(page)
      clusters = cluster_pageobjs(textobjs, self.cluster_margin)
@ -152,6 +155,7 @@ class TextConverter(PDFConverter):
        if isinstance(item, TextItem):
          self.outfp.write(item.text.encode(self.codec, 'replace'))
          self.outfp.write('\n')
+    self.outfp.write('\f')
    return

  def close(self):
@ -246,7 +250,7 @@ def main(argv):
    print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-w] [-t text|html|sgml|tag] [-o output] file ...' % argv[0]
    return 100
  try:
-    (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:t:o:C:D:m:w')
+    (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:T:t:o:C:D:m:w')
  except getopt.GetoptError:
    return usage()
  if not args: return usage()
@ -258,7 +262,9 @@ def main(argv):
  maxpages = 0
  outtype = 'html'
  password = ''
+  pagenum = True
  splitwords = False
+  cluster_margin = None
  outfp = sys.stdout
  for (k, v) in opts:
    if k == '-d': debug += 1
@ -268,6 +274,7 @@ def main(argv):
    elif k == '-m': maxpages = int(v)
    elif k == '-C': cmapdir = v
    elif k == '-D': cdbcmapdir = v
+    elif k == '-T': cluster_margin = float(v)
    elif k == '-t': outtype = v
    elif k == '-o': outfp = file(v, 'wb')
    elif k == '-w': splitwords = True
@ -283,9 +290,9 @@ def main(argv):
  if outtype == 'sgml':
    device = SGMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords)
  elif outtype == 'html':
-    device = HTMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords)
+    device = HTMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords, cluster_margin=cluster_margin)
  elif outtype == 'text':
-    device = TextConverter(rsrc, outfp, codec=codec)
+    device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
  elif outtype == 'tag':
    device = TagExtractor(rsrc, outfp, codec=codec)
  else: