bugfix

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@82 1aa58f4a-7d42-0410-adbc-911cccaed67c
2009-04-02 14:22:19 +00:00 · 2009-04-02 14:22:19 +00:00 · 548cdf9443
parent 8b1e38295d
commit 548cdf9443
3 changed files with 6 additions and 4 deletions
--- a/README.html
+++ b/README.html
@ -14,10 +14,11 @@ Python PDF parser and analyzer

 <p>
 <a href="http://www.unixuser.org/~euske/python/pdfminer/index.html">Homepage</a>
+<a href="#changes">Recent Changes</a>

 <div align=right class=lastmod>
 <!-- hhmts start -->
-Last Modified: Mon Mar 30 00:30:36 JST 2009
+Last Modified: Thu Apr  2 08:21:56 JST 2009
 <!-- hhmts end -->
 </div>

@ -45,6 +46,7 @@ which could be useful for analyzing the document.
 <li> PDF to HTML conversion (with a sample converter web app).
 <li> Outline (TOC) extraction.
 <li> Tagged contents extraction.
+<li> Infer text running by using clustering technique.
 </ul>

 <a name="source"></a>
--- a/pdflib/pdf2txt.py
+++ b/pdflib/pdf2txt.py
@ -124,7 +124,7 @@ class HTMLConverter(PDFConverter):
 ##
 class TextConverter(PDFConverter):

-  def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, cluster_margin=None, splitwords=False):
+  def __init__(self, rsrc, outfp, codec='utf-8', pagenum=False, cluster_margin=None, splitwords=False):
    PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=True)
    self.pagenum = pagenum
    if cluster_margin == None:
--- a/pdflib/pdfinterp.py
+++ b/pdflib/pdfinterp.py
@ -153,9 +153,9 @@ class PDFContentParser(PSStackParser):
        c = self.buf[self.charpos]
        data += c
        self.charpos += 1
-        if i >= len(target) and c.isspace():
+        if len(target) <= i and c.isspace():
          i += 1
-        elif c == target[i]:
+        elif i < len(target) and c == target[i]:
          i += 1
        else:
          i = 0