bugfix
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@82 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
8b1e38295d
commit
548cdf9443
|
@ -14,10 +14,11 @@ Python PDF parser and analyzer
|
|||
|
||||
<p>
|
||||
<a href="http://www.unixuser.org/~euske/python/pdfminer/index.html">Homepage</a>
|
||||
<a href="#changes">Recent Changes</a>
|
||||
|
||||
<div align=right class=lastmod>
|
||||
<!-- hhmts start -->
|
||||
Last Modified: Mon Mar 30 00:30:36 JST 2009
|
||||
Last Modified: Thu Apr 2 08:21:56 JST 2009
|
||||
<!-- hhmts end -->
|
||||
</div>
|
||||
|
||||
|
@ -45,6 +46,7 @@ which could be useful for analyzing the document.
|
|||
<li> PDF to HTML conversion (with a sample converter web app).
|
||||
<li> Outline (TOC) extraction.
|
||||
<li> Tagged contents extraction.
|
||||
<li> Infer text running by using clustering technique.
|
||||
</ul>
|
||||
|
||||
<a name="source"></a>
|
||||
|
|
|
@ -124,7 +124,7 @@ class HTMLConverter(PDFConverter):
|
|||
##
|
||||
class TextConverter(PDFConverter):
|
||||
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, cluster_margin=None, splitwords=False):
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=False, cluster_margin=None, splitwords=False):
|
||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=True)
|
||||
self.pagenum = pagenum
|
||||
if cluster_margin == None:
|
||||
|
|
|
@ -153,9 +153,9 @@ class PDFContentParser(PSStackParser):
|
|||
c = self.buf[self.charpos]
|
||||
data += c
|
||||
self.charpos += 1
|
||||
if i >= len(target) and c.isspace():
|
||||
if len(target) <= i and c.isspace():
|
||||
i += 1
|
||||
elif c == target[i]:
|
||||
elif i < len(target) and c == target[i]:
|
||||
i += 1
|
||||
else:
|
||||
i = 0
|
||||
|
|
Loading…
Reference in New Issue