diff --git a/README.html b/README.html index 6501741..b56092b 100644 --- a/README.html +++ b/README.html @@ -14,10 +14,11 @@ Python PDF parser and analyzer

Homepage +Recent Changes

-Last Modified: Mon Mar 30 00:30:36 JST 2009 +Last Modified: Thu Apr 2 08:21:56 JST 2009
@@ -45,6 +46,7 @@ which could be useful for analyzing the document.
  • PDF to HTML conversion (with a sample converter web app).
  • Outline (TOC) extraction.
  • Tagged contents extraction. +
  • Infer text running by using clustering technique. diff --git a/pdflib/pdf2txt.py b/pdflib/pdf2txt.py index 243c7e4..6fefb9f 100755 --- a/pdflib/pdf2txt.py +++ b/pdflib/pdf2txt.py @@ -124,7 +124,7 @@ class HTMLConverter(PDFConverter): ## class TextConverter(PDFConverter): - def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, cluster_margin=None, splitwords=False): + def __init__(self, rsrc, outfp, codec='utf-8', pagenum=False, cluster_margin=None, splitwords=False): PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=True) self.pagenum = pagenum if cluster_margin == None: diff --git a/pdflib/pdfinterp.py b/pdflib/pdfinterp.py index 9681c38..cd147ef 100644 --- a/pdflib/pdfinterp.py +++ b/pdflib/pdfinterp.py @@ -153,9 +153,9 @@ class PDFContentParser(PSStackParser): c = self.buf[self.charpos] data += c self.charpos += 1 - if i >= len(target) and c.isspace(): + if len(target) <= i and c.isspace(): i += 1 - elif c == target[i]: + elif i < len(target) and c == target[i]: i += 1 else: i = 0