From 548cdf94433b5fd09eaaa5fe304b415d036fa0c9 Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Thu, 2 Apr 2009 14:22:19 +0000 Subject: [PATCH] bugfix git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@82 1aa58f4a-7d42-0410-adbc-911cccaed67c --- README.html | 4 +++- pdflib/pdf2txt.py | 2 +- pdflib/pdfinterp.py | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/README.html b/README.html index 6501741..b56092b 100644 --- a/README.html +++ b/README.html @@ -14,10 +14,11 @@ Python PDF parser and analyzer

Homepage +Recent Changes

-Last Modified: Mon Mar 30 00:30:36 JST 2009 +Last Modified: Thu Apr 2 08:21:56 JST 2009
@@ -45,6 +46,7 @@ which could be useful for analyzing the document.
  • PDF to HTML conversion (with a sample converter web app).
  • Outline (TOC) extraction.
  • Tagged contents extraction. +
  • Infer text running by using clustering technique. diff --git a/pdflib/pdf2txt.py b/pdflib/pdf2txt.py index 243c7e4..6fefb9f 100755 --- a/pdflib/pdf2txt.py +++ b/pdflib/pdf2txt.py @@ -124,7 +124,7 @@ class HTMLConverter(PDFConverter): ## class TextConverter(PDFConverter): - def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, cluster_margin=None, splitwords=False): + def __init__(self, rsrc, outfp, codec='utf-8', pagenum=False, cluster_margin=None, splitwords=False): PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=True) self.pagenum = pagenum if cluster_margin == None: diff --git a/pdflib/pdfinterp.py b/pdflib/pdfinterp.py index 9681c38..cd147ef 100644 --- a/pdflib/pdfinterp.py +++ b/pdflib/pdfinterp.py @@ -153,9 +153,9 @@ class PDFContentParser(PSStackParser): c = self.buf[self.charpos] data += c self.charpos += 1 - if i >= len(target) and c.isspace(): + if len(target) <= i and c.isspace(): i += 1 - elif c == target[i]: + elif i < len(target) and c == target[i]: i += 1 else: i = 0