From 77d7c9ae552420a93032047ae7b4c54b24cd7ef4 Mon Sep 17 00:00:00 2001
From: "yusuke.shinyama.dummy"
 <yusuke.shinyama.dummy@1aa58f4a-7d42-0410-adbc-911cccaed67c>
Date: Sat, 3 May 2008 04:10:59 +0000
Subject: [PATCH] PSEOF check

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@28 1aa58f4a-7d42-0410-adbc-911cccaed67c
---
 README.html  | 16 +++++++--------
 pdfparser.py | 58 +++++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 54 insertions(+), 20 deletions(-)
diff --git a/README.html b/README.html
index 9d93c09..24669d3 100644
--- a/README.html
+++ b/README.html
@@ -11,13 +11,13 @@ blockquote { background: #eeeeee; }
 <h1>PDFMiner</h1>
 <div align=right class=lastmod>
 <!-- hhmts start -->
-Last Modified: Tue Apr 29 20:16:22 JST 2008
+Last Modified: Wed Apr 30 19:15:12 JST 2008
 <!-- hhmts end -->
 </div>
 
 <a name="intro"></a>
 <hr noshade>
-<h2>What's it?</h2>
+<h2>What's It?</h2>
 <p>
 PDFMiner is a suite of programs that aims to help
 extracting or analyzing text data from PDF documents.
@@ -42,6 +42,7 @@ It can be also used as a basis for a full-fledged PDF interpreter.
 http://www.unixuser.org/~euske/python/pdfminer/index.html
 </a>
 
+<a name="source"></a>
 <p>
 <strong>Download (source):</strong><br>
 <a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20080429.tar.gz">
@@ -57,11 +58,10 @@ http://pdfminerr.googlecode.com/svn/
 
 <a name="install"></a>
 <hr noshade>
-<h2>Installation</h2>
-<p>
-Prerequisite: <a href="http://www.python.org/download/">Python</a> 2.4 or newer.
+<h2>How to Install</h2>
 <ol>
-<li> Download the source.
+<li> Install <a href="http://www.python.org/download/">Python</a> 2.4 or newer.
+<li> Download the <a href="#source">PDFMiner source</a>.
 <li> Extract it.
 <li> Go to the <code>pdfminer</code> directory.
 <li> Do the following test:<br>
@@ -102,7 +102,7 @@ $ <strong>make cdbcmap</strong>
 
 <a name="usage"></a>
 <hr noshade>
-<h2>Usage</h2>
+<h2>How to Use</h2>
 
 <p>
 PDFMiner comes with two programs:
@@ -234,7 +234,7 @@ no stream header is displayed for the ease of saving it to a file.
 
 <a name="license"></a>
 <hr noshade>
-<h2>Terms and conditions</h2>
+<h2>Terms and Conditions</h2>
 <p>
 <small>
 Copyright (c) 2004-2008  Yusuke Shinyama &lt;yusuke at cs dot nyu dot edu&gt;
diff --git a/pdfparser.py b/pdfparser.py
index a2cbbd1..6eb308f 100755
--- a/pdfparser.py
+++ b/pdfparser.py
@@ -280,7 +280,12 @@ class PDFXRef:
 
   def __init__(self, parser):
     while 1:
-      (pos, line) = parser.nextline()
+      try:
+        (pos, line) = parser.nextline()
+      except PSEOF:
+        if STRICT:
+          raise PDFSyntaxError('Unexpected EOF')
+        break
       if not line:
         if STRICT:
           raise PDFSyntaxError('premature eof: %r' % parser)
@@ -293,12 +298,19 @@ class PDFXRef:
         if STRICT:
           raise PDFSyntaxError('trailer not found: %r: line=%r' % (parser, line))
         continue
-      (start, nobjs) = map(long, f)
+      try:
+        (start, nobjs) = map(long, f)
+      except ValueError:
+        if STRICT:
+          raise PDFSyntaxError('invalid line: %r: line=%r' % (parser, line))
+        continue
       self.objid0 = start
-      self.objid1 = start+nobjs
       self.offsets = []
       for objid in xrange(start, start+nobjs):
-        (_, line) = parser.nextline()
+        try:
+          (_, line) = parser.nextline()
+        except PSEOF:
+          break
         f = line.strip().split(' ')
         if len(f) != 3:
           if STRICT:
@@ -307,14 +319,19 @@ class PDFXRef:
         (pos, genno, use) = f
         self.offsets.append((int(genno), long(pos), use))
     # read trailer
-    (_,kwd) = parser.nexttoken()
-    assert kwd == KEYWORD_TRAILER
-    (_,dic) = parser.nextobject()
-    self.trailer = dict_value(dic)
+    try:
+      (_,kwd) = parser.nexttoken()
+      assert kwd == KEYWORD_TRAILER
+      (_,dic) = parser.nextobject()
+      self.trailer = dict_value(dic)
+    except PSEOF:
+      if STRICT:
+        raise PDFSyntaxError('Unexpected EOF')
+      self.trailer = None
     return
 
   def getpos(self, objid):
-    if objid < self.objid0 or self.objid1 <= objid:
+    if objid < self.objid0 or (self.objid0+len(self.offsets)) <= objid:
       raise IndexError(objid)
     (genno, pos, use) = self.offsets[objid-self.objid0]
     if use != 'n':
@@ -386,6 +403,7 @@ class PDFDocument:
     self.xrefs = list(parser.read_xref())
     for xref in self.xrefs:
       trailer = xref.trailer
+      if not trailer: continue
       if 'Encrypt' in trailer:
         self.encryption = (list_value(trailer['ID']),
                            dict_value(trailer['Encrypt']))
@@ -600,13 +618,23 @@ class PDFParser(PSStackParser):
           raise PDFValueError('/Length is undefined: %r' % dic)
         objlen = 0
       self.seek(pos)
-      (_, line) = self.nextline()  # 'stream'
+      try:
+        (_, line) = self.nextline()  # 'stream'
+      except PSEOF:
+        if STRICT:
+          raise PDFSyntaxError('Unexpected EOF')
+        return
       pos += len(line)
       self.fp.seek(pos)
       data = self.fp.read(objlen)
       self.seek(pos+objlen)
       while 1:
-        (linepos, line) = self.nextline()
+        try:
+          (linepos, line) = self.nextline()
+        except PSEOF:
+          if STRICT:
+            raise PDFSyntaxError('Unexpected EOF')
+          break
         if 'endstream' in line:
           i = line.index('endstream')
           objlen += i
@@ -649,7 +677,12 @@ class PDFParser(PSStackParser):
     self.find_xref()
     while 1:
       # read xref table
-      (pos, token) = self.nexttoken()
+      try:
+        (pos, token) = self.nexttoken()
+      except PSEOF:
+        if STRICT:
+          raise PDFSyntaxError('Unexpected EOF')
+        break
       if 2 <= self.debug:
         print >>stderr, 'read_xref: %r' % token
       if isinstance(token, int):
@@ -665,6 +698,7 @@ class PDFParser(PSStackParser):
         xref = PDFXRef(self)
       yield xref
       trailer = xref.trailer
+      if not trailer: continue
       if 1 <= self.debug:
         print >>stderr, 'trailer: %r' % trailer
       if 'XRefStm' in trailer: