PSEOF check

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@28 1aa58f4a-7d42-0410-adbc-911cccaed67c
2008-05-03 04:10:59 +00:00 · 2008-05-03 04:10:59 +00:00 · 77d7c9ae55
parent 43436a4abb
commit 77d7c9ae55
2 changed files with 54 additions and 20 deletions
--- a/README.html
+++ b/README.html
@ -11,13 +11,13 @@ blockquote { background: #eeeeee; }
 <h1>PDFMiner</h1>
 <div align=right class=lastmod>
 <!-- hhmts start -->
-Last Modified: Tue Apr 29 20:16:22 JST 2008
+Last Modified: Wed Apr 30 19:15:12 JST 2008
 <!-- hhmts end -->
 </div>

 <a name="intro"></a>
 <hr noshade>
-<h2>What's it?</h2>
+<h2>What's It?</h2>
 <p>
 PDFMiner is a suite of programs that aims to help
 extracting or analyzing text data from PDF documents.
@ -42,6 +42,7 @@ It can be also used as a basis for a full-fledged PDF interpreter.
 http://www.unixuser.org/~euske/python/pdfminer/index.html
 </a>

+<a name="source"></a>
 <p>
 <strong>Download (source):</strong><br>
 <a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20080429.tar.gz">
@ -57,11 +58,10 @@ http://pdfminerr.googlecode.com/svn/

 <a name="install"></a>
 <hr noshade>
-<h2>Installation</h2>
-<p>
-Prerequisite: <a href="http://www.python.org/download/">Python</a> 2.4 or newer.
+<h2>How to Install</h2>
 <ol>
-<li> Download the source.
+<li> Install <a href="http://www.python.org/download/">Python</a> 2.4 or newer.
+<li> Download the <a href="#source">PDFMiner source</a>.
 <li> Extract it.
 <li> Go to the <code>pdfminer</code> directory.
 <li> Do the following test:<br>
@ -102,7 +102,7 @@ $ <strong>make cdbcmap</strong>

 <a name="usage"></a>
 <hr noshade>
-<h2>Usage</h2>
+<h2>How to Use</h2>

 <p>
 PDFMiner comes with two programs:
@ -234,7 +234,7 @@ no stream header is displayed for the ease of saving it to a file.

 <a name="license"></a>
 <hr noshade>
-<h2>Terms and conditions</h2>
+<h2>Terms and Conditions</h2>
 <p>
 <small>
 Copyright (c) 2004-2008  Yusuke Shinyama &lt;yusuke at cs dot nyu dot edu&gt;
--- a/pdfparser.py
+++ b/pdfparser.py
@ -280,7 +280,12 @@ class PDFXRef:

  def __init__(self, parser):
    while 1:
-      (pos, line) = parser.nextline()
+      try:
+        (pos, line) = parser.nextline()
+      except PSEOF:
+        if STRICT:
+          raise PDFSyntaxError('Unexpected EOF')
+        break
      if not line:
        if STRICT:
          raise PDFSyntaxError('premature eof: %r' % parser)
@ -293,12 +298,19 @@ class PDFXRef:
        if STRICT:
          raise PDFSyntaxError('trailer not found: %r: line=%r' % (parser, line))
        continue
-      (start, nobjs) = map(long, f)
+      try:
+        (start, nobjs) = map(long, f)
+      except ValueError:
+        if STRICT:
+          raise PDFSyntaxError('invalid line: %r: line=%r' % (parser, line))
+        continue
      self.objid0 = start
-      self.objid1 = start+nobjs
      self.offsets = []
      for objid in xrange(start, start+nobjs):
-        (_, line) = parser.nextline()
+        try:
+          (_, line) = parser.nextline()
+        except PSEOF:
+          break
        f = line.strip().split(' ')
        if len(f) != 3:
          if STRICT:
@ -307,14 +319,19 @@ class PDFXRef:
        (pos, genno, use) = f
        self.offsets.append((int(genno), long(pos), use))
    # read trailer
-    (_,kwd) = parser.nexttoken()
-    assert kwd == KEYWORD_TRAILER
-    (_,dic) = parser.nextobject()
-    self.trailer = dict_value(dic)
+    try:
+      (_,kwd) = parser.nexttoken()
+      assert kwd == KEYWORD_TRAILER
+      (_,dic) = parser.nextobject()
+      self.trailer = dict_value(dic)
+    except PSEOF:
+      if STRICT:
+        raise PDFSyntaxError('Unexpected EOF')
+      self.trailer = None
    return

  def getpos(self, objid):
-    if objid < self.objid0 or self.objid1 <= objid:
+    if objid < self.objid0 or (self.objid0+len(self.offsets)) <= objid:
      raise IndexError(objid)
    (genno, pos, use) = self.offsets[objid-self.objid0]
    if use != 'n':
@ -386,6 +403,7 @@ class PDFDocument:
    self.xrefs = list(parser.read_xref())
    for xref in self.xrefs:
      trailer = xref.trailer
+      if not trailer: continue
      if 'Encrypt' in trailer:
        self.encryption = (list_value(trailer['ID']),
                           dict_value(trailer['Encrypt']))
@ -600,13 +618,23 @@ class PDFParser(PSStackParser):
          raise PDFValueError('/Length is undefined: %r' % dic)
        objlen = 0
      self.seek(pos)
-      (_, line) = self.nextline()  # 'stream'
+      try:
+        (_, line) = self.nextline()  # 'stream'
+      except PSEOF:
+        if STRICT:
+          raise PDFSyntaxError('Unexpected EOF')
+        return
      pos += len(line)
      self.fp.seek(pos)
      data = self.fp.read(objlen)
      self.seek(pos+objlen)
      while 1:
-        (linepos, line) = self.nextline()
+        try:
+          (linepos, line) = self.nextline()
+        except PSEOF:
+          if STRICT:
+            raise PDFSyntaxError('Unexpected EOF')
+          break
        if 'endstream' in line:
          i = line.index('endstream')
          objlen += i
@ -649,7 +677,12 @@ class PDFParser(PSStackParser):
    self.find_xref()
    while 1:
      # read xref table
-      (pos, token) = self.nexttoken()
+      try:
+        (pos, token) = self.nexttoken()
+      except PSEOF:
+        if STRICT:
+          raise PDFSyntaxError('Unexpected EOF')
+        break
      if 2 <= self.debug:
        print >>stderr, 'read_xref: %r' % token
      if isinstance(token, int):
@ -665,6 +698,7 @@ class PDFParser(PSStackParser):
        xref = PDFXRef(self)
      yield xref
      trailer = xref.trailer
+      if not trailer: continue
      if 1 <= self.debug:
        print >>stderr, 'trailer: %r' % trailer
      if 'XRefStm' in trailer: