PSEOF check

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@28 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2008-05-03 04:10:59 +00:00
parent 43436a4abb
commit 77d7c9ae55
2 changed files with 54 additions and 20 deletions

View File

@ -11,13 +11,13 @@ blockquote { background: #eeeeee; }
<h1>PDFMiner</h1>
<div align=right class=lastmod>
<!-- hhmts start -->
Last Modified: Tue Apr 29 20:16:22 JST 2008
Last Modified: Wed Apr 30 19:15:12 JST 2008
<!-- hhmts end -->
</div>
<a name="intro"></a>
<hr noshade>
<h2>What's it?</h2>
<h2>What's It?</h2>
<p>
PDFMiner is a suite of programs that aims to help
extracting or analyzing text data from PDF documents.
@ -42,6 +42,7 @@ It can be also used as a basis for a full-fledged PDF interpreter.
http://www.unixuser.org/~euske/python/pdfminer/index.html
</a>
<a name="source"></a>
<p>
<strong>Download (source):</strong><br>
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20080429.tar.gz">
@ -57,11 +58,10 @@ http://pdfminerr.googlecode.com/svn/
<a name="install"></a>
<hr noshade>
<h2>Installation</h2>
<p>
Prerequisite: <a href="http://www.python.org/download/">Python</a> 2.4 or newer.
<h2>How to Install</h2>
<ol>
<li> Download the source.
<li> Install <a href="http://www.python.org/download/">Python</a> 2.4 or newer.
<li> Download the <a href="#source">PDFMiner source</a>.
<li> Extract it.
<li> Go to the <code>pdfminer</code> directory.
<li> Do the following test:<br>
@ -102,7 +102,7 @@ $ <strong>make cdbcmap</strong>
<a name="usage"></a>
<hr noshade>
<h2>Usage</h2>
<h2>How to Use</h2>
<p>
PDFMiner comes with two programs:
@ -234,7 +234,7 @@ no stream header is displayed for the ease of saving it to a file.
<a name="license"></a>
<hr noshade>
<h2>Terms and conditions</h2>
<h2>Terms and Conditions</h2>
<p>
<small>
Copyright (c) 2004-2008 Yusuke Shinyama &lt;yusuke at cs dot nyu dot edu&gt;

View File

@ -280,7 +280,12 @@ class PDFXRef:
def __init__(self, parser):
while 1:
try:
(pos, line) = parser.nextline()
except PSEOF:
if STRICT:
raise PDFSyntaxError('Unexpected EOF')
break
if not line:
if STRICT:
raise PDFSyntaxError('premature eof: %r' % parser)
@ -293,12 +298,19 @@ class PDFXRef:
if STRICT:
raise PDFSyntaxError('trailer not found: %r: line=%r' % (parser, line))
continue
try:
(start, nobjs) = map(long, f)
except ValueError:
if STRICT:
raise PDFSyntaxError('invalid line: %r: line=%r' % (parser, line))
continue
self.objid0 = start
self.objid1 = start+nobjs
self.offsets = []
for objid in xrange(start, start+nobjs):
try:
(_, line) = parser.nextline()
except PSEOF:
break
f = line.strip().split(' ')
if len(f) != 3:
if STRICT:
@ -307,14 +319,19 @@ class PDFXRef:
(pos, genno, use) = f
self.offsets.append((int(genno), long(pos), use))
# read trailer
try:
(_,kwd) = parser.nexttoken()
assert kwd == KEYWORD_TRAILER
(_,dic) = parser.nextobject()
self.trailer = dict_value(dic)
except PSEOF:
if STRICT:
raise PDFSyntaxError('Unexpected EOF')
self.trailer = None
return
def getpos(self, objid):
if objid < self.objid0 or self.objid1 <= objid:
if objid < self.objid0 or (self.objid0+len(self.offsets)) <= objid:
raise IndexError(objid)
(genno, pos, use) = self.offsets[objid-self.objid0]
if use != 'n':
@ -386,6 +403,7 @@ class PDFDocument:
self.xrefs = list(parser.read_xref())
for xref in self.xrefs:
trailer = xref.trailer
if not trailer: continue
if 'Encrypt' in trailer:
self.encryption = (list_value(trailer['ID']),
dict_value(trailer['Encrypt']))
@ -600,13 +618,23 @@ class PDFParser(PSStackParser):
raise PDFValueError('/Length is undefined: %r' % dic)
objlen = 0
self.seek(pos)
try:
(_, line) = self.nextline() # 'stream'
except PSEOF:
if STRICT:
raise PDFSyntaxError('Unexpected EOF')
return
pos += len(line)
self.fp.seek(pos)
data = self.fp.read(objlen)
self.seek(pos+objlen)
while 1:
try:
(linepos, line) = self.nextline()
except PSEOF:
if STRICT:
raise PDFSyntaxError('Unexpected EOF')
break
if 'endstream' in line:
i = line.index('endstream')
objlen += i
@ -649,7 +677,12 @@ class PDFParser(PSStackParser):
self.find_xref()
while 1:
# read xref table
try:
(pos, token) = self.nexttoken()
except PSEOF:
if STRICT:
raise PDFSyntaxError('Unexpected EOF')
break
if 2 <= self.debug:
print >>stderr, 'read_xref: %r' % token
if isinstance(token, int):
@ -665,6 +698,7 @@ class PDFParser(PSStackParser):
xref = PDFXRef(self)
yield xref
trailer = xref.trailer
if not trailer: continue
if 1 <= self.debug:
print >>stderr, 'trailer: %r' % trailer
if 'XRefStm' in trailer: