PSEOF check
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@28 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
43436a4abb
commit
77d7c9ae55
16
README.html
16
README.html
|
@ -11,13 +11,13 @@ blockquote { background: #eeeeee; }
|
|||
<h1>PDFMiner</h1>
|
||||
<div align=right class=lastmod>
|
||||
<!-- hhmts start -->
|
||||
Last Modified: Tue Apr 29 20:16:22 JST 2008
|
||||
Last Modified: Wed Apr 30 19:15:12 JST 2008
|
||||
<!-- hhmts end -->
|
||||
</div>
|
||||
|
||||
<a name="intro"></a>
|
||||
<hr noshade>
|
||||
<h2>What's it?</h2>
|
||||
<h2>What's It?</h2>
|
||||
<p>
|
||||
PDFMiner is a suite of programs that aims to help
|
||||
extracting or analyzing text data from PDF documents.
|
||||
|
@ -42,6 +42,7 @@ It can be also used as a basis for a full-fledged PDF interpreter.
|
|||
http://www.unixuser.org/~euske/python/pdfminer/index.html
|
||||
</a>
|
||||
|
||||
<a name="source"></a>
|
||||
<p>
|
||||
<strong>Download (source):</strong><br>
|
||||
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20080429.tar.gz">
|
||||
|
@ -57,11 +58,10 @@ http://pdfminerr.googlecode.com/svn/
|
|||
|
||||
<a name="install"></a>
|
||||
<hr noshade>
|
||||
<h2>Installation</h2>
|
||||
<p>
|
||||
Prerequisite: <a href="http://www.python.org/download/">Python</a> 2.4 or newer.
|
||||
<h2>How to Install</h2>
|
||||
<ol>
|
||||
<li> Download the source.
|
||||
<li> Install <a href="http://www.python.org/download/">Python</a> 2.4 or newer.
|
||||
<li> Download the <a href="#source">PDFMiner source</a>.
|
||||
<li> Extract it.
|
||||
<li> Go to the <code>pdfminer</code> directory.
|
||||
<li> Do the following test:<br>
|
||||
|
@ -102,7 +102,7 @@ $ <strong>make cdbcmap</strong>
|
|||
|
||||
<a name="usage"></a>
|
||||
<hr noshade>
|
||||
<h2>Usage</h2>
|
||||
<h2>How to Use</h2>
|
||||
|
||||
<p>
|
||||
PDFMiner comes with two programs:
|
||||
|
@ -234,7 +234,7 @@ no stream header is displayed for the ease of saving it to a file.
|
|||
|
||||
<a name="license"></a>
|
||||
<hr noshade>
|
||||
<h2>Terms and conditions</h2>
|
||||
<h2>Terms and Conditions</h2>
|
||||
<p>
|
||||
<small>
|
||||
Copyright (c) 2004-2008 Yusuke Shinyama <yusuke at cs dot nyu dot edu>
|
||||
|
|
58
pdfparser.py
58
pdfparser.py
|
@ -280,7 +280,12 @@ class PDFXRef:
|
|||
|
||||
def __init__(self, parser):
|
||||
while 1:
|
||||
(pos, line) = parser.nextline()
|
||||
try:
|
||||
(pos, line) = parser.nextline()
|
||||
except PSEOF:
|
||||
if STRICT:
|
||||
raise PDFSyntaxError('Unexpected EOF')
|
||||
break
|
||||
if not line:
|
||||
if STRICT:
|
||||
raise PDFSyntaxError('premature eof: %r' % parser)
|
||||
|
@ -293,12 +298,19 @@ class PDFXRef:
|
|||
if STRICT:
|
||||
raise PDFSyntaxError('trailer not found: %r: line=%r' % (parser, line))
|
||||
continue
|
||||
(start, nobjs) = map(long, f)
|
||||
try:
|
||||
(start, nobjs) = map(long, f)
|
||||
except ValueError:
|
||||
if STRICT:
|
||||
raise PDFSyntaxError('invalid line: %r: line=%r' % (parser, line))
|
||||
continue
|
||||
self.objid0 = start
|
||||
self.objid1 = start+nobjs
|
||||
self.offsets = []
|
||||
for objid in xrange(start, start+nobjs):
|
||||
(_, line) = parser.nextline()
|
||||
try:
|
||||
(_, line) = parser.nextline()
|
||||
except PSEOF:
|
||||
break
|
||||
f = line.strip().split(' ')
|
||||
if len(f) != 3:
|
||||
if STRICT:
|
||||
|
@ -307,14 +319,19 @@ class PDFXRef:
|
|||
(pos, genno, use) = f
|
||||
self.offsets.append((int(genno), long(pos), use))
|
||||
# read trailer
|
||||
(_,kwd) = parser.nexttoken()
|
||||
assert kwd == KEYWORD_TRAILER
|
||||
(_,dic) = parser.nextobject()
|
||||
self.trailer = dict_value(dic)
|
||||
try:
|
||||
(_,kwd) = parser.nexttoken()
|
||||
assert kwd == KEYWORD_TRAILER
|
||||
(_,dic) = parser.nextobject()
|
||||
self.trailer = dict_value(dic)
|
||||
except PSEOF:
|
||||
if STRICT:
|
||||
raise PDFSyntaxError('Unexpected EOF')
|
||||
self.trailer = None
|
||||
return
|
||||
|
||||
def getpos(self, objid):
|
||||
if objid < self.objid0 or self.objid1 <= objid:
|
||||
if objid < self.objid0 or (self.objid0+len(self.offsets)) <= objid:
|
||||
raise IndexError(objid)
|
||||
(genno, pos, use) = self.offsets[objid-self.objid0]
|
||||
if use != 'n':
|
||||
|
@ -386,6 +403,7 @@ class PDFDocument:
|
|||
self.xrefs = list(parser.read_xref())
|
||||
for xref in self.xrefs:
|
||||
trailer = xref.trailer
|
||||
if not trailer: continue
|
||||
if 'Encrypt' in trailer:
|
||||
self.encryption = (list_value(trailer['ID']),
|
||||
dict_value(trailer['Encrypt']))
|
||||
|
@ -600,13 +618,23 @@ class PDFParser(PSStackParser):
|
|||
raise PDFValueError('/Length is undefined: %r' % dic)
|
||||
objlen = 0
|
||||
self.seek(pos)
|
||||
(_, line) = self.nextline() # 'stream'
|
||||
try:
|
||||
(_, line) = self.nextline() # 'stream'
|
||||
except PSEOF:
|
||||
if STRICT:
|
||||
raise PDFSyntaxError('Unexpected EOF')
|
||||
return
|
||||
pos += len(line)
|
||||
self.fp.seek(pos)
|
||||
data = self.fp.read(objlen)
|
||||
self.seek(pos+objlen)
|
||||
while 1:
|
||||
(linepos, line) = self.nextline()
|
||||
try:
|
||||
(linepos, line) = self.nextline()
|
||||
except PSEOF:
|
||||
if STRICT:
|
||||
raise PDFSyntaxError('Unexpected EOF')
|
||||
break
|
||||
if 'endstream' in line:
|
||||
i = line.index('endstream')
|
||||
objlen += i
|
||||
|
@ -649,7 +677,12 @@ class PDFParser(PSStackParser):
|
|||
self.find_xref()
|
||||
while 1:
|
||||
# read xref table
|
||||
(pos, token) = self.nexttoken()
|
||||
try:
|
||||
(pos, token) = self.nexttoken()
|
||||
except PSEOF:
|
||||
if STRICT:
|
||||
raise PDFSyntaxError('Unexpected EOF')
|
||||
break
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'read_xref: %r' % token
|
||||
if isinstance(token, int):
|
||||
|
@ -665,6 +698,7 @@ class PDFParser(PSStackParser):
|
|||
xref = PDFXRef(self)
|
||||
yield xref
|
||||
trailer = xref.trailer
|
||||
if not trailer: continue
|
||||
if 1 <= self.debug:
|
||||
print >>stderr, 'trailer: %r' % trailer
|
||||
if 'XRefStm' in trailer:
|
||||
|
|
Loading…
Reference in New Issue