PSEOF check
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@28 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
43436a4abb
commit
77d7c9ae55
16
README.html
16
README.html
|
@ -11,13 +11,13 @@ blockquote { background: #eeeeee; }
|
||||||
<h1>PDFMiner</h1>
|
<h1>PDFMiner</h1>
|
||||||
<div align=right class=lastmod>
|
<div align=right class=lastmod>
|
||||||
<!-- hhmts start -->
|
<!-- hhmts start -->
|
||||||
Last Modified: Tue Apr 29 20:16:22 JST 2008
|
Last Modified: Wed Apr 30 19:15:12 JST 2008
|
||||||
<!-- hhmts end -->
|
<!-- hhmts end -->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<a name="intro"></a>
|
<a name="intro"></a>
|
||||||
<hr noshade>
|
<hr noshade>
|
||||||
<h2>What's it?</h2>
|
<h2>What's It?</h2>
|
||||||
<p>
|
<p>
|
||||||
PDFMiner is a suite of programs that aims to help
|
PDFMiner is a suite of programs that aims to help
|
||||||
extracting or analyzing text data from PDF documents.
|
extracting or analyzing text data from PDF documents.
|
||||||
|
@ -42,6 +42,7 @@ It can be also used as a basis for a full-fledged PDF interpreter.
|
||||||
http://www.unixuser.org/~euske/python/pdfminer/index.html
|
http://www.unixuser.org/~euske/python/pdfminer/index.html
|
||||||
</a>
|
</a>
|
||||||
|
|
||||||
|
<a name="source"></a>
|
||||||
<p>
|
<p>
|
||||||
<strong>Download (source):</strong><br>
|
<strong>Download (source):</strong><br>
|
||||||
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20080429.tar.gz">
|
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20080429.tar.gz">
|
||||||
|
@ -57,11 +58,10 @@ http://pdfminerr.googlecode.com/svn/
|
||||||
|
|
||||||
<a name="install"></a>
|
<a name="install"></a>
|
||||||
<hr noshade>
|
<hr noshade>
|
||||||
<h2>Installation</h2>
|
<h2>How to Install</h2>
|
||||||
<p>
|
|
||||||
Prerequisite: <a href="http://www.python.org/download/">Python</a> 2.4 or newer.
|
|
||||||
<ol>
|
<ol>
|
||||||
<li> Download the source.
|
<li> Install <a href="http://www.python.org/download/">Python</a> 2.4 or newer.
|
||||||
|
<li> Download the <a href="#source">PDFMiner source</a>.
|
||||||
<li> Extract it.
|
<li> Extract it.
|
||||||
<li> Go to the <code>pdfminer</code> directory.
|
<li> Go to the <code>pdfminer</code> directory.
|
||||||
<li> Do the following test:<br>
|
<li> Do the following test:<br>
|
||||||
|
@ -102,7 +102,7 @@ $ <strong>make cdbcmap</strong>
|
||||||
|
|
||||||
<a name="usage"></a>
|
<a name="usage"></a>
|
||||||
<hr noshade>
|
<hr noshade>
|
||||||
<h2>Usage</h2>
|
<h2>How to Use</h2>
|
||||||
|
|
||||||
<p>
|
<p>
|
||||||
PDFMiner comes with two programs:
|
PDFMiner comes with two programs:
|
||||||
|
@ -234,7 +234,7 @@ no stream header is displayed for the ease of saving it to a file.
|
||||||
|
|
||||||
<a name="license"></a>
|
<a name="license"></a>
|
||||||
<hr noshade>
|
<hr noshade>
|
||||||
<h2>Terms and conditions</h2>
|
<h2>Terms and Conditions</h2>
|
||||||
<p>
|
<p>
|
||||||
<small>
|
<small>
|
||||||
Copyright (c) 2004-2008 Yusuke Shinyama <yusuke at cs dot nyu dot edu>
|
Copyright (c) 2004-2008 Yusuke Shinyama <yusuke at cs dot nyu dot edu>
|
||||||
|
|
38
pdfparser.py
38
pdfparser.py
|
@ -280,7 +280,12 @@ class PDFXRef:
|
||||||
|
|
||||||
def __init__(self, parser):
|
def __init__(self, parser):
|
||||||
while 1:
|
while 1:
|
||||||
|
try:
|
||||||
(pos, line) = parser.nextline()
|
(pos, line) = parser.nextline()
|
||||||
|
except PSEOF:
|
||||||
|
if STRICT:
|
||||||
|
raise PDFSyntaxError('Unexpected EOF')
|
||||||
|
break
|
||||||
if not line:
|
if not line:
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFSyntaxError('premature eof: %r' % parser)
|
raise PDFSyntaxError('premature eof: %r' % parser)
|
||||||
|
@ -293,12 +298,19 @@ class PDFXRef:
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFSyntaxError('trailer not found: %r: line=%r' % (parser, line))
|
raise PDFSyntaxError('trailer not found: %r: line=%r' % (parser, line))
|
||||||
continue
|
continue
|
||||||
|
try:
|
||||||
(start, nobjs) = map(long, f)
|
(start, nobjs) = map(long, f)
|
||||||
|
except ValueError:
|
||||||
|
if STRICT:
|
||||||
|
raise PDFSyntaxError('invalid line: %r: line=%r' % (parser, line))
|
||||||
|
continue
|
||||||
self.objid0 = start
|
self.objid0 = start
|
||||||
self.objid1 = start+nobjs
|
|
||||||
self.offsets = []
|
self.offsets = []
|
||||||
for objid in xrange(start, start+nobjs):
|
for objid in xrange(start, start+nobjs):
|
||||||
|
try:
|
||||||
(_, line) = parser.nextline()
|
(_, line) = parser.nextline()
|
||||||
|
except PSEOF:
|
||||||
|
break
|
||||||
f = line.strip().split(' ')
|
f = line.strip().split(' ')
|
||||||
if len(f) != 3:
|
if len(f) != 3:
|
||||||
if STRICT:
|
if STRICT:
|
||||||
|
@ -307,14 +319,19 @@ class PDFXRef:
|
||||||
(pos, genno, use) = f
|
(pos, genno, use) = f
|
||||||
self.offsets.append((int(genno), long(pos), use))
|
self.offsets.append((int(genno), long(pos), use))
|
||||||
# read trailer
|
# read trailer
|
||||||
|
try:
|
||||||
(_,kwd) = parser.nexttoken()
|
(_,kwd) = parser.nexttoken()
|
||||||
assert kwd == KEYWORD_TRAILER
|
assert kwd == KEYWORD_TRAILER
|
||||||
(_,dic) = parser.nextobject()
|
(_,dic) = parser.nextobject()
|
||||||
self.trailer = dict_value(dic)
|
self.trailer = dict_value(dic)
|
||||||
|
except PSEOF:
|
||||||
|
if STRICT:
|
||||||
|
raise PDFSyntaxError('Unexpected EOF')
|
||||||
|
self.trailer = None
|
||||||
return
|
return
|
||||||
|
|
||||||
def getpos(self, objid):
|
def getpos(self, objid):
|
||||||
if objid < self.objid0 or self.objid1 <= objid:
|
if objid < self.objid0 or (self.objid0+len(self.offsets)) <= objid:
|
||||||
raise IndexError(objid)
|
raise IndexError(objid)
|
||||||
(genno, pos, use) = self.offsets[objid-self.objid0]
|
(genno, pos, use) = self.offsets[objid-self.objid0]
|
||||||
if use != 'n':
|
if use != 'n':
|
||||||
|
@ -386,6 +403,7 @@ class PDFDocument:
|
||||||
self.xrefs = list(parser.read_xref())
|
self.xrefs = list(parser.read_xref())
|
||||||
for xref in self.xrefs:
|
for xref in self.xrefs:
|
||||||
trailer = xref.trailer
|
trailer = xref.trailer
|
||||||
|
if not trailer: continue
|
||||||
if 'Encrypt' in trailer:
|
if 'Encrypt' in trailer:
|
||||||
self.encryption = (list_value(trailer['ID']),
|
self.encryption = (list_value(trailer['ID']),
|
||||||
dict_value(trailer['Encrypt']))
|
dict_value(trailer['Encrypt']))
|
||||||
|
@ -600,13 +618,23 @@ class PDFParser(PSStackParser):
|
||||||
raise PDFValueError('/Length is undefined: %r' % dic)
|
raise PDFValueError('/Length is undefined: %r' % dic)
|
||||||
objlen = 0
|
objlen = 0
|
||||||
self.seek(pos)
|
self.seek(pos)
|
||||||
|
try:
|
||||||
(_, line) = self.nextline() # 'stream'
|
(_, line) = self.nextline() # 'stream'
|
||||||
|
except PSEOF:
|
||||||
|
if STRICT:
|
||||||
|
raise PDFSyntaxError('Unexpected EOF')
|
||||||
|
return
|
||||||
pos += len(line)
|
pos += len(line)
|
||||||
self.fp.seek(pos)
|
self.fp.seek(pos)
|
||||||
data = self.fp.read(objlen)
|
data = self.fp.read(objlen)
|
||||||
self.seek(pos+objlen)
|
self.seek(pos+objlen)
|
||||||
while 1:
|
while 1:
|
||||||
|
try:
|
||||||
(linepos, line) = self.nextline()
|
(linepos, line) = self.nextline()
|
||||||
|
except PSEOF:
|
||||||
|
if STRICT:
|
||||||
|
raise PDFSyntaxError('Unexpected EOF')
|
||||||
|
break
|
||||||
if 'endstream' in line:
|
if 'endstream' in line:
|
||||||
i = line.index('endstream')
|
i = line.index('endstream')
|
||||||
objlen += i
|
objlen += i
|
||||||
|
@ -649,7 +677,12 @@ class PDFParser(PSStackParser):
|
||||||
self.find_xref()
|
self.find_xref()
|
||||||
while 1:
|
while 1:
|
||||||
# read xref table
|
# read xref table
|
||||||
|
try:
|
||||||
(pos, token) = self.nexttoken()
|
(pos, token) = self.nexttoken()
|
||||||
|
except PSEOF:
|
||||||
|
if STRICT:
|
||||||
|
raise PDFSyntaxError('Unexpected EOF')
|
||||||
|
break
|
||||||
if 2 <= self.debug:
|
if 2 <= self.debug:
|
||||||
print >>stderr, 'read_xref: %r' % token
|
print >>stderr, 'read_xref: %r' % token
|
||||||
if isinstance(token, int):
|
if isinstance(token, int):
|
||||||
|
@ -665,6 +698,7 @@ class PDFParser(PSStackParser):
|
||||||
xref = PDFXRef(self)
|
xref = PDFXRef(self)
|
||||||
yield xref
|
yield xref
|
||||||
trailer = xref.trailer
|
trailer = xref.trailer
|
||||||
|
if not trailer: continue
|
||||||
if 1 <= self.debug:
|
if 1 <= self.debug:
|
||||||
print >>stderr, 'trailer: %r' % trailer
|
print >>stderr, 'trailer: %r' % trailer
|
||||||
if 'XRefStm' in trailer:
|
if 'XRefStm' in trailer:
|
||||||
|
|
Loading…
Reference in New Issue