pdfminer.six/pdfminer/pdfparser.py

168 lines
4.8 KiB
Python
Raw Normal View History

2013-10-17 14:05:27 +00:00
#!/usr/bin/env python
import sys
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
from psparser import PSStackParser
from psparser import PSSyntaxError, PSEOF
2013-10-10 09:29:30 +00:00
from psparser import KWD, STRICT
from pdftypes import PDFException
from pdftypes import PDFStream, PDFObjRef
from pdftypes import int_value
from pdftypes import dict_value
## Exceptions
##
2013-11-07 08:35:04 +00:00
class PDFSyntaxError(PDFException):
pass
## PDFParser
##
class PDFParser(PSStackParser):
"""
PDFParser fetch PDF objects from a file stream.
It can handle indirect references by referring to
a PDF document set by set_document method.
It also reads XRefs at the end of every PDF file.
Typical usage:
parser = PDFParser(fp)
parser.read_xref()
2013-10-09 12:39:23 +00:00
parser.read_xref(fallback=True) # optional
parser.set_document(doc)
parser.seek(offset)
parser.nextobject()
2013-11-07 07:14:53 +00:00
"""
def __init__(self, fp):
PSStackParser.__init__(self, fp)
self.doc = None
self.fallback = False
return
def set_document(self, doc):
"""Associates the parser with a PDFDocument object."""
self.doc = doc
return
KEYWORD_R = KWD('R')
KEYWORD_NULL = KWD('null')
KEYWORD_ENDOBJ = KWD('endobj')
KEYWORD_STREAM = KWD('stream')
KEYWORD_XREF = KWD('xref')
KEYWORD_STARTXREF = KWD('startxref')
2013-11-07 08:35:04 +00:00
def do_keyword(self, pos, token):
"""Handles PDF-related keywords."""
2013-11-07 07:14:53 +00:00
if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
self.add_results(*self.pop(1))
2013-11-07 07:14:53 +00:00
elif token is self.KEYWORD_ENDOBJ:
self.add_results(*self.pop(4))
elif token is self.KEYWORD_NULL:
# null object
self.push((pos, None))
elif token is self.KEYWORD_R:
# reference to indirect object
try:
2013-11-07 08:35:04 +00:00
((_, objid), (_, genno)) = self.pop(2)
(objid, genno) = (int(objid), int(genno))
obj = PDFObjRef(self.doc, objid, genno)
self.push((pos, obj))
except PSSyntaxError:
pass
elif token is self.KEYWORD_STREAM:
# stream object
2013-11-07 08:35:04 +00:00
((_, dic),) = self.pop(1)
dic = dict_value(dic)
objlen = 0
if not self.fallback:
try:
objlen = int_value(dic['Length'])
except KeyError:
if STRICT:
raise PDFSyntaxError('/Length is undefined: %r' % dic)
self.seek(pos)
try:
(_, line) = self.nextline() # 'stream'
except PSEOF:
if STRICT:
raise PDFSyntaxError('Unexpected EOF')
return
pos += len(line)
self.fp.seek(pos)
data = self.fp.read(objlen)
self.seek(pos+objlen)
while 1:
try:
(linepos, line) = self.nextline()
except PSEOF:
if STRICT:
raise PDFSyntaxError('Unexpected EOF')
break
if 'endstream' in line:
i = line.index('endstream')
objlen += i
data += line[:i]
break
objlen += len(line)
data += line
self.seek(pos+objlen)
# XXX limit objlen not to exceed object boundary
if 2 <= self.debug:
2011-03-02 14:43:03 +00:00
print >>sys.stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
2013-11-07 08:35:04 +00:00
(pos, objlen, dic, data[:10])
obj = PDFStream(dic, data, self.doc.decipher)
self.push((pos, obj))
else:
# others
self.push((pos, token))
2013-11-07 07:14:53 +00:00
return
## PDFStreamParser
##
class PDFStreamParser(PDFParser):
"""
PDFStreamParser is used to parse PDF content streams
that is contained in each page and has instructions
for rendering the page. A reference to a PDF document is
needed because a PDF content stream can also have
indirect references to other objects in the same document.
"""
def __init__(self, data):
PDFParser.__init__(self, StringIO(data))
return
def flush(self):
self.add_results(*self.popall())
return
def do_keyword(self, pos, token):
if token is self.KEYWORD_R:
# reference to indirect object
try:
2013-11-07 08:35:04 +00:00
((_, objid), (_, genno)) = self.pop(2)
(objid, genno) = (int(objid), int(genno))
obj = PDFObjRef(self.doc, objid, genno)
self.push((pos, obj))
except PSSyntaxError:
pass
return
# others
self.push((pos, token))
return