2013-10-17 14:05:27 +00:00
|
|
|
#!/usr/bin/env python
|
2014-06-14 03:00:49 +00:00
|
|
|
import logging
|
2014-06-25 10:55:41 +00:00
|
|
|
from io import BytesIO
|
2014-06-26 09:12:39 +00:00
|
|
|
from .psparser import PSStackParser
|
|
|
|
from .psparser import PSSyntaxError
|
|
|
|
from .psparser import PSEOF
|
|
|
|
from .psparser import KWD
|
2016-01-10 17:17:38 +00:00
|
|
|
from . import settings
|
2014-06-26 09:12:39 +00:00
|
|
|
from .pdftypes import PDFException
|
|
|
|
from .pdftypes import PDFStream
|
|
|
|
from .pdftypes import PDFObjRef
|
|
|
|
from .pdftypes import int_value
|
|
|
|
from .pdftypes import dict_value
|
2007-12-30 09:13:51 +00:00
|
|
|
|
|
|
|
|
2009-01-10 09:14:46 +00:00
|
|
|
## Exceptions
|
2007-12-30 09:13:51 +00:00
|
|
|
##
|
2013-11-07 08:35:04 +00:00
|
|
|
class PDFSyntaxError(PDFException):
|
|
|
|
pass
|
2010-10-17 05:14:52 +00:00
|
|
|
|
2007-12-30 09:13:51 +00:00
|
|
|
|
|
|
|
## PDFParser
|
|
|
|
##
|
|
|
|
class PDFParser(PSStackParser):
|
|
|
|
|
2010-03-27 06:06:09 +00:00
|
|
|
"""
|
|
|
|
PDFParser fetch PDF objects from a file stream.
|
|
|
|
It can handle indirect references by referring to
|
|
|
|
a PDF document set by set_document method.
|
|
|
|
It also reads XRefs at the end of every PDF file.
|
|
|
|
|
|
|
|
Typical usage:
|
|
|
|
parser = PDFParser(fp)
|
|
|
|
parser.read_xref()
|
2013-10-09 12:39:23 +00:00
|
|
|
parser.read_xref(fallback=True) # optional
|
2010-03-27 06:06:09 +00:00
|
|
|
parser.set_document(doc)
|
|
|
|
parser.seek(offset)
|
|
|
|
parser.nextobject()
|
2013-11-07 07:14:53 +00:00
|
|
|
|
2010-03-27 06:06:09 +00:00
|
|
|
"""
|
|
|
|
|
2010-01-01 03:09:26 +00:00
|
|
|
def __init__(self, fp):
|
2009-10-24 04:41:59 +00:00
|
|
|
PSStackParser.__init__(self, fp)
|
2010-01-01 03:09:26 +00:00
|
|
|
self.doc = None
|
2010-08-29 06:39:24 +00:00
|
|
|
self.fallback = False
|
2009-10-24 04:41:59 +00:00
|
|
|
return
|
|
|
|
|
2010-01-01 03:09:26 +00:00
|
|
|
def set_document(self, doc):
|
2010-03-27 06:06:09 +00:00
|
|
|
"""Associates the parser with a PDFDocument object."""
|
2010-01-01 03:09:26 +00:00
|
|
|
self.doc = doc
|
|
|
|
return
|
2009-10-24 04:41:59 +00:00
|
|
|
|
2014-06-30 10:05:56 +00:00
|
|
|
KEYWORD_R = KWD(b'R')
|
|
|
|
KEYWORD_NULL = KWD(b'null')
|
|
|
|
KEYWORD_ENDOBJ = KWD(b'endobj')
|
|
|
|
KEYWORD_STREAM = KWD(b'stream')
|
|
|
|
KEYWORD_XREF = KWD(b'xref')
|
|
|
|
KEYWORD_STARTXREF = KWD(b'startxref')
|
2013-11-07 08:35:04 +00:00
|
|
|
|
2009-10-24 04:41:59 +00:00
|
|
|
def do_keyword(self, pos, token):
|
2010-03-27 06:06:09 +00:00
|
|
|
"""Handles PDF-related keywords."""
|
2013-11-07 07:14:53 +00:00
|
|
|
|
2009-10-24 04:41:59 +00:00
|
|
|
if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
|
|
|
|
self.add_results(*self.pop(1))
|
2013-11-07 07:14:53 +00:00
|
|
|
|
2010-03-27 06:06:09 +00:00
|
|
|
elif token is self.KEYWORD_ENDOBJ:
|
2009-10-24 04:41:59 +00:00
|
|
|
self.add_results(*self.pop(4))
|
|
|
|
|
2010-03-27 06:06:09 +00:00
|
|
|
elif token is self.KEYWORD_NULL:
|
2010-03-23 10:29:52 +00:00
|
|
|
# null object
|
|
|
|
self.push((pos, None))
|
|
|
|
|
2010-03-27 06:06:09 +00:00
|
|
|
elif token is self.KEYWORD_R:
|
2009-10-24 04:41:59 +00:00
|
|
|
# reference to indirect object
|
|
|
|
try:
|
2013-11-07 08:35:04 +00:00
|
|
|
((_, objid), (_, genno)) = self.pop(2)
|
2009-10-24 04:41:59 +00:00
|
|
|
(objid, genno) = (int(objid), int(genno))
|
|
|
|
obj = PDFObjRef(self.doc, objid, genno)
|
|
|
|
self.push((pos, obj))
|
|
|
|
except PSSyntaxError:
|
|
|
|
pass
|
|
|
|
|
2010-03-27 06:06:09 +00:00
|
|
|
elif token is self.KEYWORD_STREAM:
|
2009-10-24 04:41:59 +00:00
|
|
|
# stream object
|
2013-11-07 08:35:04 +00:00
|
|
|
((_, dic),) = self.pop(1)
|
2009-10-24 04:41:59 +00:00
|
|
|
dic = dict_value(dic)
|
2010-08-29 06:39:24 +00:00
|
|
|
objlen = 0
|
|
|
|
if not self.fallback:
|
|
|
|
try:
|
|
|
|
objlen = int_value(dic['Length'])
|
|
|
|
except KeyError:
|
2016-01-10 17:17:38 +00:00
|
|
|
if settings.STRICT:
|
2010-08-29 06:39:24 +00:00
|
|
|
raise PDFSyntaxError('/Length is undefined: %r' % dic)
|
2009-10-24 04:41:59 +00:00
|
|
|
self.seek(pos)
|
|
|
|
try:
|
|
|
|
(_, line) = self.nextline() # 'stream'
|
|
|
|
except PSEOF:
|
2016-01-10 17:17:38 +00:00
|
|
|
if settings.STRICT:
|
2009-10-24 04:41:59 +00:00
|
|
|
raise PDFSyntaxError('Unexpected EOF')
|
|
|
|
return
|
|
|
|
pos += len(line)
|
|
|
|
self.fp.seek(pos)
|
|
|
|
data = self.fp.read(objlen)
|
|
|
|
self.seek(pos+objlen)
|
|
|
|
while 1:
|
|
|
|
try:
|
|
|
|
(linepos, line) = self.nextline()
|
|
|
|
except PSEOF:
|
2016-01-10 17:17:38 +00:00
|
|
|
if settings.STRICT:
|
2009-10-24 04:41:59 +00:00
|
|
|
raise PDFSyntaxError('Unexpected EOF')
|
|
|
|
break
|
2014-06-30 10:05:56 +00:00
|
|
|
if b'endstream' in line:
|
|
|
|
i = line.index(b'endstream')
|
2009-10-24 04:41:59 +00:00
|
|
|
objlen += i
|
2014-05-19 14:27:43 +00:00
|
|
|
if self.fallback:
|
|
|
|
data += line[:i]
|
2009-10-24 04:41:59 +00:00
|
|
|
break
|
|
|
|
objlen += len(line)
|
2014-05-19 14:27:43 +00:00
|
|
|
if self.fallback:
|
|
|
|
data += line
|
2009-10-24 04:41:59 +00:00
|
|
|
self.seek(pos+objlen)
|
2010-08-29 06:39:24 +00:00
|
|
|
# XXX limit objlen not to exceed object boundary
|
2014-09-11 21:40:18 +00:00
|
|
|
logging.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...', pos, objlen, dic, data[:10])
|
2009-10-24 04:41:59 +00:00
|
|
|
obj = PDFStream(dic, data, self.doc.decipher)
|
|
|
|
self.push((pos, obj))
|
|
|
|
|
2010-03-27 06:06:09 +00:00
|
|
|
else:
|
|
|
|
# others
|
|
|
|
self.push((pos, token))
|
2013-11-07 07:14:53 +00:00
|
|
|
|
2008-05-03 04:10:59 +00:00
|
|
|
return
|
2009-10-24 04:41:59 +00:00
|
|
|
|
2009-01-10 09:14:46 +00:00
|
|
|
|
2010-03-25 11:38:47 +00:00
|
|
|
## PDFStreamParser
|
2008-02-03 09:36:34 +00:00
|
|
|
##
|
2010-03-25 11:38:47 +00:00
|
|
|
class PDFStreamParser(PDFParser):
|
2009-10-24 04:41:59 +00:00
|
|
|
|
2010-03-27 06:06:09 +00:00
|
|
|
"""
|
|
|
|
PDFStreamParser is used to parse PDF content streams
|
|
|
|
that is contained in each page and has instructions
|
|
|
|
for rendering the page. A reference to a PDF document is
|
|
|
|
needed because a PDF content stream can also have
|
|
|
|
indirect references to other objects in the same document.
|
|
|
|
"""
|
|
|
|
|
2010-03-25 11:38:47 +00:00
|
|
|
def __init__(self, data):
|
2014-06-25 10:55:41 +00:00
|
|
|
PDFParser.__init__(self, BytesIO(data))
|
2009-10-24 04:41:59 +00:00
|
|
|
return
|
|
|
|
|
|
|
|
def flush(self):
|
|
|
|
self.add_results(*self.popall())
|
|
|
|
return
|
2010-01-30 07:30:01 +00:00
|
|
|
|
2014-06-30 10:05:56 +00:00
|
|
|
KEYWORD_OBJ = KWD(b'obj')
|
2010-01-30 07:30:01 +00:00
|
|
|
def do_keyword(self, pos, token):
|
|
|
|
if token is self.KEYWORD_R:
|
|
|
|
# reference to indirect object
|
|
|
|
try:
|
2013-11-07 08:35:04 +00:00
|
|
|
((_, objid), (_, genno)) = self.pop(2)
|
2010-01-30 07:30:01 +00:00
|
|
|
(objid, genno) = (int(objid), int(genno))
|
|
|
|
obj = PDFObjRef(self.doc, objid, genno)
|
|
|
|
self.push((pos, obj))
|
|
|
|
except PSSyntaxError:
|
|
|
|
pass
|
|
|
|
return
|
2014-04-09 08:55:50 +00:00
|
|
|
elif token in (self.KEYWORD_OBJ, self.KEYWORD_ENDOBJ):
|
2016-01-10 17:17:38 +00:00
|
|
|
if settings.STRICT:
|
2014-04-09 08:55:50 +00:00
|
|
|
# See PDF Spec 3.4.6: Only the object values are stored in the
|
|
|
|
# stream; the obj and endobj keywords are not used.
|
2014-06-30 10:05:56 +00:00
|
|
|
raise PDFSyntaxError('Keyword endobj found in stream')
|
2014-04-09 08:55:50 +00:00
|
|
|
return
|
2010-01-30 07:30:01 +00:00
|
|
|
# others
|
|
|
|
self.push((pos, token))
|
|
|
|
return
|