2007-12-30 09:13:51 +00:00
|
|
|
#!/usr/bin/env python
|
|
|
|
|
|
|
|
# pdfparser.py, Yusuke Shinyama
|
|
|
|
# ver 0.1, Dec 24 2004-
|
|
|
|
# ver 0.2, Dec 24 2007
|
|
|
|
|
2008-06-21 17:22:44 +00:00
|
|
|
import sys, re
|
2008-04-26 06:47:56 +00:00
|
|
|
import md5, struct
|
2007-12-30 09:13:51 +00:00
|
|
|
stderr = sys.stderr
|
2009-01-10 09:14:46 +00:00
|
|
|
from pdflib.utils import choplist, nunpack
|
|
|
|
from pdflib.arcfour import Arcfour
|
|
|
|
from pdflib.psparser import PSStackParser, PSSyntaxError, PSEOF, \
|
|
|
|
PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
|
|
|
|
STRICT
|
|
|
|
from pdflib.pdftypes import PDFException, PDFTypeError, PDFNotImplementedError, \
|
|
|
|
PDFStream, PDFObjRef, resolve1, decipher_all, \
|
|
|
|
int_value, float_value, num_value, str_value, list_value, dict_value, stream_value
|
2007-12-30 09:13:51 +00:00
|
|
|
|
|
|
|
|
2009-01-10 09:14:46 +00:00
|
|
|
## Exceptions
|
2007-12-30 09:13:51 +00:00
|
|
|
##
|
|
|
|
class PDFSyntaxError(PDFException): pass
|
2008-06-21 17:22:44 +00:00
|
|
|
class PDFNoValidXRef(PDFSyntaxError): pass
|
2008-04-26 06:47:56 +00:00
|
|
|
class PDFEncryptionError(PDFException): pass
|
|
|
|
class PDFPasswordIncorrect(PDFEncryptionError): pass
|
2007-12-30 09:13:51 +00:00
|
|
|
|
|
|
|
# some predefined literals and keywords.
|
|
|
|
LITERAL_OBJSTM = PSLiteralTable.intern('ObjStm')
|
|
|
|
LITERAL_XREF = PSLiteralTable.intern('XRef')
|
|
|
|
LITERAL_PAGE = PSLiteralTable.intern('Page')
|
|
|
|
LITERAL_PAGES = PSLiteralTable.intern('Pages')
|
|
|
|
LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
|
|
|
|
|
|
|
|
|
|
|
|
## XRefs
|
2009-01-10 09:14:46 +00:00
|
|
|
##
|
2007-12-30 09:13:51 +00:00
|
|
|
|
|
|
|
## PDFXRef
|
|
|
|
##
|
2008-06-21 17:22:44 +00:00
|
|
|
class PDFXRef(object):
|
2007-12-30 09:13:51 +00:00
|
|
|
|
2008-06-21 17:22:44 +00:00
|
|
|
def __init__(self):
|
|
|
|
self.offsets = None
|
|
|
|
return
|
|
|
|
|
2009-01-17 16:31:42 +00:00
|
|
|
def __repr__(self):
|
|
|
|
return '<PDFXRef: objs=%d>' % len(self.offsets)
|
|
|
|
|
2008-06-21 17:58:28 +00:00
|
|
|
def objids(self):
|
2009-01-10 09:14:46 +00:00
|
|
|
return self.offsets.iterkeys()
|
2008-06-21 17:58:28 +00:00
|
|
|
|
2009-01-17 16:31:42 +00:00
|
|
|
def load(self, parser, debug=0):
|
|
|
|
self.offsets = {}
|
2007-12-30 09:13:51 +00:00
|
|
|
while 1:
|
2008-05-03 04:10:59 +00:00
|
|
|
try:
|
|
|
|
(pos, line) = parser.nextline()
|
|
|
|
except PSEOF:
|
2008-09-06 04:31:06 +00:00
|
|
|
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
|
2007-12-30 09:13:51 +00:00
|
|
|
if not line:
|
2008-09-06 04:31:06 +00:00
|
|
|
raise PDFNoValidXRef('Premature eof: %r' % parser)
|
2008-04-27 03:16:27 +00:00
|
|
|
if line.startswith('trailer'):
|
|
|
|
parser.seek(pos)
|
2007-12-30 09:13:51 +00:00
|
|
|
break
|
2008-04-27 03:16:27 +00:00
|
|
|
f = line.strip().split(' ')
|
|
|
|
if len(f) != 2:
|
2008-09-06 04:31:06 +00:00
|
|
|
raise PDFNoValidXRef('Trailer not found: %r: line=%r' % (parser, line))
|
2008-05-03 04:10:59 +00:00
|
|
|
try:
|
|
|
|
(start, nobjs) = map(long, f)
|
|
|
|
except ValueError:
|
2008-09-06 04:31:06 +00:00
|
|
|
raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line))
|
2007-12-30 09:13:51 +00:00
|
|
|
for objid in xrange(start, start+nobjs):
|
2008-05-03 04:10:59 +00:00
|
|
|
try:
|
|
|
|
(_, line) = parser.nextline()
|
|
|
|
except PSEOF:
|
2008-09-06 04:31:06 +00:00
|
|
|
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
|
2007-12-30 09:13:51 +00:00
|
|
|
f = line.strip().split(' ')
|
|
|
|
if len(f) != 3:
|
2008-09-06 04:31:06 +00:00
|
|
|
raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line))
|
2007-12-30 09:13:51 +00:00
|
|
|
(pos, genno, use) = f
|
2009-01-17 16:31:42 +00:00
|
|
|
if use != 'n': continue
|
|
|
|
self.offsets[objid] = (int(genno), long(pos))
|
|
|
|
if debug:
|
|
|
|
print >>stderr, 'xref objects:', self.offsets
|
2008-06-21 17:22:44 +00:00
|
|
|
self.load_trailer(parser)
|
|
|
|
return
|
|
|
|
|
2009-01-10 09:14:46 +00:00
|
|
|
KEYWORD_TRAILER = PSKeywordTable.intern('trailer')
|
2008-06-21 17:22:44 +00:00
|
|
|
def load_trailer(self, parser):
|
2008-05-03 04:10:59 +00:00
|
|
|
try:
|
|
|
|
(_,kwd) = parser.nexttoken()
|
2009-01-10 09:14:46 +00:00
|
|
|
assert kwd is self.KEYWORD_TRAILER
|
2008-05-03 04:10:59 +00:00
|
|
|
(_,dic) = parser.nextobject()
|
|
|
|
except PSEOF:
|
2008-06-21 17:22:44 +00:00
|
|
|
x = parser.pop(1)
|
|
|
|
if not x:
|
2008-09-06 04:31:06 +00:00
|
|
|
raise PDFNoValidXRef('Unexpected EOF - file corrupted')
|
2008-06-21 17:22:44 +00:00
|
|
|
(_,dic) = x[0]
|
|
|
|
self.trailer = dict_value(dic)
|
2007-12-30 09:13:51 +00:00
|
|
|
return
|
|
|
|
|
|
|
|
def getpos(self, objid):
|
2008-06-21 17:22:44 +00:00
|
|
|
try:
|
2009-01-17 16:31:42 +00:00
|
|
|
(genno, pos) = self.offsets[objid]
|
2008-06-21 17:22:44 +00:00
|
|
|
except KeyError:
|
2008-06-21 17:58:28 +00:00
|
|
|
raise
|
2007-12-30 09:13:51 +00:00
|
|
|
return (None, pos)
|
|
|
|
|
|
|
|
|
|
|
|
## PDFXRefStream
|
|
|
|
##
|
2008-06-21 17:22:44 +00:00
|
|
|
class PDFXRefStream(object):
|
2007-12-30 09:13:51 +00:00
|
|
|
|
2008-06-21 17:22:44 +00:00
|
|
|
def __init__(self):
|
2009-01-17 16:31:42 +00:00
|
|
|
self.objid_first = None
|
|
|
|
self.objid_last = None
|
2008-06-21 17:22:44 +00:00
|
|
|
self.data = None
|
|
|
|
self.entlen = None
|
|
|
|
self.fl1 = self.fl2 = self.fl3 = None
|
|
|
|
return
|
|
|
|
|
2009-01-17 16:31:42 +00:00
|
|
|
def __repr__(self):
|
|
|
|
return '<PDFXRef: objid=%d-%d>' % (self.objid_first, self.objid_last)
|
|
|
|
|
2008-06-21 17:58:28 +00:00
|
|
|
def objids(self):
|
2009-01-17 16:31:42 +00:00
|
|
|
return xrange(self.objid_first, self.objid_last+1)
|
2008-06-21 17:58:28 +00:00
|
|
|
|
2009-01-17 16:31:42 +00:00
|
|
|
def load(self, parser, debug=0):
|
2008-06-21 17:22:44 +00:00
|
|
|
(_,objid) = parser.nexttoken() # ignored
|
|
|
|
(_,genno) = parser.nexttoken() # ignored
|
2008-04-27 03:16:27 +00:00
|
|
|
(_,kwd) = parser.nexttoken()
|
2008-02-03 09:36:34 +00:00
|
|
|
(_,stream) = parser.nextobject()
|
2009-01-10 09:14:46 +00:00
|
|
|
if not isinstance(stream, PDFStream) or stream.dic['Type'] is not LITERAL_XREF:
|
2008-09-06 04:31:06 +00:00
|
|
|
raise PDFNoValidXRef('Invalid PDF stream spec.')
|
2007-12-30 09:13:51 +00:00
|
|
|
size = stream.dic['Size']
|
|
|
|
(start, nobjs) = stream.dic.get('Index', (0,size))
|
2009-01-17 16:31:42 +00:00
|
|
|
self.objid_first = start
|
|
|
|
self.objid_last = start+nobjs-1
|
2007-12-30 09:13:51 +00:00
|
|
|
(self.fl1, self.fl2, self.fl3) = stream.dic['W']
|
|
|
|
self.data = stream.get_data()
|
|
|
|
self.entlen = self.fl1+self.fl2+self.fl3
|
|
|
|
self.trailer = stream.dic
|
2009-01-17 16:31:42 +00:00
|
|
|
if debug:
|
|
|
|
print >>stderr, ('xref stream: objid=%d-%d, fields=%d,%d,%d' %
|
|
|
|
(self.objid_first, self.objid_last, self.fl1, self.fl2, self.fl3))
|
2007-12-30 09:13:51 +00:00
|
|
|
return
|
|
|
|
|
|
|
|
def getpos(self, objid):
|
2009-01-17 16:31:42 +00:00
|
|
|
if objid < self.objid_first or self.objid_last < objid:
|
2008-06-21 17:58:28 +00:00
|
|
|
raise KeyError(objid)
|
2009-01-17 16:31:42 +00:00
|
|
|
i = self.entlen * (objid-self.objid_first)
|
2007-12-30 09:13:51 +00:00
|
|
|
ent = self.data[i:i+self.entlen]
|
|
|
|
f1 = nunpack(ent[:self.fl1], 1)
|
|
|
|
if f1 == 1:
|
|
|
|
pos = nunpack(ent[self.fl1:self.fl1+self.fl2])
|
|
|
|
genno = nunpack(ent[self.fl1+self.fl2:])
|
|
|
|
return (None, pos)
|
|
|
|
elif f1 == 2:
|
|
|
|
objid = nunpack(ent[self.fl1:self.fl1+self.fl2])
|
|
|
|
index = nunpack(ent[self.fl1+self.fl2:])
|
|
|
|
return (objid, index)
|
2009-01-17 16:31:42 +00:00
|
|
|
# this is a free object
|
|
|
|
raise KeyError(objid)
|
2007-12-30 09:13:51 +00:00
|
|
|
|
|
|
|
|
2009-01-10 09:14:46 +00:00
|
|
|
## PDFPage
|
|
|
|
##
|
|
|
|
class PDFPage(object):
|
|
|
|
|
|
|
|
def __init__(self, doc, pageid, attrs):
|
|
|
|
self.doc = doc
|
|
|
|
self.pageid = pageid
|
|
|
|
self.attrs = dict_value(attrs)
|
|
|
|
self.lastmod = resolve1(self.attrs.get('LastModified'))
|
|
|
|
self.resources = resolve1(self.attrs['Resources'])
|
|
|
|
self.mediabox = resolve1(self.attrs['MediaBox'])
|
|
|
|
if 'CropBox' in self.attrs:
|
|
|
|
self.cropbox = resolve1(self.attrs['CropBox'])
|
|
|
|
else:
|
|
|
|
self.cropbox = self.mediabox
|
|
|
|
self.rotate = self.attrs.get('Rotate', 0)
|
|
|
|
self.annots = self.attrs.get('Annots')
|
|
|
|
self.beads = self.attrs.get('B')
|
|
|
|
if 'Contents' in self.attrs:
|
|
|
|
contents = resolve1(self.attrs['Contents'])
|
|
|
|
else:
|
|
|
|
contents = []
|
|
|
|
if not isinstance(contents, list):
|
|
|
|
contents = [ contents ]
|
|
|
|
self.contents = contents
|
|
|
|
return
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
|
|
|
|
|
|
|
|
|
2007-12-30 09:13:51 +00:00
|
|
|
## PDFDocument
|
|
|
|
##
|
2008-06-21 17:22:44 +00:00
|
|
|
## A PDFDocument object represents a PDF document.
|
|
|
|
## Since a PDF file is usually pretty big, normally it is not loaded
|
|
|
|
## at once. Rather it is parsed dynamically as processing goes.
|
|
|
|
## A PDF parser is associated with the document.
|
|
|
|
##
|
2008-07-09 15:15:32 +00:00
|
|
|
class PDFDocument(object):
|
2009-01-05 04:40:50 +00:00
|
|
|
|
|
|
|
debug = 0
|
2007-12-30 09:13:51 +00:00
|
|
|
|
2009-01-05 04:40:50 +00:00
|
|
|
def __init__(self):
|
2007-12-30 09:13:51 +00:00
|
|
|
self.xrefs = []
|
|
|
|
self.objs = {}
|
|
|
|
self.parsed_objs = {}
|
|
|
|
self.root = None
|
2007-12-31 02:40:32 +00:00
|
|
|
self.catalog = None
|
2007-12-30 09:13:51 +00:00
|
|
|
self.parser = None
|
2008-04-26 06:47:56 +00:00
|
|
|
self.encryption = None
|
|
|
|
self.decipher = None
|
2008-06-21 17:22:44 +00:00
|
|
|
self.ready = False
|
2007-12-30 09:13:51 +00:00
|
|
|
return
|
|
|
|
|
2008-06-21 17:22:44 +00:00
|
|
|
# set_parser(parser)
|
|
|
|
# Associates the document with an (already initialized) parser object.
|
2007-12-30 09:13:51 +00:00
|
|
|
def set_parser(self, parser):
|
|
|
|
if self.parser: return
|
|
|
|
self.parser = parser
|
2008-06-21 17:22:44 +00:00
|
|
|
# The document is set to be temporarily ready during collecting
|
|
|
|
# all the basic information about the document, e.g.
|
|
|
|
# the header, the encryption information, and the access rights
|
|
|
|
# for the document.
|
|
|
|
self.ready = True
|
|
|
|
# Retrieve the information of each header that was appended
|
|
|
|
# (maybe multiple times) at the end of the document.
|
2009-01-17 16:31:42 +00:00
|
|
|
self.xrefs = parser.read_xref()
|
2007-12-30 09:13:51 +00:00
|
|
|
for xref in self.xrefs:
|
|
|
|
trailer = xref.trailer
|
2008-05-03 04:10:59 +00:00
|
|
|
if not trailer: continue
|
2008-06-21 17:22:44 +00:00
|
|
|
# If there's an encryption info, remember it.
|
2007-12-30 09:13:51 +00:00
|
|
|
if 'Encrypt' in trailer:
|
2008-06-21 17:22:44 +00:00
|
|
|
#assert not self.encryption
|
2008-04-26 06:47:56 +00:00
|
|
|
self.encryption = (list_value(trailer['ID']),
|
|
|
|
dict_value(trailer['Encrypt']))
|
2007-12-30 09:13:51 +00:00
|
|
|
if 'Root' in trailer:
|
|
|
|
self.set_root(dict_value(trailer['Root']))
|
|
|
|
break
|
|
|
|
else:
|
2008-09-06 04:31:06 +00:00
|
|
|
raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
|
2008-06-21 17:22:44 +00:00
|
|
|
# The document is set to be non-ready again, until all the
|
|
|
|
# proper initialization (asking the password key and
|
|
|
|
# verifying the access permission, so on) is finished.
|
|
|
|
self.ready = False
|
2008-04-26 06:47:56 +00:00
|
|
|
return
|
|
|
|
|
2008-06-21 17:22:44 +00:00
|
|
|
# set_root(root)
|
|
|
|
# Set the Root dictionary of the document.
|
|
|
|
# Each PDF file must have exactly one /Root dictionary.
|
2008-04-27 11:47:38 +00:00
|
|
|
def set_root(self, root):
|
|
|
|
self.root = root
|
|
|
|
self.catalog = dict_value(self.root)
|
2009-01-10 09:14:46 +00:00
|
|
|
if self.catalog.get('Type') is not LITERAL_CATALOG:
|
2008-04-27 11:47:38 +00:00
|
|
|
if STRICT:
|
2009-01-10 09:14:46 +00:00
|
|
|
raise PDFSyntaxError('Catalog not found!')
|
2008-04-27 11:47:38 +00:00
|
|
|
return
|
|
|
|
|
2008-06-21 17:22:44 +00:00
|
|
|
# initialize(password='')
|
|
|
|
# Perform the initialization with a given password.
|
|
|
|
# This step is mandatory even if there's no password associated
|
|
|
|
# with the document.
|
2009-01-10 09:14:46 +00:00
|
|
|
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
|
2008-04-27 11:47:38 +00:00
|
|
|
def initialize(self, password=''):
|
|
|
|
if not self.encryption:
|
|
|
|
self.is_printable = self.is_modifiable = self.is_extractable = True
|
2008-06-21 17:22:44 +00:00
|
|
|
self.ready = True
|
2008-04-27 11:47:38 +00:00
|
|
|
return
|
2008-04-26 06:47:56 +00:00
|
|
|
(docid, param) = self.encryption
|
|
|
|
if literal_name(param['Filter']) != 'Standard':
|
2008-09-06 04:31:06 +00:00
|
|
|
raise PDFEncryptionError('Unknown filter: param=%r' % param)
|
2008-04-26 06:47:56 +00:00
|
|
|
V = int_value(param.get('V', 0))
|
|
|
|
if not (V == 1 or V == 2):
|
2008-09-06 04:31:06 +00:00
|
|
|
raise PDFEncryptionError('Unknown algorithm: param=%r' % param)
|
2008-04-26 06:47:56 +00:00
|
|
|
length = int_value(param.get('Length', 40)) # Key length (bits)
|
|
|
|
O = str_value(param['O'])
|
|
|
|
R = int_value(param['R']) # Revision
|
|
|
|
if 5 <= R:
|
2008-09-06 04:31:06 +00:00
|
|
|
raise PDFEncryptionError('Unknown revision: %r' % R)
|
2008-04-26 06:47:56 +00:00
|
|
|
U = str_value(param['U'])
|
|
|
|
P = int_value(param['P'])
|
2008-04-27 03:16:27 +00:00
|
|
|
self.is_printable = bool(P & 4)
|
|
|
|
self.is_modifiable = bool(P & 8)
|
|
|
|
self.is_extractable = bool(P & 16)
|
2008-04-26 06:47:56 +00:00
|
|
|
# Algorithm 3.2
|
2009-01-10 09:14:46 +00:00
|
|
|
password = (password+self.PASSWORD_PADDING)[:32] # 1
|
2008-04-26 06:47:56 +00:00
|
|
|
hash = md5.md5(password) # 2
|
|
|
|
hash.update(O) # 3
|
2008-06-21 17:22:44 +00:00
|
|
|
hash.update(struct.pack('<l', P)) # 4
|
2008-04-26 06:47:56 +00:00
|
|
|
hash.update(docid[0]) # 5
|
|
|
|
if 4 <= R:
|
2008-04-27 03:16:27 +00:00
|
|
|
# 6
|
|
|
|
raise PDFNotImplementedError('Revision 4 encryption is currently unsupported')
|
2008-04-26 06:47:56 +00:00
|
|
|
if 3 <= R:
|
|
|
|
# 8
|
|
|
|
for _ in xrange(50):
|
|
|
|
hash = md5.md5(hash.digest()[:length/8])
|
|
|
|
key = hash.digest()[:length/8]
|
|
|
|
if R == 2:
|
|
|
|
# Algorithm 3.4
|
|
|
|
u1 = Arcfour(key).process(password)
|
|
|
|
elif R == 3:
|
|
|
|
# Algorithm 3.5
|
2009-01-10 09:14:46 +00:00
|
|
|
hash = md5.md5(self.PASSWORD_PADDING) # 2
|
2008-04-26 06:47:56 +00:00
|
|
|
hash.update(docid[0]) # 3
|
|
|
|
x = Arcfour(key).process(hash.digest()[:16]) # 4
|
|
|
|
for i in xrange(1,19+1):
|
2008-07-03 15:51:44 +00:00
|
|
|
k = ''.join( chr(ord(c) ^ i) for c in key )
|
2008-04-26 06:47:56 +00:00
|
|
|
x = Arcfour(k).process(x)
|
|
|
|
u1 = x+x # 32bytes total
|
|
|
|
if R == 2:
|
|
|
|
is_authenticated = (u1 == U)
|
|
|
|
else:
|
|
|
|
is_authenticated = (u1[:16] == U[:16])
|
|
|
|
if not is_authenticated:
|
|
|
|
raise PDFPasswordIncorrect
|
2008-04-27 11:47:38 +00:00
|
|
|
self.decrypt_key = key
|
|
|
|
self.decipher = self.decrypt_rc4 # XXX may be AES
|
2008-06-21 17:22:44 +00:00
|
|
|
self.ready = True
|
2007-12-30 09:13:51 +00:00
|
|
|
return
|
|
|
|
|
2008-04-27 11:47:38 +00:00
|
|
|
def decrypt_rc4(self, objid, genno, data):
|
|
|
|
key = self.decrypt_key + struct.pack('<L',objid)[:3]+struct.pack('<L',genno)[:2]
|
|
|
|
hash = md5.md5(key)
|
|
|
|
key = hash.digest()[:min(len(key),16)]
|
|
|
|
return Arcfour(key).process(data)
|
|
|
|
|
2009-01-10 09:14:46 +00:00
|
|
|
KEYWORD_OBJ = PSKeywordTable.intern('obj')
|
2007-12-30 09:13:51 +00:00
|
|
|
def getobj(self, objid):
|
2008-06-21 17:22:44 +00:00
|
|
|
if not self.ready:
|
2008-04-27 11:47:38 +00:00
|
|
|
raise PDFException('PDFDocument not initialized')
|
2008-01-20 04:44:16 +00:00
|
|
|
#assert self.xrefs
|
2008-06-21 17:22:44 +00:00
|
|
|
if 2 <= self.debug:
|
|
|
|
print >>stderr, 'getobj: objid=%r' % (objid)
|
2007-12-30 09:13:51 +00:00
|
|
|
if objid in self.objs:
|
2008-04-26 06:47:56 +00:00
|
|
|
genno = 0
|
2007-12-30 09:13:51 +00:00
|
|
|
obj = self.objs[objid]
|
|
|
|
else:
|
|
|
|
for xref in self.xrefs:
|
|
|
|
try:
|
|
|
|
(strmid, index) = xref.getpos(objid)
|
|
|
|
break
|
2008-06-21 17:58:28 +00:00
|
|
|
except KeyError:
|
2007-12-30 09:13:51 +00:00
|
|
|
pass
|
|
|
|
else:
|
2008-01-20 04:44:16 +00:00
|
|
|
if STRICT:
|
2009-01-10 09:14:46 +00:00
|
|
|
raise PDFSyntaxError('Cannot locate objid=%r' % objid)
|
2008-01-20 04:44:16 +00:00
|
|
|
return None
|
2007-12-30 09:13:51 +00:00
|
|
|
if strmid:
|
|
|
|
stream = stream_value(self.getobj(strmid))
|
2009-01-17 16:31:42 +00:00
|
|
|
if stream.dic.get('Type') is not LITERAL_OBJSTM:
|
2008-01-20 04:44:16 +00:00
|
|
|
if STRICT:
|
|
|
|
raise PDFSyntaxError('Not a stream object: %r' % stream)
|
|
|
|
try:
|
|
|
|
n = stream.dic['N']
|
|
|
|
except KeyError:
|
|
|
|
if STRICT:
|
|
|
|
raise PDFSyntaxError('N is not defined: %r' % stream)
|
|
|
|
n = 0
|
2007-12-30 09:13:51 +00:00
|
|
|
if strmid in self.parsed_objs:
|
|
|
|
objs = self.parsed_objs[stream]
|
|
|
|
else:
|
2009-01-05 04:40:50 +00:00
|
|
|
parser = PDFObjStrmParser(self, stream.get_data())
|
2008-02-03 09:36:34 +00:00
|
|
|
objs = []
|
|
|
|
try:
|
|
|
|
while 1:
|
|
|
|
(_,obj) = parser.nextobject()
|
|
|
|
objs.append(obj)
|
|
|
|
except PSEOF:
|
|
|
|
pass
|
2007-12-30 09:13:51 +00:00
|
|
|
self.parsed_objs[stream] = objs
|
2008-04-26 06:47:56 +00:00
|
|
|
genno = 0
|
2009-01-17 16:31:42 +00:00
|
|
|
i = n*2+index
|
|
|
|
try:
|
|
|
|
obj = objs[i]
|
|
|
|
except IndexError:
|
|
|
|
raise PDFSyntaxError('Invalid object number: objid=%r' % (objid))
|
2008-04-26 06:47:56 +00:00
|
|
|
if isinstance(obj, PDFStream):
|
|
|
|
obj.set_objid(objid, 0)
|
2007-12-30 09:13:51 +00:00
|
|
|
else:
|
2008-02-03 09:36:34 +00:00
|
|
|
self.parser.seek(index)
|
2008-04-27 03:16:27 +00:00
|
|
|
(_,objid1) = self.parser.nexttoken() # objid
|
|
|
|
(_,genno) = self.parser.nexttoken() # genno
|
2008-12-25 15:09:54 +00:00
|
|
|
#assert objid1 == objid, (objid, objid1)
|
2008-04-27 03:16:27 +00:00
|
|
|
(_,kwd) = self.parser.nexttoken()
|
2009-01-10 09:14:46 +00:00
|
|
|
if kwd is not self.KEYWORD_OBJ:
|
2008-09-06 04:31:06 +00:00
|
|
|
raise PDFSyntaxError('Invalid object spec: offset=%r' % index)
|
2008-02-03 09:36:34 +00:00
|
|
|
(_,obj) = self.parser.nextobject()
|
2008-04-26 06:47:56 +00:00
|
|
|
if isinstance(obj, PDFStream):
|
|
|
|
obj.set_objid(objid, genno)
|
2007-12-30 09:13:51 +00:00
|
|
|
if 2 <= self.debug:
|
|
|
|
print >>stderr, 'register: objid=%r: %r' % (objid, obj)
|
|
|
|
self.objs[objid] = obj
|
2008-04-26 06:47:56 +00:00
|
|
|
if self.decipher:
|
|
|
|
obj = decipher_all(self.decipher, objid, genno, obj)
|
2007-12-30 09:13:51 +00:00
|
|
|
return obj
|
|
|
|
|
2008-06-21 17:22:44 +00:00
|
|
|
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
|
2009-01-05 04:40:50 +00:00
|
|
|
def get_pages(self):
|
2008-06-21 17:22:44 +00:00
|
|
|
if not self.ready:
|
2008-09-06 04:31:06 +00:00
|
|
|
raise PDFException('PDFDocument is not initialized')
|
2008-01-20 04:44:16 +00:00
|
|
|
#assert self.xrefs
|
2007-12-30 09:13:51 +00:00
|
|
|
def search(obj, parent):
|
2008-01-07 13:47:52 +00:00
|
|
|
tree = dict_value(obj).copy()
|
|
|
|
for (k,v) in parent.iteritems():
|
2008-06-21 17:22:44 +00:00
|
|
|
if k in self.INHERITABLE_ATTRS and k not in tree:
|
2008-01-07 13:47:52 +00:00
|
|
|
tree[k] = v
|
2009-01-10 09:14:46 +00:00
|
|
|
if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
|
2009-01-05 04:40:50 +00:00
|
|
|
if 1 <= self.debug:
|
2007-12-30 09:13:51 +00:00
|
|
|
print >>stderr, 'Pages: Kids=%r' % tree['Kids']
|
|
|
|
for c in tree['Kids']:
|
|
|
|
for x in search(c, tree):
|
|
|
|
yield x
|
2009-01-10 09:14:46 +00:00
|
|
|
elif tree.get('Type') is LITERAL_PAGE:
|
2009-01-05 04:40:50 +00:00
|
|
|
if 1 <= self.debug:
|
2007-12-31 02:40:32 +00:00
|
|
|
print >>stderr, 'Page: %r' % tree
|
2008-07-09 15:15:32 +00:00
|
|
|
yield (obj.objid, tree)
|
2008-07-03 15:51:44 +00:00
|
|
|
if 'Pages' not in self.catalog: return
|
2008-07-09 15:15:32 +00:00
|
|
|
for (pageid,tree) in search(self.catalog['Pages'], self.catalog):
|
|
|
|
yield PDFPage(self, pageid, tree)
|
|
|
|
return
|
|
|
|
|
|
|
|
def get_outlines(self):
|
|
|
|
if 'Outlines' not in self.catalog:
|
2008-09-06 04:31:06 +00:00
|
|
|
raise PDFException('No /Outlines defined!')
|
2008-07-09 15:15:32 +00:00
|
|
|
def search(entry, level):
|
|
|
|
entry = dict_value(entry)
|
|
|
|
if 'Title' in entry:
|
|
|
|
if 'A' in entry or 'Dest' in entry:
|
|
|
|
title = unicode(str_value(entry['Title']), 'utf-8', 'ignore')
|
|
|
|
dest = entry.get('Dest')
|
|
|
|
action = entry.get('A')
|
|
|
|
se = entry.get('SE')
|
|
|
|
yield (level, title, dest, action, se)
|
|
|
|
if 'First' in entry and 'Last' in entry:
|
|
|
|
for x in search(entry['First'], level+1):
|
|
|
|
yield x
|
|
|
|
if 'Next' in entry:
|
|
|
|
for x in search(entry['Next'], level):
|
|
|
|
yield x
|
|
|
|
return
|
|
|
|
return search(self.catalog['Outlines'], 0)
|
|
|
|
|
|
|
|
def lookup_name(self, cat, key):
|
|
|
|
try:
|
|
|
|
names = dict_value(self.catalog['Names'])
|
|
|
|
except (PDFTypeError, KeyError):
|
|
|
|
raise KeyError((cat,key))
|
|
|
|
# may raise KeyError
|
|
|
|
d0 = dict_value(names[cat])
|
|
|
|
def lookup(d):
|
|
|
|
if 'Limits' in d:
|
|
|
|
(k1,k2) = list_value(d['Limits'])
|
|
|
|
if key < k1 or k2 < key: return None
|
|
|
|
if 'Names' in d:
|
|
|
|
objs = list_value(d['Names'])
|
|
|
|
names = dict(choplist(2, objs))
|
|
|
|
return names[key]
|
|
|
|
if 'Kids' in d:
|
|
|
|
for c in list_value(d['Kids']):
|
|
|
|
v = lookup(dict_value(c))
|
|
|
|
if v: return v
|
|
|
|
raise KeyError((cat,key))
|
|
|
|
return lookup(d0)
|
2007-12-30 09:13:51 +00:00
|
|
|
|
|
|
|
|
|
|
|
## PDFParser
|
|
|
|
##
|
|
|
|
class PDFParser(PSStackParser):
|
|
|
|
|
2009-01-05 04:40:50 +00:00
|
|
|
def __init__(self, doc, fp):
|
|
|
|
PSStackParser.__init__(self, fp)
|
2007-12-30 09:13:51 +00:00
|
|
|
self.doc = doc
|
|
|
|
self.doc.set_parser(self)
|
|
|
|
return
|
|
|
|
|
|
|
|
def __repr__(self):
|
2008-02-03 09:36:34 +00:00
|
|
|
return '<PDFParser>'
|
2007-12-30 09:13:51 +00:00
|
|
|
|
2009-01-10 09:14:46 +00:00
|
|
|
KEYWORD_R = PSKeywordTable.intern('R')
|
|
|
|
KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
|
|
|
|
KEYWORD_STREAM = PSKeywordTable.intern('stream')
|
|
|
|
KEYWORD_XREF = PSKeywordTable.intern('xref')
|
|
|
|
KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
|
2008-02-03 09:36:34 +00:00
|
|
|
def do_keyword(self, pos, token):
|
2009-01-10 09:14:46 +00:00
|
|
|
if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
|
2008-02-03 09:36:34 +00:00
|
|
|
self.add_results(*self.pop(1))
|
|
|
|
return
|
2009-01-10 09:14:46 +00:00
|
|
|
if token is self.KEYWORD_ENDOBJ:
|
2008-02-03 09:36:34 +00:00
|
|
|
self.add_results(*self.pop(4))
|
|
|
|
return
|
|
|
|
|
2009-01-10 09:14:46 +00:00
|
|
|
if token is self.KEYWORD_R:
|
2007-12-30 09:13:51 +00:00
|
|
|
# reference to indirect object
|
|
|
|
try:
|
2008-02-03 09:36:34 +00:00
|
|
|
((_,objid), (_,genno)) = self.pop(2)
|
2007-12-30 09:13:51 +00:00
|
|
|
(objid, genno) = (int(objid), int(genno))
|
|
|
|
obj = PDFObjRef(self.doc, objid, genno)
|
2008-02-03 09:36:34 +00:00
|
|
|
self.push((pos, obj))
|
2007-12-30 09:13:51 +00:00
|
|
|
except PSSyntaxError:
|
|
|
|
pass
|
2008-02-03 09:36:34 +00:00
|
|
|
return
|
2007-12-30 09:13:51 +00:00
|
|
|
|
2009-01-10 09:14:46 +00:00
|
|
|
if token is self.KEYWORD_STREAM:
|
2007-12-30 09:13:51 +00:00
|
|
|
# stream object
|
2008-02-03 09:36:34 +00:00
|
|
|
((_,dic),) = self.pop(1)
|
2007-12-30 09:13:51 +00:00
|
|
|
dic = dict_value(dic)
|
2008-01-20 04:44:16 +00:00
|
|
|
try:
|
|
|
|
objlen = int_value(dic['Length'])
|
|
|
|
except KeyError:
|
|
|
|
if STRICT:
|
2009-01-10 09:14:46 +00:00
|
|
|
raise PDFSyntaxError('/Length is undefined: %r' % dic)
|
2008-01-20 04:44:16 +00:00
|
|
|
objlen = 0
|
2007-12-30 09:13:51 +00:00
|
|
|
self.seek(pos)
|
2008-05-03 04:10:59 +00:00
|
|
|
try:
|
|
|
|
(_, line) = self.nextline() # 'stream'
|
|
|
|
except PSEOF:
|
|
|
|
if STRICT:
|
|
|
|
raise PDFSyntaxError('Unexpected EOF')
|
|
|
|
return
|
2008-01-20 04:44:16 +00:00
|
|
|
pos += len(line)
|
|
|
|
self.fp.seek(pos)
|
2007-12-30 09:13:51 +00:00
|
|
|
data = self.fp.read(objlen)
|
2008-01-20 04:44:16 +00:00
|
|
|
self.seek(pos+objlen)
|
2007-12-30 09:13:51 +00:00
|
|
|
while 1:
|
2008-05-03 04:10:59 +00:00
|
|
|
try:
|
|
|
|
(linepos, line) = self.nextline()
|
|
|
|
except PSEOF:
|
|
|
|
if STRICT:
|
|
|
|
raise PDFSyntaxError('Unexpected EOF')
|
|
|
|
break
|
2008-04-27 03:16:27 +00:00
|
|
|
if 'endstream' in line:
|
|
|
|
i = line.index('endstream')
|
|
|
|
objlen += i
|
|
|
|
data += line[:i]
|
|
|
|
break
|
2008-01-20 04:44:16 +00:00
|
|
|
objlen += len(line)
|
|
|
|
data += line
|
2008-07-16 11:38:01 +00:00
|
|
|
self.seek(pos+objlen)
|
2007-12-30 09:13:51 +00:00
|
|
|
if 1 <= self.debug:
|
|
|
|
print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
|
|
|
|
(pos, objlen, dic, data[:10])
|
2008-01-08 14:22:09 +00:00
|
|
|
obj = PDFStream(dic, data, self.doc.decipher)
|
2008-02-03 09:36:34 +00:00
|
|
|
self.push((pos, obj))
|
|
|
|
return
|
|
|
|
|
|
|
|
# others
|
|
|
|
self.push((pos, token))
|
|
|
|
return
|
2007-12-30 09:13:51 +00:00
|
|
|
|
|
|
|
def find_xref(self):
|
2008-04-27 03:16:27 +00:00
|
|
|
# search the last xref table by scanning the file backwards.
|
2007-12-30 09:13:51 +00:00
|
|
|
prev = None
|
|
|
|
for line in self.revreadlines():
|
|
|
|
line = line.strip()
|
|
|
|
if 2 <= self.debug:
|
2008-02-03 09:36:34 +00:00
|
|
|
print >>stderr, 'find_xref: %r' % line
|
2007-12-30 09:13:51 +00:00
|
|
|
if line == 'startxref': break
|
|
|
|
if line:
|
|
|
|
prev = line
|
|
|
|
else:
|
2008-09-06 04:31:06 +00:00
|
|
|
raise PDFNoValidXRef('Unexpected EOF')
|
2007-12-30 09:13:51 +00:00
|
|
|
if 1 <= self.debug:
|
|
|
|
print >>stderr, 'xref found: pos=%r' % prev
|
2009-01-17 16:31:42 +00:00
|
|
|
return long(prev)
|
2007-12-30 09:13:51 +00:00
|
|
|
|
2009-01-17 16:31:42 +00:00
|
|
|
# read xref table
|
|
|
|
def read_xref_from(self, start, xrefs):
|
|
|
|
self.seek(start)
|
|
|
|
self.reset()
|
|
|
|
try:
|
|
|
|
(pos, token) = self.nexttoken()
|
|
|
|
except PSEOF:
|
|
|
|
raise PDFNoValidXRef('Unexpected EOF')
|
|
|
|
if 2 <= self.debug:
|
|
|
|
print >>stderr, 'read_xref_from: start=%d, token=%r' % (start, token)
|
|
|
|
if isinstance(token, int):
|
|
|
|
# XRefStream: PDF-1.5
|
|
|
|
self.seek(pos)
|
|
|
|
self.reset()
|
|
|
|
xref = PDFXRefStream()
|
|
|
|
xref.load(self, debug=self.debug)
|
|
|
|
else:
|
|
|
|
if token is not self.KEYWORD_XREF:
|
|
|
|
raise PDFNoValidXRef('xref not found: pos=%d, token=%r' %
|
|
|
|
(pos, token))
|
|
|
|
self.nextline()
|
|
|
|
xref = PDFXRef()
|
|
|
|
xref.load(self, debug=self.debug)
|
|
|
|
xrefs.append(xref)
|
|
|
|
trailer = xref.trailer
|
|
|
|
if 1 <= self.debug:
|
|
|
|
print >>stderr, 'trailer: %r' % trailer
|
|
|
|
if 'XRefStm' in trailer:
|
|
|
|
pos = int_value(trailer['XRefStm'])
|
|
|
|
self.read_xref_from(pos, xrefs)
|
|
|
|
if 'Prev' in trailer:
|
|
|
|
# find previous xref
|
|
|
|
pos = int_value(trailer['Prev'])
|
|
|
|
self.read_xref_from(pos, xrefs)
|
|
|
|
return
|
|
|
|
|
2007-12-30 09:13:51 +00:00
|
|
|
# read xref tables and trailers
|
|
|
|
def read_xref(self):
|
2009-01-17 16:31:42 +00:00
|
|
|
xrefs = []
|
2008-06-21 17:22:44 +00:00
|
|
|
try:
|
2009-01-17 16:31:42 +00:00
|
|
|
pos = self.find_xref()
|
|
|
|
self.read_xref_from(pos, xrefs)
|
2008-06-21 17:22:44 +00:00
|
|
|
except PDFNoValidXRef:
|
|
|
|
# fallback
|
|
|
|
if 1 <= self.debug:
|
|
|
|
print >>stderr, 'no xref, fallback'
|
|
|
|
self.seek(0)
|
|
|
|
pat = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
|
|
|
|
offsets = {}
|
|
|
|
xref = PDFXRef()
|
|
|
|
while 1:
|
|
|
|
try:
|
|
|
|
(pos, line) = self.nextline()
|
|
|
|
except PSEOF:
|
|
|
|
break
|
|
|
|
if line.startswith('trailer'): break
|
|
|
|
m = pat.match(line)
|
|
|
|
if not m: continue
|
|
|
|
(objid, genno) = m.groups()
|
2009-01-17 16:31:42 +00:00
|
|
|
offsets[int(objid)] = (0, pos)
|
2008-06-29 14:29:36 +00:00
|
|
|
if not offsets: raise
|
2008-06-21 17:22:44 +00:00
|
|
|
xref.offsets = offsets
|
|
|
|
self.seek(pos)
|
|
|
|
xref.load_trailer(self)
|
|
|
|
if 1 <= self.debug:
|
|
|
|
print >>stderr, 'trailer: %r' % xref.trailer
|
2009-01-17 16:31:42 +00:00
|
|
|
xrefs.append(xref)
|
|
|
|
return xrefs
|
2008-02-03 09:36:34 +00:00
|
|
|
|
2009-01-10 09:14:46 +00:00
|
|
|
|
2008-02-03 09:36:34 +00:00
|
|
|
## PDFObjStrmParser
|
|
|
|
##
|
|
|
|
class PDFObjStrmParser(PDFParser):
|
2009-01-05 04:40:50 +00:00
|
|
|
|
|
|
|
def __init__(self, doc, data):
|
2008-02-03 09:36:34 +00:00
|
|
|
try:
|
|
|
|
from cStringIO import StringIO
|
|
|
|
except ImportError:
|
|
|
|
from StringIO import StringIO
|
2009-01-05 04:40:50 +00:00
|
|
|
PDFParser.__init__(self, doc, StringIO(data))
|
2008-02-03 09:36:34 +00:00
|
|
|
return
|
|
|
|
|
|
|
|
def flush(self):
|
|
|
|
self.add_results(*self.popall())
|
|
|
|
return
|