2009-01-10 09:14:46 +00:00
|
|
|
#!/usr/bin/env python
|
2009-11-03 01:27:30 +00:00
|
|
|
import sys
|
|
|
|
import zlib
|
|
|
|
from lzw import LZWDecoder
|
|
|
|
from psparser import PSException
|
2009-11-03 13:39:34 +00:00
|
|
|
from psparser import PSObject, PSLiteral, PSKeyword
|
|
|
|
from psparser import PSLiteralTable, PSKeywordTable
|
|
|
|
from psparser import literal_name, keyword_name
|
2009-11-03 01:27:30 +00:00
|
|
|
from psparser import STRICT
|
2009-01-10 09:14:46 +00:00
|
|
|
|
|
|
|
LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
|
|
|
|
LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'), PSLiteralTable.intern('Fl'))
|
|
|
|
LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'), PSLiteralTable.intern('LZW'))
|
|
|
|
LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'), PSLiteralTable.intern('A85'))
|
2009-04-08 10:55:01 +00:00
|
|
|
LITERALS_ASCIIHEX_DECODE = (PSLiteralTable.intern('ASCIIHexDecode'), PSLiteralTable.intern('AHx'))
|
2009-01-10 09:14:46 +00:00
|
|
|
|
|
|
|
|
|
|
|
## PDF Objects
|
|
|
|
##
|
|
|
|
class PDFObject(PSObject): pass
|
|
|
|
|
|
|
|
class PDFException(PSException): pass
|
|
|
|
class PDFTypeError(PDFException): pass
|
|
|
|
class PDFValueError(PDFException): pass
|
|
|
|
class PDFNotImplementedError(PSException): pass
|
|
|
|
|
|
|
|
|
|
|
|
## PDFObjRef
|
|
|
|
##
|
|
|
|
class PDFObjRef(PDFObject):
|
|
|
|
|
2009-10-24 04:41:59 +00:00
|
|
|
def __init__(self, doc, objid, _):
|
|
|
|
if objid == 0:
|
|
|
|
if STRICT:
|
|
|
|
raise PDFValueError('PDF object id cannot be 0.')
|
|
|
|
self.doc = doc
|
|
|
|
self.objid = objid
|
|
|
|
#self.genno = genno # Never used.
|
|
|
|
return
|
2009-01-10 09:14:46 +00:00
|
|
|
|
2009-10-24 04:41:59 +00:00
|
|
|
def __repr__(self):
|
|
|
|
return '<PDFObjRef:%d>' % (self.objid)
|
|
|
|
|
|
|
|
def resolve(self):
|
|
|
|
return self.doc.getobj(self.objid)
|
2009-01-10 09:14:46 +00:00
|
|
|
|
|
|
|
|
|
|
|
# resolve
|
|
|
|
def resolve1(x):
|
2009-10-24 04:41:59 +00:00
|
|
|
'''
|
|
|
|
Resolve an object. If this is an array or dictionary,
|
|
|
|
it may still contains some indirect objects inside.
|
|
|
|
'''
|
|
|
|
while isinstance(x, PDFObjRef):
|
|
|
|
x = x.resolve()
|
|
|
|
return x
|
2009-01-10 09:14:46 +00:00
|
|
|
|
|
|
|
def resolve_all(x):
|
2009-10-24 04:41:59 +00:00
|
|
|
'''
|
|
|
|
Recursively resolve X and all the internals.
|
|
|
|
Make sure there is no indirect reference within the nested object.
|
|
|
|
This procedure might be slow.
|
|
|
|
'''
|
|
|
|
while isinstance(x, PDFObjRef):
|
|
|
|
x = x.resolve()
|
|
|
|
if isinstance(x, list):
|
|
|
|
x = [ resolve_all(v) for v in x ]
|
|
|
|
elif isinstance(x, dict):
|
|
|
|
for (k,v) in x.iteritems():
|
|
|
|
x[k] = resolve_all(v)
|
|
|
|
return x
|
2009-01-10 09:14:46 +00:00
|
|
|
|
|
|
|
def decipher_all(decipher, objid, genno, x):
|
2009-10-24 04:41:59 +00:00
|
|
|
'''
|
|
|
|
Recursively decipher X.
|
|
|
|
'''
|
|
|
|
if isinstance(x, str):
|
|
|
|
return decipher(objid, genno, x)
|
|
|
|
if isinstance(x, list):
|
|
|
|
x = [ decipher_all(decipher, objid, genno, v) for v in x ]
|
|
|
|
elif isinstance(x, dict):
|
|
|
|
for (k,v) in x.iteritems():
|
|
|
|
x[k] = decipher_all(decipher, objid, genno, v)
|
|
|
|
return x
|
2009-01-10 09:14:46 +00:00
|
|
|
|
|
|
|
# Type cheking
|
|
|
|
def int_value(x):
|
2009-10-24 04:41:59 +00:00
|
|
|
x = resolve1(x)
|
|
|
|
if not isinstance(x, int):
|
|
|
|
if STRICT:
|
|
|
|
raise PDFTypeError('Integer required: %r' % x)
|
|
|
|
return 0
|
|
|
|
return x
|
2009-01-10 09:14:46 +00:00
|
|
|
|
|
|
|
def float_value(x):
|
2009-10-24 04:41:59 +00:00
|
|
|
x = resolve1(x)
|
|
|
|
if not isinstance(x, float):
|
|
|
|
if STRICT:
|
|
|
|
raise PDFTypeError('Float required: %r' % x)
|
|
|
|
return 0.0
|
|
|
|
return x
|
2009-01-10 09:14:46 +00:00
|
|
|
|
|
|
|
def num_value(x):
|
2009-10-24 04:41:59 +00:00
|
|
|
x = resolve1(x)
|
|
|
|
if not (isinstance(x, int) or isinstance(x, float)):
|
|
|
|
if STRICT:
|
|
|
|
raise PDFTypeError('Int or Float required: %r' % x)
|
|
|
|
return 0
|
|
|
|
return x
|
2009-01-10 09:14:46 +00:00
|
|
|
|
|
|
|
def str_value(x):
|
2009-10-24 04:41:59 +00:00
|
|
|
x = resolve1(x)
|
|
|
|
if not isinstance(x, str):
|
|
|
|
if STRICT:
|
|
|
|
raise PDFTypeError('String required: %r' % x)
|
|
|
|
return ''
|
|
|
|
return x
|
2009-01-10 09:14:46 +00:00
|
|
|
|
|
|
|
def list_value(x):
|
2009-10-24 04:41:59 +00:00
|
|
|
x = resolve1(x)
|
|
|
|
if not (isinstance(x, list) or isinstance(x, tuple)):
|
|
|
|
if STRICT:
|
|
|
|
raise PDFTypeError('List required: %r' % x)
|
|
|
|
return []
|
|
|
|
return x
|
2009-01-10 09:14:46 +00:00
|
|
|
|
|
|
|
def dict_value(x):
|
2009-10-24 04:41:59 +00:00
|
|
|
x = resolve1(x)
|
|
|
|
if not isinstance(x, dict):
|
|
|
|
if STRICT:
|
|
|
|
raise PDFTypeError('Dict required: %r' % x)
|
|
|
|
return {}
|
|
|
|
return x
|
2009-01-10 09:14:46 +00:00
|
|
|
|
|
|
|
def stream_value(x):
|
2009-10-24 04:41:59 +00:00
|
|
|
x = resolve1(x)
|
|
|
|
if not isinstance(x, PDFStream):
|
|
|
|
if STRICT:
|
|
|
|
raise PDFTypeError('PDFStream required: %r' % x)
|
|
|
|
return PDFStream({}, '')
|
|
|
|
return x
|
2009-01-10 09:14:46 +00:00
|
|
|
|
|
|
|
|
|
|
|
## PDFStream type
|
|
|
|
##
|
|
|
|
class PDFStream(PDFObject):
|
2009-10-24 04:41:59 +00:00
|
|
|
|
|
|
|
def __init__(self, dic, rawdata, decipher=None):
|
|
|
|
self.dic = dic
|
|
|
|
self.rawdata = rawdata
|
|
|
|
self.decipher = decipher
|
|
|
|
self.data = None
|
|
|
|
self.objid = None
|
|
|
|
self.genno = None
|
|
|
|
return
|
|
|
|
|
|
|
|
def set_objid(self, objid, genno):
|
|
|
|
self.objid = objid
|
|
|
|
self.genno = genno
|
|
|
|
return
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
|
|
|
|
|
|
|
|
def decomp(self,data):
|
|
|
|
import zlib
|
|
|
|
buf = data
|
|
|
|
# some FlateDecode streams have garbage (newlines, etc) appended to the
|
|
|
|
# end. remove chars from the end to try and decompress the buffer
|
|
|
|
while 8 <= len(buf):
|
|
|
|
try:
|
|
|
|
# will get errors if the document is encrypted.
|
|
|
|
dco = zlib.decompressobj()
|
|
|
|
return dco.decompress(buf)
|
|
|
|
except zlib.error:
|
|
|
|
buf = buf[:-1]
|
|
|
|
raise Exception, "zlib.error while decompressing data"
|
|
|
|
|
|
|
|
def decode(self):
|
|
|
|
assert self.data == None and self.rawdata != None
|
|
|
|
data = self.rawdata
|
|
|
|
if self.decipher:
|
|
|
|
# Handle encryption
|
|
|
|
data = self.decipher(self.objid, self.genno, data)
|
|
|
|
if 'Filter' not in self.dic:
|
|
|
|
self.data = data
|
|
|
|
self.rawdata = None
|
|
|
|
return
|
|
|
|
filters = self.dic['Filter']
|
|
|
|
if not isinstance(filters, list):
|
|
|
|
filters = [ filters ]
|
|
|
|
for f in filters:
|
|
|
|
if f in LITERALS_FLATE_DECODE:
|
|
|
|
# will get errors if the document is encrypted.
|
|
|
|
data = self.decomp(data)
|
|
|
|
elif f in LITERALS_LZW_DECODE:
|
|
|
|
try:
|
|
|
|
from cStringIO import StringIO
|
|
|
|
except ImportError:
|
|
|
|
from StringIO import StringIO
|
|
|
|
data = ''.join(LZWDecoder(StringIO(data)).run())
|
|
|
|
elif f in LITERALS_ASCII85_DECODE:
|
|
|
|
import ascii85
|
|
|
|
data = ascii85.ascii85decode(data)
|
|
|
|
elif f in LITERALS_ASCIIHEX_DECODE:
|
|
|
|
import ascii85
|
|
|
|
data = ascii85.asciihexdecode(data)
|
|
|
|
elif f == LITERAL_CRYPT:
|
|
|
|
raise PDFNotImplementedError('/Crypt filter is unsupported')
|
|
|
|
else:
|
|
|
|
raise PDFNotImplementedError('Unsupported filter: %r' % f)
|
|
|
|
# apply predictors
|
|
|
|
if 'DP' in self.dic:
|
|
|
|
params = self.dic['DP']
|
|
|
|
else:
|
|
|
|
params = self.dic.get('DecodeParms', {})
|
|
|
|
if 'Predictor' in params:
|
|
|
|
pred = int_value(params['Predictor'])
|
|
|
|
if pred:
|
|
|
|
if pred != 12:
|
|
|
|
raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
|
|
|
|
if 'Columns' not in params:
|
|
|
|
raise PDFValueError('Columns undefined for predictor=12')
|
|
|
|
columns = int_value(params['Columns'])
|
|
|
|
buf = ''
|
|
|
|
ent0 = '\x00' * columns
|
|
|
|
for i in xrange(0, len(data), columns+1):
|
|
|
|
pred = data[i]
|
|
|
|
ent1 = data[i+1:i+1+columns]
|
|
|
|
if pred == '\x02':
|
|
|
|
ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )
|
|
|
|
buf += ent1
|
|
|
|
ent0 = ent1
|
|
|
|
data = buf
|
|
|
|
self.data = data
|
|
|
|
self.rawdata = None
|
|
|
|
return
|
|
|
|
|
|
|
|
def get_data(self):
|
|
|
|
if self.data == None:
|
|
|
|
self.decode()
|
|
|
|
return self.data
|
|
|
|
|
|
|
|
def get_rawdata(self):
|
|
|
|
return self.rawdata
|