#!/usr/bin/env python import sys, zlib from pdfminer.lzw import LZWDecoder from pdfminer.psparser import PSException, PSObject, \ PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \ literal_name, keyword_name, STRICT LITERAL_CRYPT = PSLiteralTable.intern('Crypt') LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'), PSLiteralTable.intern('Fl')) LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'), PSLiteralTable.intern('LZW')) LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'), PSLiteralTable.intern('A85')) LITERALS_ASCIIHEX_DECODE = (PSLiteralTable.intern('ASCIIHexDecode'), PSLiteralTable.intern('AHx')) ## PDF Objects ## class PDFObject(PSObject): pass class PDFException(PSException): pass class PDFTypeError(PDFException): pass class PDFValueError(PDFException): pass class PDFNotImplementedError(PSException): pass ## PDFObjRef ## class PDFObjRef(PDFObject): def __init__(self, doc, objid, _): if objid == 0: if STRICT: raise PDFValueError('PDF object id cannot be 0.') self.doc = doc self.objid = objid #self.genno = genno # Never used. return def __repr__(self): return '' % (self.objid) def resolve(self): return self.doc.getobj(self.objid) # resolve def resolve1(x): ''' Resolve an object. If this is an array or dictionary, it may still contains some indirect objects inside. ''' while isinstance(x, PDFObjRef): x = x.resolve() return x def resolve_all(x): ''' Recursively resolve X and all the internals. Make sure there is no indirect reference within the nested object. This procedure might be slow. ''' while isinstance(x, PDFObjRef): x = x.resolve() if isinstance(x, list): x = [ resolve_all(v) for v in x ] elif isinstance(x, dict): for (k,v) in x.iteritems(): x[k] = resolve_all(v) return x def decipher_all(decipher, objid, genno, x): ''' Recursively decipher X. ''' if isinstance(x, str): return decipher(objid, genno, x) if isinstance(x, list): x = [ decipher_all(decipher, objid, genno, v) for v in x ] elif isinstance(x, dict): for (k,v) in x.iteritems(): x[k] = decipher_all(decipher, objid, genno, v) return x # Type cheking def int_value(x): x = resolve1(x) if not isinstance(x, int): if STRICT: raise PDFTypeError('Integer required: %r' % x) return 0 return x def float_value(x): x = resolve1(x) if not isinstance(x, float): if STRICT: raise PDFTypeError('Float required: %r' % x) return 0.0 return x def num_value(x): x = resolve1(x) if not (isinstance(x, int) or isinstance(x, float)): if STRICT: raise PDFTypeError('Int or Float required: %r' % x) return 0 return x def str_value(x): x = resolve1(x) if not isinstance(x, str): if STRICT: raise PDFTypeError('String required: %r' % x) return '' return x def list_value(x): x = resolve1(x) if not (isinstance(x, list) or isinstance(x, tuple)): if STRICT: raise PDFTypeError('List required: %r' % x) return [] return x def dict_value(x): x = resolve1(x) if not isinstance(x, dict): if STRICT: raise PDFTypeError('Dict required: %r' % x) return {} return x def stream_value(x): x = resolve1(x) if not isinstance(x, PDFStream): if STRICT: raise PDFTypeError('PDFStream required: %r' % x) return PDFStream({}, '') return x ## PDFStream type ## class PDFStream(PDFObject): def __init__(self, dic, rawdata, decipher=None): self.dic = dic self.rawdata = rawdata self.decipher = decipher self.data = None self.objid = None self.genno = None return def set_objid(self, objid, genno): self.objid = objid self.genno = genno return def __repr__(self): return '' % (self.objid, len(self.rawdata), self.dic) def decomp(self,data): import zlib buf = data # some FlateDecode streams have garbage (newlines, etc) appended to the # end. remove chars from the end to try and decompress the buffer while 8 <= len(buf): try: # will get errors if the document is encrypted. dco = zlib.decompressobj() return dco.decompress(buf) except zlib.error: buf = buf[:-1] raise Exception, "zlib.error while decompressing data" def decode(self): assert self.data == None and self.rawdata != None data = self.rawdata if self.decipher: # Handle encryption data = self.decipher(self.objid, self.genno, data) if 'Filter' not in self.dic: self.data = data self.rawdata = None return filters = self.dic['Filter'] if not isinstance(filters, list): filters = [ filters ] for f in filters: if f in LITERALS_FLATE_DECODE: # will get errors if the document is encrypted. data = self.decomp(data) elif f in LITERALS_LZW_DECODE: try: from cStringIO import StringIO except ImportError: from StringIO import StringIO data = ''.join(LZWDecoder(StringIO(data)).run()) elif f in LITERALS_ASCII85_DECODE: import ascii85 data = ascii85.ascii85decode(data) elif f in LITERALS_ASCIIHEX_DECODE: import ascii85 data = ascii85.asciihexdecode(data) elif f == LITERAL_CRYPT: raise PDFNotImplementedError('/Crypt filter is unsupported') else: raise PDFNotImplementedError('Unsupported filter: %r' % f) # apply predictors if 'DP' in self.dic: params = self.dic['DP'] else: params = self.dic.get('DecodeParms', {}) if 'Predictor' in params: pred = int_value(params['Predictor']) if pred: if pred != 12: raise PDFNotImplementedError('Unsupported predictor: %r' % pred) if 'Columns' not in params: raise PDFValueError('Columns undefined for predictor=12') columns = int_value(params['Columns']) buf = '' ent0 = '\x00' * columns for i in xrange(0, len(data), columns+1): pred = data[i] ent1 = data[i+1:i+1+columns] if pred == '\x02': ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) ) buf += ent1 ent0 = ent1 data = buf self.data = data self.rawdata = None return def get_data(self): if self.data == None: self.decode() return self.data def get_rawdata(self): return self.rawdata