#!/usr/bin/env python # pdfparser.py, Yusuke Shinyama # ver 0.1, Dec 24 2004- # ver 0.2, Dec 24 2007 # TODO: # - Code Documentation. # - Error handling for invalid type. # - Outlines. # - Named Objects. (pages) # - Writers. # - Linearized PDF. # - Encryption? import sys, re stderr = sys.stderr from utils import choplist, nunpack from psparser import PSException, PSSyntaxError, PSTypeError, \ PSLiteral, PSKeyword, \ PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \ PSStackParser ## PDF Exceptions ## class PDFException(PSException): pass class PDFSyntaxError(PDFException): pass class PDFEncrypted(PDFException): pass class PDFTypeError(PDFException): pass class PDFValueError(PDFException): pass # some predefined literals and keywords. LITERAL_OBJSTM = PSLiteralTable.intern('ObjStm') LITERAL_XREF = PSLiteralTable.intern('XRef') LITERAL_PAGE = PSLiteralTable.intern('Page') LITERAL_PAGES = PSLiteralTable.intern('Pages') LITERAL_CATALOG = PSLiteralTable.intern('Catalog') LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode') KEYWORD_OBJ = PSKeywordTable.intern('obj') KEYWORD_EI = PSKeywordTable.intern('EI') ## PDFObjRef ## class PDFObjRef: def __init__(self, doc, objid, genno): if objid == 0: raise PDFValueError('objid cannot be 0.') self.doc = doc self.objid = objid #self.genno = genno # Never used. return def __repr__(self): return '' % (self.objid) def resolve(self): return self.doc.getobj(self.objid) # resolve def resolve1(x): ''' Resolve an object. If this is an array or dictionary, it may still contains some indirect objects inside. ''' while isinstance(x, PDFObjRef): x = x.resolve() return x def resolveall(x): ''' Recursively resolve X and all the internals. Make sure there is no indirect reference within the nested object. This procedure might be slow. Do not used it unless you really need it. ''' while isinstance(x, PDFObjRef): x = x.resolve() if isinstance(x, list): x = [ resolveall(v) for v in x ] elif isinstance(x, dict): for (k,v) in x.iteritems(): x[k] = resolveall(v) return x # Type cheking def int_value(x): x = resolve1(x) if not isinstance(x, int): raise PDFTypeError('integer required: %r' % x) return x def float_value(x): x = resolve1(x) if not isinstance(x, float): raise PDFTypeError('float required: %r' % x) return x def num_value(x): x = resolve1(x) if not (isinstance(x, int) or isinstance(x, float)): raise PDFTypeError('int or float required: %r' % x) return x def str_value(x): x = resolve1(x) if not isinstance(x, str): raise PDFTypeError('string required: %r' % x) return x def list_value(x): x = resolve1(x) if not isinstance(x, list): raise PDFTypeError('list required: %r' % x) return x def dict_value(x): x = resolve1(x) if not isinstance(x, dict): raise PDFTypeError('dict required: %r' % x) return x def stream_value(x): x = resolve1(x) if not isinstance(x, PDFStream): raise PDFTypeError('stream required: %r' % x) return x ## PDFStream type ## class PDFStream: def __init__(self, doc, dic, rawdata): self.doc = doc self.dic = dic self.rawdata = rawdata self.data = None return def __repr__(self): return '' % (self.dic) def decode(self): assert self.data == None and self.rawdata != None data = self.rawdata if self.doc.crypt: # func DECRYPT is not implemented yet... raise NotImplementedError data = DECRYPT(self.doc.crypt, data) if 'Filter' not in self.dic: self.data = data self.rawdata = None return filters = self.dic['Filter'] if not isinstance(filters, list): filters = [ filters ] for f in filters: if f == LITERAL_FLATE_DECODE: import zlib # will get errors if the document is encrypted. data = zlib.decompress(data) # apply predictors params = self.dic.get('DecodeParms', {}) if 'Predictor' in params: pred = int_value(params['Predictor']) if pred: if pred != 12: raise PDFValueError('Unsupported predictor: %r' % pred) if 'Columns' not in params: raise PDFValueError('Columns undefined for predictor=12') columns = int_value(params['Columns']) buf = '' ent0 = '\x00' * columns for i in xrange(0, len(data), columns+1): pred = data[i] ent1 = data[i+1:i+1+columns] if pred == '\x02': ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) ) buf += ent1 ent0 = ent1 data = buf else: raise PDFValueError('Invalid filter spec: %r' % f) self.data = data self.rawdata = None return def get_data(self): if self.data == None: self.decode() return self.data def parse_data(self, inline=False, debug=0): try: from cStringIO import StringIO except ImportError: from StringIO import StringIO return PDFParser(self.doc, StringIO(self.get_data()), inline=inline, debug=debug).parse() ## PDFPage ## class PDFPage: def __init__(self, doc, pageidx, attrs, parent_attrs): self.doc = doc self.pageid = pageidx self.attrs = dict_value(attrs) self.parent_attrs = parent_attrs self.resources = self.get_attr('Resources') self.mediabox = self.get_attr('MediaBox') contents = resolve1(self.attrs['Contents']) if not isinstance(contents, list): contents = [ contents ] self.contents = contents return def __repr__(self): return '' % (self.resources, self.mediabox) def get_attr(self, k): if k in self.attrs: return resolve1(self.attrs[k]) return self.parent_attrs.get(k) ## XRefs ## PDFXRef ## class PDFXRef: def __init__(self, parser): while 1: line = parser.nextline() if not line: raise PDFSyntaxError('premature eof: %r' % parser) line = line.strip() f = line.split(' ') if len(f) != 2: if line != 'trailer': raise PDFSyntaxError('trailer not found: %r: line=%r' % (parser, line)) break (start, nobjs) = map(long, f) self.objid0 = start self.objid1 = start+nobjs self.offsets = [] for objid in xrange(start, start+nobjs): line = parser.nextline() f = line.strip().split(' ') if len(f) != 3: raise PDFSyntaxError('invalid xref format: %r, line=%r' % (parser, line)) (pos, genno, use) = f self.offsets.append((int(genno), long(pos), use)) # read trailer self.trailer = dict_value(parser.parse()[0]) return def getpos(self, objid): if objid < self.objid0 or self.objid1 <= objid: raise IndexError (genno, pos, use) = self.offsets[objid-self.objid0] if use != 'n': raise PDFValueError('unused objid=%r' % objid) return (None, pos) ## PDFXRefStream ## class PDFXRefStream: def __init__(self, parser): (objid, genno, _, stream) = list_value(parser.parse()) assert stream.dic['Type'] == LITERAL_XREF size = stream.dic['Size'] (start, nobjs) = stream.dic.get('Index', (0,size)) self.objid0 = start self.objid1 = start+nobjs (self.fl1, self.fl2, self.fl3) = stream.dic['W'] self.data = stream.get_data() self.entlen = self.fl1+self.fl2+self.fl3 self.trailer = stream.dic return def getpos(self, objid): if objid < self.objid0 or self.objid1 <= objid: raise IndexError i = self.entlen * (objid-self.objid0) ent = self.data[i:i+self.entlen] f1 = nunpack(ent[:self.fl1], 1) if f1 == 1: pos = nunpack(ent[self.fl1:self.fl1+self.fl2]) genno = nunpack(ent[self.fl1+self.fl2:]) return (None, pos) elif f1 == 2: objid = nunpack(ent[self.fl1:self.fl1+self.fl2]) index = nunpack(ent[self.fl1+self.fl2:]) return (objid, index) ## PDFDocument ## class PDFDocument: def __init__(self, debug=0): self.debug = debug self.xrefs = [] self.objs = {} self.parsed_objs = {} self.crypt = None self.root = None self.catalog = None self.parser = None return def set_parser(self, parser): if self.parser: return self.parser = parser self.xrefs = list(parser.read_xref()) for xref in self.xrefs: trailer = xref.trailer if 'Encrypt' in trailer: self.crypt = dict_value(trailer['Encrypt']) if 'Root' in trailer: self.set_root(dict_value(trailer['Root'])) break else: raise PDFValueError('no /Root object!') return def getobj(self, objid): assert self.xrefs if objid in self.objs: obj = self.objs[objid] else: for xref in self.xrefs: try: (strmid, index) = xref.getpos(objid) break except IndexError: pass else: raise PDFValueError('Cannot locate objid=%r' % objid) if strmid: stream = stream_value(self.getobj(strmid)) if stream.dic['Type'] != LITERAL_OBJSTM: raise PDFSyntaxError('Not a stream object: %r' % stream) if 'N' not in stream.dic: raise PDFSyntaxError('N is not defined: %r' % stream) if strmid in self.parsed_objs: objs = self.parsed_objs[stream] else: objs = stream.parse_data(self.debug) self.parsed_objs[stream] = objs obj = objs[stream.dic['N']*2+index] else: pos0 = self.parser.linepos self.parser.seek(index) seq = list_value(self.parser.parse()) if not (len(seq) == 4 and seq[0] == objid and seq[2] == KEYWORD_OBJ): raise PDFSyntaxError('invalid stream spec: %r' % seq) obj = seq[3] self.parser.seek(pos0) if 2 <= self.debug: print >>stderr, 'register: objid=%r: %r' % (objid, obj) self.objs[objid] = obj return obj def get_pages(self, debug=0): assert self.xrefs def search(obj, parent): tree = dict_value(obj) if tree['Type'] == LITERAL_PAGES: if 1 <= debug: print >>stderr, 'Pages: Kids=%r' % tree['Kids'] for c in tree['Kids']: for x in search(c, tree): yield x elif tree['Type'] == LITERAL_PAGE: if 1 <= debug: print >>stderr, 'Page: %r' % tree yield (tree, parent) for (i,(tree,parent)) in enumerate(search(self.catalog['Pages'], self.catalog)): yield PDFPage(self, i, tree, parent) return def set_root(self, root): self.root = root self.catalog = dict_value(self.root) if self.catalog['Type'] != LITERAL_CATALOG: raise PDFValueError('Catalog not found!') self.outline = self.catalog.get('Outline') return ## PDFParser ## class PDFParser(PSStackParser): def __init__(self, doc, fp, inline=False, debug=0): PSStackParser.__init__(self, fp, debug=debug) self.inline = inline self.doc = doc self.doc.set_parser(self) return def __repr__(self): return '' % self.linepos EOIPAT = re.compile(r'\nEI\W') def do_token(self, pos, token): name = keyword_name(token) if name in ('xref', 'trailer', 'startxref', 'endobj'): return True if name == 'R': # reference to indirect object try: (objid, genno) = self.pop(2) (objid, genno) = (int(objid), int(genno)) obj = PDFObjRef(self.doc, objid, genno) self.push(obj) if 2 <= self.debug: print >>stderr, 'refer obj: %r' % obj except PSSyntaxError: pass elif name == 'stream': # stream object (dic,) = self.pop(1) dic = dict_value(dic) if 'Length' not in dic: raise PDFValueError('/Length is undefined: %r' % dic) objlen = int_value(dic['Length']) self.seek(pos) line = self.nextline() # 'stream' self.fp.seek(pos+len(line)) data = self.fp.read(objlen) self.seek(pos+len(line)+objlen) while 1: line = self.nextline() if not line: raise PDFSyntaxError('premature eof, need endstream: linepos=%d, line=%r' % (self.linepos, line)) if line.strip(): if not line.startswith('endstream'): raise PDFSyntaxError('need endstream: linepos=%d, line=%r' % (self.linepos, line)) break if 1 <= self.debug: print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \ (pos, objlen, dic, data[:10]) obj = PDFStream(self.doc, dic, data) self.push(obj) elif self.inline and name == 'BI': # inline image within a content stream self.context.append(('BI', self.partobj)) self.partobj = [] elif self.inline and name == 'ID': objs = self.partobj (type0, self.partobj) = self.context.pop() if len(objs) % 2 != 0: raise PSTypeError('invalid dictionary construct: %r' % objs) dic = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) ) pos += len('ID ') self.fp.seek(pos) data = self.fp.read(8192) # XXX how do we know the real length other than scanning? m = self.EOIPAT.search(data) assert m objlen = m.start(0) obj = PDFStream(self.doc, dic, data[:objlen]) self.push(obj) self.seek(pos+objlen+len('\nEI')) self.push(KEYWORD_EI) else: self.push(token) return False def find_xref(self): # find the first xref table prev = None for line in self.revreadlines(): line = line.strip() if 2 <= self.debug: print >>stderr, 'line: %r' % line if line == 'startxref': break if line: prev = line else: raise PDFSyntaxError('startxref not found!') if 1 <= self.debug: print >>stderr, 'xref found: pos=%r' % prev self.seek(long(prev)) return # read xref tables and trailers def read_xref(self): self.find_xref() while 1: # read xref table pos0 = self.linepos line = self.nextline() if 2 <= self.debug: print >>stderr, 'line: %r' % line if line[0].isdigit(): # XRefStream: PDF-1.5 self.seek(pos0) xref = PDFXRefStream(self) elif line.strip() != 'xref': raise PDFSyntaxError('xref not found: linepos=%d, line=%r' % (self.linepos, line)) else: xref = PDFXRef(self) yield xref trailer = xref.trailer if 1 <= self.debug: print >>stderr, 'trailer: %r' % trailer if 'XRefStm' in trailer: self.seek(int_value(trailer['XRefStm'])) if 'Prev' in trailer: # find previous xref pos0 = int_value(trailer['Prev']) self.seek(pos0) if 1 <= self.debug: print >>stderr, 'prev trailer: pos=%d' % pos0 else: break return