#!/usr/bin/env python # pdfparser.py, Yusuke Shinyama # ver 0.1, Dec 24 2004- # ver 0.2, Dec 24 2007 # TODO: # - Code Documentation. # - Error handling for invalid type. # - Outlines. # - Named Objects. (pages) # - Writers. # - Linearized PDF. # - Encryption? import sys, re from struct import pack, unpack try: from cStringIO import StringIO except ImportError: from StringIO import StringIO try: import cdb except ImportError: import pycdb as cdb stderr = sys.stderr ## Utilities ## def choplist(n, seq): '''Groups every n elements of the list.''' r = [] for x in seq: r.append(x) if len(r) == n: yield tuple(r) r = [] return def nunpack(s, default=0): '''Unpacks up to 4 bytes.''' l = len(s) if not l: return default elif l == 1: return ord(s) elif l == 2: return unpack('>H', s)[0] elif l == 3: return unpack('>L', '\x00'+s)[0] elif l == 4: return unpack('>L', s)[0] else: return TypeError('invalid length: %d' % l) def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)): '''Multiplies two matrices.''' return (a0*a1+c0*b1, b0*a1+d0*b1, a0*c1+c0*d1, b0*c1+d0*d1, a0*e1+c0*f1+e0, b0*e1+d0*f1+f0) def apply_matrix((a,b,c,d,e,f), (x,y)): '''Applies a matrix to a coordination.''' return (a*x+c*y+e, b*x+d*y+f) ## Exceptions ## class PSException(Exception): pass class PSSyntaxError(PSException): pass class PSTypeError(PSException): pass class PSValueError(PSException): pass class PDFException(PSException): pass class PDFSyntaxError(PDFException): pass class PDFEncrypted(PDFException): pass class PDFTypeError(PDFException): pass class PDFValueError(PDFException): pass class PDFResourceError(PDFException): pass class PDFInterpreterError(PDFException): pass class PDFFontError(PDFException): pass class PDFUnicodeNotDefined(PDFFontError): pass ## PostScript Types ## class PSLiteral: ''' PS literals (e.g. "/Name"). Caution: Never create these objects directly. Use PSLiteralTable.intern() instead. ''' def __init__(self, name): self.name = name return def __repr__(self): return '/%s' % self.name class PSKeyword: ''' PS keywords (e.g. "showpage"). Caution: Never create these objects directly. Use PSKeywordTable.intern() instead. ''' def __init__(self, name): self.name = name return def __repr__(self): return self.name class PSSymbolTable: ''' Symbol table that stores PSLiteral or PSKeyword. ''' def __init__(self, classe): self.dic = {} self.classe = classe return def intern(self, name): if name in self.dic: lit = self.dic[name] else: lit = self.classe(name) self.dic[name] = lit return lit PSLiteralTable = PSSymbolTable(PSLiteral) PSKeywordTable = PSSymbolTable(PSKeyword) # some predefined literals and keywords. LITERAL_OBJSTM = PSLiteralTable.intern('ObjStm') LITERAL_PDF = PSLiteralTable.intern('PDF') LITERAL_TEXT = PSLiteralTable.intern('Text') LITERAL_XREF = PSLiteralTable.intern('XRef') LITERAL_FONT = PSLiteralTable.intern('Font') LITERAL_PAGE = PSLiteralTable.intern('Page') LITERAL_FORM = PSLiteralTable.intern('Form') LITERAL_PAGES = PSLiteralTable.intern('Pages') LITERAL_CATALOG = PSLiteralTable.intern('Catalog') LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode') LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding') KEYWORD_OBJ = PSKeywordTable.intern('obj') KEYWORD_EI = PSKeywordTable.intern('EI') ## CMap ## class CMap: def __init__(self, debug=0): self.debug = 0 self.code2cid = {} self.cid2code = {} self.attrs = {} return def __repr__(self): return '' % self.attrs.get('CMapName') def update(self, code2cid=None, cid2code=None): if code2cid: self.code2cid.update(code2cid) if cid2code: self.cid2code.update(cid2code) return self def copycmap(self, cmap): self.code2cid.update(cmap.getall_code2cid()) self.cid2code.update(cmap.getall_cid2code()) return self def register_code2cid(self, code, cid): assert isinstance(code, str) assert isinstance(cid, int) self.code2cid[code] = cid return self def register_cid2code(self, cid, code): from glyphlist import charname2unicode assert isinstance(cid, int) if isinstance(code, PSLiteral): code = pack('>H', charname2unicode[code.name]) self.cid2code[cid] = code return self def decode(self, bytes): if self.debug: print >>stderr, 'decode: %r, %r' % (self, bytes) x = '' for c in bytes: if x: if x+c in self.code2cid: yield self.code2cid[x+c] x = '' elif c in self.code2cid: yield self.code2cid[c] else: x = c return def is_vertical(self): return self.attrs.get('WMode', '0') == '1' def tocid(self, code): return self.code2cid.get(code) def tocode(self, cid): return self.cid2code.get(cid) def getall_attrs(self): return self.attrs.iteritems() def getall_code2cid(self): return self.code2cid.iteritems() def getall_cid2code(self): return self.cid2code.iteritems() ## CDBCMap ## class CDBCMap(CMap): def __init__(self, cdbname, debug=0): CMap.__init__(self, debug=debug) self.cdbname = cdbname self.db = cdb.init(cdbname) return def __repr__(self): return '' % (self.db['/CMapName'], self.cdbname) def tocid(self, code): k = 'c'+code if not self.db.has_key(k): return None return unpack('>L', self.db[k]) def tocode(self, cid): k = 'i'+pack('>L', cid) if not self.db.has_key(k): return None return self.db[k] def is_vertical(self): return (self.db.has_key('/WMode') and self.db['/WMode'] == '1') def getall(self, c): while 1: x = self.db.each() if not x: break (k,v) = x if k.startswith(c): yield (k[1:], unpack('>L', v)[0]) return def getall_attrs(self): while 1: x = self.db.each() if not x: break (k,v) = x if k.startswith('/'): yield (k[1:], eval(v)[0]) return def getall_cid2code(self): return self.getall('i') def getall_code2cid(self): return self.getall('c') def decode(self, bytes): if self.debug: print >>stderr, 'decode: %r, %r' % (self, bytes) x = '' for c in bytes: if x: if x+c in self.code2cid: yield self.code2cid[x+c] elif self.db.has_key('c'+x+c): (dest,) = unpack('>L', self.db['c'+x+c]) self.code2cid[x+c] = dest yield dest x = '' elif c in self.code2cid: yield self.code2cid[c] elif self.db.has_key('c'+c): (dest,) = unpack('>L', self.db['c'+c]) self.code2cid[c] = dest yield dest else: x = c return ## CMapDB ## class CMapDB: CMAP_ALIAS = { } debug = 0 dirname = None cdbdirname = None cmapdb = {} @classmethod def initialize(klass, dirname, cdbdirname=None, debug=0): klass.dirname = dirname klass.cdbdirname = cdbdirname or dirname klass.debug = debug return @classmethod def get_cmap(klass, cmapname): import os.path cmapname = klass.CMAP_ALIAS.get(cmapname, cmapname) if cmapname in klass.cmapdb: cmap = klass.cmapdb[cmapname] else: fname = os.path.join(klass.dirname, cmapname) cdbname = os.path.join(klass.cdbdirname, cmapname+'.cmap.cdb') if os.path.exists(cdbname): if 1 <= klass.debug: print >>stderr, 'Opening: CDBCMap %r...' % cdbname cmap = CDBCMap(cdbname) elif os.path.exists(fname): if 1 <= klass.debug: print >>stderr, 'Reading: CMap %r...' % fname cmap = CMap() fp = file(fname) CMapParser(cmap, fp).parse() fp.close() klass.cmapdb[cmapname] = cmap return cmap ## FontMetricsDB ## class FontMetricsDB: from fontmetrics import FONT_METRICS @classmethod def get_metrics(klass, fontname): return klass.FONT_METRICS[fontname] ## EncodingDB ## class EncodingDB: from glyphlist import charname2unicode from latin_enc import ENCODING std2unicode = {} mac2unicode = {} win2unicode = {} pdf2unicode = {} for (name,std,mac,win,pdf) in ENCODING: c = unichr(charname2unicode[name]) if std: std2unicode[std] = c if mac: mac2unicode[mac] = c if win: win2unicode[win] = c if pdf: pdf2unicode[pdf] = c encodings = { 'StandardEncoding': std2unicode, 'MacRomanEncoding': mac2unicode, 'WinAnsiEncoding': win2unicode, 'PDFDocEncoding': pdf2unicode, } @classmethod def get_encoding(klass, name, diff=None): cid2unicode = klass.encodings.get(name, klass.std2unicode) if diff: cid2unicode = cid2unicode.copy() cid = 0 for x in diff: if isinstance(x, int): cid = x elif isinstance(x, PSLiteral): try: cid2unicode[cid] = unichr(EncodingDB.charname2unicode[x.name]) except KeyError: pass cid += 1 return cid2unicode ## Color Spaces ## LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray') LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB') LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK') LITERAL_ICC_BASED = PSLiteralTable.intern('ICCBased') LITERAL_DEVICE_N = PSLiteralTable.intern('DeviceN') CS_COMPONENTS = { PSLiteralTable.intern('CalRGB'): 3, PSLiteralTable.intern('CalGray'): 1, PSLiteralTable.intern('Lab'): 3, PSLiteralTable.intern('DeviceRGB'): 3, PSLiteralTable.intern('DeviceCMYK'): 4, PSLiteralTable.intern('DeviceGray'): 1, PSLiteralTable.intern('Separation'): 1, PSLiteralTable.intern('Indexed'): 1, PSLiteralTable.intern('Pattern'): 1, } def cs_params(cs): t = cs[0] if t == LITERAL_ICC_BASED: return stream_value(cs[1]).dic['N'] elif t == LITERAL_DEVICE_N: return len(list_value(cs[1])) else: return CS_COMPONENTS[t] ## PSBaseParser ## class PSBaseParser: '''PostScript parser that performs only basic tokenization.''' def __init__(self, fp, debug=0): self.fp = fp self.debug = debug self.bufsize = 4096 self.seek(0) return def __repr__(self): return '' % (self.fp,) def seek(self, pos): ''' seeks to the given pos. ''' if 2 <= self.debug: print >>stderr, 'seek:', pos self.fp.seek(pos) self.linepos = pos self.linebuf = None self.curpos = 0 self.line = '' return EOLCHAR = re.compile(r'[\r\n]') def nextline(self): ''' fetches the next line that ends either with \\r or \\n. ''' line = '' eol = None while 1: if not self.linebuf or len(self.linebuf) <= self.curpos: # fetch next chunk. self.linebuf = self.fp.read(self.bufsize) if not self.linebuf: # at EOF. break self.curpos = 0 if eol: c = self.linebuf[self.curpos] # handle '\r\n' if (eol == '\r' and c == '\n'): line += c self.curpos += 1 break m = self.EOLCHAR.search(self.linebuf, self.curpos) if m: i = m.end(0) line += self.linebuf[self.curpos:i] eol = self.linebuf[i-1] self.curpos = i else: # fetch further line += self.linebuf[self.curpos:] self.linebuf = None self.linepos += len(line) return line def revreadlines(self): ''' fetches lines backword. used to locate trailers. ''' self.fp.seek(0, 2) pos = self.fp.tell() buf = '' while 0 < pos: pos = max(0, pos-self.bufsize) self.fp.seek(pos) s = self.fp.read(self.bufsize) if not s: break while 1: n = max(s.rfind('\r'), s.rfind('\n')) if n == -1: buf = s + buf break yield buf+s[n:] s = s[:n] buf = '' return SPECIAL = r'%\[\]()<>{}/\000\011\012\014\015\040' TOKEN = re.compile(r'<<|>>|[%\[\]()<>{}/]|[^'+SPECIAL+r']+') LITERAL = re.compile(r'([^#'+SPECIAL+r']|#[0-9abcdefABCDEF]{2})+') NUMBER = re.compile(r'[+-]?[0-9][.0-9]*$') STRING_NORM = re.compile(r'(\\[0-9]{1,3}|\\.|[^\)])+') STRING_NORM_SUB = re.compile(r'\\[0-7]{1,3}|\\.') STRING_HEX = re.compile(r'[\s0-9a-fA-F]+') STRING_HEX_SUB = re.compile(r'[0-9a-fA-F]{1,2}') def parse(self): ''' Yields a list of basic tokens: keywords, literals, strings, numbers and parentheses. Comments are skipped. Nested objects (i.e. arrays and dictionaries) are not handled. ''' while 1: # do not strip line! we need to distinguish last '\n' or '\r' linepos0 = self.linepos self.line = self.nextline() if not self.line: break if 2 <= self.debug: print >>stderr, 'line: (%d) %r' % (self.linepos, self.line) # do this before removing comment if self.line.startswith('%%EOF'): break charpos = 0 # tokenize while 1: m = self.TOKEN.search(self.line, charpos) if not m: break t = m.group(0) pos = linepos0 + m.start(0) charpos = m.end(0) if t == '%': # skip comment if 2 <= self.debug: print >>stderr, 'comment: %r' % self.line[charpos:] break elif t == '/': # literal object mn = self.LITERAL.match(self.line, m.start(0)+1) lit = PSLiteralTable.intern(mn.group(0)) yield (pos, lit) charpos = mn.end(0) if 2 <= self.debug: print >>stderr, 'name: %r' % lit elif t == '(': # normal string object s = '' while 1: ms = self.STRING_NORM.match(self.line, charpos) if not ms: break s1 = ms.group(0) charpos = ms.end(0) if len(s1) == 1 and s1[-1] == '\\': s += s1[-1:] self.line = self.nextline() if not self.line: raise PSSyntaxError('end inside string: linepos=%d, line=%r' % (self.linepos, self.line)) charpos = 0 elif charpos == len(self.line): s += s1 self.line = self.nextline() if not self.line: raise PSSyntaxError('end inside string: linepos=%d, line=%r' % (self.linepos, self.line)) charpos = 0 else: s += s1 break if self.line[charpos] != ')': raise PSSyntaxError('no close paren: linepos=%d, line=%r' % (self.linepos, self.line)) charpos += 1 def convesc(m): x = m.group(0) if x[1:].isdigit(): return chr(int(x[1:], 8)) else: return x[1] s = self.STRING_NORM_SUB.sub(convesc, s) if 2 <= self.debug: print >>stderr, 'str: %r' % s yield (pos, s) elif t == '<': # hex string object ms = self.STRING_HEX.match(self.line, charpos) charpos = ms.end(0) if self.line[charpos] != '>': raise PSSyntaxError('no close paren: linepos=%d, line=%r' % (self.linepos, self.line)) charpos += 1 def convhex(m1): return chr(int(m1.group(0), 16)) s = self.STRING_HEX_SUB.sub(convhex, ms.group(0)) if 2 <= self.debug: print >>stderr, 'str: %r' % s yield (pos, s) elif self.NUMBER.match(t): # number if '.' in t: n = float(t) else: n = int(t) if 2 <= self.debug: print >>stderr, 'number: %r' % n yield (pos, n) elif t in ('true','false'): # boolean if 2 <= self.debug: print >>stderr, 'boolean: %r' % t yield (pos, (t == 'true')) else: # other token if 2 <= self.debug: print >>stderr, 'keyword: %r' % t yield (pos, PSKeywordTable.intern(t)) return ## PSStackParser ## class PSStackParser(PSBaseParser): ''' PostScript parser that recognizes compound objects such as arrays and dictionaries. ''' def __init__(self, fp, debug=0): PSBaseParser.__init__(self, fp, debug=debug) self.context = [] self.partobj = None return def do_token(self, pos, token): ''' Handles special tokens. Returns true if the token denotes the end of an object. ''' return False def push(self, obj): ''' Push an object to the stack. ''' self.partobj.append(obj) return def pop(self, n): ''' Pop N objects from the stack. ''' if len(self.partobj) < n: raise PSSyntaxError('stack too short < %d' % n) r = self.partobj[-n:] self.partobj = self.partobj[:-n] return r def popall(self): ''' Discards all the objects on the stack. ''' self.partobj = [] return def parse(self): ''' Yields a list of objects: keywords, literals, strings, numbers, arrays and dictionaries. Arrays and dictionaries are represented as Python sequence and dictionaries. ''' def startobj(type): self.context.append((type, self.partobj)) self.partobj = [] return def endobj(type1): assert self.context obj = self.partobj (type0, self.partobj) = self.context.pop() if type0 != type1: raise PSTypeError('type mismatch: %r(%r) != %r(%r)' % (type0, self.partobj, type1, obj)) return obj startobj('o') for (pos,t) in PSBaseParser.parse(self): if isinstance(t, int) or isinstance(t, float): self.push(t) elif isinstance(t, str): self.push(t) elif isinstance(t, PSLiteral): self.push(t) else: c = keyword_name(t) if c == '{' or c == '}': self.push(t) elif c == '[': # begin array if 2 <= self.debug: print >>stderr, 'start array' startobj('a') elif c == ']': # end array a = endobj('a') if 2 <= self.debug: print >>stderr, 'end array: %r' % a self.push(a) elif c == '<<': # begin dictionary if 2 <= self.debug: print >>stderr, 'start dict' startobj('d') elif c == '>>': # end dictionary objs = endobj('d') if len(objs) % 2 != 0: raise PSTypeError('invalid dictionary construct: %r' % objs) d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) ) if 2 <= self.debug: print >>stderr, 'end dict: %r' % d self.push(d) elif self.do_token(pos, t): break return endobj('o') ## CMapParser ## class CMapParser(PSStackParser): def __init__(self, cmap, fp, debug=0): PSStackParser.__init__(self, fp, debug=debug) self.cmap = cmap self.in_cmap = False return def do_token(self, pos, token): name = token.name if name == 'begincmap': self.in_cmap = True self.popall() return elif name == 'endcmap': self.in_cmap = False return if not self.in_cmap: return # if name == 'def': try: (k,v) = self.pop(2) self.cmap.attrs[literal_name(k)] = v except PSSyntaxError: pass return if name == 'usecmap': try: (cmapname,) = self.pop(1) self.cmap.copycmap(CMapDB.get_cmap(literal_name(cmapname))) except PSSyntaxError: pass return if name == 'begincodespacerange': self.popall() return if name == 'endcodespacerange': if 1 <= self.debug: print >>stderr, 'codespace: %r' % self.partobj self.popall() return if name == 'begincidrange': self.popall() return if name == 'endcidrange': for (s,e,cid) in choplist(3, self.partobj): assert isinstance(s, str) assert isinstance(e, str) assert isinstance(cid, int) assert len(s) == len(e) sprefix = s[:-4] eprefix = e[:-4] assert sprefix == eprefix svar = s[-4:] evar = e[-4:] s1 = nunpack(svar) e1 = nunpack(evar) vlen = len(svar) assert s1 <= e1 for i in xrange(e1-s1+1): x = sprefix+pack('>L',s1+i)[-vlen:] self.cmap.register_code2cid(x, cid+i) self.popall() return if name == 'begincidchar': self.popall() return if name == 'endcidchar': for (cid,code) in choplist(2, self.partobj): assert isinstance(code, str) assert isinstance(cid, str) self.cmap.register_code2cid(code, nunpack(cid)) self.popall() return if name == 'beginbfrange': self.popall() return if name == 'endbfrange': for (s,e,code) in choplist(3, self.partobj): assert isinstance(s, str) assert isinstance(e, str) assert len(s) == len(e) s1 = nunpack(s) e1 = nunpack(e) assert s1 <= e1 if isinstance(code, list): for i in xrange(e1-s1+1): self.cmap.register_cid2code(s1+i, code[i]) else: var = code[-4:] base = nunpack(var) prefix = code[:-4] vlen = len(var) for i in xrange(e1-s1+1): x = prefix+pack('>L',base+i)[-vlen:] self.cmap.register_cid2code(s1+i, x) self.popall() return if name == 'beginbfchar': self.popall() return if name == 'endbfchar': for (cid,code) in choplist(2, self.partobj): assert isinstance(cid, str) assert isinstance(code, str) self.cmap.register_cid2code(nunpack(cid), code) self.popall() return if name == 'beginnotdefrange': self.popall() return if name == 'endnotdefrange': if 1 <= self.debug: print >>stderr, 'notdefrange: %r' % self.partobj self.popall() return return ## PDFStream type ## class PDFStream: def __init__(self, doc, dic, rawdata): self.doc = doc self.dic = dic self.rawdata = rawdata self.data = None return def __repr__(self): return '' % (self.dic) def decode(self): assert self.data == None and self.rawdata != None data = self.rawdata if self.doc.crypt: # func DECRYPT is not implemented yet... raise NotImplementedError data = DECRYPT(self.doc.crypt, data) if 'Filter' not in self.dic: self.data = data self.rawdata = None return filters = self.dic['Filter'] if not isinstance(filters, list): filters = [ filters ] for f in filters: if f == LITERAL_FLATE_DECODE: import zlib # will get errors if the document is encrypted. data = zlib.decompress(data) # apply predictors params = self.dic.get('DecodeParms', {}) if 'Predictor' in params: pred = int_value(params['Predictor']) if pred: if pred != 12: raise PDFValueError('Unsupported predictor: %r' % pred) if 'Columns' not in params: raise PDFValueError('Columns undefined for predictor=12') columns = int_value(params['Columns']) buf = '' ent0 = '\x00' * columns for i in xrange(0, len(data), columns+1): pred = data[i] ent1 = data[i+1:i+1+columns] if pred == '\x02': ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) ) buf += ent1 ent0 = ent1 data = buf else: raise PDFValueError('Invalid filter spec: %r' % f) self.data = data self.rawdata = None return def get_data(self): if self.data == None: self.decode() return self.data def parse_data(self, inline=False, debug=0): return PDFParser(self.doc, StringIO(self.get_data()), inline=inline, debug=debug).parse() ## PDFObjRef ## class PDFObjRef: def __init__(self, doc, objid, genno): if objid == 0: raise PDFValueError('objid cannot be 0.') self.doc = doc self.objid = objid #self.genno = genno # Never used. return def __repr__(self): return '' % (self.objid) def resolve(self): return self.doc.getobj(self.objid) # resolve def resolve1(x): ''' Resolve an object. If this is an array or dictionary, it may still contains some indirect objects inside. ''' while isinstance(x, PDFObjRef): x = x.resolve() return x def resolveall(x): ''' Recursively resolve X and all the internals. Make sure there is no indirect reference within the nested object. This procedure might be slow. Do not used it unless you really need it. ''' while isinstance(x, PDFObjRef): x = x.resolve() if isinstance(x, list): x = [ resolveall(v) for v in x ] elif isinstance(x, dict): for (k,v) in x.iteritems(): x[k] = resolveall(v) return x # Type cheking def literal_name(x): x = resolve1(x) if not isinstance(x, PSLiteral): raise PDFTypeError('literal required: %r' % x) return x.name def keyword_name(x): x = resolve1(x) if not isinstance(x, PSKeyword): raise PDFTypeError('keyword required: %r' % x) return x.name def str_value(x): x = resolve1(x) if not isinstance(x, str): raise PDFTypeError('string required: %r' % x) return x def int_value(x): x = resolve1(x) if not isinstance(x, int): raise PDFTypeError('integer required: %r' % x) return x def float_value(x): x = resolve1(x) if not isinstance(x, float): raise PDFTypeError('float required: %r' % x) return x def num_value(x): x = resolve1(x) if not (isinstance(x, int) or isinstance(x, float)): raise PDFTypeError('int or float required: %r' % x) return x def list_value(x): x = resolve1(x) if not isinstance(x, list): raise PDFTypeError('list required: %r' % x) return x def dict_value(x): x = resolve1(x) if not isinstance(x, dict): raise PDFTypeError('dict required: %r' % x) return x def stream_value(x): x = resolve1(x) if not isinstance(x, PDFStream): raise PDFTypeError('stream required: %r' % x) return x ## PDFPage ## class PDFPage: def __init__(self, doc, pageidx, attrs, parent_attrs): self.doc = doc self.pageid = pageidx self.attrs = dict_value(attrs) self.parent_attrs = parent_attrs self.resources = self.get_attr('Resources') self.mediabox = self.get_attr('MediaBox') contents = resolve1(self.attrs['Contents']) if not isinstance(contents, list): contents = [ contents ] self.contents = contents return def __repr__(self): return '' % (self.resources, self.mediabox) def get_attr(self, k): if k in self.attrs: return resolve1(self.attrs[k]) return self.parent_attrs.get(k) ## XRefs ## PDFXRef ## class PDFXRef: def __init__(self, parser): while 1: line = parser.nextline() if not line: raise PDFSyntaxError('premature eof: %r' % parser) line = line.strip() f = line.split(' ') if len(f) != 2: if line != 'trailer': raise PDFSyntaxError('trailer not found: %r: line=%r' % (parser, line)) break (start, nobjs) = map(long, f) self.objid0 = start self.objid1 = start+nobjs self.offsets = [] for objid in xrange(start, start+nobjs): line = parser.nextline() f = line.strip().split(' ') if len(f) != 3: raise PDFSyntaxError('invalid xref format: %r, line=%r' % (parser, line)) (pos, genno, use) = f self.offsets.append((int(genno), long(pos), use)) # read trailer self.trailer = dict_value(parser.parse()[0]) return def getpos(self, objid): if objid < self.objid0 or self.objid1 <= objid: raise IndexError (genno, pos, use) = self.offsets[objid-self.objid0] if use != 'n': raise PDFValueError('unused objid=%r' % objid) return (None, pos) ## PDFXRefStream ## class PDFXRefStream: def __init__(self, parser): (objid, genno, _, stream) = list_value(parser.parse()) assert stream.dic['Type'] == LITERAL_XREF size = stream.dic['Size'] (start, nobjs) = stream.dic.get('Index', (0,size)) self.objid0 = start self.objid1 = start+nobjs (self.fl1, self.fl2, self.fl3) = stream.dic['W'] self.data = stream.get_data() self.entlen = self.fl1+self.fl2+self.fl3 self.trailer = stream.dic return def getpos(self, objid): if objid < self.objid0 or self.objid1 <= objid: raise IndexError i = self.entlen * (objid-self.objid0) ent = self.data[i:i+self.entlen] f1 = nunpack(ent[:self.fl1], 1) if f1 == 1: pos = nunpack(ent[self.fl1:self.fl1+self.fl2]) genno = nunpack(ent[self.fl1+self.fl2:]) return (None, pos) elif f1 == 2: objid = nunpack(ent[self.fl1:self.fl1+self.fl2]) index = nunpack(ent[self.fl1+self.fl2:]) return (objid, index) ## PDFDocument ## class PDFDocument: def __init__(self, debug=0): self.debug = debug self.xrefs = [] self.objs = {} self.parsed_objs = {} self.crypt = None self.root = None self.catalog = None self.parser = None return def set_parser(self, parser): if self.parser: return self.parser = parser self.xrefs = list(parser.read_xref()) for xref in self.xrefs: trailer = xref.trailer if 'Encrypt' in trailer: self.crypt = dict_value(trailer['Encrypt']) if 'Root' in trailer: self.set_root(dict_value(trailer['Root'])) break else: raise PDFValueError('no /Root object!') return def getobj(self, objid): assert self.xrefs if objid in self.objs: obj = self.objs[objid] else: for xref in self.xrefs: try: (strmid, index) = xref.getpos(objid) break except IndexError: pass else: raise PDFValueError('Cannot locate objid=%r' % objid) if strmid: stream = stream_value(self.getobj(strmid)) if stream.dic['Type'] != LITERAL_OBJSTM: raise PDFSyntaxError('Not a stream object: %r' % stream) if 'N' not in stream.dic: raise PDFSyntaxError('N is not defined: %r' % stream) if strmid in self.parsed_objs: objs = self.parsed_objs[stream] else: objs = stream.parse_data(self.debug) self.parsed_objs[stream] = objs obj = objs[stream.dic['N']*2+index] else: pos0 = self.parser.linepos self.parser.seek(index) seq = list_value(self.parser.parse()) if not (len(seq) == 4 and seq[0] == objid and seq[2] == KEYWORD_OBJ): raise PDFSyntaxError('invalid stream spec: %r' % seq) obj = seq[3] self.parser.seek(pos0) if 2 <= self.debug: print >>stderr, 'register: objid=%r: %r' % (objid, obj) self.objs[objid] = obj return obj def get_pages(self, debug=0): assert self.xrefs def search(obj, parent): tree = dict_value(obj) if tree['Type'] == LITERAL_PAGES: if 1 <= debug: print >>stderr, 'Pages: Kids=%r' % tree['Kids'] for c in tree['Kids']: for x in search(c, tree): yield x elif tree['Type'] == LITERAL_PAGE: if 1 <= debug: print >>stderr, 'Page: %r' % tree yield (tree, parent) for (i,(tree,parent)) in enumerate(search(self.catalog['Pages'], self.catalog)): yield PDFPage(self, i, tree, parent) return def set_root(self, root): self.root = root self.catalog = dict_value(self.root) if self.catalog['Type'] != LITERAL_CATALOG: raise PDFValueError('Catalog not found!') self.outline = self.catalog.get('Outline') return ## PDFParser ## class PDFParser(PSStackParser): def __init__(self, doc, fp, inline=False, debug=0): PSStackParser.__init__(self, fp, debug=debug) self.inline = inline self.doc = doc self.doc.set_parser(self) return def __repr__(self): return '' % self.linepos EOIPAT = re.compile(r'\nEI\W') def do_token(self, pos, token): name = keyword_name(token) if name in ('xref', 'trailer', 'startxref', 'endobj'): return True if name == 'R': # reference to indirect object try: (objid, genno) = self.pop(2) (objid, genno) = (int(objid), int(genno)) obj = PDFObjRef(self.doc, objid, genno) self.push(obj) if 2 <= self.debug: print >>stderr, 'refer obj: %r' % obj except PSSyntaxError: pass elif name == 'stream': # stream object (dic,) = self.pop(1) dic = dict_value(dic) if 'Length' not in dic: raise PDFValueError('/Length is undefined: %r' % dic) objlen = int_value(dic['Length']) self.seek(pos) line = self.nextline() # 'stream' self.fp.seek(pos+len(line)) data = self.fp.read(objlen) self.seek(pos+len(line)+objlen) while 1: line = self.nextline() if not line: raise PDFSyntaxError('premature eof, need endstream: linepos=%d, line=%r' % (self.linepos, line)) if line.strip(): if not line.startswith('endstream'): raise PDFSyntaxError('need endstream: linepos=%d, line=%r' % (self.linepos, line)) break if 1 <= self.debug: print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \ (pos, objlen, dic, data[:10]) obj = PDFStream(self.doc, dic, data) self.push(obj) elif self.inline and name == 'BI': # inline image within a content stream self.context.append(('BI', self.partobj)) self.partobj = [] elif self.inline and name == 'ID': objs = self.partobj (type0, self.partobj) = self.context.pop() if len(objs) % 2 != 0: raise PSTypeError('invalid dictionary construct: %r' % objs) dic = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) ) pos += len('ID ') self.fp.seek(pos) data = self.fp.read(8192) # XXX how do we know the real length other than scanning? m = self.EOIPAT.search(data) assert m objlen = m.start(0) obj = PDFStream(self.doc, dic, data[:objlen]) self.push(obj) self.seek(pos+objlen+len('\nEI')) self.push(KEYWORD_EI) else: self.push(token) return False def find_xref(self): # find the first xref table prev = None for line in self.revreadlines(): line = line.strip() if 2 <= self.debug: print >>stderr, 'line: %r' % line if line == 'startxref': break if line: prev = line else: raise PDFSyntaxError('startxref not found!') if 1 <= self.debug: print >>stderr, 'xref found: pos=%r' % prev self.seek(long(prev)) return # read xref tables and trailers def read_xref(self): self.find_xref() while 1: # read xref table pos0 = self.linepos line = self.nextline() if 2 <= self.debug: print >>stderr, 'line: %r' % line if line[0].isdigit(): # XRefStream: PDF-1.5 self.seek(pos0) xref = PDFXRefStream(self) elif line.strip() != 'xref': raise PDFSyntaxError('xref not found: linepos=%d, line=%r' % (self.linepos, line)) else: xref = PDFXRef(self) yield xref trailer = xref.trailer if 1 <= self.debug: print >>stderr, 'trailer: %r' % trailer if 'XRefStm' in trailer: self.seek(int_value(trailer['XRefStm'])) if 'Prev' in trailer: # find previous xref pos0 = int_value(trailer['Prev']) self.seek(pos0) if 1 <= self.debug: print >>stderr, 'prev trailer: pos=%d' % pos0 else: break return ## Fonts ## # PDFFont class PDFFont: def __init__(self, fontid, descriptor, widths, default_width=None): self.fontid = fontid self.descriptor = descriptor self.widths = widths self.fontname = descriptor['FontName'] if isinstance(self.fontname, PSLiteral): self.fontname = literal_name(self.fontname) self.ascent = descriptor['Ascent'] self.descent = descriptor['Descent'] self.default_width = default_width or descriptor.get('MissingWidth', 0) self.leading = descriptor.get('Leading', 0) self.bbox = descriptor['FontBBox'] return def __repr__(self): return '' % (self.fontid,) def is_vertical(self): return False def decode(self, bytes): return map(ord, bytes) def char_width(self, cid): return self.widths.get(cid, self.default_width) def char_disp(self, cid): return 0 def string_width(self, s): return sum( self.char_width(cid) for cid in self.decode(s) ) # PDFSimpleFont class PDFSimpleFont(PDFFont): def __init__(self, fontid, descriptor, widths, spec): # Font encoding is specified either by a name of # built-in encoding or a dictionary that describes # the differences. if 'Encoding' in spec: encoding = resolve1(spec['Encoding']) else: encoding = LITERAL_STANDARD_ENCODING if isinstance(encoding, dict): name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING)) diff = encoding.get('Differences', None) self.encoding = EncodingDB.get_encoding(name, diff) else: self.encoding = EncodingDB.get_encoding(literal_name(encoding)) self.ucs2_cmap = None if 'ToUnicode' in spec: strm = stream_value(spec['ToUnicode']) self.ucs2_cmap = CMap() CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse() PDFFont.__init__(self, fontid, descriptor, widths) return def to_unicode(self, cid): if not self.ucs2_cmap: try: return self.encoding[cid] except KeyError: raise PDFUnicodeNotDefined(None, cid) code = self.ucs2_cmap.tocode(cid) if not code: raise PDFUnicodeNotDefined(None, cid) chars = unpack('>%dH' % (len(code)/2), code) return ''.join( unichr(c) for c in chars ) # PDFType1Font class PDFType1Font(PDFSimpleFont): def __init__(self, fontid, spec): if 'BaseFont' not in spec: raise PDFFontError('BaseFont is missing') self.basefont = literal_name(spec['BaseFont']) try: (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont) except KeyError: try: descriptor = dict_value(spec['FontDescriptor']) firstchar = int_value(spec['FirstChar']) lastchar = int_value(spec['LastChar']) widths = dict( (i+firstchar,w) for (i,w) in enumerate(list_value(spec['Widths'])) ) except KeyError, k: raise PDFFontError('%s is missing' % k) PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec) return # PDFTrueTypeFont class PDFTrueTypeFont(PDFType1Font): pass # PDFType3Font class PDFType3Font(PDFSimpleFont): def __init__(self, fontid, spec): try: firstchar = int_value(spec['FirstChar']) lastchar = int_value(spec['LastChar']) widths = dict( (i+firstchar,w) for (i,w) in enumerate(list_value(spec['Widths'])) ) except KeyError, k: raise PDFFontError('%s is missing' % k) if 'FontDescriptor' in spec: descriptor = dict_value(spec['FontDescriptor']) else: descriptor = {'FontName':fontid, 'Ascent':0, 'Descent':0, 'FontBBox':spec['FontBBox']} PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec) return # PDFCIDFont ## TrueTypeFont ## class TrueTypeFont: class CMapNotFound(Exception): pass def __init__(self, name, fp): self.name = name self.fp = fp self.tables = {} fonttype = fp.read(4) (ntables, _1, _2, _3) = unpack('>HHHH', fp.read(8)) for i in xrange(ntables): (name, tsum, offset, length) = unpack('>4sLLL', fp.read(16)) self.tables[name] = (offset, length) return def create_cmap(self): if 'cmap' not in self.tables: raise TrueTypeFont.CMapNotFound (base_offset, length) = self.tables['cmap'] fp = self.fp fp.seek(base_offset) (version, nsubtables) = unpack('>HH', fp.read(4)) subtables = [] for i in xrange(nsubtables): subtables.append(unpack('>HHL', fp.read(8))) char2gid = {} # Only supports subtable type 0, 2 and 4. for (_1, _2, st_offset) in subtables: fp.seek(base_offset+st_offset) (fmttype, fmtlen, fmtlang) = unpack('>HHH', fp.read(6)) if fmttype == 0: char2gid.update(enumerate(unpack('>256B', fp.read(256)))) elif fmttype == 2: subheaderkeys = unpack('>256H', fp.read(512)) firstbytes = [0]*8192 for (i,k) in enumerate(subheaderkeys): firstbytes[k/8] = i nhdrs = max(subheaderkeys)/8 + 1 hdrs = [] for i in xrange(nhdrs): (firstcode,entcount,delta,offset) = unpack('>HHhH', fp.read(8)) hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset)) for (i,firstcode,entcount,delta,pos) in hdrs: if not entcount: continue first = firstcode + (firstbytes[i] << 8) fp.seek(pos) for c in xrange(entcount): gid = unpack('>H', fp.read(2)) if gid: gid += delta char2gid[first+c] = gid elif fmttype == 4: (segcount, _1, _2, _3) = unpack('>HHHH', fp.read(8)) segcount /= 2 ecs = unpack('>%dH' % segcount, fp.read(2*segcount)) fp.read(2) scs = unpack('>%dH' % segcount, fp.read(2*segcount)) idds = unpack('>%dh' % segcount, fp.read(2*segcount)) pos = fp.tell() idrs = unpack('>%dH' % segcount, fp.read(2*segcount)) for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs): if idr: fp.seek(pos+idr) for c in xrange(sc, ec+1): char2gid[c] = (unpack('>H', fp.read(2))[0] + idd) & 0xffff else: for c in xrange(sc, ec+1): char2gid[c] = (c + idd) & 0xffff gid2char = dict( (gid, pack('>H', char)) for (char,gid) in char2gid.iteritems() ) cmapname = 'Adobe-Identity-UCS-%s' % self.name return CMap(cmapname).update(char2gid, gid2char) class PDFCIDFont(PDFFont): def __init__(self, fontid, spec): if 'BaseFont' not in spec: raise PDFFontError('BaseFont is missing') try: self.cidsysteminfo = dict_value(spec['CIDSystemInfo']) self.cidcoding = '%s-%s' % (self.cidsysteminfo['Registry'], self.cidsysteminfo['Ordering']) except KeyError: raise PDFFontError('CIDSystemInfo not properly defined.') self.basefont = literal_name(spec['BaseFont']) self.cmap = CMapDB.get_cmap(literal_name(spec['Encoding'])) descriptor = dict_value(spec['FontDescriptor']) ttf = None if 'FontFile2' in descriptor: self.fontfile = stream_value(descriptor.get('FontFile2')) ttf = TrueTypeFont(self.basefont, StringIO(self.fontfile.get_data())) self.ucs2_cmap = None if 'ToUnicode' in spec: strm = stream_value(spec['ToUnicode']) self.ucs2_cmap = CMap() CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse() elif self.cidcoding == 'Adobe-Identity': if ttf: try: self.ucs2_cmap = ttf.create_cmap() except TrueTypeFont.CMapNotFound: pass else: self.ucs2_cmap = CMapDB.get_cmap('%s-UCS2' % self.cidcoding) def get_width(seq): dic = {} char1 = char2 = None for v in seq: if char1 == None: char1 = v elif char2 == None and isinstance(v, int): char2 = v else: if char2 == None: for (i,w) in enumerate(v): dic[char1+i] = w else: for i in xrange(char1, char2+1): dic[i] = v char1 = char2 = None return dic self.vertical = self.cmap.is_vertical() if self.vertical: # writing mode: vertical dic = get_width(list_value(spec.get('W2', []))) widths = dict( (cid,w) for (cid,(d,w)) in dic.iteritems() ) self.disps = dict( (cid,d) for (cid,(d,w)) in dic.iteritems() ) (d,w) = spec.get('DW2', [880, -1000]) default_width = w self.default_disp = d else: # writing mode: horizontal widths = get_width(list_value(spec.get('W', []))) self.disps = {} default_width = spec.get('DW', 1000) self.default_disp = 0 PDFFont.__init__(self, fontid, descriptor, widths, default_width) return def is_vertical(self): return self.vertical def decode(self, bytes): return self.cmap.decode(bytes) def char_disp(self, cid): return self.disps.get(cid, self.default_disp) def to_unicode(self, cid): if not self.ucs2_cmap: raise PDFUnicodeNotDefined(self.cidcoding, cid) code = self.ucs2_cmap.tocode(cid) if not code: raise PDFUnicodeNotDefined(self.cidcoding, cid) chars = unpack('>%dH' % (len(code)/2), code) return ''.join( unichr(c) for c in chars ) ## Resource Manager ## class PDFResourceManager: ''' ResourceManager facilitates reuse of shared resources such as fonts, images and cmaps so that large objects are not allocated multiple times. ''' def __init__(self, debug=0): self.debug = debug self.fonts = {} return def get_procset(self, procs): for proc in procs: if proc == LITERAL_PDF: pass elif proc == LITERAL_TEXT: pass else: #raise PDFResourceError('ProcSet %r is not supported.' % proc) pass return def get_cmap(self, name): return CMapDB.get_cmap(name) def get_font(self, fontid, spec): if fontid in self.fonts: font = self.fonts[fontid] else: spec = dict_value(spec) assert spec['Type'] == LITERAL_FONT # Create a Font object. if 'Subtype' not in spec: raise PDFFontError('Font Subtype is not specified.') subtype = literal_name(spec['Subtype']) if subtype in ('Type1', 'MMType1'): # Type1 Font font = PDFType1Font(fontid, spec) elif subtype == 'TrueType': # TrueType Font font = PDFTrueTypeFont(fontid, spec) elif subtype == 'Type3': # Type3 Font font = PDFType3Font(fontid, spec) elif subtype in ('CIDFontType0', 'CIDFontType2'): # CID Font font = PDFCIDFont(fontid, spec) elif subtype == 'Type0': # Type0 Font dfonts = list_value(spec['DescendantFonts']) assert len(dfonts) == 1 subspec = dict_value(dfonts[0]).copy() for k in ('Encoding', 'ToUnicode'): if k in spec: subspec[k] = resolve1(spec[k]) font = self.get_font(fontid, subspec) else: raise PDFFontError('Invalid Font: %r' % spec) self.fonts[fontid] = font return font ## Interpreter ## class PDFPageInterpreter: class TextState: def __init__(self): self.font = None self.fontsize = 0 self.charspace = 0 self.wordspace = 0 self.scaling = 100 self.leading = 0 self.render = 0 self.rise = 0 self.reset() return def __repr__(self): return ('' % (self.font, self.fontsize, self.matrix, self.charspace, self.wordspace, self.scaling, self.leading, self.render, self.rise)) def reset(self): self.matrix = (1, 0, 0, 1, 0, 0) self.linematrix = (0, 0) return def __init__(self, rsrc, device, debug=0): self.rsrc = rsrc self.device = device self.debug = debug return def initpage(self, ctm): self.fontmap = {} self.xobjmap = {} self.csmap = {} # gstack: stack for graphical states. self.gstack = [] self.ctm = ctm self.device.set_ctm(self.ctm) self.textstate = PDFPageInterpreter.TextState() # argstack: stack for command arguments. self.argstack = [] # set some global states. self.scs = None self.ncs = None return def push(self, obj): self.argstack.append(obj) return def pop(self, n): x = self.argstack[-n:] self.argstack = self.argstack[:-n] return x def get_current_state(self): return (self.ctm, self.textstate) def set_current_state(self, state): (self.ctm, self.textstate) = state self.device.set_ctm(self.ctm) return # gsave def do_q(self): self.gstack.append(self.get_current_state()) return # grestore def do_Q(self): if self.gstack: self.set_current_state(self.gstack.pop()) return # concat-matrix def do_cm(self, a1, b1, c1, d1, e1, f1): self.ctm = mult_matrix((a1,b1,c1,d1,e1,f1), self.ctm) self.device.set_ctm(self.ctm) return # setlinewidth def do_w(self, width): return # setlinecap def do_J(self, cap): return # setlinejoin def do_j(self, join): return # setmiterlimit def do_M(self, limit): return # setdash def do_d(self, dash, phase): return # setintent def do_ri(self, intent): return # setflatness def do_i(self, flatness): return # savedict def do_gs(self, name): return # moveto def do_m(self, x, y): return # lineto def do_l(self, x, y): return # curveto def do_c(self, x1, y1, x2, y2, x3, y3): return # urveto def do_v(self, x2, y2, x3, y3): return # rveto def do_y(self, x1, y1, x3, y3): return # closepath def do_h(self): return # rectangle def do_re(self, x, y, w, h): return # stroke def do_S(self): return # close-and-stroke def do_s(self): return # fill def do_f(self): return # fill (obsolete) do_F = do_f # fill-even-odd def do_f_a(self): return # fill-and-stroke def do_B(self): return # fill-and-stroke-even-odd def do_B_a(self): return # close-fill-and-stroke def do_b(self): return # close-fill-and-stroke-even-odd def do_b_a(self): return # close-only def do_n(self): return # clip def do_W(self): return # clip-even-odd def do_W_a(self): return # setcolorspace-stroking def do_CS(self, name): self.scs = self.csmap.get(literal_name(name), None) return # setcolorspace-non-strokine def do_cs(self, name): self.ncs = self.csmap.get(literal_name(name), None) return # setgray-stroking def do_G(self, gray): self.do_CS(LITERAL_DEVICE_GRAY) return # setgray-non-stroking def do_g(self, gray): self.do_cs(LITERAL_DEVICE_GRAY) return # setrgb-stroking def do_RG(self, r, g, b): self.do_CS(LITERAL_DEVICE_RGB) return # setrgb-non-stroking def do_rg(self, r, g, b): self.do_cs(LITERAL_DEVICE_RGB) return # setcmyk-stroking def do_K(self, c, m, y, k): self.do_CS(LITERAL_DEVICE_CMYK) return # setcmyk-non-stroking def do_k(self, c, m, y, k): self.do_cs(LITERAL_DEVICE_CMYK) return # setcolor def do_SCN(self): n = cs_params(self.scs) self.pop(n) return def do_scn(self): n = cs_params(self.ncs) self.pop(n) return def do_SC(self): self.do_SCN() return def do_sc(self): self.do_scn() return # sharing-name def do_sh(self, name): return # begin-text def do_BT(self): self.textstate.reset() return # end-text def do_ET(self): return # begin-compat def do_BX(self): return # end-compat def do_EX(self): return # marked content operators def do_MP(self, tag): return def do_DP(self, tag, props): return def do_BMC(self, tag): return def do_BDC(self, tag, props): return def do_EMC(self): return # setcharspace def do_Tc(self, space): self.textstate.charspace = space return # setwordspace def do_Tw(self, space): self.textstate.wordspace = space return # textscale def do_Tz(self, scale): self.textstate.scaling = scale return # setleading def do_TL(self, leading): self.textstate.leading = leading return # selectfont def do_Tf(self, fontid, fontsize): try: self.textstate.font = self.fontmap[literal_name(fontid)] except KeyError: raise PDFInterpreterError('Undefined font id: %r' % fontid) self.textstate.fontsize = fontsize return # setrendering def do_Tr(self, render): self.textstate.render = render return # settextrise def do_Ts(self, rise): self.textstate.rise = rise return # text-move def do_Td(self, tx, ty): (a,b,c,d,e,f) = self.textstate.matrix self.textstate.matrix = (a,b,c,d,e+tx,f+ty) self.textstate.linematrix = (0, 0) return # text-move def do_TD(self, tx, ty): (a,b,c,d,e,f) = self.textstate.matrix self.textstate.matrix = (a,b,c,d,e+tx,f+ty) self.textstate.leading = -ty self.textstate.linematrix = (0, 0) return # textmatrix def do_Tm(self, a,b,c,d,e,f): self.textstate.matrix = (a,b,c,d,e,f) self.textstate.linematrix = (0, 0) return # nextline def do_T_a(self): (a,b,c,d,e,f) = self.textstate.matrix self.textstate.matrix = (a,b,c,d,e,f+self.textstate.leading) self.textstate.linematrix = (0, 0) return # show-pos def do_TJ(self, seq): textstate = self.textstate font = textstate.font (a,b,c,d,e,f) = textstate.matrix (lx,ly) = textstate.linematrix s = ''.join( x for x in seq if isinstance(x, str) ) n = sum( x for x in seq if not isinstance(x, str) ) w = ((font.string_width(s)-n)/1000.0 * textstate.fontsize + len(s) * textstate.charspace + s.count(' ')*textstate.wordspace) * textstate.scaling / 100.0 self.device.render_string(textstate, (a,b,c,d,e+lx,f+ly), w, seq) if font.is_vertical(): ly += w else: lx += w textstate.linematrix = (lx,ly) return # show def do_Tj(self, s): self.do_TJ([s]) return # quote def do__q(self, s): self.do_T_a() self.do_TJ([s]) return # doublequote def do__w(self, aw, ac, s): self.do_Tw(aw) self.do_Tc(ac) self.do_TJ([s]) return # inline image def do_BI(self): # never called return def do_ID(self): # never called return def do_EI(self, obj): return # invoke an XObject def do_Do(self, xobjid): xobjid = literal_name(xobjid) try: xobj = stream_value(self.xobjmap[xobjid]) except KeyError: raise PDFInterpreterError('Undefined xobject id: %r' % xobjid) if xobj.dic['Subtype'] == LITERAL_FORM: if 1 <= self.debug: print >>stderr, 'Processing xobj: %r' % xobj interpreter = PDFPageInterpreter(self.rsrc, self.device) interpreter.render_contents(xobjid, xobj.dic['Resources'], [xobj], xobj.dic['Matrix']) return def process_page(self, page): if 1 <= self.debug: print >>stderr, 'Processing page: %r' % page self.render_contents('page-%d' % page.pageid, page.resources, page.contents) return def render_contents(self, contid, resources, contents, ctm=(1, 0, 0, 1, 0, 0)): self.initpage(ctm) self.device.begin_block(contid) # Handle resource declarations. for (k,v) in resources.iteritems(): if 1 <= self.debug: print >>stderr, 'Resource: %r: %r' % (k,v) if k == 'Font': for (fontid,fontrsrc) in dict_value(v).iteritems(): self.fontmap[fontid] = self.rsrc.get_font(fontid, fontrsrc) elif k == 'ColorSpace': for (csid,csspec) in dict_value(v).iteritems(): self.csmap[csid] = list_value(csspec) elif k == 'ProcSet': self.rsrc.get_procset(list_value(v)) elif k == 'XObject': for (xobjid,xobjstrm) in dict_value(v).iteritems(): self.xobjmap[xobjid] = xobjstrm for stream in contents: self.execute(stream_value(stream)) self.device.end_block() return def execute(self, stream): for obj in stream.parse_data(inline=True, debug=self.debug): if isinstance(obj, PSKeyword): name = 'do_%s' % obj.name.replace('*','_a').replace('"','_w').replace("'",'_q') if hasattr(self, name): func = getattr(self, name) nargs = func.func_code.co_argcount-1 if nargs: args = self.pop(nargs) if 1 <= self.debug: print >>stderr, 'exec: %s %r' % (obj.name, args) if len(args) == nargs: func(*args) else: if 1 <= self.debug: print >>stderr, 'exec: %s' % (obj.name) func() else: raise PDFInterpreterError('unknown operator: %r' % obj.name) else: self.push(obj) return ## PDFDevice ## class PDFDevice: def __init__(self, rsrc): self.rsrc = rsrc self.ctm = None return def __repr__(self): return '' def set_ctm(self, ctm): self.ctm = ctm return def begin_block(self, name): return def end_block(self): return def render_string(self, textstate, textmatrix, size, seq): raise NotImplementedError ## TextConverter ## class TextConverter(PDFDevice): def __init__(self, rsrc, codec, outfp=sys.stdout): PDFDevice.__init__(self, rsrc) self.outfp = outfp self.codec = codec return def begin_block(self, name): self.outfp.write('\n' % name) return def end_block(self): self.outfp.write('\n') return def render_string(self, textstate, textmatrix, size, seq): font = textstate.font spwidth = int(-font.char_width(32) * 0.6) # space width buf = '' for x in seq: if isinstance(x, int) or isinstance(x, float): if not font.is_vertical() and x <= spwidth: buf += ' ' else: chars = font.decode(x) for cid in chars: try: char = font.to_unicode(cid) except PDFUnicodeNotDefined, e: (cidcoding, cid) = e.args char = u'[%s:%d]' % (cidcoding, cid) buf += char (a,b,c,d,tx,ty) = mult_matrix(textmatrix, self.ctm) skewed = (b != 0 or c != 0) if font.is_vertical(): size = -size tag = 'vtext' else: tag = 'htext' if skewed: tag += ' skewed' s = buf.encode(self.codec, 'xmlcharrefreplace') (w,fs) = apply_matrix((a,b,c,d,0,0), (size,textstate.fontsize)) def f(x): return '%.03f' % x self.outfp.write('<%s font="%s" size="%s" x="%s" y="%s" w="%s">%s\n' % (tag, font.fontname, f(fs), f(tx), f(ty), f(w), s, tag)) return # main def main(argv): import getopt def usage(): print 'usage: %s [-d] [-v] [-c codec] [-p pages] file ...' % argv[0] return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dvp:c:') except getopt.GetoptError: return usage() if not args: return usage() (debug, verbose) = (0, 0) cmapdir = 'CMap' cdbcmapdir = 'CDBCMap' codec = 'ascii' pages = set() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-v': verbose += 1 elif k == '-p': pages.add(int(v)) elif k == '-c': codec = v # CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug) rsrc = PDFResourceManager(debug=debug) device = TextConverter(rsrc, codec) for fname in args: doc = PDFDocument(debug=debug) fp = file(fname) parser = PDFParser(doc, fp, debug=debug) interpreter = PDFPageInterpreter(rsrc, device, debug=debug) for (i,page) in enumerate(doc.get_pages(debug=debug)): if pages and (i not in pages): continue interpreter.process_page(page) fp.close() return if __name__ == '__main__': sys.exit(main(sys.argv))