From 6d93b4a7f7f51e00b89af6136dce240a9b1f4e59 Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Mon, 31 Dec 2007 03:41:45 +0000 Subject: [PATCH] split files. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@4 1aa58f4a-7d42-0410-adbc-911cccaed67c --- cmap.py | 383 +++++++++++ pdf2txt.py | 111 +++ pdfinterp.py | 827 +++++++++++++++++++++++ pdfparser.py | 1834 +++----------------------------------------------- psparser.py | 396 +++++++++++ utils.py | 29 + 6 files changed, 1825 insertions(+), 1755 deletions(-) create mode 100644 cmap.py create mode 100755 pdf2txt.py create mode 100644 pdfinterp.py create mode 100644 psparser.py create mode 100644 utils.py diff --git a/cmap.py b/cmap.py new file mode 100644 index 0000000..d08a299 --- /dev/null +++ b/cmap.py @@ -0,0 +1,383 @@ +#!/usr/bin/env python +import sys +stderr = sys.stderr +from struct import pack, unpack +from utils import choplist, nunpack +from psparser import PSException, PSSyntaxError, PSTypeError, \ + PSLiteral, PSKeyword, literal_name, keyword_name, \ + PSStackParser +try: + import cdb +except ImportError: + import pycdb as cdb + + +## CMap +## +class CMap: + + def __init__(self, debug=0): + self.debug = debug + self.code2cid = {} + self.cid2code = {} + self.attrs = {} + return + + def __repr__(self): + return '' % self.attrs.get('CMapName') + + def update(self, code2cid=None, cid2code=None): + if code2cid: + self.code2cid.update(code2cid) + if cid2code: + self.cid2code.update(cid2code) + return self + + def copycmap(self, cmap): + self.code2cid.update(cmap.getall_code2cid()) + self.cid2code.update(cmap.getall_cid2code()) + return self + + def register_code2cid(self, code, cid): + assert isinstance(code, str) + assert isinstance(cid, int) + self.code2cid[code] = cid + return self + + def register_cid2code(self, cid, code): + from glyphlist import charname2unicode + assert isinstance(cid, int) + if isinstance(code, PSLiteral): + code = pack('>H', charname2unicode[code.name]) + self.cid2code[cid] = code + return self + + def decode(self, bytes): + if self.debug: + print >>stderr, 'decode: %r, %r' % (self, bytes) + x = '' + for c in bytes: + if x: + if x+c in self.code2cid: + yield self.code2cid[x+c] + x = '' + elif c in self.code2cid: + yield self.code2cid[c] + else: + x = c + return + + def is_vertical(self): + return self.attrs.get('WMode', '0') == '1' + + def tocid(self, code): + return self.code2cid.get(code) + def tocode(self, cid): + return self.cid2code.get(cid) + + def getall_attrs(self): + return self.attrs.iteritems() + def getall_code2cid(self): + return self.code2cid.iteritems() + def getall_cid2code(self): + return self.cid2code.iteritems() + + +## CDBCMap +## +class CDBCMap(CMap): + + def __init__(self, cdbname, debug=0): + CMap.__init__(self, debug=debug) + self.cdbname = cdbname + self.db = cdb.init(cdbname) + return + + def __repr__(self): + return '' % (self.db['/CMapName'], self.cdbname) + + def tocid(self, code): + k = 'c'+code + if not self.db.has_key(k): + return None + return unpack('>L', self.db[k]) + def tocode(self, cid): + k = 'i'+pack('>L', cid) + if not self.db.has_key(k): + return None + return self.db[k] + + def is_vertical(self): + return (self.db.has_key('/WMode') and + self.db['/WMode'] == '1') + + def getall(self, c): + while 1: + x = self.db.each() + if not x: break + (k,v) = x + if k.startswith(c): + yield (k[1:], unpack('>L', v)[0]) + return + + def getall_attrs(self): + while 1: + x = self.db.each() + if not x: break + (k,v) = x + if k.startswith('/'): + yield (k[1:], eval(v)[0]) + return + + def getall_cid2code(self): + return self.getall('i') + def getall_code2cid(self): + return self.getall('c') + + def decode(self, bytes): + if self.debug: + print >>stderr, 'decode: %r, %r' % (self, bytes) + x = '' + for c in bytes: + if x: + if x+c in self.code2cid: + yield self.code2cid[x+c] + elif self.db.has_key('c'+x+c): + (dest,) = unpack('>L', self.db['c'+x+c]) + self.code2cid[x+c] = dest + yield dest + x = '' + elif c in self.code2cid: + yield self.code2cid[c] + elif self.db.has_key('c'+c): + (dest,) = unpack('>L', self.db['c'+c]) + self.code2cid[c] = dest + yield dest + else: + x = c + return + + +## CMapDB +## +class CMapDB: + + CMAP_ALIAS = { + } + + debug = 0 + dirname = None + cdbdirname = None + cmapdb = {} + + @classmethod + def initialize(klass, dirname, cdbdirname=None, debug=0): + klass.dirname = dirname + klass.cdbdirname = cdbdirname or dirname + klass.debug = debug + return + + @classmethod + def get_cmap(klass, cmapname): + import os.path + cmapname = klass.CMAP_ALIAS.get(cmapname, cmapname) + if cmapname in klass.cmapdb: + cmap = klass.cmapdb[cmapname] + else: + fname = os.path.join(klass.dirname, cmapname) + cdbname = os.path.join(klass.cdbdirname, cmapname+'.cmap.cdb') + if os.path.exists(cdbname): + if 1 <= klass.debug: + print >>stderr, 'Opening: CDBCMap %r...' % cdbname + cmap = CDBCMap(cdbname) + elif os.path.exists(fname): + if 1 <= klass.debug: + print >>stderr, 'Reading: CMap %r...' % fname + cmap = CMap() + fp = file(fname) + CMapParser(cmap, fp).parse() + fp.close() + klass.cmapdb[cmapname] = cmap + return cmap + + +## CMapParser +## +class CMapParser(PSStackParser): + + def __init__(self, cmap, fp, debug=0): + PSStackParser.__init__(self, fp, debug=debug) + self.cmap = cmap + self.in_cmap = False + return + + def do_token(self, _, token): + name = token.name + if name == 'begincmap': + self.in_cmap = True + self.popall() + return + elif name == 'endcmap': + self.in_cmap = False + return + if not self.in_cmap: return + # + if name == 'def': + try: + (k,v) = self.pop(2) + self.cmap.attrs[literal_name(k)] = v + except PSSyntaxError: + pass + return + + if name == 'usecmap': + try: + (cmapname,) = self.pop(1) + self.cmap.copycmap(CMapDB.get_cmap(literal_name(cmapname))) + except PSSyntaxError: + pass + return + + if name == 'begincodespacerange': + self.popall() + return + if name == 'endcodespacerange': + if 1 <= self.debug: + print >>stderr, 'codespace: %r' % self.partobj + self.popall() + return + + if name == 'begincidrange': + self.popall() + return + if name == 'endcidrange': + for (s,e,cid) in choplist(3, self.partobj): + assert isinstance(s, str) + assert isinstance(e, str) + assert isinstance(cid, int) + assert len(s) == len(e) + sprefix = s[:-4] + eprefix = e[:-4] + assert sprefix == eprefix + svar = s[-4:] + evar = e[-4:] + s1 = nunpack(svar) + e1 = nunpack(evar) + vlen = len(svar) + assert s1 <= e1 + for i in xrange(e1-s1+1): + x = sprefix+pack('>L',s1+i)[-vlen:] + self.cmap.register_code2cid(x, cid+i) + self.popall() + return + + if name == 'begincidchar': + self.popall() + return + if name == 'endcidchar': + for (cid,code) in choplist(2, self.partobj): + assert isinstance(code, str) + assert isinstance(cid, str) + self.cmap.register_code2cid(code, nunpack(cid)) + self.popall() + return + + if name == 'beginbfrange': + self.popall() + return + if name == 'endbfrange': + for (s,e,code) in choplist(3, self.partobj): + assert isinstance(s, str) + assert isinstance(e, str) + assert len(s) == len(e) + s1 = nunpack(s) + e1 = nunpack(e) + assert s1 <= e1 + if isinstance(code, list): + for i in xrange(e1-s1+1): + self.cmap.register_cid2code(s1+i, code[i]) + else: + var = code[-4:] + base = nunpack(var) + prefix = code[:-4] + vlen = len(var) + for i in xrange(e1-s1+1): + x = prefix+pack('>L',base+i)[-vlen:] + self.cmap.register_cid2code(s1+i, x) + self.popall() + return + + if name == 'beginbfchar': + self.popall() + return + if name == 'endbfchar': + for (cid,code) in choplist(2, self.partobj): + assert isinstance(cid, str) + assert isinstance(code, str) + self.cmap.register_cid2code(nunpack(cid), code) + self.popall() + return + + if name == 'beginnotdefrange': + self.popall() + return + if name == 'endnotdefrange': + if 1 <= self.debug: + print >>stderr, 'notdefrange: %r' % self.partobj + self.popall() + return + + return + + +## FontMetricsDB +## +class FontMetricsDB: + from fontmetrics import FONT_METRICS + + @classmethod + def get_metrics(klass, fontname): + return klass.FONT_METRICS[fontname] + + +## EncodingDB +## +class EncodingDB: + + from glyphlist import charname2unicode + from latin_enc import ENCODING + + std2unicode = {} + mac2unicode = {} + win2unicode = {} + pdf2unicode = {} + for (name,std,mac,win,pdf) in ENCODING: + c = unichr(charname2unicode[name]) + if std: std2unicode[std] = c + if mac: mac2unicode[mac] = c + if win: win2unicode[win] = c + if pdf: pdf2unicode[pdf] = c + + encodings = { + 'StandardEncoding': std2unicode, + 'MacRomanEncoding': mac2unicode, + 'WinAnsiEncoding': win2unicode, + 'PDFDocEncoding': pdf2unicode, + } + + @classmethod + def get_encoding(klass, name, diff=None): + cid2unicode = klass.encodings.get(name, klass.std2unicode) + if diff: + cid2unicode = cid2unicode.copy() + cid = 0 + for x in diff: + if isinstance(x, int): + cid = x + elif isinstance(x, PSLiteral): + try: + cid2unicode[cid] = unichr(EncodingDB.charname2unicode[x.name]) + except KeyError: + pass + cid += 1 + return cid2unicode diff --git a/pdf2txt.py b/pdf2txt.py new file mode 100755 index 0000000..c52daad --- /dev/null +++ b/pdf2txt.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python +import sys +stdout = sys.stdout +stderr = sys.stderr +from pdfparser import PDFDocument, PDFParser +from pdfinterp import PDFDevice, PDFResourceManager, \ + PDFPageInterpreter, PDFUnicodeNotDefined, \ + mult_matrix, apply_matrix +from cmap import CMapDB + + +## TextConverter +## +class TextConverter(PDFDevice): + + def __init__(self, outfp, rsrc, codec): + PDFDevice.__init__(self, rsrc) + self.outfp = outfp + self.codec = codec + return + + def close(self): + self.outfp.write('\n') + return + + def begin_block(self, name): + self.outfp.write('\n' % name) + return + def end_block(self): + self.outfp.write('\n') + return + + def render_string(self, textstate, textmatrix, size, seq): + font = textstate.font + spwidth = int(-font.char_width(32) * 0.6) # space width + buf = '' + for x in seq: + if isinstance(x, int) or isinstance(x, float): + if not font.is_vertical() and x <= spwidth: + buf += ' ' + else: + chars = font.decode(x) + for cid in chars: + try: + char = font.to_unicode(cid) + except PDFUnicodeNotDefined, e: + (cidcoding, cid) = e.args + char = u'[%s:%d]' % (cidcoding, cid) + buf += char + (a,b,c,d,tx,ty) = mult_matrix(textmatrix, self.ctm) + skewed = (b != 0 or c != 0) + if font.is_vertical(): + size = -size + tag = 'vtext' + else: + tag = 'htext' + if skewed: + tag += ' skewed' + s = buf.encode(self.codec, 'xmlcharrefreplace') + (w,fs) = apply_matrix((a,b,c,d,0,0), (size,textstate.fontsize)) + def f(x): return '%.03f' % x + self.outfp.write('<%s font="%s" size="%s" x="%s" y="%s" w="%s">%s\n' % + (tag, font.fontname, f(fs), f(tx), f(ty), f(w), s, tag)) + return + + +# pdf2txt +def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0): + device = TextConverter(outfp, rsrc, codec) + doc = PDFDocument(debug=debug) + fp = file(fname) + parser = PDFParser(doc, fp, debug=debug) + interpreter = PDFPageInterpreter(rsrc, device, debug=debug) + for (i,page) in enumerate(doc.get_pages(debug=debug)): + if pages and (i not in pages): continue + interpreter.process_page(page) + fp.close() + device.close() + return + + +# main +def main(argv): + import getopt + def usage(): + print 'usage: %s [-d] [-v] [-c codec] [-p pages] file ...' % argv[0] + return 100 + try: + (opts, args) = getopt.getopt(argv[1:], 'dvp:c:') + except getopt.GetoptError: + return usage() + if not args: return usage() + debug = 0 + cmapdir = 'CMap' + cdbcmapdir = 'CDBCMap' + codec = 'ascii' + pages = set() + outfp = stdout + for (k, v) in opts: + if k == '-d': debug += 1 + elif k == '-p': pages.add(int(v)) + elif k == '-o': outfp = file(v, 'wb') + elif k == '-c': codec = v + # + CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug) + rsrc = PDFResourceManager(debug=debug) + for fname in args: + pdf2txt(outfp, rsrc, fname, pages, codec, debug=debug) + return + +if __name__ == '__main__': sys.exit(main(sys.argv)) diff --git a/pdfinterp.py b/pdfinterp.py new file mode 100644 index 0000000..d24f848 --- /dev/null +++ b/pdfinterp.py @@ -0,0 +1,827 @@ +#!/usr/bin/env python +import sys +stderr = sys.stderr +from struct import pack, unpack +try: + from cStringIO import StringIO +except ImportError: + from StringIO import StringIO +from psparser import PSException, PSSyntaxError, PSTypeError, \ + PSStackParser, PSLiteral, PSKeyword, \ + PSLiteralTable, PSKeywordTable, literal_name, keyword_name +from pdfparser import resolve1, int_value, float_value, num_value, \ + str_value, list_value, dict_value, stream_value, PDFException +from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB + + +## Exceptions +## +class PDFResourceError(PDFException): pass +class PDFInterpreterError(PDFException): pass +class PDFFontError(PDFException): pass +class PDFUnicodeNotDefined(PDFFontError): pass + + +## Constants +## +LITERAL_PDF = PSLiteralTable.intern('PDF') +LITERAL_TEXT = PSLiteralTable.intern('Text') +LITERAL_FONT = PSLiteralTable.intern('Font') +LITERAL_FORM = PSLiteralTable.intern('Form') +LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding') +LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray') +LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB') +LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK') +LITERAL_ICC_BASED = PSLiteralTable.intern('ICCBased') +LITERAL_DEVICE_N = PSLiteralTable.intern('DeviceN') +MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0) +CS_COMPONENTS = { + PSLiteralTable.intern('CalRGB'): 3, + PSLiteralTable.intern('CalGray'): 1, + PSLiteralTable.intern('Lab'): 3, + PSLiteralTable.intern('DeviceRGB'): 3, + PSLiteralTable.intern('DeviceCMYK'): 4, + PSLiteralTable.intern('DeviceGray'): 1, + PSLiteralTable.intern('Separation'): 1, + PSLiteralTable.intern('Indexed'): 1, + PSLiteralTable.intern('Pattern'): 1, + } + + +## Matrix operations +## +def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)): + '''Multiplies two matrices.''' + return (a0*a1+c0*b1, b0*a1+d0*b1, + a0*c1+c0*d1, b0*c1+d0*d1, + a0*e1+c0*f1+e0, b0*e1+d0*f1+f0) + +def apply_matrix((a,b,c,d,e,f), (x,y)): + '''Applies a matrix to a coordination.''' + return (a*x+c*y+e, b*x+d*y+f) + +def cs_params(cs): + t = cs[0] + if t == LITERAL_ICC_BASED: + return stream_value(cs[1]).dic['N'] + elif t == LITERAL_DEVICE_N: + return len(list_value(cs[1])) + else: + return CS_COMPONENTS[t] + + +## Fonts +## + +# PDFFont +class PDFFont: + + def __init__(self, fontid, descriptor, widths, default_width=None): + self.fontid = fontid + self.descriptor = descriptor + self.widths = widths + self.fontname = descriptor['FontName'] + if isinstance(self.fontname, PSLiteral): + self.fontname = literal_name(self.fontname) + self.ascent = descriptor['Ascent'] + self.descent = descriptor['Descent'] + self.default_width = default_width or descriptor.get('MissingWidth', 0) + self.leading = descriptor.get('Leading', 0) + self.bbox = descriptor['FontBBox'] + return + + def __repr__(self): + return '' % (self.fontid,) + + def is_vertical(self): + return False + + def decode(self, bytes): + return map(ord, bytes) + + def char_width(self, cid): + return self.widths.get(cid, self.default_width) + + def char_disp(self, cid): + return 0 + + def string_width(self, s): + return sum( self.char_width(cid) for cid in self.decode(s) ) + + +# PDFSimpleFont +class PDFSimpleFont(PDFFont): + + def __init__(self, fontid, descriptor, widths, spec): + # Font encoding is specified either by a name of + # built-in encoding or a dictionary that describes + # the differences. + if 'Encoding' in spec: + encoding = resolve1(spec['Encoding']) + else: + encoding = LITERAL_STANDARD_ENCODING + if isinstance(encoding, dict): + name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING)) + diff = encoding.get('Differences', None) + self.encoding = EncodingDB.get_encoding(name, diff) + else: + self.encoding = EncodingDB.get_encoding(literal_name(encoding)) + self.ucs2_cmap = None + if 'ToUnicode' in spec: + strm = stream_value(spec['ToUnicode']) + self.ucs2_cmap = CMap() + CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse() + PDFFont.__init__(self, fontid, descriptor, widths) + return + + def to_unicode(self, cid): + if not self.ucs2_cmap: + try: + return self.encoding[cid] + except KeyError: + raise PDFUnicodeNotDefined(None, cid) + code = self.ucs2_cmap.tocode(cid) + if not code: + raise PDFUnicodeNotDefined(None, cid) + chars = unpack('>%dH' % (len(code)/2), code) + return ''.join( unichr(c) for c in chars ) + + +# PDFType1Font +class PDFType1Font(PDFSimpleFont): + + def __init__(self, fontid, spec): + if 'BaseFont' not in spec: + raise PDFFontError('BaseFont is missing') + self.basefont = literal_name(spec['BaseFont']) + try: + (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont) + except KeyError: + try: + descriptor = dict_value(spec['FontDescriptor']) + firstchar = int_value(spec['FirstChar']) + lastchar = int_value(spec['LastChar']) + widths = dict( (i+firstchar,w) for (i,w) + in enumerate(list_value(spec['Widths'])) ) + except KeyError, k: + raise PDFFontError('%s is missing' % k) + PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec) + return + +# PDFTrueTypeFont +class PDFTrueTypeFont(PDFType1Font): + pass + +# PDFType3Font +class PDFType3Font(PDFSimpleFont): + def __init__(self, fontid, spec): + try: + firstchar = int_value(spec['FirstChar']) + lastchar = int_value(spec['LastChar']) + widths = dict( (i+firstchar,w) for (i,w) + in enumerate(list_value(spec['Widths'])) ) + except KeyError, k: + raise PDFFontError('%s is missing' % k) + if 'FontDescriptor' in spec: + descriptor = dict_value(spec['FontDescriptor']) + else: + descriptor = {'FontName':fontid, 'Ascent':0, 'Descent':0, + 'FontBBox':spec['FontBBox']} + PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec) + return + +# PDFCIDFont + +## TrueTypeFont +## +class TrueTypeFont: + + class CMapNotFound(Exception): pass + + def __init__(self, name, fp): + self.name = name + self.fp = fp + self.tables = {} + fonttype = fp.read(4) + (ntables, _1, _2, _3) = unpack('>HHHH', fp.read(8)) + for i in xrange(ntables): + (name, tsum, offset, length) = unpack('>4sLLL', fp.read(16)) + self.tables[name] = (offset, length) + return + + def create_cmap(self): + if 'cmap' not in self.tables: raise TrueTypeFont.CMapNotFound + (base_offset, length) = self.tables['cmap'] + fp = self.fp + fp.seek(base_offset) + (version, nsubtables) = unpack('>HH', fp.read(4)) + subtables = [] + for i in xrange(nsubtables): + subtables.append(unpack('>HHL', fp.read(8))) + char2gid = {} + # Only supports subtable type 0, 2 and 4. + for (_1, _2, st_offset) in subtables: + fp.seek(base_offset+st_offset) + (fmttype, fmtlen, fmtlang) = unpack('>HHH', fp.read(6)) + if fmttype == 0: + char2gid.update(enumerate(unpack('>256B', fp.read(256)))) + elif fmttype == 2: + subheaderkeys = unpack('>256H', fp.read(512)) + firstbytes = [0]*8192 + for (i,k) in enumerate(subheaderkeys): + firstbytes[k/8] = i + nhdrs = max(subheaderkeys)/8 + 1 + hdrs = [] + for i in xrange(nhdrs): + (firstcode,entcount,delta,offset) = unpack('>HHhH', fp.read(8)) + hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset)) + for (i,firstcode,entcount,delta,pos) in hdrs: + if not entcount: continue + first = firstcode + (firstbytes[i] << 8) + fp.seek(pos) + for c in xrange(entcount): + gid = unpack('>H', fp.read(2)) + if gid: + gid += delta + char2gid[first+c] = gid + elif fmttype == 4: + (segcount, _1, _2, _3) = unpack('>HHHH', fp.read(8)) + segcount /= 2 + ecs = unpack('>%dH' % segcount, fp.read(2*segcount)) + fp.read(2) + scs = unpack('>%dH' % segcount, fp.read(2*segcount)) + idds = unpack('>%dh' % segcount, fp.read(2*segcount)) + pos = fp.tell() + idrs = unpack('>%dH' % segcount, fp.read(2*segcount)) + for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs): + if idr: + fp.seek(pos+idr) + for c in xrange(sc, ec+1): + char2gid[c] = (unpack('>H', fp.read(2))[0] + idd) & 0xffff + else: + for c in xrange(sc, ec+1): + char2gid[c] = (c + idd) & 0xffff + gid2char = dict( (gid, pack('>H', char)) + for (char,gid) in char2gid.iteritems() ) + cmapname = 'Adobe-Identity-UCS-%s' % self.name + return CMap(cmapname).update(char2gid, gid2char) + +class PDFCIDFont(PDFFont): + + def __init__(self, fontid, spec): + if 'BaseFont' not in spec: + raise PDFFontError('BaseFont is missing') + try: + self.cidsysteminfo = dict_value(spec['CIDSystemInfo']) + self.cidcoding = '%s-%s' % (self.cidsysteminfo['Registry'], + self.cidsysteminfo['Ordering']) + except KeyError: + raise PDFFontError('CIDSystemInfo not properly defined.') + self.basefont = literal_name(spec['BaseFont']) + self.cmap = CMapDB.get_cmap(literal_name(spec['Encoding'])) + descriptor = dict_value(spec['FontDescriptor']) + ttf = None + if 'FontFile2' in descriptor: + self.fontfile = stream_value(descriptor.get('FontFile2')) + ttf = TrueTypeFont(self.basefont, + StringIO(self.fontfile.get_data())) + self.ucs2_cmap = None + if 'ToUnicode' in spec: + strm = stream_value(spec['ToUnicode']) + self.ucs2_cmap = CMap() + CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse() + elif self.cidcoding == 'Adobe-Identity': + if ttf: + try: + self.ucs2_cmap = ttf.create_cmap() + except TrueTypeFont.CMapNotFound: + pass + else: + self.ucs2_cmap = CMapDB.get_cmap('%s-UCS2' % self.cidcoding) + + def get_width(seq): + dic = {} + char1 = char2 = None + for v in seq: + if char1 == None: + char1 = v + elif char2 == None and isinstance(v, int): + char2 = v + else: + if char2 == None: + for (i,w) in enumerate(v): + dic[char1+i] = w + else: + for i in xrange(char1, char2+1): + dic[i] = v + char1 = char2 = None + return dic + self.vertical = self.cmap.is_vertical() + if self.vertical: + # writing mode: vertical + dic = get_width(list_value(spec.get('W2', []))) + widths = dict( (cid,w) for (cid,(d,w)) in dic.iteritems() ) + self.disps = dict( (cid,d) for (cid,(d,w)) in dic.iteritems() ) + (d,w) = spec.get('DW2', [880, -1000]) + default_width = w + self.default_disp = d + else: + # writing mode: horizontal + widths = get_width(list_value(spec.get('W', []))) + self.disps = {} + default_width = spec.get('DW', 1000) + self.default_disp = 0 + PDFFont.__init__(self, fontid, descriptor, widths, default_width) + return + + def is_vertical(self): + return self.vertical + + def decode(self, bytes): + return self.cmap.decode(bytes) + + def char_disp(self, cid): + return self.disps.get(cid, self.default_disp) + + def to_unicode(self, cid): + if not self.ucs2_cmap: + raise PDFUnicodeNotDefined(self.cidcoding, cid) + code = self.ucs2_cmap.tocode(cid) + if not code: + raise PDFUnicodeNotDefined(self.cidcoding, cid) + chars = unpack('>%dH' % (len(code)/2), code) + return ''.join( unichr(c) for c in chars ) + + +## Resource Manager +## +class PDFResourceManager: + + ''' + ResourceManager facilitates reuse of shared resources + such as fonts, images and cmaps so that large objects are not + allocated multiple times. + ''' + + def __init__(self, debug=0): + self.debug = debug + self.fonts = {} + return + + def get_procset(self, procs): + for proc in procs: + if proc == LITERAL_PDF: + pass + elif proc == LITERAL_TEXT: + pass + else: + #raise PDFResourceError('ProcSet %r is not supported.' % proc) + pass + return + + def get_cmap(self, name): + return CMapDB.get_cmap(name) + + def get_font(self, fontid, spec): + if fontid in self.fonts: + font = self.fonts[fontid] + else: + spec = dict_value(spec) + assert spec['Type'] == LITERAL_FONT + # Create a Font object. + if 'Subtype' not in spec: + raise PDFFontError('Font Subtype is not specified.') + subtype = literal_name(spec['Subtype']) + if subtype in ('Type1', 'MMType1'): + # Type1 Font + font = PDFType1Font(fontid, spec) + elif subtype == 'TrueType': + # TrueType Font + font = PDFTrueTypeFont(fontid, spec) + elif subtype == 'Type3': + # Type3 Font + font = PDFType3Font(fontid, spec) + elif subtype in ('CIDFontType0', 'CIDFontType2'): + # CID Font + font = PDFCIDFont(fontid, spec) + elif subtype == 'Type0': + # Type0 Font + dfonts = list_value(spec['DescendantFonts']) + assert len(dfonts) == 1 + subspec = dict_value(dfonts[0]).copy() + for k in ('Encoding', 'ToUnicode'): + if k in spec: + subspec[k] = resolve1(spec[k]) + font = self.get_font(fontid, subspec) + else: + raise PDFFontError('Invalid Font: %r' % spec) + self.fonts[fontid] = font + return font + + +## PDFDevice +## +class PDFDevice: + + def __init__(self, rsrc): + self.rsrc = rsrc + self.ctm = None + return + + def __repr__(self): + return '' + + def close(self): + return + + def set_ctm(self, ctm): + self.ctm = ctm + return + + def begin_block(self, name): + return + def end_block(self): + return + + def render_string(self, textstate, textmatrix, size, seq): + raise NotImplementedError + + +## Interpreter +## +class PDFPageInterpreter: + + class TextState: + def __init__(self): + self.font = None + self.fontsize = 0 + self.charspace = 0 + self.wordspace = 0 + self.scaling = 100 + self.leading = 0 + self.render = 0 + self.rise = 0 + self.reset() + return + def __repr__(self): + return ('' % + (self.font, self.fontsize, self.matrix, + self.charspace, self.wordspace, self.scaling, self.leading, + self.render, self.rise)) + def reset(self): + self.matrix = MATRIX_IDENTITY + self.linematrix = (0, 0) + return + + def __init__(self, rsrc, device, debug=0): + self.rsrc = rsrc + self.device = device + self.debug = debug + return + + def initpage(self, ctm): + self.fontmap = {} + self.xobjmap = {} + self.csmap = {} + # gstack: stack for graphical states. + self.gstack = [] + self.ctm = ctm + self.device.set_ctm(self.ctm) + self.textstate = PDFPageInterpreter.TextState() + # argstack: stack for command arguments. + self.argstack = [] + # set some global states. + self.scs = None + self.ncs = None + return + + def push(self, obj): + self.argstack.append(obj) + return + + def pop(self, n): + x = self.argstack[-n:] + self.argstack = self.argstack[:-n] + return x + + def get_current_state(self): + return (self.ctm, self.textstate) + + def set_current_state(self, state): + (self.ctm, self.textstate) = state + self.device.set_ctm(self.ctm) + return + + # gsave + def do_q(self): + self.gstack.append(self.get_current_state()) + return + # grestore + def do_Q(self): + if self.gstack: + self.set_current_state(self.gstack.pop()) + return + + # concat-matrix + def do_cm(self, a1, b1, c1, d1, e1, f1): + self.ctm = mult_matrix((a1,b1,c1,d1,e1,f1), self.ctm) + self.device.set_ctm(self.ctm) + return + + # setlinewidth + def do_w(self, width): return + # setlinecap + def do_J(self, cap): return + # setlinejoin + def do_j(self, join): return + # setmiterlimit + def do_M(self, limit): return + # setdash + def do_d(self, dash, phase): return + # setintent + def do_ri(self, intent): return + # setflatness + def do_i(self, flatness): return + # savedict + def do_gs(self, name): return + + # moveto + def do_m(self, x, y): return + # lineto + def do_l(self, x, y): return + # curveto + def do_c(self, x1, y1, x2, y2, x3, y3): return + # urveto + def do_v(self, x2, y2, x3, y3): return + # rveto + def do_y(self, x1, y1, x3, y3): return + # closepath + def do_h(self): return + # rectangle + def do_re(self, x, y, w, h): return + + # stroke + def do_S(self): return + # close-and-stroke + def do_s(self): return + # fill + def do_f(self): return + # fill (obsolete) + do_F = do_f + # fill-even-odd + def do_f_a(self): return + # fill-and-stroke + def do_B(self): return + # fill-and-stroke-even-odd + def do_B_a(self): return + # close-fill-and-stroke + def do_b(self): return + # close-fill-and-stroke-even-odd + def do_b_a(self): return + # close-only + def do_n(self): return + # clip + def do_W(self): return + # clip-even-odd + def do_W_a(self): return + + # setcolorspace-stroking + def do_CS(self, name): + self.scs = self.csmap.get(literal_name(name), None) + return + # setcolorspace-non-strokine + def do_cs(self, name): + self.ncs = self.csmap.get(literal_name(name), None) + return + # setgray-stroking + def do_G(self, gray): + self.do_CS(LITERAL_DEVICE_GRAY) + return + # setgray-non-stroking + def do_g(self, gray): + self.do_cs(LITERAL_DEVICE_GRAY) + return + # setrgb-stroking + def do_RG(self, r, g, b): + self.do_CS(LITERAL_DEVICE_RGB) + return + # setrgb-non-stroking + def do_rg(self, r, g, b): + self.do_cs(LITERAL_DEVICE_RGB) + return + # setcmyk-stroking + def do_K(self, c, m, y, k): + self.do_CS(LITERAL_DEVICE_CMYK) + return + # setcmyk-non-stroking + def do_k(self, c, m, y, k): + self.do_cs(LITERAL_DEVICE_CMYK) + return + + # setcolor + def do_SCN(self): + n = cs_params(self.scs) + self.pop(n) + return + def do_scn(self): + n = cs_params(self.ncs) + self.pop(n) + return + def do_SC(self): + self.do_SCN() + return + def do_sc(self): + self.do_scn() + return + + # sharing-name + def do_sh(self, name): return + + # begin-text + def do_BT(self): + self.textstate.reset() + return + # end-text + def do_ET(self): + return + + # begin-compat + def do_BX(self): return + # end-compat + def do_EX(self): return + + # marked content operators + def do_MP(self, tag): return + def do_DP(self, tag, props): return + def do_BMC(self, tag): return + def do_BDC(self, tag, props): return + def do_EMC(self): return + + # setcharspace + def do_Tc(self, space): + self.textstate.charspace = space + return + # setwordspace + def do_Tw(self, space): + self.textstate.wordspace = space + return + # textscale + def do_Tz(self, scale): + self.textstate.scaling = scale + return + # setleading + def do_TL(self, leading): + self.textstate.leading = leading + return + # selectfont + def do_Tf(self, fontid, fontsize): + try: + self.textstate.font = self.fontmap[literal_name(fontid)] + except KeyError: + raise PDFInterpreterError('Undefined font id: %r' % fontid) + self.textstate.fontsize = fontsize + return + # setrendering + def do_Tr(self, render): + self.textstate.render = render + return + # settextrise + def do_Ts(self, rise): + self.textstate.rise = rise + return + + # text-move + def do_Td(self, tx, ty): + (a,b,c,d,e,f) = self.textstate.matrix + self.textstate.matrix = (a,b,c,d,e+tx,f+ty) + self.textstate.linematrix = (0, 0) + return + # text-move + def do_TD(self, tx, ty): + (a,b,c,d,e,f) = self.textstate.matrix + self.textstate.matrix = (a,b,c,d,e+tx,f+ty) + self.textstate.leading = -ty + self.textstate.linematrix = (0, 0) + return + # textmatrix + def do_Tm(self, a,b,c,d,e,f): + self.textstate.matrix = (a,b,c,d,e,f) + self.textstate.linematrix = (0, 0) + return + # nextline + def do_T_a(self): + (a,b,c,d,e,f) = self.textstate.matrix + self.textstate.matrix = (a,b,c,d,e,f+self.textstate.leading) + self.textstate.linematrix = (0, 0) + return + + # show-pos + def do_TJ(self, seq): + textstate = self.textstate + font = textstate.font + (a,b,c,d,e,f) = textstate.matrix + (lx,ly) = textstate.linematrix + s = ''.join( x for x in seq if isinstance(x, str) ) + n = sum( x for x in seq if not isinstance(x, str) ) + w = ((font.string_width(s)-n)/1000.0 * textstate.fontsize + + len(s) * textstate.charspace + + s.count(' ')*textstate.wordspace) * textstate.scaling / 100.0 + self.device.render_string(textstate, (a,b,c,d,e+lx,f+ly), w, seq) + if font.is_vertical(): + ly += w + else: + lx += w + textstate.linematrix = (lx,ly) + return + # show + def do_Tj(self, s): + self.do_TJ([s]) + return + # quote + def do__q(self, s): + self.do_T_a() + self.do_TJ([s]) + return + # doublequote + def do__w(self, aw, ac, s): + self.do_Tw(aw) + self.do_Tc(ac) + self.do_TJ([s]) + return + + # inline image + def do_BI(self): # never called + return + def do_ID(self): # never called + return + def do_EI(self, obj): + return + + # invoke an XObject + def do_Do(self, xobjid): + xobjid = literal_name(xobjid) + try: + xobj = stream_value(self.xobjmap[xobjid]) + except KeyError: + raise PDFInterpreterError('Undefined xobject id: %r' % xobjid) + if xobj.dic['Subtype'] == LITERAL_FORM: + if 1 <= self.debug: + print >>stderr, 'Processing xobj: %r' % xobj + interpreter = PDFPageInterpreter(self.rsrc, self.device) + interpreter.render_contents(xobjid, xobj.dic['Resources'], [xobj], + xobj.dic.get('Matrix', MATRIX_IDENTITY)) + return + + def process_page(self, page): + if 1 <= self.debug: + print >>stderr, 'Processing page: %r' % page + self.render_contents('page-%d' % page.pageid, page.resources, page.contents) + return + + def render_contents(self, contid, resources, contents, ctm=MATRIX_IDENTITY): + self.initpage(ctm) + self.device.begin_block(contid) + # Handle resource declarations. + for (k,v) in dict_value(resources).iteritems(): + if 1 <= self.debug: + print >>stderr, 'Resource: %r: %r' % (k,v) + if k == 'Font': + for (fontid,fontrsrc) in dict_value(v).iteritems(): + self.fontmap[fontid] = self.rsrc.get_font(fontid, fontrsrc) + elif k == 'ColorSpace': + for (csid,csspec) in dict_value(v).iteritems(): + self.csmap[csid] = list_value(csspec) + elif k == 'ProcSet': + self.rsrc.get_procset(list_value(v)) + elif k == 'XObject': + for (xobjid,xobjstrm) in dict_value(v).iteritems(): + self.xobjmap[xobjid] = xobjstrm + for stream in list_value(contents): + self.execute(stream_value(stream)) + self.device.end_block() + return + + def execute(self, stream): + for obj in stream.parse_data(inline=True, debug=self.debug): + if isinstance(obj, PSKeyword): + name = 'do_%s' % obj.name.replace('*','_a').replace('"','_w').replace("'",'_q') + if hasattr(self, name): + func = getattr(self, name) + nargs = func.func_code.co_argcount-1 + if nargs: + args = self.pop(nargs) + if 1 <= self.debug: + print >>stderr, 'exec: %s %r' % (obj.name, args) + if len(args) == nargs: + func(*args) + else: + if 1 <= self.debug: + print >>stderr, 'exec: %s' % (obj.name) + func() + else: + raise PDFInterpreterError('unknown operator: %r' % obj.name) + else: + self.push(obj) + return diff --git a/pdfparser.py b/pdfparser.py index ed5d289..a12c7a1 100755 --- a/pdfparser.py +++ b/pdfparser.py @@ -15,858 +15,121 @@ # - Encryption? import sys, re -from struct import pack, unpack -try: - from cStringIO import StringIO -except ImportError: - from StringIO import StringIO -try: - import cdb -except ImportError: - import pycdb as cdb stderr = sys.stderr +from utils import choplist, nunpack +from psparser import PSException, PSSyntaxError, PSTypeError, \ + PSLiteral, PSKeyword, \ + PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \ + PSStackParser -## Utilities +## PDF Exceptions ## -def choplist(n, seq): - '''Groups every n elements of the list.''' - r = [] - for x in seq: - r.append(x) - if len(r) == n: - yield tuple(r) - r = [] - return - -def nunpack(s, default=0): - '''Unpacks up to 4 bytes.''' - l = len(s) - if not l: - return default - elif l == 1: - return ord(s) - elif l == 2: - return unpack('>H', s)[0] - elif l == 3: - return unpack('>L', '\x00'+s)[0] - elif l == 4: - return unpack('>L', s)[0] - else: - return TypeError('invalid length: %d' % l) - -def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)): - '''Multiplies two matrices.''' - return (a0*a1+c0*b1, b0*a1+d0*b1, - a0*c1+c0*d1, b0*c1+d0*d1, - a0*e1+c0*f1+e0, b0*e1+d0*f1+f0) - -def apply_matrix((a,b,c,d,e,f), (x,y)): - '''Applies a matrix to a coordination.''' - return (a*x+c*y+e, b*x+d*y+f) - - -## Exceptions -## -class PSException(Exception): pass -class PSSyntaxError(PSException): pass -class PSTypeError(PSException): pass -class PSValueError(PSException): pass class PDFException(PSException): pass class PDFSyntaxError(PDFException): pass class PDFEncrypted(PDFException): pass class PDFTypeError(PDFException): pass class PDFValueError(PDFException): pass -class PDFResourceError(PDFException): pass -class PDFInterpreterError(PDFException): pass -class PDFFontError(PDFException): pass -class PDFUnicodeNotDefined(PDFFontError): pass -## PostScript Types -## -class PSLiteral: - ''' - PS literals (e.g. "/Name"). - Caution: Never create these objects directly. - Use PSLiteralTable.intern() instead. - ''' - def __init__(self, name): - self.name = name - return - def __repr__(self): - return '/%s' % self.name - -class PSKeyword: - ''' - PS keywords (e.g. "showpage"). - Caution: Never create these objects directly. - Use PSKeywordTable.intern() instead. - ''' - def __init__(self, name): - self.name = name - return - def __repr__(self): - return self.name - -class PSSymbolTable: - ''' - Symbol table that stores PSLiteral or PSKeyword. - ''' - def __init__(self, classe): - self.dic = {} - self.classe = classe - return - - def intern(self, name): - if name in self.dic: - lit = self.dic[name] - else: - lit = self.classe(name) - self.dic[name] = lit - return lit - -PSLiteralTable = PSSymbolTable(PSLiteral) -PSKeywordTable = PSSymbolTable(PSKeyword) - # some predefined literals and keywords. LITERAL_OBJSTM = PSLiteralTable.intern('ObjStm') -LITERAL_PDF = PSLiteralTable.intern('PDF') -LITERAL_TEXT = PSLiteralTable.intern('Text') LITERAL_XREF = PSLiteralTable.intern('XRef') -LITERAL_FONT = PSLiteralTable.intern('Font') LITERAL_PAGE = PSLiteralTable.intern('Page') -LITERAL_FORM = PSLiteralTable.intern('Form') LITERAL_PAGES = PSLiteralTable.intern('Pages') LITERAL_CATALOG = PSLiteralTable.intern('Catalog') LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode') -LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding') KEYWORD_OBJ = PSKeywordTable.intern('obj') KEYWORD_EI = PSKeywordTable.intern('EI') -## CMap +## PDFObjRef ## -class CMap: +class PDFObjRef: - def __init__(self, debug=0): - self.debug = 0 - self.code2cid = {} - self.cid2code = {} - self.attrs = {} + def __init__(self, doc, objid, genno): + if objid == 0: + raise PDFValueError('objid cannot be 0.') + self.doc = doc + self.objid = objid + #self.genno = genno # Never used. return def __repr__(self): - return '' % self.attrs.get('CMapName') + return '' % (self.objid) - def update(self, code2cid=None, cid2code=None): - if code2cid: - self.code2cid.update(code2cid) - if cid2code: - self.cid2code.update(cid2code) - return self - - def copycmap(self, cmap): - self.code2cid.update(cmap.getall_code2cid()) - self.cid2code.update(cmap.getall_cid2code()) - return self - - def register_code2cid(self, code, cid): - assert isinstance(code, str) - assert isinstance(cid, int) - self.code2cid[code] = cid - return self - - def register_cid2code(self, cid, code): - from glyphlist import charname2unicode - assert isinstance(cid, int) - if isinstance(code, PSLiteral): - code = pack('>H', charname2unicode[code.name]) - self.cid2code[cid] = code - return self - - def decode(self, bytes): - if self.debug: - print >>stderr, 'decode: %r, %r' % (self, bytes) - x = '' - for c in bytes: - if x: - if x+c in self.code2cid: - yield self.code2cid[x+c] - x = '' - elif c in self.code2cid: - yield self.code2cid[c] - else: - x = c - return - - def is_vertical(self): - return self.attrs.get('WMode', '0') == '1' - - def tocid(self, code): - return self.code2cid.get(code) - def tocode(self, cid): - return self.cid2code.get(cid) - - def getall_attrs(self): - return self.attrs.iteritems() - def getall_code2cid(self): - return self.code2cid.iteritems() - def getall_cid2code(self): - return self.cid2code.iteritems() - - -## CDBCMap -## -class CDBCMap(CMap): - - def __init__(self, cdbname, debug=0): - CMap.__init__(self, debug=debug) - self.cdbname = cdbname - self.db = cdb.init(cdbname) - return - - def __repr__(self): - return '' % (self.db['/CMapName'], self.cdbname) - - def tocid(self, code): - k = 'c'+code - if not self.db.has_key(k): - return None - return unpack('>L', self.db[k]) - def tocode(self, cid): - k = 'i'+pack('>L', cid) - if not self.db.has_key(k): - return None - return self.db[k] - - def is_vertical(self): - return (self.db.has_key('/WMode') and - self.db['/WMode'] == '1') - - def getall(self, c): - while 1: - x = self.db.each() - if not x: break - (k,v) = x - if k.startswith(c): - yield (k[1:], unpack('>L', v)[0]) - return - - def getall_attrs(self): - while 1: - x = self.db.each() - if not x: break - (k,v) = x - if k.startswith('/'): - yield (k[1:], eval(v)[0]) - return - - def getall_cid2code(self): - return self.getall('i') - def getall_code2cid(self): - return self.getall('c') - - def decode(self, bytes): - if self.debug: - print >>stderr, 'decode: %r, %r' % (self, bytes) - x = '' - for c in bytes: - if x: - if x+c in self.code2cid: - yield self.code2cid[x+c] - elif self.db.has_key('c'+x+c): - (dest,) = unpack('>L', self.db['c'+x+c]) - self.code2cid[x+c] = dest - yield dest - x = '' - elif c in self.code2cid: - yield self.code2cid[c] - elif self.db.has_key('c'+c): - (dest,) = unpack('>L', self.db['c'+c]) - self.code2cid[c] = dest - yield dest - else: - x = c - return + def resolve(self): + return self.doc.getobj(self.objid) -## CMapDB -## -class CMapDB: - - CMAP_ALIAS = { - } - - debug = 0 - dirname = None - cdbdirname = None - cmapdb = {} - - @classmethod - def initialize(klass, dirname, cdbdirname=None, debug=0): - klass.dirname = dirname - klass.cdbdirname = cdbdirname or dirname - klass.debug = debug - return - - @classmethod - def get_cmap(klass, cmapname): - import os.path - cmapname = klass.CMAP_ALIAS.get(cmapname, cmapname) - if cmapname in klass.cmapdb: - cmap = klass.cmapdb[cmapname] - else: - fname = os.path.join(klass.dirname, cmapname) - cdbname = os.path.join(klass.cdbdirname, cmapname+'.cmap.cdb') - if os.path.exists(cdbname): - if 1 <= klass.debug: - print >>stderr, 'Opening: CDBCMap %r...' % cdbname - cmap = CDBCMap(cdbname) - elif os.path.exists(fname): - if 1 <= klass.debug: - print >>stderr, 'Reading: CMap %r...' % fname - cmap = CMap() - fp = file(fname) - CMapParser(cmap, fp).parse() - fp.close() - klass.cmapdb[cmapname] = cmap - return cmap - - -## FontMetricsDB -## -class FontMetricsDB: - from fontmetrics import FONT_METRICS - - @classmethod - def get_metrics(klass, fontname): - return klass.FONT_METRICS[fontname] - - -## EncodingDB -## -class EncodingDB: - - from glyphlist import charname2unicode - from latin_enc import ENCODING - std2unicode = {} - mac2unicode = {} - win2unicode = {} - pdf2unicode = {} - for (name,std,mac,win,pdf) in ENCODING: - c = unichr(charname2unicode[name]) - if std: std2unicode[std] = c - if mac: mac2unicode[mac] = c - if win: win2unicode[win] = c - if pdf: pdf2unicode[pdf] = c - encodings = { - 'StandardEncoding': std2unicode, - 'MacRomanEncoding': mac2unicode, - 'WinAnsiEncoding': win2unicode, - 'PDFDocEncoding': pdf2unicode, - } - - @classmethod - def get_encoding(klass, name, diff=None): - cid2unicode = klass.encodings.get(name, klass.std2unicode) - if diff: - cid2unicode = cid2unicode.copy() - cid = 0 - for x in diff: - if isinstance(x, int): - cid = x - elif isinstance(x, PSLiteral): - try: - cid2unicode[cid] = unichr(EncodingDB.charname2unicode[x.name]) - except KeyError: - pass - cid += 1 - return cid2unicode - - -## Color Spaces -## -LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray') -LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB') -LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK') -LITERAL_ICC_BASED = PSLiteralTable.intern('ICCBased') -LITERAL_DEVICE_N = PSLiteralTable.intern('DeviceN') -CS_COMPONENTS = { - PSLiteralTable.intern('CalRGB'): 3, - PSLiteralTable.intern('CalGray'): 1, - PSLiteralTable.intern('Lab'): 3, - PSLiteralTable.intern('DeviceRGB'): 3, - PSLiteralTable.intern('DeviceCMYK'): 4, - PSLiteralTable.intern('DeviceGray'): 1, - PSLiteralTable.intern('Separation'): 1, - PSLiteralTable.intern('Indexed'): 1, - PSLiteralTable.intern('Pattern'): 1, - } - -def cs_params(cs): - t = cs[0] - if t == LITERAL_ICC_BASED: - return stream_value(cs[1]).dic['N'] - elif t == LITERAL_DEVICE_N: - return len(list_value(cs[1])) - else: - return CS_COMPONENTS[t] - - -## PSBaseParser -## -class PSBaseParser: - - '''PostScript parser that performs only basic tokenization.''' - - def __init__(self, fp, debug=0): - self.fp = fp - self.debug = debug - self.bufsize = 4096 - self.seek(0) - return - - def __repr__(self): - return '' % (self.fp,) - - def seek(self, pos): - ''' - seeks to the given pos. - ''' - if 2 <= self.debug: - print >>stderr, 'seek:', pos - self.fp.seek(pos) - self.linepos = pos - self.linebuf = None - self.curpos = 0 - self.line = '' - return - - EOLCHAR = re.compile(r'[\r\n]') - def nextline(self): - ''' - fetches the next line that ends either with \\r or \\n. - ''' - line = '' - eol = None - while 1: - if not self.linebuf or len(self.linebuf) <= self.curpos: - # fetch next chunk. - self.linebuf = self.fp.read(self.bufsize) - if not self.linebuf: - # at EOF. - break - self.curpos = 0 - if eol: - c = self.linebuf[self.curpos] - # handle '\r\n' - if (eol == '\r' and c == '\n'): - line += c - self.curpos += 1 - break - m = self.EOLCHAR.search(self.linebuf, self.curpos) - if m: - i = m.end(0) - line += self.linebuf[self.curpos:i] - eol = self.linebuf[i-1] - self.curpos = i - else: - # fetch further - line += self.linebuf[self.curpos:] - self.linebuf = None - self.linepos += len(line) - return line - - def revreadlines(self): - ''' - fetches lines backword. used to locate trailers. - ''' - self.fp.seek(0, 2) - pos = self.fp.tell() - buf = '' - while 0 < pos: - pos = max(0, pos-self.bufsize) - self.fp.seek(pos) - s = self.fp.read(self.bufsize) - if not s: break - while 1: - n = max(s.rfind('\r'), s.rfind('\n')) - if n == -1: - buf = s + buf - break - yield buf+s[n:] - s = s[:n] - buf = '' - return - - SPECIAL = r'%\[\]()<>{}/\000\011\012\014\015\040' - TOKEN = re.compile(r'<<|>>|[%\[\]()<>{}/]|[^'+SPECIAL+r']+') - LITERAL = re.compile(r'([^#'+SPECIAL+r']|#[0-9abcdefABCDEF]{2})+') - NUMBER = re.compile(r'[+-]?[0-9][.0-9]*$') - STRING_NORM = re.compile(r'(\\[0-9]{1,3}|\\.|[^\)])+') - STRING_NORM_SUB = re.compile(r'\\[0-7]{1,3}|\\.') - STRING_HEX = re.compile(r'[\s0-9a-fA-F]+') - STRING_HEX_SUB = re.compile(r'[0-9a-fA-F]{1,2}') - - def parse(self): - ''' - Yields a list of basic tokens: keywords, literals, strings, - numbers and parentheses. Comments are skipped. - Nested objects (i.e. arrays and dictionaries) are not handled. - ''' - while 1: - # do not strip line! we need to distinguish last '\n' or '\r' - linepos0 = self.linepos - self.line = self.nextline() - if not self.line: break - if 2 <= self.debug: - print >>stderr, 'line: (%d) %r' % (self.linepos, self.line) - # do this before removing comment - if self.line.startswith('%%EOF'): break - charpos = 0 - - # tokenize - while 1: - m = self.TOKEN.search(self.line, charpos) - if not m: break - t = m.group(0) - pos = linepos0 + m.start(0) - charpos = m.end(0) - - if t == '%': - # skip comment - if 2 <= self.debug: - print >>stderr, 'comment: %r' % self.line[charpos:] - break - - elif t == '/': - # literal object - mn = self.LITERAL.match(self.line, m.start(0)+1) - lit = PSLiteralTable.intern(mn.group(0)) - yield (pos, lit) - charpos = mn.end(0) - if 2 <= self.debug: - print >>stderr, 'name: %r' % lit - - elif t == '(': - # normal string object - s = '' - while 1: - ms = self.STRING_NORM.match(self.line, charpos) - if not ms: break - s1 = ms.group(0) - charpos = ms.end(0) - if len(s1) == 1 and s1[-1] == '\\': - s += s1[-1:] - self.line = self.nextline() - if not self.line: - raise PSSyntaxError('end inside string: linepos=%d, line=%r' % - (self.linepos, self.line)) - charpos = 0 - elif charpos == len(self.line): - s += s1 - self.line = self.nextline() - if not self.line: - raise PSSyntaxError('end inside string: linepos=%d, line=%r' % - (self.linepos, self.line)) - charpos = 0 - else: - s += s1 - break - if self.line[charpos] != ')': - raise PSSyntaxError('no close paren: linepos=%d, line=%r' % - (self.linepos, self.line)) - charpos += 1 - def convesc(m): - x = m.group(0) - if x[1:].isdigit(): - return chr(int(x[1:], 8)) - else: - return x[1] - s = self.STRING_NORM_SUB.sub(convesc, s) - if 2 <= self.debug: - print >>stderr, 'str: %r' % s - yield (pos, s) - - elif t == '<': - # hex string object - ms = self.STRING_HEX.match(self.line, charpos) - charpos = ms.end(0) - if self.line[charpos] != '>': - raise PSSyntaxError('no close paren: linepos=%d, line=%r' % - (self.linepos, self.line)) - charpos += 1 - def convhex(m1): - return chr(int(m1.group(0), 16)) - s = self.STRING_HEX_SUB.sub(convhex, ms.group(0)) - if 2 <= self.debug: - print >>stderr, 'str: %r' % s - yield (pos, s) - - elif self.NUMBER.match(t): - # number - if '.' in t: - n = float(t) - else: - n = int(t) - if 2 <= self.debug: - print >>stderr, 'number: %r' % n - yield (pos, n) - - elif t in ('true','false'): - # boolean - if 2 <= self.debug: - print >>stderr, 'boolean: %r' % t - yield (pos, (t == 'true')) - - else: - # other token - if 2 <= self.debug: - print >>stderr, 'keyword: %r' % t - yield (pos, PSKeywordTable.intern(t)) - - return - - -## PSStackParser -## -class PSStackParser(PSBaseParser): - +# resolve +def resolve1(x): ''' - PostScript parser that recognizes compound objects - such as arrays and dictionaries. + Resolve an object. If this is an array or dictionary, + it may still contains some indirect objects inside. ''' - - def __init__(self, fp, debug=0): - PSBaseParser.__init__(self, fp, debug=debug) - self.context = [] - self.partobj = None - return + while isinstance(x, PDFObjRef): + x = x.resolve() + return x - def do_token(self, pos, token): - ''' - Handles special tokens. - Returns true if the token denotes the end of an object. - ''' - return False +def resolveall(x): + ''' + Recursively resolve X and all the internals. + Make sure there is no indirect reference within the nested object. + This procedure might be slow. Do not used it unless + you really need it. + ''' + while isinstance(x, PDFObjRef): + x = x.resolve() + if isinstance(x, list): + x = [ resolveall(v) for v in x ] + elif isinstance(x, dict): + for (k,v) in x.iteritems(): + x[k] = resolveall(v) + return x - def push(self, obj): - ''' - Push an object to the stack. - ''' - self.partobj.append(obj) - return +# Type cheking +def int_value(x): + x = resolve1(x) + if not isinstance(x, int): + raise PDFTypeError('integer required: %r' % x) + return x - def pop(self, n): - ''' - Pop N objects from the stack. - ''' - if len(self.partobj) < n: - raise PSSyntaxError('stack too short < %d' % n) - r = self.partobj[-n:] - self.partobj = self.partobj[:-n] - return r - - def popall(self): - ''' - Discards all the objects on the stack. - ''' - self.partobj = [] - return +def float_value(x): + x = resolve1(x) + if not isinstance(x, float): + raise PDFTypeError('float required: %r' % x) + return x - def parse(self): - ''' - Yields a list of objects: keywords, literals, strings, - numbers, arrays and dictionaries. Arrays and dictionaries - are represented as Python sequence and dictionaries. - ''' - - def startobj(type): - self.context.append((type, self.partobj)) - self.partobj = [] - return +def num_value(x): + x = resolve1(x) + if not (isinstance(x, int) or isinstance(x, float)): + raise PDFTypeError('int or float required: %r' % x) + return x - def endobj(type1): - assert self.context - obj = self.partobj - (type0, self.partobj) = self.context.pop() - if type0 != type1: - raise PSTypeError('type mismatch: %r(%r) != %r(%r)' % - (type0, self.partobj, type1, obj)) - return obj +def str_value(x): + x = resolve1(x) + if not isinstance(x, str): + raise PDFTypeError('string required: %r' % x) + return x - startobj('o') +def list_value(x): + x = resolve1(x) + if not isinstance(x, list): + raise PDFTypeError('list required: %r' % x) + return x - for (pos,t) in PSBaseParser.parse(self): - if isinstance(t, int) or isinstance(t, float): - self.push(t) - elif isinstance(t, str): - self.push(t) - elif isinstance(t, PSLiteral): - self.push(t) - else: - c = keyword_name(t) - if c == '{' or c == '}': - self.push(t) - elif c == '[': - # begin array - if 2 <= self.debug: - print >>stderr, 'start array' - startobj('a') - elif c == ']': - # end array - a = endobj('a') - if 2 <= self.debug: - print >>stderr, 'end array: %r' % a - self.push(a) - elif c == '<<': - # begin dictionary - if 2 <= self.debug: - print >>stderr, 'start dict' - startobj('d') - elif c == '>>': - # end dictionary - objs = endobj('d') - if len(objs) % 2 != 0: - raise PSTypeError('invalid dictionary construct: %r' % objs) - d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) ) - if 2 <= self.debug: - print >>stderr, 'end dict: %r' % d - self.push(d) - elif self.do_token(pos, t): - break +def dict_value(x): + x = resolve1(x) + if not isinstance(x, dict): + raise PDFTypeError('dict required: %r' % x) + return x - return endobj('o') - - -## CMapParser -## -class CMapParser(PSStackParser): - - def __init__(self, cmap, fp, debug=0): - PSStackParser.__init__(self, fp, debug=debug) - self.cmap = cmap - self.in_cmap = False - return - - def do_token(self, pos, token): - name = token.name - if name == 'begincmap': - self.in_cmap = True - self.popall() - return - elif name == 'endcmap': - self.in_cmap = False - return - if not self.in_cmap: return - # - if name == 'def': - try: - (k,v) = self.pop(2) - self.cmap.attrs[literal_name(k)] = v - except PSSyntaxError: - pass - return - - if name == 'usecmap': - try: - (cmapname,) = self.pop(1) - self.cmap.copycmap(CMapDB.get_cmap(literal_name(cmapname))) - except PSSyntaxError: - pass - return - - if name == 'begincodespacerange': - self.popall() - return - if name == 'endcodespacerange': - if 1 <= self.debug: - print >>stderr, 'codespace: %r' % self.partobj - self.popall() - return - - if name == 'begincidrange': - self.popall() - return - if name == 'endcidrange': - for (s,e,cid) in choplist(3, self.partobj): - assert isinstance(s, str) - assert isinstance(e, str) - assert isinstance(cid, int) - assert len(s) == len(e) - sprefix = s[:-4] - eprefix = e[:-4] - assert sprefix == eprefix - svar = s[-4:] - evar = e[-4:] - s1 = nunpack(svar) - e1 = nunpack(evar) - vlen = len(svar) - assert s1 <= e1 - for i in xrange(e1-s1+1): - x = sprefix+pack('>L',s1+i)[-vlen:] - self.cmap.register_code2cid(x, cid+i) - self.popall() - return - - if name == 'begincidchar': - self.popall() - return - if name == 'endcidchar': - for (cid,code) in choplist(2, self.partobj): - assert isinstance(code, str) - assert isinstance(cid, str) - self.cmap.register_code2cid(code, nunpack(cid)) - self.popall() - return - - if name == 'beginbfrange': - self.popall() - return - if name == 'endbfrange': - for (s,e,code) in choplist(3, self.partobj): - assert isinstance(s, str) - assert isinstance(e, str) - assert len(s) == len(e) - s1 = nunpack(s) - e1 = nunpack(e) - assert s1 <= e1 - if isinstance(code, list): - for i in xrange(e1-s1+1): - self.cmap.register_cid2code(s1+i, code[i]) - else: - var = code[-4:] - base = nunpack(var) - prefix = code[:-4] - vlen = len(var) - for i in xrange(e1-s1+1): - x = prefix+pack('>L',base+i)[-vlen:] - self.cmap.register_cid2code(s1+i, x) - self.popall() - return - - if name == 'beginbfchar': - self.popall() - return - if name == 'endbfchar': - for (cid,code) in choplist(2, self.partobj): - assert isinstance(cid, str) - assert isinstance(code, str) - self.cmap.register_cid2code(nunpack(cid), code) - self.popall() - return - - if name == 'beginnotdefrange': - self.popall() - return - if name == 'endnotdefrange': - if 1 <= self.debug: - print >>stderr, 'notdefrange: %r' % self.partobj - self.popall() - return - - return +def stream_value(x): + x = resolve1(x) + if not isinstance(x, PDFStream): + raise PDFTypeError('stream required: %r' % x) + return x ## PDFStream type @@ -934,111 +197,14 @@ class PDFStream: return self.data def parse_data(self, inline=False, debug=0): + try: + from cStringIO import StringIO + except ImportError: + from StringIO import StringIO return PDFParser(self.doc, StringIO(self.get_data()), inline=inline, debug=debug).parse() -## PDFObjRef -## -class PDFObjRef: - - def __init__(self, doc, objid, genno): - if objid == 0: - raise PDFValueError('objid cannot be 0.') - self.doc = doc - self.objid = objid - #self.genno = genno # Never used. - return - - def __repr__(self): - return '' % (self.objid) - - def resolve(self): - return self.doc.getobj(self.objid) - - -# resolve -def resolve1(x): - ''' - Resolve an object. If this is an array or dictionary, - it may still contains some indirect objects inside. - ''' - while isinstance(x, PDFObjRef): - x = x.resolve() - return x - -def resolveall(x): - ''' - Recursively resolve X and all the internals. - Make sure there is no indirect reference within the nested object. - This procedure might be slow. Do not used it unless - you really need it. - ''' - while isinstance(x, PDFObjRef): - x = x.resolve() - if isinstance(x, list): - x = [ resolveall(v) for v in x ] - elif isinstance(x, dict): - for (k,v) in x.iteritems(): - x[k] = resolveall(v) - return x - -# Type cheking -def literal_name(x): - x = resolve1(x) - if not isinstance(x, PSLiteral): - raise PDFTypeError('literal required: %r' % x) - return x.name - -def keyword_name(x): - x = resolve1(x) - if not isinstance(x, PSKeyword): - raise PDFTypeError('keyword required: %r' % x) - return x.name - -def str_value(x): - x = resolve1(x) - if not isinstance(x, str): - raise PDFTypeError('string required: %r' % x) - return x - -def int_value(x): - x = resolve1(x) - if not isinstance(x, int): - raise PDFTypeError('integer required: %r' % x) - return x - -def float_value(x): - x = resolve1(x) - if not isinstance(x, float): - raise PDFTypeError('float required: %r' % x) - return x - -def num_value(x): - x = resolve1(x) - if not (isinstance(x, int) or isinstance(x, float)): - raise PDFTypeError('int or float required: %r' % x) - return x - -def list_value(x): - x = resolve1(x) - if not isinstance(x, list): - raise PDFTypeError('list required: %r' % x) - return x - -def dict_value(x): - x = resolve1(x) - if not isinstance(x, dict): - raise PDFTypeError('dict required: %r' % x) - return x - -def stream_value(x): - x = resolve1(x) - if not isinstance(x, PDFStream): - raise PDFTypeError('stream required: %r' % x) - return x - - ## PDFPage ## class PDFPage: @@ -1372,845 +538,3 @@ class PDFParser(PSStackParser): else: break return - - -## Fonts -## - -# PDFFont -class PDFFont: - - def __init__(self, fontid, descriptor, widths, default_width=None): - self.fontid = fontid - self.descriptor = descriptor - self.widths = widths - self.fontname = descriptor['FontName'] - if isinstance(self.fontname, PSLiteral): - self.fontname = literal_name(self.fontname) - self.ascent = descriptor['Ascent'] - self.descent = descriptor['Descent'] - self.default_width = default_width or descriptor.get('MissingWidth', 0) - self.leading = descriptor.get('Leading', 0) - self.bbox = descriptor['FontBBox'] - return - - def __repr__(self): - return '' % (self.fontid,) - - def is_vertical(self): - return False - - def decode(self, bytes): - return map(ord, bytes) - - def char_width(self, cid): - return self.widths.get(cid, self.default_width) - - def char_disp(self, cid): - return 0 - - def string_width(self, s): - return sum( self.char_width(cid) for cid in self.decode(s) ) - - -# PDFSimpleFont -class PDFSimpleFont(PDFFont): - - def __init__(self, fontid, descriptor, widths, spec): - # Font encoding is specified either by a name of - # built-in encoding or a dictionary that describes - # the differences. - if 'Encoding' in spec: - encoding = resolve1(spec['Encoding']) - else: - encoding = LITERAL_STANDARD_ENCODING - if isinstance(encoding, dict): - name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING)) - diff = encoding.get('Differences', None) - self.encoding = EncodingDB.get_encoding(name, diff) - else: - self.encoding = EncodingDB.get_encoding(literal_name(encoding)) - self.ucs2_cmap = None - if 'ToUnicode' in spec: - strm = stream_value(spec['ToUnicode']) - self.ucs2_cmap = CMap() - CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse() - PDFFont.__init__(self, fontid, descriptor, widths) - return - - def to_unicode(self, cid): - if not self.ucs2_cmap: - try: - return self.encoding[cid] - except KeyError: - raise PDFUnicodeNotDefined(None, cid) - code = self.ucs2_cmap.tocode(cid) - if not code: - raise PDFUnicodeNotDefined(None, cid) - chars = unpack('>%dH' % (len(code)/2), code) - return ''.join( unichr(c) for c in chars ) - - -# PDFType1Font -class PDFType1Font(PDFSimpleFont): - - def __init__(self, fontid, spec): - if 'BaseFont' not in spec: - raise PDFFontError('BaseFont is missing') - self.basefont = literal_name(spec['BaseFont']) - try: - (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont) - except KeyError: - try: - descriptor = dict_value(spec['FontDescriptor']) - firstchar = int_value(spec['FirstChar']) - lastchar = int_value(spec['LastChar']) - widths = dict( (i+firstchar,w) for (i,w) - in enumerate(list_value(spec['Widths'])) ) - except KeyError, k: - raise PDFFontError('%s is missing' % k) - PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec) - return - -# PDFTrueTypeFont -class PDFTrueTypeFont(PDFType1Font): - pass - -# PDFType3Font -class PDFType3Font(PDFSimpleFont): - def __init__(self, fontid, spec): - try: - firstchar = int_value(spec['FirstChar']) - lastchar = int_value(spec['LastChar']) - widths = dict( (i+firstchar,w) for (i,w) - in enumerate(list_value(spec['Widths'])) ) - except KeyError, k: - raise PDFFontError('%s is missing' % k) - if 'FontDescriptor' in spec: - descriptor = dict_value(spec['FontDescriptor']) - else: - descriptor = {'FontName':fontid, 'Ascent':0, 'Descent':0, - 'FontBBox':spec['FontBBox']} - PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec) - return - -# PDFCIDFont - -## TrueTypeFont -## -class TrueTypeFont: - - class CMapNotFound(Exception): pass - - def __init__(self, name, fp): - self.name = name - self.fp = fp - self.tables = {} - fonttype = fp.read(4) - (ntables, _1, _2, _3) = unpack('>HHHH', fp.read(8)) - for i in xrange(ntables): - (name, tsum, offset, length) = unpack('>4sLLL', fp.read(16)) - self.tables[name] = (offset, length) - return - - def create_cmap(self): - if 'cmap' not in self.tables: raise TrueTypeFont.CMapNotFound - (base_offset, length) = self.tables['cmap'] - fp = self.fp - fp.seek(base_offset) - (version, nsubtables) = unpack('>HH', fp.read(4)) - subtables = [] - for i in xrange(nsubtables): - subtables.append(unpack('>HHL', fp.read(8))) - char2gid = {} - # Only supports subtable type 0, 2 and 4. - for (_1, _2, st_offset) in subtables: - fp.seek(base_offset+st_offset) - (fmttype, fmtlen, fmtlang) = unpack('>HHH', fp.read(6)) - if fmttype == 0: - char2gid.update(enumerate(unpack('>256B', fp.read(256)))) - elif fmttype == 2: - subheaderkeys = unpack('>256H', fp.read(512)) - firstbytes = [0]*8192 - for (i,k) in enumerate(subheaderkeys): - firstbytes[k/8] = i - nhdrs = max(subheaderkeys)/8 + 1 - hdrs = [] - for i in xrange(nhdrs): - (firstcode,entcount,delta,offset) = unpack('>HHhH', fp.read(8)) - hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset)) - for (i,firstcode,entcount,delta,pos) in hdrs: - if not entcount: continue - first = firstcode + (firstbytes[i] << 8) - fp.seek(pos) - for c in xrange(entcount): - gid = unpack('>H', fp.read(2)) - if gid: - gid += delta - char2gid[first+c] = gid - elif fmttype == 4: - (segcount, _1, _2, _3) = unpack('>HHHH', fp.read(8)) - segcount /= 2 - ecs = unpack('>%dH' % segcount, fp.read(2*segcount)) - fp.read(2) - scs = unpack('>%dH' % segcount, fp.read(2*segcount)) - idds = unpack('>%dh' % segcount, fp.read(2*segcount)) - pos = fp.tell() - idrs = unpack('>%dH' % segcount, fp.read(2*segcount)) - for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs): - if idr: - fp.seek(pos+idr) - for c in xrange(sc, ec+1): - char2gid[c] = (unpack('>H', fp.read(2))[0] + idd) & 0xffff - else: - for c in xrange(sc, ec+1): - char2gid[c] = (c + idd) & 0xffff - gid2char = dict( (gid, pack('>H', char)) - for (char,gid) in char2gid.iteritems() ) - cmapname = 'Adobe-Identity-UCS-%s' % self.name - return CMap(cmapname).update(char2gid, gid2char) - -class PDFCIDFont(PDFFont): - - def __init__(self, fontid, spec): - if 'BaseFont' not in spec: - raise PDFFontError('BaseFont is missing') - try: - self.cidsysteminfo = dict_value(spec['CIDSystemInfo']) - self.cidcoding = '%s-%s' % (self.cidsysteminfo['Registry'], - self.cidsysteminfo['Ordering']) - except KeyError: - raise PDFFontError('CIDSystemInfo not properly defined.') - self.basefont = literal_name(spec['BaseFont']) - self.cmap = CMapDB.get_cmap(literal_name(spec['Encoding'])) - descriptor = dict_value(spec['FontDescriptor']) - ttf = None - if 'FontFile2' in descriptor: - self.fontfile = stream_value(descriptor.get('FontFile2')) - ttf = TrueTypeFont(self.basefont, - StringIO(self.fontfile.get_data())) - self.ucs2_cmap = None - if 'ToUnicode' in spec: - strm = stream_value(spec['ToUnicode']) - self.ucs2_cmap = CMap() - CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse() - elif self.cidcoding == 'Adobe-Identity': - if ttf: - try: - self.ucs2_cmap = ttf.create_cmap() - except TrueTypeFont.CMapNotFound: - pass - else: - self.ucs2_cmap = CMapDB.get_cmap('%s-UCS2' % self.cidcoding) - def get_width(seq): - dic = {} - char1 = char2 = None - for v in seq: - if char1 == None: - char1 = v - elif char2 == None and isinstance(v, int): - char2 = v - else: - if char2 == None: - for (i,w) in enumerate(v): - dic[char1+i] = w - else: - for i in xrange(char1, char2+1): - dic[i] = v - char1 = char2 = None - return dic - self.vertical = self.cmap.is_vertical() - if self.vertical: - # writing mode: vertical - dic = get_width(list_value(spec.get('W2', []))) - widths = dict( (cid,w) for (cid,(d,w)) in dic.iteritems() ) - self.disps = dict( (cid,d) for (cid,(d,w)) in dic.iteritems() ) - (d,w) = spec.get('DW2', [880, -1000]) - default_width = w - self.default_disp = d - else: - # writing mode: horizontal - widths = get_width(list_value(spec.get('W', []))) - self.disps = {} - default_width = spec.get('DW', 1000) - self.default_disp = 0 - PDFFont.__init__(self, fontid, descriptor, widths, default_width) - return - - def is_vertical(self): - return self.vertical - - def decode(self, bytes): - return self.cmap.decode(bytes) - - def char_disp(self, cid): - return self.disps.get(cid, self.default_disp) - - def to_unicode(self, cid): - if not self.ucs2_cmap: - raise PDFUnicodeNotDefined(self.cidcoding, cid) - code = self.ucs2_cmap.tocode(cid) - if not code: - raise PDFUnicodeNotDefined(self.cidcoding, cid) - chars = unpack('>%dH' % (len(code)/2), code) - return ''.join( unichr(c) for c in chars ) - - -## Resource Manager -## -class PDFResourceManager: - - ''' - ResourceManager facilitates reuse of shared resources - such as fonts, images and cmaps so that large objects are not - allocated multiple times. - ''' - - def __init__(self, debug=0): - self.debug = debug - self.fonts = {} - return - - def get_procset(self, procs): - for proc in procs: - if proc == LITERAL_PDF: - pass - elif proc == LITERAL_TEXT: - pass - else: - #raise PDFResourceError('ProcSet %r is not supported.' % proc) - pass - return - - def get_cmap(self, name): - return CMapDB.get_cmap(name) - - def get_font(self, fontid, spec): - if fontid in self.fonts: - font = self.fonts[fontid] - else: - spec = dict_value(spec) - assert spec['Type'] == LITERAL_FONT - # Create a Font object. - if 'Subtype' not in spec: - raise PDFFontError('Font Subtype is not specified.') - subtype = literal_name(spec['Subtype']) - if subtype in ('Type1', 'MMType1'): - # Type1 Font - font = PDFType1Font(fontid, spec) - elif subtype == 'TrueType': - # TrueType Font - font = PDFTrueTypeFont(fontid, spec) - elif subtype == 'Type3': - # Type3 Font - font = PDFType3Font(fontid, spec) - elif subtype in ('CIDFontType0', 'CIDFontType2'): - # CID Font - font = PDFCIDFont(fontid, spec) - elif subtype == 'Type0': - # Type0 Font - dfonts = list_value(spec['DescendantFonts']) - assert len(dfonts) == 1 - subspec = dict_value(dfonts[0]).copy() - for k in ('Encoding', 'ToUnicode'): - if k in spec: - subspec[k] = resolve1(spec[k]) - font = self.get_font(fontid, subspec) - else: - raise PDFFontError('Invalid Font: %r' % spec) - self.fonts[fontid] = font - return font - - -## Interpreter -## -class PDFPageInterpreter: - - class TextState: - def __init__(self): - self.font = None - self.fontsize = 0 - self.charspace = 0 - self.wordspace = 0 - self.scaling = 100 - self.leading = 0 - self.render = 0 - self.rise = 0 - self.reset() - return - def __repr__(self): - return ('' % - (self.font, self.fontsize, self.matrix, - self.charspace, self.wordspace, self.scaling, self.leading, - self.render, self.rise)) - def reset(self): - self.matrix = (1, 0, 0, 1, 0, 0) - self.linematrix = (0, 0) - return - - def __init__(self, rsrc, device, debug=0): - self.rsrc = rsrc - self.device = device - self.debug = debug - return - - def initpage(self, ctm): - self.fontmap = {} - self.xobjmap = {} - self.csmap = {} - # gstack: stack for graphical states. - self.gstack = [] - self.ctm = ctm - self.device.set_ctm(self.ctm) - self.textstate = PDFPageInterpreter.TextState() - # argstack: stack for command arguments. - self.argstack = [] - # set some global states. - self.scs = None - self.ncs = None - return - - def push(self, obj): - self.argstack.append(obj) - return - - def pop(self, n): - x = self.argstack[-n:] - self.argstack = self.argstack[:-n] - return x - - def get_current_state(self): - return (self.ctm, self.textstate) - - def set_current_state(self, state): - (self.ctm, self.textstate) = state - self.device.set_ctm(self.ctm) - return - - # gsave - def do_q(self): - self.gstack.append(self.get_current_state()) - return - # grestore - def do_Q(self): - if self.gstack: - self.set_current_state(self.gstack.pop()) - return - - # concat-matrix - def do_cm(self, a1, b1, c1, d1, e1, f1): - self.ctm = mult_matrix((a1,b1,c1,d1,e1,f1), self.ctm) - self.device.set_ctm(self.ctm) - return - - # setlinewidth - def do_w(self, width): return - # setlinecap - def do_J(self, cap): return - # setlinejoin - def do_j(self, join): return - # setmiterlimit - def do_M(self, limit): return - # setdash - def do_d(self, dash, phase): return - # setintent - def do_ri(self, intent): return - # setflatness - def do_i(self, flatness): return - # savedict - def do_gs(self, name): return - - # moveto - def do_m(self, x, y): return - # lineto - def do_l(self, x, y): return - # curveto - def do_c(self, x1, y1, x2, y2, x3, y3): return - # urveto - def do_v(self, x2, y2, x3, y3): return - # rveto - def do_y(self, x1, y1, x3, y3): return - # closepath - def do_h(self): return - # rectangle - def do_re(self, x, y, w, h): return - - # stroke - def do_S(self): return - # close-and-stroke - def do_s(self): return - # fill - def do_f(self): return - # fill (obsolete) - do_F = do_f - # fill-even-odd - def do_f_a(self): return - # fill-and-stroke - def do_B(self): return - # fill-and-stroke-even-odd - def do_B_a(self): return - # close-fill-and-stroke - def do_b(self): return - # close-fill-and-stroke-even-odd - def do_b_a(self): return - # close-only - def do_n(self): return - # clip - def do_W(self): return - # clip-even-odd - def do_W_a(self): return - - # setcolorspace-stroking - def do_CS(self, name): - self.scs = self.csmap.get(literal_name(name), None) - return - # setcolorspace-non-strokine - def do_cs(self, name): - self.ncs = self.csmap.get(literal_name(name), None) - return - # setgray-stroking - def do_G(self, gray): - self.do_CS(LITERAL_DEVICE_GRAY) - return - # setgray-non-stroking - def do_g(self, gray): - self.do_cs(LITERAL_DEVICE_GRAY) - return - # setrgb-stroking - def do_RG(self, r, g, b): - self.do_CS(LITERAL_DEVICE_RGB) - return - # setrgb-non-stroking - def do_rg(self, r, g, b): - self.do_cs(LITERAL_DEVICE_RGB) - return - # setcmyk-stroking - def do_K(self, c, m, y, k): - self.do_CS(LITERAL_DEVICE_CMYK) - return - # setcmyk-non-stroking - def do_k(self, c, m, y, k): - self.do_cs(LITERAL_DEVICE_CMYK) - return - - # setcolor - def do_SCN(self): - n = cs_params(self.scs) - self.pop(n) - return - def do_scn(self): - n = cs_params(self.ncs) - self.pop(n) - return - def do_SC(self): - self.do_SCN() - return - def do_sc(self): - self.do_scn() - return - - # sharing-name - def do_sh(self, name): return - - # begin-text - def do_BT(self): - self.textstate.reset() - return - # end-text - def do_ET(self): - return - - # begin-compat - def do_BX(self): return - # end-compat - def do_EX(self): return - - # marked content operators - def do_MP(self, tag): return - def do_DP(self, tag, props): return - def do_BMC(self, tag): return - def do_BDC(self, tag, props): return - def do_EMC(self): return - - # setcharspace - def do_Tc(self, space): - self.textstate.charspace = space - return - # setwordspace - def do_Tw(self, space): - self.textstate.wordspace = space - return - # textscale - def do_Tz(self, scale): - self.textstate.scaling = scale - return - # setleading - def do_TL(self, leading): - self.textstate.leading = leading - return - # selectfont - def do_Tf(self, fontid, fontsize): - try: - self.textstate.font = self.fontmap[literal_name(fontid)] - except KeyError: - raise PDFInterpreterError('Undefined font id: %r' % fontid) - self.textstate.fontsize = fontsize - return - # setrendering - def do_Tr(self, render): - self.textstate.render = render - return - # settextrise - def do_Ts(self, rise): - self.textstate.rise = rise - return - - # text-move - def do_Td(self, tx, ty): - (a,b,c,d,e,f) = self.textstate.matrix - self.textstate.matrix = (a,b,c,d,e+tx,f+ty) - self.textstate.linematrix = (0, 0) - return - # text-move - def do_TD(self, tx, ty): - (a,b,c,d,e,f) = self.textstate.matrix - self.textstate.matrix = (a,b,c,d,e+tx,f+ty) - self.textstate.leading = -ty - self.textstate.linematrix = (0, 0) - return - # textmatrix - def do_Tm(self, a,b,c,d,e,f): - self.textstate.matrix = (a,b,c,d,e,f) - self.textstate.linematrix = (0, 0) - return - # nextline - def do_T_a(self): - (a,b,c,d,e,f) = self.textstate.matrix - self.textstate.matrix = (a,b,c,d,e,f+self.textstate.leading) - self.textstate.linematrix = (0, 0) - return - - # show-pos - def do_TJ(self, seq): - textstate = self.textstate - font = textstate.font - (a,b,c,d,e,f) = textstate.matrix - (lx,ly) = textstate.linematrix - s = ''.join( x for x in seq if isinstance(x, str) ) - n = sum( x for x in seq if not isinstance(x, str) ) - w = ((font.string_width(s)-n)/1000.0 * textstate.fontsize + - len(s) * textstate.charspace + - s.count(' ')*textstate.wordspace) * textstate.scaling / 100.0 - self.device.render_string(textstate, (a,b,c,d,e+lx,f+ly), w, seq) - if font.is_vertical(): - ly += w - else: - lx += w - textstate.linematrix = (lx,ly) - return - # show - def do_Tj(self, s): - self.do_TJ([s]) - return - # quote - def do__q(self, s): - self.do_T_a() - self.do_TJ([s]) - return - # doublequote - def do__w(self, aw, ac, s): - self.do_Tw(aw) - self.do_Tc(ac) - self.do_TJ([s]) - return - - # inline image - def do_BI(self): # never called - return - def do_ID(self): # never called - return - def do_EI(self, obj): - return - - # invoke an XObject - def do_Do(self, xobjid): - xobjid = literal_name(xobjid) - try: - xobj = stream_value(self.xobjmap[xobjid]) - except KeyError: - raise PDFInterpreterError('Undefined xobject id: %r' % xobjid) - if xobj.dic['Subtype'] == LITERAL_FORM: - if 1 <= self.debug: - print >>stderr, 'Processing xobj: %r' % xobj - interpreter = PDFPageInterpreter(self.rsrc, self.device) - interpreter.render_contents(xobjid, xobj.dic['Resources'], [xobj], xobj.dic['Matrix']) - return - - def process_page(self, page): - if 1 <= self.debug: - print >>stderr, 'Processing page: %r' % page - self.render_contents('page-%d' % page.pageid, page.resources, page.contents) - return - - def render_contents(self, contid, resources, contents, ctm=(1, 0, 0, 1, 0, 0)): - self.initpage(ctm) - self.device.begin_block(contid) - # Handle resource declarations. - for (k,v) in resources.iteritems(): - if 1 <= self.debug: - print >>stderr, 'Resource: %r: %r' % (k,v) - if k == 'Font': - for (fontid,fontrsrc) in dict_value(v).iteritems(): - self.fontmap[fontid] = self.rsrc.get_font(fontid, fontrsrc) - elif k == 'ColorSpace': - for (csid,csspec) in dict_value(v).iteritems(): - self.csmap[csid] = list_value(csspec) - elif k == 'ProcSet': - self.rsrc.get_procset(list_value(v)) - elif k == 'XObject': - for (xobjid,xobjstrm) in dict_value(v).iteritems(): - self.xobjmap[xobjid] = xobjstrm - for stream in contents: - self.execute(stream_value(stream)) - self.device.end_block() - return - - def execute(self, stream): - for obj in stream.parse_data(inline=True, debug=self.debug): - if isinstance(obj, PSKeyword): - name = 'do_%s' % obj.name.replace('*','_a').replace('"','_w').replace("'",'_q') - if hasattr(self, name): - func = getattr(self, name) - nargs = func.func_code.co_argcount-1 - if nargs: - args = self.pop(nargs) - if 1 <= self.debug: - print >>stderr, 'exec: %s %r' % (obj.name, args) - if len(args) == nargs: - func(*args) - else: - if 1 <= self.debug: - print >>stderr, 'exec: %s' % (obj.name) - func() - else: - raise PDFInterpreterError('unknown operator: %r' % obj.name) - else: - self.push(obj) - return - - -## PDFDevice -## -class PDFDevice: - - def __init__(self, rsrc): - self.rsrc = rsrc - self.ctm = None - return - - def __repr__(self): - return '' - - def set_ctm(self, ctm): - self.ctm = ctm - return - - def begin_block(self, name): - return - def end_block(self): - return - - def render_string(self, textstate, textmatrix, size, seq): - raise NotImplementedError - - -## TextConverter -## -class TextConverter(PDFDevice): - - def __init__(self, rsrc, codec, outfp=sys.stdout): - PDFDevice.__init__(self, rsrc) - self.outfp = outfp - self.codec = codec - return - - def begin_block(self, name): - self.outfp.write('\n' % name) - return - def end_block(self): - self.outfp.write('\n') - return - - def render_string(self, textstate, textmatrix, size, seq): - font = textstate.font - spwidth = int(-font.char_width(32) * 0.6) # space width - buf = '' - for x in seq: - if isinstance(x, int) or isinstance(x, float): - if not font.is_vertical() and x <= spwidth: - buf += ' ' - else: - chars = font.decode(x) - for cid in chars: - try: - char = font.to_unicode(cid) - except PDFUnicodeNotDefined, e: - (cidcoding, cid) = e.args - char = u'[%s:%d]' % (cidcoding, cid) - buf += char - (a,b,c,d,tx,ty) = mult_matrix(textmatrix, self.ctm) - skewed = (b != 0 or c != 0) - if font.is_vertical(): - size = -size - tag = 'vtext' - else: - tag = 'htext' - if skewed: - tag += ' skewed' - s = buf.encode(self.codec, 'xmlcharrefreplace') - (w,fs) = apply_matrix((a,b,c,d,0,0), (size,textstate.fontsize)) - def f(x): return '%.03f' % x - self.outfp.write('<%s font="%s" size="%s" x="%s" y="%s" w="%s">%s\n' % - (tag, font.fontname, f(fs), f(tx), f(ty), f(w), s, tag)) - return - - -# main -def main(argv): - import getopt - def usage(): - print 'usage: %s [-d] [-v] [-c codec] [-p pages] file ...' % argv[0] - return 100 - try: - (opts, args) = getopt.getopt(argv[1:], 'dvp:c:') - except getopt.GetoptError: - return usage() - if not args: return usage() - (debug, verbose) = (0, 0) - cmapdir = 'CMap' - cdbcmapdir = 'CDBCMap' - codec = 'ascii' - pages = set() - for (k, v) in opts: - if k == '-d': debug += 1 - elif k == '-v': verbose += 1 - elif k == '-p': pages.add(int(v)) - elif k == '-c': codec = v - # - CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug) - rsrc = PDFResourceManager(debug=debug) - device = TextConverter(rsrc, codec) - for fname in args: - doc = PDFDocument(debug=debug) - fp = file(fname) - parser = PDFParser(doc, fp, debug=debug) - interpreter = PDFPageInterpreter(rsrc, device, debug=debug) - for (i,page) in enumerate(doc.get_pages(debug=debug)): - if pages and (i not in pages): continue - interpreter.process_page(page) - fp.close() - return - -if __name__ == '__main__': sys.exit(main(sys.argv)) diff --git a/psparser.py b/psparser.py new file mode 100644 index 0000000..5a72d46 --- /dev/null +++ b/psparser.py @@ -0,0 +1,396 @@ +#!/usr/bin/env python +import sys, re +stderr = sys.stderr +from utils import choplist + + +## PS Exceptions +## +class PSException(Exception): pass +class PSSyntaxError(PSException): pass +class PSTypeError(PSException): pass +class PSValueError(PSException): pass + + +## PostScript Types +## +class PSLiteral: + ''' + PS literals (e.g. "/Name"). + Caution: Never create these objects directly. + Use PSLiteralTable.intern() instead. + ''' + def __init__(self, name): + self.name = name + return + def __repr__(self): + return '/%s' % self.name + +class PSKeyword: + ''' + PS keywords (e.g. "showpage"). + Caution: Never create these objects directly. + Use PSKeywordTable.intern() instead. + ''' + def __init__(self, name): + self.name = name + return + def __repr__(self): + return self.name + +class PSSymbolTable: + ''' + Symbol table that stores PSLiteral or PSKeyword. + ''' + def __init__(self, classe): + self.dic = {} + self.classe = classe + return + + def intern(self, name): + if name in self.dic: + lit = self.dic[name] + else: + lit = self.classe(name) + self.dic[name] = lit + return lit + +PSLiteralTable = PSSymbolTable(PSLiteral) +PSKeywordTable = PSSymbolTable(PSKeyword) + + +def literal_name(x): + if not isinstance(x, PSLiteral): + raise PSTypeError('literal required: %r' % x) + return x.name + +def keyword_name(x): + if not isinstance(x, PSKeyword): + raise PSTypeError('keyword required: %r' % x) + return x.name + + +## PSBaseParser +## +class PSBaseParser: + + '''PostScript parser that performs only basic tokenization.''' + + def __init__(self, fp, debug=0): + self.fp = fp + self.debug = debug + self.bufsize = 4096 + self.seek(0) + return + + def __repr__(self): + return '' % (self.fp,) + + def seek(self, pos): + ''' + seeks to the given pos. + ''' + if 2 <= self.debug: + print >>stderr, 'seek:', pos + self.fp.seek(pos) + self.linepos = pos + self.linebuf = None + self.curpos = 0 + self.line = '' + return + + EOLCHAR = re.compile(r'[\r\n]') + def nextline(self): + ''' + fetches the next line that ends either with \\r or \\n. + ''' + line = '' + eol = None + while 1: + if not self.linebuf or len(self.linebuf) <= self.curpos: + # fetch next chunk. + self.linebuf = self.fp.read(self.bufsize) + if not self.linebuf: + # at EOF. + break + self.curpos = 0 + if eol: + c = self.linebuf[self.curpos] + # handle '\r\n' + if (eol == '\r' and c == '\n'): + line += c + self.curpos += 1 + break + m = self.EOLCHAR.search(self.linebuf, self.curpos) + if m: + i = m.end(0) + line += self.linebuf[self.curpos:i] + eol = self.linebuf[i-1] + self.curpos = i + else: + # fetch further + line += self.linebuf[self.curpos:] + self.linebuf = None + self.linepos += len(line) + return line + + def revreadlines(self): + ''' + fetches lines backword. used to locate trailers. + ''' + self.fp.seek(0, 2) + pos = self.fp.tell() + buf = '' + while 0 < pos: + pos = max(0, pos-self.bufsize) + self.fp.seek(pos) + s = self.fp.read(self.bufsize) + if not s: break + while 1: + n = max(s.rfind('\r'), s.rfind('\n')) + if n == -1: + buf = s + buf + break + yield buf+s[n:] + s = s[:n] + buf = '' + return + + SPECIAL = r'%\[\]()<>{}/\000\011\012\014\015\040' + TOKEN = re.compile(r'<<|>>|[%\[\]()<>{}/]|[^'+SPECIAL+r']+') + LITERAL = re.compile(r'([^#'+SPECIAL+r']|#[0-9abcdefABCDEF]{2})+') + NUMBER = re.compile(r'[+-]?[0-9][.0-9]*$') + STRING_NORM = re.compile(r'(\\[0-9]{1,3}|\\.|[^\)])+') + STRING_NORM_SUB = re.compile(r'\\[0-7]{1,3}|\\.') + STRING_HEX = re.compile(r'[\s0-9a-fA-F]+') + STRING_HEX_SUB = re.compile(r'[0-9a-fA-F]{1,2}') + + def parse(self): + ''' + Yields a list of basic tokens: keywords, literals, strings, + numbers and parentheses. Comments are skipped. + Nested objects (i.e. arrays and dictionaries) are not handled. + ''' + while 1: + # do not strip line! we need to distinguish last '\n' or '\r' + linepos0 = self.linepos + self.line = self.nextline() + if not self.line: break + if 2 <= self.debug: + print >>stderr, 'line: (%d) %r' % (self.linepos, self.line) + # do this before removing comment + if self.line.startswith('%%EOF'): break + charpos = 0 + + # tokenize + while 1: + m = self.TOKEN.search(self.line, charpos) + if not m: break + t = m.group(0) + pos = linepos0 + m.start(0) + charpos = m.end(0) + + if t == '%': + # skip comment + if 2 <= self.debug: + print >>stderr, 'comment: %r' % self.line[charpos:] + break + + elif t == '/': + # literal object + mn = self.LITERAL.match(self.line, m.start(0)+1) + lit = PSLiteralTable.intern(mn.group(0)) + yield (pos, lit) + charpos = mn.end(0) + if 2 <= self.debug: + print >>stderr, 'name: %r' % lit + + elif t == '(': + # normal string object + s = '' + while 1: + ms = self.STRING_NORM.match(self.line, charpos) + if not ms: break + s1 = ms.group(0) + charpos = ms.end(0) + if len(s1) == 1 and s1[-1] == '\\': + s += s1[-1:] + self.line = self.nextline() + if not self.line: + raise PSSyntaxError('end inside string: linepos=%d, line=%r' % + (self.linepos, self.line)) + charpos = 0 + elif charpos == len(self.line): + s += s1 + self.line = self.nextline() + if not self.line: + raise PSSyntaxError('end inside string: linepos=%d, line=%r' % + (self.linepos, self.line)) + charpos = 0 + else: + s += s1 + break + if self.line[charpos] != ')': + raise PSSyntaxError('no close paren: linepos=%d, line=%r' % + (self.linepos, self.line)) + charpos += 1 + def convesc(m): + x = m.group(0) + if x[1:].isdigit(): + return chr(int(x[1:], 8)) + else: + return x[1] + s = self.STRING_NORM_SUB.sub(convesc, s) + if 2 <= self.debug: + print >>stderr, 'str: %r' % s + yield (pos, s) + + elif t == '<': + # hex string object + ms = self.STRING_HEX.match(self.line, charpos) + charpos = ms.end(0) + if self.line[charpos] != '>': + raise PSSyntaxError('no close paren: linepos=%d, line=%r' % + (self.linepos, self.line)) + charpos += 1 + def convhex(m1): + return chr(int(m1.group(0), 16)) + s = self.STRING_HEX_SUB.sub(convhex, ms.group(0)) + if 2 <= self.debug: + print >>stderr, 'str: %r' % s + yield (pos, s) + + elif self.NUMBER.match(t): + # number + if '.' in t: + n = float(t) + else: + n = int(t) + if 2 <= self.debug: + print >>stderr, 'number: %r' % n + yield (pos, n) + + elif t in ('true','false'): + # boolean + if 2 <= self.debug: + print >>stderr, 'boolean: %r' % t + yield (pos, (t == 'true')) + + else: + # other token + if 2 <= self.debug: + print >>stderr, 'keyword: %r' % t + yield (pos, PSKeywordTable.intern(t)) + + return + + +## PSStackParser +## +class PSStackParser(PSBaseParser): + + ''' + PostScript parser that recognizes compound objects + such as arrays and dictionaries. + ''' + + def __init__(self, fp, debug=0): + PSBaseParser.__init__(self, fp, debug=debug) + self.context = [] + self.partobj = None + return + + def do_token(self, pos, token): + ''' + Handles special tokens. + Returns true if the token denotes the end of an object. + ''' + return False + + def push(self, obj): + ''' + Push an object to the stack. + ''' + self.partobj.append(obj) + return + + def pop(self, n): + ''' + Pop N objects from the stack. + ''' + if len(self.partobj) < n: + raise PSSyntaxError('stack too short < %d' % n) + r = self.partobj[-n:] + self.partobj = self.partobj[:-n] + return r + + def popall(self): + ''' + Discards all the objects on the stack. + ''' + self.partobj = [] + return + + def parse(self): + ''' + Yields a list of objects: keywords, literals, strings, + numbers, arrays and dictionaries. Arrays and dictionaries + are represented as Python sequence and dictionaries. + ''' + + def startobj(type): + self.context.append((type, self.partobj)) + self.partobj = [] + return + + def endobj(type1): + assert self.context + obj = self.partobj + (type0, self.partobj) = self.context.pop() + if type0 != type1: + raise PSTypeError('type mismatch: %r(%r) != %r(%r)' % + (type0, self.partobj, type1, obj)) + return obj + + startobj('o') + + for (pos,t) in PSBaseParser.parse(self): + if isinstance(t, int) or isinstance(t, float): + self.push(t) + elif isinstance(t, str): + self.push(t) + elif isinstance(t, PSLiteral): + self.push(t) + else: + c = keyword_name(t) + if c == '{' or c == '}': + self.push(t) + elif c == '[': + # begin array + if 2 <= self.debug: + print >>stderr, 'start array' + startobj('a') + elif c == ']': + # end array + a = endobj('a') + if 2 <= self.debug: + print >>stderr, 'end array: %r' % a + self.push(a) + elif c == '<<': + # begin dictionary + if 2 <= self.debug: + print >>stderr, 'start dict' + startobj('d') + elif c == '>>': + # end dictionary + objs = endobj('d') + if len(objs) % 2 != 0: + raise PSTypeError('invalid dictionary construct: %r' % objs) + d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) ) + if 2 <= self.debug: + print >>stderr, 'end dict: %r' % d + self.push(d) + elif self.do_token(pos, t): + break + + return endobj('o') diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..5ceb333 --- /dev/null +++ b/utils.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python + +## Utilities +## +def choplist(n, seq): + '''Groups every n elements of the list.''' + r = [] + for x in seq: + r.append(x) + if len(r) == n: + yield tuple(r) + r = [] + return + +def nunpack(s, default=0): + '''Unpacks up to 4 bytes.''' + l = len(s) + if not l: + return default + elif l == 1: + return ord(s) + elif l == 2: + return unpack('>H', s)[0] + elif l == 3: + return unpack('>L', '\x00'+s)[0] + elif l == 4: + return unpack('>L', s)[0] + else: + return TypeError('invalid length: %d' % l)