diff --git a/tools/conv_cmap.py b/tools/conv_cmap.py index 0be7c5a..ee0e5f8 100755 --- a/tools/conv_cmap.py +++ b/tools/conv_cmap.py @@ -4,9 +4,36 @@ import os.path import gzip import cPickle as pickle -def process_cid2code(fp, check_codecs=[]): - def get_canonicals(name): +## CMapConverter +## +class CMapConverter(object): + + def __init__(self, check_codecs=[]): + self.check_codecs = check_codecs + self.code2cid = {} # {'cmapname': ...} + self.is_vertical = {} + self.cid2unichr_h = {} # {cid: unichr} + self.cid2unichr_v = {} # {cid: unichr} + return + + def load(self, fp): + names = [] + for line in fp: + (line,_,_) = line.strip().partition('#') + if not line: continue + values = line.split('\t') + if not names: + names = values + continue + d = dict( (k,v) for (k,v) in zip(names, values) if v != '*' ) + cid = int(d['CID']) + for (key,value) in d.iteritems(): + if key == 'CID': continue + self._register(cid, key, value) + return + + def get_canonicals(self, name): if name.endswith('-H'): return (name, None) elif name == 'H': @@ -14,14 +41,14 @@ def process_cid2code(fp, check_codecs=[]): else: return (name+'-H', name+'-V') - def get_unichr(codes): + def get_unichr(self, codes): # determine the "most popular" candidate. d = {} for code in codes: char = unicode(code, 'utf-8') if char not in d: d[char] = 0 - for codec in check_codecs: + for codec in self.check_codecs: try: char.encode(codec, 'strict') d[char] += 1 @@ -30,89 +57,74 @@ def process_cid2code(fp, check_codecs=[]): chars = sorted(d.keys(), key=lambda char:d[char], reverse=True) return chars[0] - def put(dmap, code, cid, force=False): - for b in code[:-1]: - b = ord(b) - if b in dmap: - dmap = dmap[b] + def _register(self, cid, key, value): + def put(dmap, code, cid, force=False): + for b in code[:-1]: + b = ord(b) + if b in dmap: + dmap = dmap[b] + else: + d = {} + dmap[b] = d + dmap = d + b = ord(code[-1]) + if force or ((b not in dmap) or dmap[b] == cid): + dmap[b] = cid + return + + (hmapname, vmapname) = self.get_canonicals(key) + if hmapname in self.code2cid: + hmap = self.code2cid[hmapname] + else: + hmap = {} + self.code2cid[hmapname] = hmap + vmap = None + if vmapname: + self.is_vertical[vmapname] = True + if vmapname in self.code2cid: + vmap = self.code2cid[vmapname] else: - d = {} - dmap[b] = d - dmap = d - b = ord(code[-1]) - if force or ((b not in dmap) or dmap[b] == cid): - dmap[b] = cid + vmap = {} + self.code2cid[vmapname] = vmap + + hcodes = [] + vcodes = [] + for code in value.split(','): + vertical = code.endswith('v') + if vertical: + code = code[:-1] + try: + code = code.decode('hex') + except: + code = chr(int(code, 16)) + if vertical: + vcodes.append(code) + else: + hcodes.append(code) + + if vcodes: + assert vmap is not None + for code in vcodes: + put(vmap, code, cid, True) + for code in hcodes: + put(hmap, code, cid, True) + if key.endswith('-UTF8'): + if hcodes: + self.cid2unichr_h[cid] = self.get_unichr(hcodes) + if vcodes: + self.cid2unichr_v[cid] = self.get_unichr(vcodes) + else: + for code in hcodes: + put(hmap, code, cid) + put(vmap, code, cid) + if key.endswith('-UTF8') and hcodes: + code = self.get_unichr(hcodes) + if cid not in self.cid2unichr_h: + self.cid2unichr_h[cid] = code + if cid not in self.cid2unichr_v: + self.cid2unichr_v[cid] = code return - names = [] - code2cid = {} # {'cmapname': ...} - is_vertical = {} - cid2unichr_h = {} # {cid: unichr} - cid2unichr_v = {} # {cid: unichr} - - for line in fp: - line = line.strip() - if line.startswith('#'): continue - if line.startswith('CID'): - names = line.split('\t')[1:] - continue - f = line.split('\t') - if not f: continue - cid = int(f[0]) - for (x,name) in zip(f[1:], names): - if x == '*': continue - (hmapname, vmapname) = get_canonicals(name) - if hmapname in code2cid: - hmap = code2cid[hmapname] - else: - hmap = {} - code2cid[hmapname] = hmap - vmap = None - if vmapname: - is_vertical[vmapname] = True - if vmapname in code2cid: - vmap = code2cid[vmapname] - else: - vmap = {} - code2cid[vmapname] = vmap - hcodes = [] - vcodes = [] - for code in x.split(','): - vertical = code.endswith('v') - if vertical: - code = code[:-1] - try: - code = code.decode('hex') - except: - code = chr(int(code, 16)) - if vertical: - vcodes.append(code) - else: - hcodes.append(code) - if vcodes: - assert vmap is not None - for code in vcodes: - put(vmap, code, cid, True) - for code in hcodes: - put(hmap, code, cid, True) - if name.endswith('-UTF8'): - if hcodes: - cid2unichr_h[cid] = get_unichr(hcodes) - if vcodes: - cid2unichr_v[cid] = get_unichr(vcodes) - else: - for code in hcodes: - put(hmap, code, cid) - put(vmap, code, cid) - if name.endswith('-UTF8') and hcodes: - code = get_unichr(hcodes) - if cid not in cid2unichr_h: - cid2unichr_h[cid] = code - if cid not in cid2unichr_v: - cid2unichr_v[cid] = code - - return (code2cid, is_vertical, cid2unichr_h, cid2unichr_v) - # main def main(argv): @@ -126,16 +138,17 @@ def main(argv): check_codecs = args[3:] print >>sys.stderr, 'reading %r...' % src + converter = CMapConverter(check_codecs) fp = file(src) - (code2cid, is_vertical, cid2unichr_h, cid2unichr_v) = process_cid2code(fp, check_codecs) + converter.load(fp) fp.close() - for (name, cmap) in code2cid.iteritems(): + for (name, cmap) in converter.code2cid.iteritems(): fname = '%s.pickle.gz' % name print >>sys.stderr, 'writing %r...' % fname fp = gzip.open(os.path.join(outdir, fname), 'wb') data = dict( - IS_VERTICAL=is_vertical.get(name, False), + IS_VERTICAL=converter.is_vertical.get(name, False), CODE2CID=cmap, ) fp.write(pickle.dumps(data)) @@ -145,12 +158,11 @@ def main(argv): print >>sys.stderr, 'writing %r...' % fname fp = gzip.open(os.path.join(outdir, fname), 'wb') data = dict( - CID2UNICHR_H=cid2unichr_h, - CID2UNICHR_V=cid2unichr_v, + CID2UNICHR_H=converter.cid2unichr_h, + CID2UNICHR_V=converter.cid2unichr_v, ) fp.write(pickle.dumps(data)) fp.close() - - return 0 + return if __name__ == '__main__': sys.exit(main(sys.argv))