Code cleanup: conv_cmap.py

pull/1/head
Yusuke Shinyama 2013-10-12 13:20:40 +09:00
parent 1455f134c6
commit 774827b4ce
1 changed files with 103 additions and 91 deletions

View File

@ -4,9 +4,36 @@ import os.path
import gzip import gzip
import cPickle as pickle import cPickle as pickle
def process_cid2code(fp, check_codecs=[]):
def get_canonicals(name): ## CMapConverter
##
class CMapConverter(object):
def __init__(self, check_codecs=[]):
self.check_codecs = check_codecs
self.code2cid = {} # {'cmapname': ...}
self.is_vertical = {}
self.cid2unichr_h = {} # {cid: unichr}
self.cid2unichr_v = {} # {cid: unichr}
return
def load(self, fp):
names = []
for line in fp:
(line,_,_) = line.strip().partition('#')
if not line: continue
values = line.split('\t')
if not names:
names = values
continue
d = dict( (k,v) for (k,v) in zip(names, values) if v != '*' )
cid = int(d['CID'])
for (key,value) in d.iteritems():
if key == 'CID': continue
self._register(cid, key, value)
return
def get_canonicals(self, name):
if name.endswith('-H'): if name.endswith('-H'):
return (name, None) return (name, None)
elif name == 'H': elif name == 'H':
@ -14,14 +41,14 @@ def process_cid2code(fp, check_codecs=[]):
else: else:
return (name+'-H', name+'-V') return (name+'-H', name+'-V')
def get_unichr(codes): def get_unichr(self, codes):
# determine the "most popular" candidate. # determine the "most popular" candidate.
d = {} d = {}
for code in codes: for code in codes:
char = unicode(code, 'utf-8') char = unicode(code, 'utf-8')
if char not in d: if char not in d:
d[char] = 0 d[char] = 0
for codec in check_codecs: for codec in self.check_codecs:
try: try:
char.encode(codec, 'strict') char.encode(codec, 'strict')
d[char] += 1 d[char] += 1
@ -30,6 +57,7 @@ def process_cid2code(fp, check_codecs=[]):
chars = sorted(d.keys(), key=lambda char:d[char], reverse=True) chars = sorted(d.keys(), key=lambda char:d[char], reverse=True)
return chars[0] return chars[0]
def _register(self, cid, key, value):
def put(dmap, code, cid, force=False): def put(dmap, code, cid, force=False):
for b in code[:-1]: for b in code[:-1]:
b = ord(b) b = ord(b)
@ -44,40 +72,24 @@ def process_cid2code(fp, check_codecs=[]):
dmap[b] = cid dmap[b] = cid
return return
names = [] (hmapname, vmapname) = self.get_canonicals(key)
code2cid = {} # {'cmapname': ...} if hmapname in self.code2cid:
is_vertical = {} hmap = self.code2cid[hmapname]
cid2unichr_h = {} # {cid: unichr}
cid2unichr_v = {} # {cid: unichr}
for line in fp:
line = line.strip()
if line.startswith('#'): continue
if line.startswith('CID'):
names = line.split('\t')[1:]
continue
f = line.split('\t')
if not f: continue
cid = int(f[0])
for (x,name) in zip(f[1:], names):
if x == '*': continue
(hmapname, vmapname) = get_canonicals(name)
if hmapname in code2cid:
hmap = code2cid[hmapname]
else: else:
hmap = {} hmap = {}
code2cid[hmapname] = hmap self.code2cid[hmapname] = hmap
vmap = None vmap = None
if vmapname: if vmapname:
is_vertical[vmapname] = True self.is_vertical[vmapname] = True
if vmapname in code2cid: if vmapname in self.code2cid:
vmap = code2cid[vmapname] vmap = self.code2cid[vmapname]
else: else:
vmap = {} vmap = {}
code2cid[vmapname] = vmap self.code2cid[vmapname] = vmap
hcodes = [] hcodes = []
vcodes = [] vcodes = []
for code in x.split(','): for code in value.split(','):
vertical = code.endswith('v') vertical = code.endswith('v')
if vertical: if vertical:
code = code[:-1] code = code[:-1]
@ -89,29 +101,29 @@ def process_cid2code(fp, check_codecs=[]):
vcodes.append(code) vcodes.append(code)
else: else:
hcodes.append(code) hcodes.append(code)
if vcodes: if vcodes:
assert vmap is not None assert vmap is not None
for code in vcodes: for code in vcodes:
put(vmap, code, cid, True) put(vmap, code, cid, True)
for code in hcodes: for code in hcodes:
put(hmap, code, cid, True) put(hmap, code, cid, True)
if name.endswith('-UTF8'): if key.endswith('-UTF8'):
if hcodes: if hcodes:
cid2unichr_h[cid] = get_unichr(hcodes) self.cid2unichr_h[cid] = self.get_unichr(hcodes)
if vcodes: if vcodes:
cid2unichr_v[cid] = get_unichr(vcodes) self.cid2unichr_v[cid] = self.get_unichr(vcodes)
else: else:
for code in hcodes: for code in hcodes:
put(hmap, code, cid) put(hmap, code, cid)
put(vmap, code, cid) put(vmap, code, cid)
if name.endswith('-UTF8') and hcodes: if key.endswith('-UTF8') and hcodes:
code = get_unichr(hcodes) code = self.get_unichr(hcodes)
if cid not in cid2unichr_h: if cid not in self.cid2unichr_h:
cid2unichr_h[cid] = code self.cid2unichr_h[cid] = code
if cid not in cid2unichr_v: if cid not in self.cid2unichr_v:
cid2unichr_v[cid] = code self.cid2unichr_v[cid] = code
return
return (code2cid, is_vertical, cid2unichr_h, cid2unichr_v)
# main # main
def main(argv): def main(argv):
@ -126,16 +138,17 @@ def main(argv):
check_codecs = args[3:] check_codecs = args[3:]
print >>sys.stderr, 'reading %r...' % src print >>sys.stderr, 'reading %r...' % src
converter = CMapConverter(check_codecs)
fp = file(src) fp = file(src)
(code2cid, is_vertical, cid2unichr_h, cid2unichr_v) = process_cid2code(fp, check_codecs) converter.load(fp)
fp.close() fp.close()
for (name, cmap) in code2cid.iteritems(): for (name, cmap) in converter.code2cid.iteritems():
fname = '%s.pickle.gz' % name fname = '%s.pickle.gz' % name
print >>sys.stderr, 'writing %r...' % fname print >>sys.stderr, 'writing %r...' % fname
fp = gzip.open(os.path.join(outdir, fname), 'wb') fp = gzip.open(os.path.join(outdir, fname), 'wb')
data = dict( data = dict(
IS_VERTICAL=is_vertical.get(name, False), IS_VERTICAL=converter.is_vertical.get(name, False),
CODE2CID=cmap, CODE2CID=cmap,
) )
fp.write(pickle.dumps(data)) fp.write(pickle.dumps(data))
@ -145,12 +158,11 @@ def main(argv):
print >>sys.stderr, 'writing %r...' % fname print >>sys.stderr, 'writing %r...' % fname
fp = gzip.open(os.path.join(outdir, fname), 'wb') fp = gzip.open(os.path.join(outdir, fname), 'wb')
data = dict( data = dict(
CID2UNICHR_H=cid2unichr_h, CID2UNICHR_H=converter.cid2unichr_h,
CID2UNICHR_V=cid2unichr_v, CID2UNICHR_V=converter.cid2unichr_v,
) )
fp.write(pickle.dumps(data)) fp.write(pickle.dumps(data))
fp.close() fp.close()
return
return 0
if __name__ == '__main__': sys.exit(main(sys.argv)) if __name__ == '__main__': sys.exit(main(sys.argv))