Code cleanup: conv_cmap.py
parent
1455f134c6
commit
774827b4ce
|
@ -4,9 +4,36 @@ import os.path
|
||||||
import gzip
|
import gzip
|
||||||
import cPickle as pickle
|
import cPickle as pickle
|
||||||
|
|
||||||
def process_cid2code(fp, check_codecs=[]):
|
|
||||||
|
|
||||||
def get_canonicals(name):
|
## CMapConverter
|
||||||
|
##
|
||||||
|
class CMapConverter(object):
|
||||||
|
|
||||||
|
def __init__(self, check_codecs=[]):
|
||||||
|
self.check_codecs = check_codecs
|
||||||
|
self.code2cid = {} # {'cmapname': ...}
|
||||||
|
self.is_vertical = {}
|
||||||
|
self.cid2unichr_h = {} # {cid: unichr}
|
||||||
|
self.cid2unichr_v = {} # {cid: unichr}
|
||||||
|
return
|
||||||
|
|
||||||
|
def load(self, fp):
|
||||||
|
names = []
|
||||||
|
for line in fp:
|
||||||
|
(line,_,_) = line.strip().partition('#')
|
||||||
|
if not line: continue
|
||||||
|
values = line.split('\t')
|
||||||
|
if not names:
|
||||||
|
names = values
|
||||||
|
continue
|
||||||
|
d = dict( (k,v) for (k,v) in zip(names, values) if v != '*' )
|
||||||
|
cid = int(d['CID'])
|
||||||
|
for (key,value) in d.iteritems():
|
||||||
|
if key == 'CID': continue
|
||||||
|
self._register(cid, key, value)
|
||||||
|
return
|
||||||
|
|
||||||
|
def get_canonicals(self, name):
|
||||||
if name.endswith('-H'):
|
if name.endswith('-H'):
|
||||||
return (name, None)
|
return (name, None)
|
||||||
elif name == 'H':
|
elif name == 'H':
|
||||||
|
@ -14,14 +41,14 @@ def process_cid2code(fp, check_codecs=[]):
|
||||||
else:
|
else:
|
||||||
return (name+'-H', name+'-V')
|
return (name+'-H', name+'-V')
|
||||||
|
|
||||||
def get_unichr(codes):
|
def get_unichr(self, codes):
|
||||||
# determine the "most popular" candidate.
|
# determine the "most popular" candidate.
|
||||||
d = {}
|
d = {}
|
||||||
for code in codes:
|
for code in codes:
|
||||||
char = unicode(code, 'utf-8')
|
char = unicode(code, 'utf-8')
|
||||||
if char not in d:
|
if char not in d:
|
||||||
d[char] = 0
|
d[char] = 0
|
||||||
for codec in check_codecs:
|
for codec in self.check_codecs:
|
||||||
try:
|
try:
|
||||||
char.encode(codec, 'strict')
|
char.encode(codec, 'strict')
|
||||||
d[char] += 1
|
d[char] += 1
|
||||||
|
@ -30,6 +57,7 @@ def process_cid2code(fp, check_codecs=[]):
|
||||||
chars = sorted(d.keys(), key=lambda char:d[char], reverse=True)
|
chars = sorted(d.keys(), key=lambda char:d[char], reverse=True)
|
||||||
return chars[0]
|
return chars[0]
|
||||||
|
|
||||||
|
def _register(self, cid, key, value):
|
||||||
def put(dmap, code, cid, force=False):
|
def put(dmap, code, cid, force=False):
|
||||||
for b in code[:-1]:
|
for b in code[:-1]:
|
||||||
b = ord(b)
|
b = ord(b)
|
||||||
|
@ -44,40 +72,24 @@ def process_cid2code(fp, check_codecs=[]):
|
||||||
dmap[b] = cid
|
dmap[b] = cid
|
||||||
return
|
return
|
||||||
|
|
||||||
names = []
|
(hmapname, vmapname) = self.get_canonicals(key)
|
||||||
code2cid = {} # {'cmapname': ...}
|
if hmapname in self.code2cid:
|
||||||
is_vertical = {}
|
hmap = self.code2cid[hmapname]
|
||||||
cid2unichr_h = {} # {cid: unichr}
|
|
||||||
cid2unichr_v = {} # {cid: unichr}
|
|
||||||
|
|
||||||
for line in fp:
|
|
||||||
line = line.strip()
|
|
||||||
if line.startswith('#'): continue
|
|
||||||
if line.startswith('CID'):
|
|
||||||
names = line.split('\t')[1:]
|
|
||||||
continue
|
|
||||||
f = line.split('\t')
|
|
||||||
if not f: continue
|
|
||||||
cid = int(f[0])
|
|
||||||
for (x,name) in zip(f[1:], names):
|
|
||||||
if x == '*': continue
|
|
||||||
(hmapname, vmapname) = get_canonicals(name)
|
|
||||||
if hmapname in code2cid:
|
|
||||||
hmap = code2cid[hmapname]
|
|
||||||
else:
|
else:
|
||||||
hmap = {}
|
hmap = {}
|
||||||
code2cid[hmapname] = hmap
|
self.code2cid[hmapname] = hmap
|
||||||
vmap = None
|
vmap = None
|
||||||
if vmapname:
|
if vmapname:
|
||||||
is_vertical[vmapname] = True
|
self.is_vertical[vmapname] = True
|
||||||
if vmapname in code2cid:
|
if vmapname in self.code2cid:
|
||||||
vmap = code2cid[vmapname]
|
vmap = self.code2cid[vmapname]
|
||||||
else:
|
else:
|
||||||
vmap = {}
|
vmap = {}
|
||||||
code2cid[vmapname] = vmap
|
self.code2cid[vmapname] = vmap
|
||||||
|
|
||||||
hcodes = []
|
hcodes = []
|
||||||
vcodes = []
|
vcodes = []
|
||||||
for code in x.split(','):
|
for code in value.split(','):
|
||||||
vertical = code.endswith('v')
|
vertical = code.endswith('v')
|
||||||
if vertical:
|
if vertical:
|
||||||
code = code[:-1]
|
code = code[:-1]
|
||||||
|
@ -89,29 +101,29 @@ def process_cid2code(fp, check_codecs=[]):
|
||||||
vcodes.append(code)
|
vcodes.append(code)
|
||||||
else:
|
else:
|
||||||
hcodes.append(code)
|
hcodes.append(code)
|
||||||
|
|
||||||
if vcodes:
|
if vcodes:
|
||||||
assert vmap is not None
|
assert vmap is not None
|
||||||
for code in vcodes:
|
for code in vcodes:
|
||||||
put(vmap, code, cid, True)
|
put(vmap, code, cid, True)
|
||||||
for code in hcodes:
|
for code in hcodes:
|
||||||
put(hmap, code, cid, True)
|
put(hmap, code, cid, True)
|
||||||
if name.endswith('-UTF8'):
|
if key.endswith('-UTF8'):
|
||||||
if hcodes:
|
if hcodes:
|
||||||
cid2unichr_h[cid] = get_unichr(hcodes)
|
self.cid2unichr_h[cid] = self.get_unichr(hcodes)
|
||||||
if vcodes:
|
if vcodes:
|
||||||
cid2unichr_v[cid] = get_unichr(vcodes)
|
self.cid2unichr_v[cid] = self.get_unichr(vcodes)
|
||||||
else:
|
else:
|
||||||
for code in hcodes:
|
for code in hcodes:
|
||||||
put(hmap, code, cid)
|
put(hmap, code, cid)
|
||||||
put(vmap, code, cid)
|
put(vmap, code, cid)
|
||||||
if name.endswith('-UTF8') and hcodes:
|
if key.endswith('-UTF8') and hcodes:
|
||||||
code = get_unichr(hcodes)
|
code = self.get_unichr(hcodes)
|
||||||
if cid not in cid2unichr_h:
|
if cid not in self.cid2unichr_h:
|
||||||
cid2unichr_h[cid] = code
|
self.cid2unichr_h[cid] = code
|
||||||
if cid not in cid2unichr_v:
|
if cid not in self.cid2unichr_v:
|
||||||
cid2unichr_v[cid] = code
|
self.cid2unichr_v[cid] = code
|
||||||
|
return
|
||||||
return (code2cid, is_vertical, cid2unichr_h, cid2unichr_v)
|
|
||||||
|
|
||||||
# main
|
# main
|
||||||
def main(argv):
|
def main(argv):
|
||||||
|
@ -126,16 +138,17 @@ def main(argv):
|
||||||
check_codecs = args[3:]
|
check_codecs = args[3:]
|
||||||
|
|
||||||
print >>sys.stderr, 'reading %r...' % src
|
print >>sys.stderr, 'reading %r...' % src
|
||||||
|
converter = CMapConverter(check_codecs)
|
||||||
fp = file(src)
|
fp = file(src)
|
||||||
(code2cid, is_vertical, cid2unichr_h, cid2unichr_v) = process_cid2code(fp, check_codecs)
|
converter.load(fp)
|
||||||
fp.close()
|
fp.close()
|
||||||
|
|
||||||
for (name, cmap) in code2cid.iteritems():
|
for (name, cmap) in converter.code2cid.iteritems():
|
||||||
fname = '%s.pickle.gz' % name
|
fname = '%s.pickle.gz' % name
|
||||||
print >>sys.stderr, 'writing %r...' % fname
|
print >>sys.stderr, 'writing %r...' % fname
|
||||||
fp = gzip.open(os.path.join(outdir, fname), 'wb')
|
fp = gzip.open(os.path.join(outdir, fname), 'wb')
|
||||||
data = dict(
|
data = dict(
|
||||||
IS_VERTICAL=is_vertical.get(name, False),
|
IS_VERTICAL=converter.is_vertical.get(name, False),
|
||||||
CODE2CID=cmap,
|
CODE2CID=cmap,
|
||||||
)
|
)
|
||||||
fp.write(pickle.dumps(data))
|
fp.write(pickle.dumps(data))
|
||||||
|
@ -145,12 +158,11 @@ def main(argv):
|
||||||
print >>sys.stderr, 'writing %r...' % fname
|
print >>sys.stderr, 'writing %r...' % fname
|
||||||
fp = gzip.open(os.path.join(outdir, fname), 'wb')
|
fp = gzip.open(os.path.join(outdir, fname), 'wb')
|
||||||
data = dict(
|
data = dict(
|
||||||
CID2UNICHR_H=cid2unichr_h,
|
CID2UNICHR_H=converter.cid2unichr_h,
|
||||||
CID2UNICHR_V=cid2unichr_v,
|
CID2UNICHR_V=converter.cid2unichr_v,
|
||||||
)
|
)
|
||||||
fp.write(pickle.dumps(data))
|
fp.write(pickle.dumps(data))
|
||||||
fp.close()
|
fp.close()
|
||||||
|
return
|
||||||
return 0
|
|
||||||
|
|
||||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||||
|
|
Loading…
Reference in New Issue