2020-04-28 08:58:42 +00:00
|
|
|
#!/usr/bin/env python3
|
2016-11-08 19:01:11 +00:00
|
|
|
|
2009-12-19 14:17:00 +00:00
|
|
|
import sys
|
2020-01-04 15:47:07 +00:00
|
|
|
import pickle as pickle
|
2014-09-11 21:34:09 +00:00
|
|
|
import codecs
|
2009-12-19 14:17:00 +00:00
|
|
|
|
|
|
|
|
2020-01-04 15:47:07 +00:00
|
|
|
class CMapConverter:
|
2013-10-12 04:20:40 +00:00
|
|
|
|
2013-10-17 13:57:48 +00:00
|
|
|
def __init__(self, enc2codec={}):
|
|
|
|
self.enc2codec = enc2codec
|
2019-12-29 20:20:20 +00:00
|
|
|
self.code2cid = {} # {'cmapname': ...}
|
2013-10-12 04:20:40 +00:00
|
|
|
self.is_vertical = {}
|
2019-12-29 20:20:20 +00:00
|
|
|
self.cid2unichr_h = {} # {cid: unichr}
|
|
|
|
self.cid2unichr_v = {} # {cid: unichr}
|
2013-10-12 04:20:40 +00:00
|
|
|
return
|
|
|
|
|
2013-10-17 13:57:48 +00:00
|
|
|
def get_encs(self):
|
|
|
|
return self.code2cid.keys()
|
|
|
|
|
|
|
|
def get_maps(self, enc):
|
|
|
|
if enc.endswith('-H'):
|
|
|
|
(hmapenc, vmapenc) = (enc, None)
|
|
|
|
elif enc == 'H':
|
|
|
|
(hmapenc, vmapenc) = ('H', 'V')
|
|
|
|
else:
|
|
|
|
(hmapenc, vmapenc) = (enc+'-H', enc+'-V')
|
|
|
|
if hmapenc in self.code2cid:
|
|
|
|
hmap = self.code2cid[hmapenc]
|
|
|
|
else:
|
|
|
|
hmap = {}
|
|
|
|
self.code2cid[hmapenc] = hmap
|
|
|
|
vmap = None
|
|
|
|
if vmapenc:
|
|
|
|
self.is_vertical[vmapenc] = True
|
|
|
|
if vmapenc in self.code2cid:
|
|
|
|
vmap = self.code2cid[vmapenc]
|
|
|
|
else:
|
|
|
|
vmap = {}
|
|
|
|
self.code2cid[vmapenc] = vmap
|
|
|
|
return (hmap, vmap)
|
|
|
|
|
2013-10-12 04:20:40 +00:00
|
|
|
def load(self, fp):
|
2013-10-17 13:57:48 +00:00
|
|
|
encs = None
|
2013-10-12 04:20:40 +00:00
|
|
|
for line in fp:
|
2019-12-29 20:20:20 +00:00
|
|
|
(line, _, _) = line.strip().partition('#')
|
|
|
|
if not line:
|
|
|
|
continue
|
2013-10-12 04:20:40 +00:00
|
|
|
values = line.split('\t')
|
2013-10-17 13:57:48 +00:00
|
|
|
if encs is None:
|
2017-05-29 07:06:09 +00:00
|
|
|
assert values[0] == 'CID', str(values)
|
2013-10-17 13:57:48 +00:00
|
|
|
encs = values
|
2013-10-12 04:20:40 +00:00
|
|
|
continue
|
2013-11-07 07:14:53 +00:00
|
|
|
|
2013-10-17 13:57:48 +00:00
|
|
|
def put(dmap, code, cid, force=False):
|
|
|
|
for b in code[:-1]:
|
|
|
|
if b in dmap:
|
|
|
|
dmap = dmap[b]
|
|
|
|
else:
|
|
|
|
d = {}
|
|
|
|
dmap[b] = d
|
|
|
|
dmap = d
|
2014-09-11 21:34:09 +00:00
|
|
|
b = code[-1]
|
2013-10-17 13:57:48 +00:00
|
|
|
if force or ((b not in dmap) or dmap[b] == cid):
|
|
|
|
dmap[b] = cid
|
|
|
|
return
|
2013-11-07 07:14:53 +00:00
|
|
|
|
2013-10-17 13:57:48 +00:00
|
|
|
def add(unimap, enc, code):
|
2009-12-19 14:17:00 +00:00
|
|
|
try:
|
2013-10-17 13:57:48 +00:00
|
|
|
codec = self.enc2codec[enc]
|
|
|
|
c = code.decode(codec, 'strict')
|
|
|
|
if len(c) == 1:
|
|
|
|
if c not in unimap:
|
|
|
|
unimap[c] = 0
|
|
|
|
unimap[c] += 1
|
|
|
|
except KeyError:
|
|
|
|
pass
|
2009-12-19 14:17:00 +00:00
|
|
|
except UnicodeError:
|
|
|
|
pass
|
2013-10-17 13:57:48 +00:00
|
|
|
return
|
2013-11-07 07:14:53 +00:00
|
|
|
|
2013-10-17 13:57:48 +00:00
|
|
|
def pick(unimap):
|
2014-09-11 21:34:09 +00:00
|
|
|
chars = list(unimap.items())
|
2019-12-29 20:20:20 +00:00
|
|
|
chars.sort(key=(lambda x: (x[1], -ord(x[0]))), reverse=True)
|
|
|
|
(c, _) = chars[0]
|
2013-10-17 13:57:48 +00:00
|
|
|
return c
|
2013-11-07 07:14:53 +00:00
|
|
|
|
2013-10-22 09:44:40 +00:00
|
|
|
cid = int(values[0])
|
2013-10-17 13:57:48 +00:00
|
|
|
unimap_h = {}
|
|
|
|
unimap_v = {}
|
2019-12-29 20:20:20 +00:00
|
|
|
for (enc, value) in zip(encs, values):
|
|
|
|
if enc == 'CID':
|
|
|
|
continue
|
|
|
|
if value == '*':
|
|
|
|
continue
|
2013-11-07 07:14:53 +00:00
|
|
|
|
2013-10-17 13:57:48 +00:00
|
|
|
# hcodes, vcodes: encoded bytes for each writing mode.
|
|
|
|
hcodes = []
|
|
|
|
vcodes = []
|
|
|
|
for code in value.split(','):
|
|
|
|
vertical = code.endswith('v')
|
|
|
|
if vertical:
|
|
|
|
code = code[:-1]
|
|
|
|
try:
|
2014-09-11 21:34:09 +00:00
|
|
|
code = codecs.decode(code, 'hex_codec')
|
2019-12-29 20:20:20 +00:00
|
|
|
except Exception:
|
2013-10-17 13:57:48 +00:00
|
|
|
code = chr(int(code, 16))
|
|
|
|
if vertical:
|
|
|
|
vcodes.append(code)
|
|
|
|
add(unimap_v, enc, code)
|
|
|
|
else:
|
|
|
|
hcodes.append(code)
|
|
|
|
add(unimap_h, enc, code)
|
|
|
|
# add cid to each map.
|
|
|
|
(hmap, vmap) = self.get_maps(enc)
|
|
|
|
if vcodes:
|
|
|
|
assert vmap is not None
|
|
|
|
for code in vcodes:
|
|
|
|
put(vmap, code, cid, True)
|
|
|
|
for code in hcodes:
|
|
|
|
put(hmap, code, cid, True)
|
2013-10-12 04:20:40 +00:00
|
|
|
else:
|
2013-10-17 13:57:48 +00:00
|
|
|
for code in hcodes:
|
|
|
|
put(hmap, code, cid)
|
|
|
|
put(vmap, code, cid)
|
2013-11-07 07:14:53 +00:00
|
|
|
|
2013-10-17 13:57:48 +00:00
|
|
|
# Determine the "most popular" candidate.
|
|
|
|
if unimap_h:
|
|
|
|
self.cid2unichr_h[cid] = pick(unimap_h)
|
2013-10-22 09:44:40 +00:00
|
|
|
if unimap_v or unimap_h:
|
2013-10-17 13:57:48 +00:00
|
|
|
self.cid2unichr_v[cid] = pick(unimap_v or unimap_h)
|
2013-10-22 09:44:40 +00:00
|
|
|
|
2013-10-17 13:57:48 +00:00
|
|
|
return
|
|
|
|
|
|
|
|
def dump_cmap(self, fp, enc):
|
|
|
|
data = dict(
|
|
|
|
IS_VERTICAL=self.is_vertical.get(enc, False),
|
|
|
|
CODE2CID=self.code2cid.get(enc),
|
|
|
|
)
|
2014-09-11 21:34:09 +00:00
|
|
|
fp.write(pickle.dumps(data, 2))
|
2013-10-17 13:57:48 +00:00
|
|
|
return
|
2013-11-07 07:14:53 +00:00
|
|
|
|
2013-10-17 13:57:48 +00:00
|
|
|
def dump_unicodemap(self, fp):
|
|
|
|
data = dict(
|
|
|
|
CID2UNICHR_H=self.cid2unichr_h,
|
|
|
|
CID2UNICHR_V=self.cid2unichr_v,
|
|
|
|
)
|
2014-09-11 21:34:09 +00:00
|
|
|
fp.write(pickle.dumps(data, 2))
|
2013-10-12 04:20:40 +00:00
|
|
|
return
|
2009-12-19 14:17:00 +00:00
|
|
|
|
2019-12-29 20:20:20 +00:00
|
|
|
|
2009-12-19 14:17:00 +00:00
|
|
|
def main(argv):
|
2013-10-17 13:57:48 +00:00
|
|
|
import getopt
|
|
|
|
import gzip
|
|
|
|
import os.path
|
2013-11-07 07:14:53 +00:00
|
|
|
|
2009-12-19 14:17:00 +00:00
|
|
|
def usage():
|
2019-12-29 20:20:20 +00:00
|
|
|
print('usage: %s [-c enc=codec] output_dir regname [cid2code.txt ...]'
|
|
|
|
% argv[0])
|
2009-12-19 14:17:00 +00:00
|
|
|
return 100
|
2013-10-17 13:57:48 +00:00
|
|
|
try:
|
|
|
|
(opts, args) = getopt.getopt(argv[1:], 'c:')
|
|
|
|
except getopt.GetoptError:
|
|
|
|
return usage()
|
|
|
|
enc2codec = {}
|
|
|
|
for (k, v) in opts:
|
|
|
|
if k == '-c':
|
2019-12-29 20:20:20 +00:00
|
|
|
(enc, _, codec) = v.partition('=')
|
2013-10-17 13:57:48 +00:00
|
|
|
enc2codec[enc] = codec
|
2019-12-29 20:20:20 +00:00
|
|
|
if not args:
|
|
|
|
return usage()
|
2013-10-17 13:57:48 +00:00
|
|
|
outdir = args.pop(0)
|
2019-12-29 20:20:20 +00:00
|
|
|
if not args:
|
|
|
|
return usage()
|
2013-10-17 13:57:48 +00:00
|
|
|
regname = args.pop(0)
|
2009-12-19 14:17:00 +00:00
|
|
|
|
2013-10-17 13:57:48 +00:00
|
|
|
converter = CMapConverter(enc2codec)
|
|
|
|
for path in args:
|
2019-12-29 20:20:20 +00:00
|
|
|
print('reading: %r...' % path)
|
2014-09-11 21:34:09 +00:00
|
|
|
fp = open(path)
|
2013-10-17 13:57:48 +00:00
|
|
|
converter.load(fp)
|
|
|
|
fp.close()
|
2009-12-19 14:17:00 +00:00
|
|
|
|
2013-10-17 13:57:48 +00:00
|
|
|
for enc in converter.get_encs():
|
|
|
|
fname = '%s.pickle.gz' % enc
|
|
|
|
path = os.path.join(outdir, fname)
|
2019-12-29 20:20:20 +00:00
|
|
|
print('writing: %r...' % path)
|
2013-10-17 13:57:48 +00:00
|
|
|
fp = gzip.open(path, 'wb')
|
|
|
|
converter.dump_cmap(fp, enc)
|
2009-12-19 14:17:00 +00:00
|
|
|
fp.close()
|
|
|
|
|
2010-06-13 13:50:24 +00:00
|
|
|
fname = 'to-unicode-%s.pickle.gz' % regname
|
2013-10-17 13:57:48 +00:00
|
|
|
path = os.path.join(outdir, fname)
|
2019-12-29 20:20:20 +00:00
|
|
|
print('writing: %r...' % path)
|
2013-10-17 13:57:48 +00:00
|
|
|
fp = gzip.open(path, 'wb')
|
|
|
|
converter.dump_unicodemap(fp)
|
2009-12-19 14:17:00 +00:00
|
|
|
fp.close()
|
2013-10-12 04:20:40 +00:00
|
|
|
return
|
2009-12-19 14:17:00 +00:00
|
|
|
|
2019-12-29 20:20:20 +00:00
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
sys.exit(main(sys.argv))
|