Beating the codepage dragon.
parent
8d42eec94d
commit
6ad82e355c
12
Makefile
12
Makefile
|
@ -41,13 +41,17 @@ cmap_clean:
|
||||||
$(CMAPDST):
|
$(CMAPDST):
|
||||||
$(MKDIR) $(CMAPDST)
|
$(MKDIR) $(CMAPDST)
|
||||||
$(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz: $(CMAPDST)
|
$(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz: $(CMAPDST)
|
||||||
$(CONV_CMAP) $(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt cp950 big5
|
$(CONV_CMAP) -c B5=cp950 -c UniCNS-UTF8=utf-8 \
|
||||||
|
$(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt
|
||||||
$(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz: $(CMAPDST)
|
$(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz: $(CMAPDST)
|
||||||
$(CONV_CMAP) $(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt cp936 gb2312
|
$(CONV_CMAP) -c GBK-EUC=cp936 -c UniGB-UTF8=utf-8 \
|
||||||
|
$(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt
|
||||||
$(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz: $(CMAPDST)
|
$(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz: $(CMAPDST)
|
||||||
$(CONV_CMAP) $(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
|
$(CONV_CMAP) -c RKSJ=cp932 -c EUC=euc-jp -c UniJIS-UTF8=utf-8 \
|
||||||
|
$(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt
|
||||||
$(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz: $(CMAPDST)
|
$(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz: $(CMAPDST)
|
||||||
$(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
|
$(CONV_CMAP) -c KSC-EUC=euc-kr -c KSC-Johab=johab -c KSCms-UHC=cp949 -c UniKS-UTF8=utf-8 \
|
||||||
|
$(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt
|
||||||
|
|
||||||
test: cmap
|
test: cmap
|
||||||
cd samples && $(MAKE) test
|
cd samples && $(MAKE) test
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
#!/usr/bin/env python2
|
#!/usr/bin/env python2
|
||||||
import sys
|
import sys
|
||||||
import os.path
|
|
||||||
import gzip
|
|
||||||
import cPickle as pickle
|
import cPickle as pickle
|
||||||
|
|
||||||
|
|
||||||
|
@ -9,159 +7,189 @@ import cPickle as pickle
|
||||||
##
|
##
|
||||||
class CMapConverter(object):
|
class CMapConverter(object):
|
||||||
|
|
||||||
def __init__(self, check_codecs=[]):
|
def __init__(self, enc2codec={}):
|
||||||
self.check_codecs = check_codecs
|
self.enc2codec = enc2codec
|
||||||
self.code2cid = {} # {'cmapname': ...}
|
self.code2cid = {} # {'cmapname': ...}
|
||||||
self.is_vertical = {}
|
self.is_vertical = {}
|
||||||
self.cid2unichr_h = {} # {cid: unichr}
|
self.cid2unichr_h = {} # {cid: unichr}
|
||||||
self.cid2unichr_v = {} # {cid: unichr}
|
self.cid2unichr_v = {} # {cid: unichr}
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def get_encs(self):
|
||||||
|
return self.code2cid.keys()
|
||||||
|
|
||||||
|
def get_maps(self, enc):
|
||||||
|
if enc.endswith('-H'):
|
||||||
|
(hmapenc, vmapenc) = (enc, None)
|
||||||
|
elif enc == 'H':
|
||||||
|
(hmapenc, vmapenc) = ('H', 'V')
|
||||||
|
else:
|
||||||
|
(hmapenc, vmapenc) = (enc+'-H', enc+'-V')
|
||||||
|
if hmapenc in self.code2cid:
|
||||||
|
hmap = self.code2cid[hmapenc]
|
||||||
|
else:
|
||||||
|
hmap = {}
|
||||||
|
self.code2cid[hmapenc] = hmap
|
||||||
|
vmap = None
|
||||||
|
if vmapenc:
|
||||||
|
self.is_vertical[vmapenc] = True
|
||||||
|
if vmapenc in self.code2cid:
|
||||||
|
vmap = self.code2cid[vmapenc]
|
||||||
|
else:
|
||||||
|
vmap = {}
|
||||||
|
self.code2cid[vmapenc] = vmap
|
||||||
|
return (hmap, vmap)
|
||||||
|
|
||||||
def load(self, fp):
|
def load(self, fp):
|
||||||
names = []
|
encs = None
|
||||||
for line in fp:
|
for line in fp:
|
||||||
(line,_,_) = line.strip().partition('#')
|
(line,_,_) = line.strip().partition('#')
|
||||||
if not line: continue
|
if not line: continue
|
||||||
values = line.split('\t')
|
values = line.split('\t')
|
||||||
if not names:
|
if encs is None:
|
||||||
names = values
|
encs = values
|
||||||
continue
|
continue
|
||||||
d = dict( (k,v) for (k,v) in zip(names, values) if v != '*' )
|
|
||||||
cid = int(d['CID'])
|
def put(dmap, code, cid, force=False):
|
||||||
for (key,value) in d.iteritems():
|
for b in code[:-1]:
|
||||||
if key == 'CID': continue
|
b = ord(b)
|
||||||
self._register(cid, key, value)
|
if b in dmap:
|
||||||
return
|
dmap = dmap[b]
|
||||||
|
else:
|
||||||
def get_canonicals(self, name):
|
d = {}
|
||||||
if name.endswith('-H'):
|
dmap[b] = d
|
||||||
return (name, None)
|
dmap = d
|
||||||
elif name == 'H':
|
b = ord(code[-1])
|
||||||
return ('H', 'V')
|
if force or ((b not in dmap) or dmap[b] == cid):
|
||||||
else:
|
dmap[b] = cid
|
||||||
return (name+'-H', name+'-V')
|
return
|
||||||
|
|
||||||
def get_unichr(self, codes):
|
def add(unimap, enc, code):
|
||||||
# determine the "most popular" candidate.
|
|
||||||
d = {}
|
|
||||||
for code in codes:
|
|
||||||
char = unicode(code, 'utf-8')
|
|
||||||
if char not in d:
|
|
||||||
d[char] = 0
|
|
||||||
for codec in self.check_codecs:
|
|
||||||
try:
|
try:
|
||||||
char.encode(codec, 'strict')
|
codec = self.enc2codec[enc]
|
||||||
d[char] += 1
|
c = code.decode(codec, 'strict')
|
||||||
|
if len(c) == 1:
|
||||||
|
if c not in unimap:
|
||||||
|
unimap[c] = 0
|
||||||
|
unimap[c] += 1
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
except UnicodeError:
|
except UnicodeError:
|
||||||
pass
|
pass
|
||||||
chars = sorted(d.keys(), key=lambda char:d[char], reverse=True)
|
return
|
||||||
return chars[0]
|
|
||||||
|
def pick(unimap):
|
||||||
|
chars = unimap.items()
|
||||||
|
chars.sort(key=(lambda (c,n):(n,-ord(c))), reverse=True)
|
||||||
|
(c,_) = chars[0]
|
||||||
|
return c
|
||||||
|
|
||||||
|
cid = None
|
||||||
|
unimap_h = {}
|
||||||
|
unimap_v = {}
|
||||||
|
for (enc,value) in zip(encs, values):
|
||||||
|
if enc == 'CID':
|
||||||
|
cid = int(value)
|
||||||
|
continue
|
||||||
|
assert cid is not None
|
||||||
|
if value == '*':
|
||||||
|
continue
|
||||||
|
|
||||||
def _register(self, cid, key, value):
|
# hcodes, vcodes: encoded bytes for each writing mode.
|
||||||
def put(dmap, code, cid, force=False):
|
hcodes = []
|
||||||
for b in code[:-1]:
|
vcodes = []
|
||||||
b = ord(b)
|
for code in value.split(','):
|
||||||
if b in dmap:
|
vertical = code.endswith('v')
|
||||||
dmap = dmap[b]
|
if vertical:
|
||||||
else:
|
code = code[:-1]
|
||||||
d = {}
|
try:
|
||||||
dmap[b] = d
|
code = code.decode('hex')
|
||||||
dmap = d
|
except:
|
||||||
b = ord(code[-1])
|
code = chr(int(code, 16))
|
||||||
if force or ((b not in dmap) or dmap[b] == cid):
|
if vertical:
|
||||||
dmap[b] = cid
|
vcodes.append(code)
|
||||||
return
|
add(unimap_v, enc, code)
|
||||||
|
else:
|
||||||
(hmapname, vmapname) = self.get_canonicals(key)
|
hcodes.append(code)
|
||||||
if hmapname in self.code2cid:
|
add(unimap_h, enc, code)
|
||||||
hmap = self.code2cid[hmapname]
|
# add cid to each map.
|
||||||
else:
|
(hmap, vmap) = self.get_maps(enc)
|
||||||
hmap = {}
|
|
||||||
self.code2cid[hmapname] = hmap
|
|
||||||
vmap = None
|
|
||||||
if vmapname:
|
|
||||||
self.is_vertical[vmapname] = True
|
|
||||||
if vmapname in self.code2cid:
|
|
||||||
vmap = self.code2cid[vmapname]
|
|
||||||
else:
|
|
||||||
vmap = {}
|
|
||||||
self.code2cid[vmapname] = vmap
|
|
||||||
|
|
||||||
hcodes = []
|
|
||||||
vcodes = []
|
|
||||||
for code in value.split(','):
|
|
||||||
vertical = code.endswith('v')
|
|
||||||
if vertical:
|
|
||||||
code = code[:-1]
|
|
||||||
try:
|
|
||||||
code = code.decode('hex')
|
|
||||||
except:
|
|
||||||
code = chr(int(code, 16))
|
|
||||||
if vertical:
|
|
||||||
vcodes.append(code)
|
|
||||||
else:
|
|
||||||
hcodes.append(code)
|
|
||||||
|
|
||||||
if vcodes:
|
|
||||||
assert vmap is not None
|
|
||||||
for code in vcodes:
|
|
||||||
put(vmap, code, cid, True)
|
|
||||||
for code in hcodes:
|
|
||||||
put(hmap, code, cid, True)
|
|
||||||
if key.endswith('-UTF8'):
|
|
||||||
if hcodes:
|
|
||||||
self.cid2unichr_h[cid] = self.get_unichr(hcodes)
|
|
||||||
if vcodes:
|
if vcodes:
|
||||||
self.cid2unichr_v[cid] = self.get_unichr(vcodes)
|
assert vmap is not None
|
||||||
else:
|
for code in vcodes:
|
||||||
for code in hcodes:
|
put(vmap, code, cid, True)
|
||||||
put(hmap, code, cid)
|
for code in hcodes:
|
||||||
put(vmap, code, cid)
|
put(hmap, code, cid, True)
|
||||||
if key.endswith('-UTF8') and hcodes:
|
else:
|
||||||
code = self.get_unichr(hcodes)
|
for code in hcodes:
|
||||||
if cid not in self.cid2unichr_h:
|
put(hmap, code, cid)
|
||||||
self.cid2unichr_h[cid] = code
|
put(vmap, code, cid)
|
||||||
if cid not in self.cid2unichr_v:
|
|
||||||
self.cid2unichr_v[cid] = code
|
# Determine the "most popular" candidate.
|
||||||
|
if unimap_h:
|
||||||
|
self.cid2unichr_h[cid] = pick(unimap_h)
|
||||||
|
self.cid2unichr_v[cid] = pick(unimap_v or unimap_h)
|
||||||
|
return
|
||||||
|
|
||||||
|
def dump_cmap(self, fp, enc):
|
||||||
|
data = dict(
|
||||||
|
IS_VERTICAL=self.is_vertical.get(enc, False),
|
||||||
|
CODE2CID=self.code2cid.get(enc),
|
||||||
|
)
|
||||||
|
fp.write(pickle.dumps(data))
|
||||||
|
return
|
||||||
|
|
||||||
|
def dump_unicodemap(self, fp):
|
||||||
|
data = dict(
|
||||||
|
CID2UNICHR_H=self.cid2unichr_h,
|
||||||
|
CID2UNICHR_V=self.cid2unichr_v,
|
||||||
|
)
|
||||||
|
fp.write(pickle.dumps(data))
|
||||||
return
|
return
|
||||||
|
|
||||||
# main
|
# main
|
||||||
def main(argv):
|
def main(argv):
|
||||||
|
import getopt
|
||||||
def usage():
|
import gzip
|
||||||
print 'usage: %s output_dir regname cid2code.txt codecs ...' % argv[0]
|
import os.path
|
||||||
return 100
|
|
||||||
|
|
||||||
args = argv[1:]
|
def usage():
|
||||||
if len(args) < 3: return usage()
|
print 'usage: %s [-c enc=codec] output_dir regname [cid2code.txt ...]' % argv[0]
|
||||||
(outdir, regname, src) = args[:3]
|
return 100
|
||||||
check_codecs = args[3:]
|
try:
|
||||||
|
(opts, args) = getopt.getopt(argv[1:], 'c:')
|
||||||
|
except getopt.GetoptError:
|
||||||
|
return usage()
|
||||||
|
enc2codec = {}
|
||||||
|
for (k, v) in opts:
|
||||||
|
if k == '-c':
|
||||||
|
(enc,_,codec) = v.partition('=')
|
||||||
|
enc2codec[enc] = codec
|
||||||
|
if not args: return usage()
|
||||||
|
outdir = args.pop(0)
|
||||||
|
if not args: return usage()
|
||||||
|
regname = args.pop(0)
|
||||||
|
|
||||||
print >>sys.stderr, 'reading %r...' % src
|
converter = CMapConverter(enc2codec)
|
||||||
converter = CMapConverter(check_codecs)
|
for path in args:
|
||||||
fp = file(src)
|
print >>sys.stderr, 'reading: %r...' % path
|
||||||
converter.load(fp)
|
fp = file(path)
|
||||||
fp.close()
|
converter.load(fp)
|
||||||
|
fp.close()
|
||||||
|
|
||||||
for (name, cmap) in converter.code2cid.iteritems():
|
for enc in converter.get_encs():
|
||||||
fname = '%s.pickle.gz' % name
|
fname = '%s.pickle.gz' % enc
|
||||||
print >>sys.stderr, 'writing %r...' % fname
|
path = os.path.join(outdir, fname)
|
||||||
fp = gzip.open(os.path.join(outdir, fname), 'wb')
|
print >>sys.stderr, 'writing: %r...' % path
|
||||||
data = dict(
|
fp = gzip.open(path, 'wb')
|
||||||
IS_VERTICAL=converter.is_vertical.get(name, False),
|
converter.dump_cmap(fp, enc)
|
||||||
CODE2CID=cmap,
|
|
||||||
)
|
|
||||||
fp.write(pickle.dumps(data))
|
|
||||||
fp.close()
|
fp.close()
|
||||||
|
|
||||||
fname = 'to-unicode-%s.pickle.gz' % regname
|
fname = 'to-unicode-%s.pickle.gz' % regname
|
||||||
print >>sys.stderr, 'writing %r...' % fname
|
path = os.path.join(outdir, fname)
|
||||||
fp = gzip.open(os.path.join(outdir, fname), 'wb')
|
print >>sys.stderr, 'writing: %r...' % path
|
||||||
data = dict(
|
fp = gzip.open(path, 'wb')
|
||||||
CID2UNICHR_H=converter.cid2unichr_h,
|
converter.dump_unicodemap(fp)
|
||||||
CID2UNICHR_V=converter.cid2unichr_v,
|
|
||||||
)
|
|
||||||
fp.write(pickle.dumps(data))
|
|
||||||
fp.close()
|
fp.close()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue