Beating the codepage dragon.

pull/1/head
Yusuke Shinyama 2013-10-17 22:57:48 +09:00
parent 8d42eec94d
commit 6ad82e355c
2 changed files with 164 additions and 132 deletions

View File

@ -41,13 +41,17 @@ cmap_clean:
$(CMAPDST): $(CMAPDST):
$(MKDIR) $(CMAPDST) $(MKDIR) $(CMAPDST)
$(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz: $(CMAPDST) $(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz: $(CMAPDST)
$(CONV_CMAP) $(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt cp950 big5 $(CONV_CMAP) -c B5=cp950 -c UniCNS-UTF8=utf-8 \
$(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt
$(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz: $(CMAPDST) $(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz: $(CMAPDST)
$(CONV_CMAP) $(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt cp936 gb2312 $(CONV_CMAP) -c GBK-EUC=cp936 -c UniGB-UTF8=utf-8 \
$(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt
$(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz: $(CMAPDST) $(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz: $(CMAPDST)
$(CONV_CMAP) $(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp $(CONV_CMAP) -c RKSJ=cp932 -c EUC=euc-jp -c UniJIS-UTF8=utf-8 \
$(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt
$(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz: $(CMAPDST) $(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz: $(CMAPDST)
$(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr $(CONV_CMAP) -c KSC-EUC=euc-kr -c KSC-Johab=johab -c KSCms-UHC=cp949 -c UniKS-UTF8=utf-8 \
$(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt
test: cmap test: cmap
cd samples && $(MAKE) test cd samples && $(MAKE) test

View File

@ -1,7 +1,5 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
import sys import sys
import os.path
import gzip
import cPickle as pickle import cPickle as pickle
@ -9,159 +7,189 @@ import cPickle as pickle
## ##
class CMapConverter(object): class CMapConverter(object):
def __init__(self, check_codecs=[]): def __init__(self, enc2codec={}):
self.check_codecs = check_codecs self.enc2codec = enc2codec
self.code2cid = {} # {'cmapname': ...} self.code2cid = {} # {'cmapname': ...}
self.is_vertical = {} self.is_vertical = {}
self.cid2unichr_h = {} # {cid: unichr} self.cid2unichr_h = {} # {cid: unichr}
self.cid2unichr_v = {} # {cid: unichr} self.cid2unichr_v = {} # {cid: unichr}
return return
def get_encs(self):
return self.code2cid.keys()
def get_maps(self, enc):
if enc.endswith('-H'):
(hmapenc, vmapenc) = (enc, None)
elif enc == 'H':
(hmapenc, vmapenc) = ('H', 'V')
else:
(hmapenc, vmapenc) = (enc+'-H', enc+'-V')
if hmapenc in self.code2cid:
hmap = self.code2cid[hmapenc]
else:
hmap = {}
self.code2cid[hmapenc] = hmap
vmap = None
if vmapenc:
self.is_vertical[vmapenc] = True
if vmapenc in self.code2cid:
vmap = self.code2cid[vmapenc]
else:
vmap = {}
self.code2cid[vmapenc] = vmap
return (hmap, vmap)
def load(self, fp): def load(self, fp):
names = [] encs = None
for line in fp: for line in fp:
(line,_,_) = line.strip().partition('#') (line,_,_) = line.strip().partition('#')
if not line: continue if not line: continue
values = line.split('\t') values = line.split('\t')
if not names: if encs is None:
names = values encs = values
continue continue
d = dict( (k,v) for (k,v) in zip(names, values) if v != '*' )
cid = int(d['CID'])
for (key,value) in d.iteritems():
if key == 'CID': continue
self._register(cid, key, value)
return
def get_canonicals(self, name): def put(dmap, code, cid, force=False):
if name.endswith('-H'): for b in code[:-1]:
return (name, None) b = ord(b)
elif name == 'H': if b in dmap:
return ('H', 'V') dmap = dmap[b]
else: else:
return (name+'-H', name+'-V') d = {}
dmap[b] = d
dmap = d
b = ord(code[-1])
if force or ((b not in dmap) or dmap[b] == cid):
dmap[b] = cid
return
def get_unichr(self, codes): def add(unimap, enc, code):
# determine the "most popular" candidate.
d = {}
for code in codes:
char = unicode(code, 'utf-8')
if char not in d:
d[char] = 0
for codec in self.check_codecs:
try: try:
char.encode(codec, 'strict') codec = self.enc2codec[enc]
d[char] += 1 c = code.decode(codec, 'strict')
if len(c) == 1:
if c not in unimap:
unimap[c] = 0
unimap[c] += 1
except KeyError:
pass
except UnicodeError: except UnicodeError:
pass pass
chars = sorted(d.keys(), key=lambda char:d[char], reverse=True) return
return chars[0]
def _register(self, cid, key, value): def pick(unimap):
def put(dmap, code, cid, force=False): chars = unimap.items()
for b in code[:-1]: chars.sort(key=(lambda (c,n):(n,-ord(c))), reverse=True)
b = ord(b) (c,_) = chars[0]
if b in dmap: return c
dmap = dmap[b]
else:
d = {}
dmap[b] = d
dmap = d
b = ord(code[-1])
if force or ((b not in dmap) or dmap[b] == cid):
dmap[b] = cid
return
(hmapname, vmapname) = self.get_canonicals(key) cid = None
if hmapname in self.code2cid: unimap_h = {}
hmap = self.code2cid[hmapname] unimap_v = {}
else: for (enc,value) in zip(encs, values):
hmap = {} if enc == 'CID':
self.code2cid[hmapname] = hmap cid = int(value)
vmap = None continue
if vmapname: assert cid is not None
self.is_vertical[vmapname] = True if value == '*':
if vmapname in self.code2cid: continue
vmap = self.code2cid[vmapname]
else:
vmap = {}
self.code2cid[vmapname] = vmap
hcodes = [] # hcodes, vcodes: encoded bytes for each writing mode.
vcodes = [] hcodes = []
for code in value.split(','): vcodes = []
vertical = code.endswith('v') for code in value.split(','):
if vertical: vertical = code.endswith('v')
code = code[:-1] if vertical:
try: code = code[:-1]
code = code.decode('hex') try:
except: code = code.decode('hex')
code = chr(int(code, 16)) except:
if vertical: code = chr(int(code, 16))
vcodes.append(code) if vertical:
else: vcodes.append(code)
hcodes.append(code) add(unimap_v, enc, code)
else:
if vcodes: hcodes.append(code)
assert vmap is not None add(unimap_h, enc, code)
for code in vcodes: # add cid to each map.
put(vmap, code, cid, True) (hmap, vmap) = self.get_maps(enc)
for code in hcodes:
put(hmap, code, cid, True)
if key.endswith('-UTF8'):
if hcodes:
self.cid2unichr_h[cid] = self.get_unichr(hcodes)
if vcodes: if vcodes:
self.cid2unichr_v[cid] = self.get_unichr(vcodes) assert vmap is not None
else: for code in vcodes:
for code in hcodes: put(vmap, code, cid, True)
put(hmap, code, cid) for code in hcodes:
put(vmap, code, cid) put(hmap, code, cid, True)
if key.endswith('-UTF8') and hcodes: else:
code = self.get_unichr(hcodes) for code in hcodes:
if cid not in self.cid2unichr_h: put(hmap, code, cid)
self.cid2unichr_h[cid] = code put(vmap, code, cid)
if cid not in self.cid2unichr_v:
self.cid2unichr_v[cid] = code # Determine the "most popular" candidate.
if unimap_h:
self.cid2unichr_h[cid] = pick(unimap_h)
self.cid2unichr_v[cid] = pick(unimap_v or unimap_h)
return
def dump_cmap(self, fp, enc):
data = dict(
IS_VERTICAL=self.is_vertical.get(enc, False),
CODE2CID=self.code2cid.get(enc),
)
fp.write(pickle.dumps(data))
return
def dump_unicodemap(self, fp):
data = dict(
CID2UNICHR_H=self.cid2unichr_h,
CID2UNICHR_V=self.cid2unichr_v,
)
fp.write(pickle.dumps(data))
return return
# main # main
def main(argv): def main(argv):
import getopt
import gzip
import os.path
def usage(): def usage():
print 'usage: %s output_dir regname cid2code.txt codecs ...' % argv[0] print 'usage: %s [-c enc=codec] output_dir regname [cid2code.txt ...]' % argv[0]
return 100 return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'c:')
except getopt.GetoptError:
return usage()
enc2codec = {}
for (k, v) in opts:
if k == '-c':
(enc,_,codec) = v.partition('=')
enc2codec[enc] = codec
if not args: return usage()
outdir = args.pop(0)
if not args: return usage()
regname = args.pop(0)
args = argv[1:] converter = CMapConverter(enc2codec)
if len(args) < 3: return usage() for path in args:
(outdir, regname, src) = args[:3] print >>sys.stderr, 'reading: %r...' % path
check_codecs = args[3:] fp = file(path)
converter.load(fp)
fp.close()
print >>sys.stderr, 'reading %r...' % src for enc in converter.get_encs():
converter = CMapConverter(check_codecs) fname = '%s.pickle.gz' % enc
fp = file(src) path = os.path.join(outdir, fname)
converter.load(fp) print >>sys.stderr, 'writing: %r...' % path
fp.close() fp = gzip.open(path, 'wb')
converter.dump_cmap(fp, enc)
for (name, cmap) in converter.code2cid.iteritems():
fname = '%s.pickle.gz' % name
print >>sys.stderr, 'writing %r...' % fname
fp = gzip.open(os.path.join(outdir, fname), 'wb')
data = dict(
IS_VERTICAL=converter.is_vertical.get(name, False),
CODE2CID=cmap,
)
fp.write(pickle.dumps(data))
fp.close() fp.close()
fname = 'to-unicode-%s.pickle.gz' % regname fname = 'to-unicode-%s.pickle.gz' % regname
print >>sys.stderr, 'writing %r...' % fname path = os.path.join(outdir, fname)
fp = gzip.open(os.path.join(outdir, fname), 'wb') print >>sys.stderr, 'writing: %r...' % path
data = dict( fp = gzip.open(path, 'wb')
CID2UNICHR_H=converter.cid2unichr_h, converter.dump_unicodemap(fp)
CID2UNICHR_V=converter.cid2unichr_v,
)
fp.write(pickle.dumps(data))
fp.close() fp.close()
return return