Beating the codepage dragon.

2013-10-17 22:57:48 +09:00 · 2013-10-17 22:57:48 +09:00 · 6ad82e355c
parent 8d42eec94d
commit 6ad82e355c
2 changed files with 164 additions and 132 deletions
--- a/12
+++ b/12
@ -41,13 +41,17 @@ cmap_clean:
 $(CMAPDST):
 	$(MKDIR) $(CMAPDST)
 $(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz: $(CMAPDST)
-	$(CONV_CMAP) $(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt cp950 big5
+	$(CONV_CMAP) -c B5=cp950 -c UniCNS-UTF8=utf-8 \
 		$(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt
 $(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz: $(CMAPDST)
-	$(CONV_CMAP) $(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt cp936 gb2312
+	$(CONV_CMAP) -c GBK-EUC=cp936 -c UniGB-UTF8=utf-8 \
 		$(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt
 $(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz: $(CMAPDST)
-	$(CONV_CMAP) $(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
+	$(CONV_CMAP) -c RKSJ=cp932 -c EUC=euc-jp -c UniJIS-UTF8=utf-8 \
 		$(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt
 $(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz: $(CMAPDST)
-	$(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
+	$(CONV_CMAP) -c KSC-EUC=euc-kr -c KSC-Johab=johab -c KSCms-UHC=cp949 -c UniKS-UTF8=utf-8 \
 		$(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt
 test: cmap
 	cd samples && $(MAKE) test
--- a/tools/conv_cmap.py
+++ b/tools/conv_cmap.py
@ -1,7 +1,5 @@
 #!/usr/bin/env python2
 import sys
 import os.path
 import gzip
 import cPickle as pickle
@ -9,55 +7,49 @@ import cPickle as pickle
 ##
 class CMapConverter(object):
-    def __init__(self, check_codecs=[]):
+    def __init__(self, enc2codec={}):
-        self.check_codecs = check_codecs
+        self.enc2codec = enc2codec
        self.code2cid = {} # {'cmapname': ...}
        self.is_vertical = {}
        self.cid2unichr_h = {} # {cid: unichr}
        self.cid2unichr_v = {} # {cid: unichr}
        return
    def get_encs(self):
        return self.code2cid.keys()
    def get_maps(self, enc):
        if enc.endswith('-H'):
            (hmapenc, vmapenc) = (enc, None)
        elif enc == 'H':
            (hmapenc, vmapenc) = ('H', 'V')
        else:
            (hmapenc, vmapenc) = (enc+'-H', enc+'-V')
        if hmapenc in self.code2cid:
            hmap = self.code2cid[hmapenc]
        else:
            hmap = {}
            self.code2cid[hmapenc] = hmap
        vmap = None
        if vmapenc:
            self.is_vertical[vmapenc] = True
            if vmapenc in self.code2cid:
                vmap = self.code2cid[vmapenc]
            else:
                vmap = {}
                self.code2cid[vmapenc] = vmap
        return (hmap, vmap)
    def load(self, fp):
-        names = []
+        encs = None
        for line in fp:
            (line,_,_) = line.strip().partition('#')
            if not line: continue
            values = line.split('\t')
-            if not names:
+            if encs is None:
-                names = values
+                encs = values
                continue
            d = dict( (k,v) for (k,v) in zip(names, values) if v != '*' )
            cid = int(d['CID'])
            for (key,value) in d.iteritems():
                if key == 'CID': continue
                self._register(cid, key, value)
        return
    def get_canonicals(self, name):
        if name.endswith('-H'):
            return (name, None)
        elif name == 'H':
            return ('H', 'V')
        else:
            return (name+'-H', name+'-V')
    def get_unichr(self, codes):
        # determine the "most popular" candidate.
        d = {}
        for code in codes:
            char = unicode(code, 'utf-8')
            if char not in d:
                d[char] = 0
            for codec in self.check_codecs:
                try:
                    char.encode(codec, 'strict')
                    d[char] += 1
                except UnicodeError:
                    pass
        chars = sorted(d.keys(), key=lambda char:d[char], reverse=True)
        return chars[0]
    def _register(self, cid, key, value):
            def put(dmap, code, cid, force=False):
                for b in code[:-1]:
                    b = ord(b)
@ -72,21 +64,38 @@ class CMapConverter(object):
                    dmap[b] = cid
                return
-        (hmapname, vmapname) = self.get_canonicals(key)
+            def add(unimap, enc, code):
-        if hmapname in self.code2cid:
+                try:
-            hmap = self.code2cid[hmapname]
+                    codec = self.enc2codec[enc]
-        else:
+                    c = code.decode(codec, 'strict')
-            hmap = {}
+                    if len(c) == 1:
-            self.code2cid[hmapname] = hmap
+                        if c not in unimap:
-        vmap = None
+                            unimap[c] = 0
-        if vmapname:
+                        unimap[c] += 1
-            self.is_vertical[vmapname] = True
+                except KeyError:
-            if vmapname in self.code2cid:
+                    pass
-                vmap = self.code2cid[vmapname]
+                except UnicodeError:
-            else:
+                    pass
-                vmap = {}
+                return
                self.code2cid[vmapname] = vmap
            def pick(unimap):
                chars = unimap.items()
                chars.sort(key=(lambda (c,n):(n,-ord(c))), reverse=True)
                (c,_) = chars[0]
                return c
            cid = None
            unimap_h = {}
            unimap_v = {}
            for (enc,value) in zip(encs, values):
                if enc == 'CID':
                    cid = int(value)
                    continue
                assert cid is not None
                if value == '*':
                    continue
                # hcodes, vcodes: encoded bytes for each writing mode.
                hcodes = []
                vcodes = []
                for code in value.split(','):
@ -99,69 +108,88 @@ class CMapConverter(object):
                        code = chr(int(code, 16))
                    if vertical:
                        vcodes.append(code)
                        add(unimap_v, enc, code)
                    else:
                        hcodes.append(code)
-        
+                        add(unimap_h, enc, code)
                # add cid to each map.
                (hmap, vmap) = self.get_maps(enc)
                if vcodes:
                    assert vmap is not None
                    for code in vcodes:
                        put(vmap, code, cid, True)
                    for code in hcodes:
                        put(hmap, code, cid, True)
            if key.endswith('-UTF8'):
                if hcodes:
                    self.cid2unichr_h[cid] = self.get_unichr(hcodes)
                if vcodes:
                    self.cid2unichr_v[cid] = self.get_unichr(vcodes)
                else:
                    for code in hcodes:
                        put(hmap, code, cid)
                        put(vmap, code, cid)
-            if key.endswith('-UTF8') and hcodes:
+
-                code = self.get_unichr(hcodes)
+            # Determine the "most popular" candidate.
-                if cid not in self.cid2unichr_h:
+            if unimap_h:
-                    self.cid2unichr_h[cid] = code
+                self.cid2unichr_h[cid] = pick(unimap_h)
-                if cid not in self.cid2unichr_v:
+                self.cid2unichr_v[cid] = pick(unimap_v or unimap_h)
-                    self.cid2unichr_v[cid] = code
+        return
    def dump_cmap(self, fp, enc):
        data = dict(
            IS_VERTICAL=self.is_vertical.get(enc, False),
            CODE2CID=self.code2cid.get(enc),
        )
        fp.write(pickle.dumps(data))
        return
    def dump_unicodemap(self, fp):
        data = dict(
            CID2UNICHR_H=self.cid2unichr_h,
            CID2UNICHR_V=self.cid2unichr_v,
        )
        fp.write(pickle.dumps(data))
        return
 # main
 def main(argv):
    import getopt
    import gzip
    import os.path
    def usage():
-        print 'usage: %s output_dir regname cid2code.txt codecs ...' % argv[0]
+        print 'usage: %s [-c enc=codec] output_dir regname [cid2code.txt ...]' % argv[0]
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'c:')
    except getopt.GetoptError:
        return usage()
    enc2codec = {}
    for (k, v) in opts:
        if k == '-c':
            (enc,_,codec) = v.partition('=')
            enc2codec[enc] = codec
    if not args: return usage()
    outdir = args.pop(0)
    if not args: return usage()
    regname = args.pop(0)
-    args = argv[1:]
+    converter = CMapConverter(enc2codec)
-    if len(args) < 3: return usage()
+    for path in args:
-    (outdir, regname, src) = args[:3]
+        print >>sys.stderr, 'reading: %r...' % path
-    check_codecs = args[3:]
+        fp = file(path)
    print >>sys.stderr, 'reading %r...' % src
    converter = CMapConverter(check_codecs)
    fp = file(src)
        converter.load(fp)
        fp.close()
-    for (name, cmap) in converter.code2cid.iteritems():
+    for enc in converter.get_encs():
-        fname = '%s.pickle.gz' % name
+        fname = '%s.pickle.gz' % enc
-        print >>sys.stderr, 'writing %r...' % fname
+        path = os.path.join(outdir, fname)
-        fp = gzip.open(os.path.join(outdir, fname), 'wb')
+        print >>sys.stderr, 'writing: %r...' % path
-        data = dict(
+        fp = gzip.open(path, 'wb')
-            IS_VERTICAL=converter.is_vertical.get(name, False),
+        converter.dump_cmap(fp, enc)
            CODE2CID=cmap,
        )
        fp.write(pickle.dumps(data))
        fp.close()
    fname = 'to-unicode-%s.pickle.gz' % regname
-    print >>sys.stderr, 'writing %r...' % fname
+    path = os.path.join(outdir, fname)
-    fp = gzip.open(os.path.join(outdir, fname), 'wb')
+    print >>sys.stderr, 'writing: %r...' % path
-    data = dict(
+    fp = gzip.open(path, 'wb')
-        CID2UNICHR_H=converter.cid2unichr_h,
+    converter.dump_unicodemap(fp)
        CID2UNICHR_V=converter.cid2unichr_v,
    )
    fp.write(pickle.dumps(data))
    fp.close()
    return