Beating the codepage dragon.

2013-10-17 22:57:48 +09:00 · 2013-10-17 22:57:48 +09:00 · 6ad82e355c
parent 8d42eec94d
commit 6ad82e355c
2 changed files with 164 additions and 132 deletions
--- a/12
+++ b/12
@ -41,13 +41,17 @@ cmap_clean:
 $(CMAPDST):
 	$(MKDIR) $(CMAPDST)
 $(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz: $(CMAPDST)
-	$(CONV_CMAP) $(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt cp950 big5
+	$(CONV_CMAP) -c B5=cp950 -c UniCNS-UTF8=utf-8 \
 		$(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt
 $(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz: $(CMAPDST)
-	$(CONV_CMAP) $(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt cp936 gb2312
+	$(CONV_CMAP) -c GBK-EUC=cp936 -c UniGB-UTF8=utf-8 \
 		$(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt
 $(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz: $(CMAPDST)
-	$(CONV_CMAP) $(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
+	$(CONV_CMAP) -c RKSJ=cp932 -c EUC=euc-jp -c UniJIS-UTF8=utf-8 \
 		$(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt
 $(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz: $(CMAPDST)
-	$(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
+	$(CONV_CMAP) -c KSC-EUC=euc-kr -c KSC-Johab=johab -c KSCms-UHC=cp949 -c UniKS-UTF8=utf-8 \
 		$(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt
 test: cmap
 	cd samples && $(MAKE) test
--- a/tools/conv_cmap.py
+++ b/tools/conv_cmap.py
@ -1,7 +1,5 @@
 #!/usr/bin/env python2
 import sys
 import os.path
 import gzip
 import cPickle as pickle
@ -9,159 +7,189 @@ import cPickle as pickle
 ##
 class CMapConverter(object):
-    def __init__(self, check_codecs=[]):
+    def __init__(self, enc2codec={}):
-        self.check_codecs = check_codecs
+        self.enc2codec = enc2codec
        self.code2cid = {} # {'cmapname': ...}
        self.is_vertical = {}
        self.cid2unichr_h = {} # {cid: unichr}
        self.cid2unichr_v = {} # {cid: unichr}
        return
    def get_encs(self):
        return self.code2cid.keys()
    def get_maps(self, enc):
        if enc.endswith('-H'):
            (hmapenc, vmapenc) = (enc, None)
        elif enc == 'H':
            (hmapenc, vmapenc) = ('H', 'V')
        else:
            (hmapenc, vmapenc) = (enc+'-H', enc+'-V')
        if hmapenc in self.code2cid:
            hmap = self.code2cid[hmapenc]
        else:
            hmap = {}
            self.code2cid[hmapenc] = hmap
        vmap = None
        if vmapenc:
            self.is_vertical[vmapenc] = True
            if vmapenc in self.code2cid:
                vmap = self.code2cid[vmapenc]
            else:
                vmap = {}
                self.code2cid[vmapenc] = vmap
        return (hmap, vmap)
    def load(self, fp):
-        names = []
+        encs = None
        for line in fp:
            (line,_,_) = line.strip().partition('#')
            if not line: continue
            values = line.split('\t')
-            if not names:
+            if encs is None:
-                names = values
+                encs = values
                continue
-            d = dict( (k,v) for (k,v) in zip(names, values) if v != '*' )
+            
-            cid = int(d['CID'])
+            def put(dmap, code, cid, force=False):
-            for (key,value) in d.iteritems():
+                for b in code[:-1]:
-                if key == 'CID': continue
+                    b = ord(b)
-                self._register(cid, key, value)
+                    if b in dmap:
-        return
+                        dmap = dmap[b]
-
+                    else:
-    def get_canonicals(self, name):
+                        d = {}
-        if name.endswith('-H'):
+                        dmap[b] = d
-            return (name, None)
+                        dmap = d
-        elif name == 'H':
+                b = ord(code[-1])
-            return ('H', 'V')
+                if force or ((b not in dmap) or dmap[b] == cid):
-        else:
+                    dmap[b] = cid
-            return (name+'-H', name+'-V')
+                return
-
+            
-    def get_unichr(self, codes):
+            def add(unimap, enc, code):
        # determine the "most popular" candidate.
        d = {}
        for code in codes:
            char = unicode(code, 'utf-8')
            if char not in d:
                d[char] = 0
            for codec in self.check_codecs:
                try:
-                    char.encode(codec, 'strict')
+                    codec = self.enc2codec[enc]
-                    d[char] += 1
+                    c = code.decode(codec, 'strict')
                    if len(c) == 1:
                        if c not in unimap:
                            unimap[c] = 0
                        unimap[c] += 1
                except KeyError:
                    pass
                except UnicodeError:
                    pass
-        chars = sorted(d.keys(), key=lambda char:d[char], reverse=True)
+                return
-        return chars[0]
+                
            def pick(unimap):
                chars = unimap.items()
                chars.sort(key=(lambda (c,n):(n,-ord(c))), reverse=True)
                (c,_) = chars[0]
                return c
            cid = None
            unimap_h = {}
            unimap_v = {}
            for (enc,value) in zip(encs, values):
                if enc == 'CID':
                    cid = int(value)
                    continue
                assert cid is not None
                if value == '*':
                    continue
-    def _register(self, cid, key, value):
+                # hcodes, vcodes: encoded bytes for each writing mode.
-        def put(dmap, code, cid, force=False):
+                hcodes = []
-            for b in code[:-1]:
+                vcodes = []
-                b = ord(b)
+                for code in value.split(','):
-                if b in dmap:
+                    vertical = code.endswith('v')
-                    dmap = dmap[b]
+                    if vertical:
-                else:
+                        code = code[:-1]
-                    d = {}
+                    try:
-                    dmap[b] = d
+                        code = code.decode('hex')
-                    dmap = d
+                    except:
-            b = ord(code[-1])
+                        code = chr(int(code, 16))
-            if force or ((b not in dmap) or dmap[b] == cid):
+                    if vertical:
-                dmap[b] = cid
+                        vcodes.append(code)
-            return
+                        add(unimap_v, enc, code)
-
+                    else:
-        (hmapname, vmapname) = self.get_canonicals(key)
+                        hcodes.append(code)
-        if hmapname in self.code2cid:
+                        add(unimap_h, enc, code)
-            hmap = self.code2cid[hmapname]
+                # add cid to each map.
-        else:
+                (hmap, vmap) = self.get_maps(enc)
            hmap = {}
            self.code2cid[hmapname] = hmap
        vmap = None
        if vmapname:
            self.is_vertical[vmapname] = True
            if vmapname in self.code2cid:
                vmap = self.code2cid[vmapname]
            else:
                vmap = {}
                self.code2cid[vmapname] = vmap
        hcodes = []
        vcodes = []
        for code in value.split(','):
            vertical = code.endswith('v')
            if vertical:
                code = code[:-1]
            try:
                code = code.decode('hex')
            except:
                code = chr(int(code, 16))
            if vertical:
                vcodes.append(code)
            else:
                hcodes.append(code)
        if vcodes:
            assert vmap is not None
            for code in vcodes:
                put(vmap, code, cid, True)
            for code in hcodes:
                put(hmap, code, cid, True)
            if key.endswith('-UTF8'):
                if hcodes:
                    self.cid2unichr_h[cid] = self.get_unichr(hcodes)
                if vcodes:
-                    self.cid2unichr_v[cid] = self.get_unichr(vcodes)
+                    assert vmap is not None
-        else:
+                    for code in vcodes:
-            for code in hcodes:
+                        put(vmap, code, cid, True)
-                put(hmap, code, cid)
+                    for code in hcodes:
-                put(vmap, code, cid)
+                        put(hmap, code, cid, True)
-            if key.endswith('-UTF8') and hcodes:
+                else:
-                code = self.get_unichr(hcodes)
+                    for code in hcodes:
-                if cid not in self.cid2unichr_h:
+                        put(hmap, code, cid)
-                    self.cid2unichr_h[cid] = code
+                        put(vmap, code, cid)
-                if cid not in self.cid2unichr_v:
+
-                    self.cid2unichr_v[cid] = code
+            # Determine the "most popular" candidate.
            if unimap_h:
                self.cid2unichr_h[cid] = pick(unimap_h)
                self.cid2unichr_v[cid] = pick(unimap_v or unimap_h)
        return
    def dump_cmap(self, fp, enc):
        data = dict(
            IS_VERTICAL=self.is_vertical.get(enc, False),
            CODE2CID=self.code2cid.get(enc),
        )
        fp.write(pickle.dumps(data))
        return
    def dump_unicodemap(self, fp):
        data = dict(
            CID2UNICHR_H=self.cid2unichr_h,
            CID2UNICHR_V=self.cid2unichr_v,
        )
        fp.write(pickle.dumps(data))
        return
 # main
 def main(argv):
-
+    import getopt
-    def usage():
+    import gzip
-        print 'usage: %s output_dir regname cid2code.txt codecs ...' % argv[0]
+    import os.path
        return 100
-    args = argv[1:]
+    def usage():
-    if len(args) < 3: return usage()
+        print 'usage: %s [-c enc=codec] output_dir regname [cid2code.txt ...]' % argv[0]
-    (outdir, regname, src) = args[:3]
+        return 100
-    check_codecs = args[3:]
+    try:
        (opts, args) = getopt.getopt(argv[1:], 'c:')
    except getopt.GetoptError:
        return usage()
    enc2codec = {}
    for (k, v) in opts:
        if k == '-c':
            (enc,_,codec) = v.partition('=')
            enc2codec[enc] = codec
    if not args: return usage()
    outdir = args.pop(0)
    if not args: return usage()
    regname = args.pop(0)
-    print >>sys.stderr, 'reading %r...' % src
+    converter = CMapConverter(enc2codec)
-    converter = CMapConverter(check_codecs)
+    for path in args:
-    fp = file(src)
+        print >>sys.stderr, 'reading: %r...' % path
-    converter.load(fp)
+        fp = file(path)
-    fp.close()
+        converter.load(fp)
        fp.close()
-    for (name, cmap) in converter.code2cid.iteritems():
+    for enc in converter.get_encs():
-        fname = '%s.pickle.gz' % name
+        fname = '%s.pickle.gz' % enc
-        print >>sys.stderr, 'writing %r...' % fname
+        path = os.path.join(outdir, fname)
-        fp = gzip.open(os.path.join(outdir, fname), 'wb')
+        print >>sys.stderr, 'writing: %r...' % path
-        data = dict(
+        fp = gzip.open(path, 'wb')
-            IS_VERTICAL=converter.is_vertical.get(name, False),
+        converter.dump_cmap(fp, enc)
            CODE2CID=cmap,
        )
        fp.write(pickle.dumps(data))
        fp.close()
    fname = 'to-unicode-%s.pickle.gz' % regname
-    print >>sys.stderr, 'writing %r...' % fname
+    path = os.path.join(outdir, fname)
-    fp = gzip.open(os.path.join(outdir, fname), 'wb')
+    print >>sys.stderr, 'writing: %r...' % path
-    data = dict(
+    fp = gzip.open(path, 'wb')
-        CID2UNICHR_H=converter.cid2unichr_h,
+    converter.dump_unicodemap(fp)
        CID2UNICHR_V=converter.cid2unichr_v,
    )
    fp.write(pickle.dumps(data))
    fp.close()
    return