From 6ad82e355ca1499fc018380bdd2ada6b688e0e1f Mon Sep 17 00:00:00 2001
From: Yusuke Shinyama <yusuke@shinyama.jp>
Date: Thu, 17 Oct 2013 22:57:48 +0900
Subject: [PATCH] Beating the codepage dragon.

---
 Makefile           |  12 +-
 tools/conv_cmap.py | 284 +++++++++++++++++++++++++--------------------
 2 files changed, 164 insertions(+), 132 deletions(-)

diff --git a/Makefile b/Makefile
index 07966bd..c6953eb 100644
--- a/Makefile
+++ b/Makefile
@@ -41,13 +41,17 @@ cmap_clean:
 $(CMAPDST):
 	$(MKDIR) $(CMAPDST)
 $(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz: $(CMAPDST)
-	$(CONV_CMAP) $(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt cp950 big5
+	$(CONV_CMAP) -c B5=cp950 -c UniCNS-UTF8=utf-8 \
+		$(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt
 $(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz: $(CMAPDST)
-	$(CONV_CMAP) $(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt cp936 gb2312
+	$(CONV_CMAP) -c GBK-EUC=cp936 -c UniGB-UTF8=utf-8 \
+		$(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt
 $(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz: $(CMAPDST)
-	$(CONV_CMAP) $(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
+	$(CONV_CMAP) -c RKSJ=cp932 -c EUC=euc-jp -c UniJIS-UTF8=utf-8 \
+		$(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt
 $(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz: $(CMAPDST)
-	$(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
+	$(CONV_CMAP) -c KSC-EUC=euc-kr -c KSC-Johab=johab -c KSCms-UHC=cp949 -c UniKS-UTF8=utf-8 \
+		$(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt
 
 test: cmap
 	cd samples && $(MAKE) test
diff --git a/tools/conv_cmap.py b/tools/conv_cmap.py
index ee0e5f8..f7b81d1 100755
--- a/tools/conv_cmap.py
+++ b/tools/conv_cmap.py
@@ -1,7 +1,5 @@
 #!/usr/bin/env python2
 import sys
-import os.path
-import gzip
 import cPickle as pickle
 
 
@@ -9,159 +7,189 @@ import cPickle as pickle
 ##
 class CMapConverter(object):
 
-    def __init__(self, check_codecs=[]):
-        self.check_codecs = check_codecs
+    def __init__(self, enc2codec={}):
+        self.enc2codec = enc2codec
         self.code2cid = {} # {'cmapname': ...}
         self.is_vertical = {}
         self.cid2unichr_h = {} # {cid: unichr}
         self.cid2unichr_v = {} # {cid: unichr}
         return
 
+    def get_encs(self):
+        return self.code2cid.keys()
+
+    def get_maps(self, enc):
+        if enc.endswith('-H'):
+            (hmapenc, vmapenc) = (enc, None)
+        elif enc == 'H':
+            (hmapenc, vmapenc) = ('H', 'V')
+        else:
+            (hmapenc, vmapenc) = (enc+'-H', enc+'-V')
+        if hmapenc in self.code2cid:
+            hmap = self.code2cid[hmapenc]
+        else:
+            hmap = {}
+            self.code2cid[hmapenc] = hmap
+        vmap = None
+        if vmapenc:
+            self.is_vertical[vmapenc] = True
+            if vmapenc in self.code2cid:
+                vmap = self.code2cid[vmapenc]
+            else:
+                vmap = {}
+                self.code2cid[vmapenc] = vmap
+        return (hmap, vmap)
+
     def load(self, fp):
-        names = []
+        encs = None
         for line in fp:
             (line,_,_) = line.strip().partition('#')
             if not line: continue
             values = line.split('\t')
-            if not names:
-                names = values
+            if encs is None:
+                encs = values
                 continue
-            d = dict( (k,v) for (k,v) in zip(names, values) if v != '*' )
-            cid = int(d['CID'])
-            for (key,value) in d.iteritems():
-                if key == 'CID': continue
-                self._register(cid, key, value)
-        return
-
-    def get_canonicals(self, name):
-        if name.endswith('-H'):
-            return (name, None)
-        elif name == 'H':
-            return ('H', 'V')
-        else:
-            return (name+'-H', name+'-V')
-
-    def get_unichr(self, codes):
-        # determine the "most popular" candidate.
-        d = {}
-        for code in codes:
-            char = unicode(code, 'utf-8')
-            if char not in d:
-                d[char] = 0
-            for codec in self.check_codecs:
+            
+            def put(dmap, code, cid, force=False):
+                for b in code[:-1]:
+                    b = ord(b)
+                    if b in dmap:
+                        dmap = dmap[b]
+                    else:
+                        d = {}
+                        dmap[b] = d
+                        dmap = d
+                b = ord(code[-1])
+                if force or ((b not in dmap) or dmap[b] == cid):
+                    dmap[b] = cid
+                return
+            
+            def add(unimap, enc, code):
                 try:
-                    char.encode(codec, 'strict')
-                    d[char] += 1
+                    codec = self.enc2codec[enc]
+                    c = code.decode(codec, 'strict')
+                    if len(c) == 1:
+                        if c not in unimap:
+                            unimap[c] = 0
+                        unimap[c] += 1
+                except KeyError:
+                    pass
                 except UnicodeError:
                     pass
-        chars = sorted(d.keys(), key=lambda char:d[char], reverse=True)
-        return chars[0]
+                return
+                
+            def pick(unimap):
+                chars = unimap.items()
+                chars.sort(key=(lambda (c,n):(n,-ord(c))), reverse=True)
+                (c,_) = chars[0]
+                return c
+                
+            cid = None
+            unimap_h = {}
+            unimap_v = {}
+            for (enc,value) in zip(encs, values):
+                if enc == 'CID':
+                    cid = int(value)
+                    continue
+                assert cid is not None
+                if value == '*':
+                    continue
 
-    def _register(self, cid, key, value):
-        def put(dmap, code, cid, force=False):
-            for b in code[:-1]:
-                b = ord(b)
-                if b in dmap:
-                    dmap = dmap[b]
-                else:
-                    d = {}
-                    dmap[b] = d
-                    dmap = d
-            b = ord(code[-1])
-            if force or ((b not in dmap) or dmap[b] == cid):
-                dmap[b] = cid
-            return
-
-        (hmapname, vmapname) = self.get_canonicals(key)
-        if hmapname in self.code2cid:
-            hmap = self.code2cid[hmapname]
-        else:
-            hmap = {}
-            self.code2cid[hmapname] = hmap
-        vmap = None
-        if vmapname:
-            self.is_vertical[vmapname] = True
-            if vmapname in self.code2cid:
-                vmap = self.code2cid[vmapname]
-            else:
-                vmap = {}
-                self.code2cid[vmapname] = vmap
-        
-        hcodes = []
-        vcodes = []
-        for code in value.split(','):
-            vertical = code.endswith('v')
-            if vertical:
-                code = code[:-1]
-            try:
-                code = code.decode('hex')
-            except:
-                code = chr(int(code, 16))
-            if vertical:
-                vcodes.append(code)
-            else:
-                hcodes.append(code)
-        
-        if vcodes:
-            assert vmap is not None
-            for code in vcodes:
-                put(vmap, code, cid, True)
-            for code in hcodes:
-                put(hmap, code, cid, True)
-            if key.endswith('-UTF8'):
-                if hcodes:
-                    self.cid2unichr_h[cid] = self.get_unichr(hcodes)
+                # hcodes, vcodes: encoded bytes for each writing mode.
+                hcodes = []
+                vcodes = []
+                for code in value.split(','):
+                    vertical = code.endswith('v')
+                    if vertical:
+                        code = code[:-1]
+                    try:
+                        code = code.decode('hex')
+                    except:
+                        code = chr(int(code, 16))
+                    if vertical:
+                        vcodes.append(code)
+                        add(unimap_v, enc, code)
+                    else:
+                        hcodes.append(code)
+                        add(unimap_h, enc, code)
+                # add cid to each map.
+                (hmap, vmap) = self.get_maps(enc)
                 if vcodes:
-                    self.cid2unichr_v[cid] = self.get_unichr(vcodes)
-        else:
-            for code in hcodes:
-                put(hmap, code, cid)
-                put(vmap, code, cid)
-            if key.endswith('-UTF8') and hcodes:
-                code = self.get_unichr(hcodes)
-                if cid not in self.cid2unichr_h:
-                    self.cid2unichr_h[cid] = code
-                if cid not in self.cid2unichr_v:
-                    self.cid2unichr_v[cid] = code
+                    assert vmap is not None
+                    for code in vcodes:
+                        put(vmap, code, cid, True)
+                    for code in hcodes:
+                        put(hmap, code, cid, True)
+                else:
+                    for code in hcodes:
+                        put(hmap, code, cid)
+                        put(vmap, code, cid)
+
+            # Determine the "most popular" candidate.
+            if unimap_h:
+                self.cid2unichr_h[cid] = pick(unimap_h)
+                self.cid2unichr_v[cid] = pick(unimap_v or unimap_h)
+        return
+
+    def dump_cmap(self, fp, enc):
+        data = dict(
+            IS_VERTICAL=self.is_vertical.get(enc, False),
+            CODE2CID=self.code2cid.get(enc),
+        )
+        fp.write(pickle.dumps(data))
+        return
+        
+    def dump_unicodemap(self, fp):
+        data = dict(
+            CID2UNICHR_H=self.cid2unichr_h,
+            CID2UNICHR_V=self.cid2unichr_v,
+        )
+        fp.write(pickle.dumps(data))
         return
 
 # main
 def main(argv):
-
-    def usage():
-        print 'usage: %s output_dir regname cid2code.txt codecs ...' % argv[0]
-        return 100
+    import getopt
+    import gzip
+    import os.path
     
-    args = argv[1:]
-    if len(args) < 3: return usage()
-    (outdir, regname, src) = args[:3]
-    check_codecs = args[3:]
+    def usage():
+        print 'usage: %s [-c enc=codec] output_dir regname [cid2code.txt ...]' % argv[0]
+        return 100
+    try:
+        (opts, args) = getopt.getopt(argv[1:], 'c:')
+    except getopt.GetoptError:
+        return usage()
+    enc2codec = {}
+    for (k, v) in opts:
+        if k == '-c':
+            (enc,_,codec) = v.partition('=')
+            enc2codec[enc] = codec
+    if not args: return usage()
+    outdir = args.pop(0)
+    if not args: return usage()
+    regname = args.pop(0)
 
-    print >>sys.stderr, 'reading %r...' % src
-    converter = CMapConverter(check_codecs)
-    fp = file(src)
-    converter.load(fp)
-    fp.close()
+    converter = CMapConverter(enc2codec)
+    for path in args:
+        print >>sys.stderr, 'reading: %r...' % path
+        fp = file(path)
+        converter.load(fp)
+        fp.close()
 
-    for (name, cmap) in converter.code2cid.iteritems():
-        fname = '%s.pickle.gz' % name
-        print >>sys.stderr, 'writing %r...' % fname
-        fp = gzip.open(os.path.join(outdir, fname), 'wb')
-        data = dict(
-            IS_VERTICAL=converter.is_vertical.get(name, False),
-            CODE2CID=cmap,
-        )
-        fp.write(pickle.dumps(data))
+    for enc in converter.get_encs():
+        fname = '%s.pickle.gz' % enc
+        path = os.path.join(outdir, fname)
+        print >>sys.stderr, 'writing: %r...' % path
+        fp = gzip.open(path, 'wb')
+        converter.dump_cmap(fp, enc)
         fp.close()
 
     fname = 'to-unicode-%s.pickle.gz' % regname
-    print >>sys.stderr, 'writing %r...' % fname
-    fp = gzip.open(os.path.join(outdir, fname), 'wb')
-    data = dict(
-        CID2UNICHR_H=converter.cid2unichr_h,
-        CID2UNICHR_V=converter.cid2unichr_v,
-    )
-    fp.write(pickle.dumps(data))
+    path = os.path.join(outdir, fname)
+    print >>sys.stderr, 'writing: %r...' % path
+    fp = gzip.open(path, 'wb')
+    converter.dump_unicodemap(fp)
     fp.close()
     return