pdfminer.six/tools/conv_cmap.py

196 lines
5.8 KiB
Python
Raw Normal View History

2013-10-17 14:05:27 +00:00
#!/usr/bin/env python
import sys
import cPickle as pickle
2013-10-12 04:20:40 +00:00
## CMapConverter
##
class CMapConverter(object):
2013-10-17 13:57:48 +00:00
def __init__(self, enc2codec={}):
self.enc2codec = enc2codec
2013-10-12 04:20:40 +00:00
self.code2cid = {} # {'cmapname': ...}
self.is_vertical = {}
self.cid2unichr_h = {} # {cid: unichr}
self.cid2unichr_v = {} # {cid: unichr}
return
2013-10-17 13:57:48 +00:00
def get_encs(self):
return self.code2cid.keys()
def get_maps(self, enc):
if enc.endswith('-H'):
(hmapenc, vmapenc) = (enc, None)
elif enc == 'H':
(hmapenc, vmapenc) = ('H', 'V')
else:
(hmapenc, vmapenc) = (enc+'-H', enc+'-V')
if hmapenc in self.code2cid:
hmap = self.code2cid[hmapenc]
else:
hmap = {}
self.code2cid[hmapenc] = hmap
vmap = None
if vmapenc:
self.is_vertical[vmapenc] = True
if vmapenc in self.code2cid:
vmap = self.code2cid[vmapenc]
else:
vmap = {}
self.code2cid[vmapenc] = vmap
return (hmap, vmap)
2013-10-12 04:20:40 +00:00
def load(self, fp):
2013-10-17 13:57:48 +00:00
encs = None
2013-10-12 04:20:40 +00:00
for line in fp:
(line,_,_) = line.strip().partition('#')
if not line: continue
values = line.split('\t')
2013-10-17 13:57:48 +00:00
if encs is None:
assert values[0] == 'CID'
2013-10-17 13:57:48 +00:00
encs = values
2013-10-12 04:20:40 +00:00
continue
2013-11-07 07:14:53 +00:00
2013-10-17 13:57:48 +00:00
def put(dmap, code, cid, force=False):
for b in code[:-1]:
b = ord(b)
if b in dmap:
dmap = dmap[b]
else:
d = {}
dmap[b] = d
dmap = d
b = ord(code[-1])
if force or ((b not in dmap) or dmap[b] == cid):
dmap[b] = cid
return
2013-11-07 07:14:53 +00:00
2013-10-17 13:57:48 +00:00
def add(unimap, enc, code):
try:
2013-10-17 13:57:48 +00:00
codec = self.enc2codec[enc]
c = code.decode(codec, 'strict')
if len(c) == 1:
if c not in unimap:
unimap[c] = 0
unimap[c] += 1
except KeyError:
pass
except UnicodeError:
pass
2013-10-17 13:57:48 +00:00
return
2013-11-07 07:14:53 +00:00
2013-10-17 13:57:48 +00:00
def pick(unimap):
chars = unimap.items()
chars.sort(key=(lambda (c,n):(n,-ord(c))), reverse=True)
(c,_) = chars[0]
return c
2013-11-07 07:14:53 +00:00
cid = int(values[0])
2013-10-17 13:57:48 +00:00
unimap_h = {}
unimap_v = {}
for (enc,value) in zip(encs, values):
if enc == 'CID': continue
if value == '*': continue
2013-11-07 07:14:53 +00:00
2013-10-17 13:57:48 +00:00
# hcodes, vcodes: encoded bytes for each writing mode.
hcodes = []
vcodes = []
for code in value.split(','):
vertical = code.endswith('v')
if vertical:
code = code[:-1]
try:
code = code.decode('hex')
except:
code = chr(int(code, 16))
if vertical:
vcodes.append(code)
add(unimap_v, enc, code)
else:
hcodes.append(code)
add(unimap_h, enc, code)
# add cid to each map.
(hmap, vmap) = self.get_maps(enc)
if vcodes:
assert vmap is not None
for code in vcodes:
put(vmap, code, cid, True)
for code in hcodes:
put(hmap, code, cid, True)
2013-10-12 04:20:40 +00:00
else:
2013-10-17 13:57:48 +00:00
for code in hcodes:
put(hmap, code, cid)
put(vmap, code, cid)
2013-11-07 07:14:53 +00:00
2013-10-17 13:57:48 +00:00
# Determine the "most popular" candidate.
if unimap_h:
self.cid2unichr_h[cid] = pick(unimap_h)
if unimap_v or unimap_h:
2013-10-17 13:57:48 +00:00
self.cid2unichr_v[cid] = pick(unimap_v or unimap_h)
2013-10-17 13:57:48 +00:00
return
def dump_cmap(self, fp, enc):
data = dict(
IS_VERTICAL=self.is_vertical.get(enc, False),
CODE2CID=self.code2cid.get(enc),
)
fp.write(pickle.dumps(data))
return
2013-11-07 07:14:53 +00:00
2013-10-17 13:57:48 +00:00
def dump_unicodemap(self, fp):
data = dict(
CID2UNICHR_H=self.cid2unichr_h,
CID2UNICHR_V=self.cid2unichr_v,
)
fp.write(pickle.dumps(data))
2013-10-12 04:20:40 +00:00
return
# main
def main(argv):
2013-10-17 13:57:48 +00:00
import getopt
import gzip
import os.path
2013-11-07 07:14:53 +00:00
def usage():
2014-06-15 03:14:33 +00:00
print ('usage: %s [-c enc=codec] output_dir regname [cid2code.txt ...]' % argv[0])
return 100
2013-10-17 13:57:48 +00:00
try:
(opts, args) = getopt.getopt(argv[1:], 'c:')
except getopt.GetoptError:
return usage()
enc2codec = {}
for (k, v) in opts:
if k == '-c':
(enc,_,codec) = v.partition('=')
enc2codec[enc] = codec
if not args: return usage()
outdir = args.pop(0)
if not args: return usage()
regname = args.pop(0)
2013-10-17 13:57:48 +00:00
converter = CMapConverter(enc2codec)
for path in args:
2014-06-15 03:14:33 +00:00
print ('reading: %r...' % path)
2013-10-17 13:57:48 +00:00
fp = file(path)
converter.load(fp)
fp.close()
2013-10-17 13:57:48 +00:00
for enc in converter.get_encs():
fname = '%s.pickle.gz' % enc
path = os.path.join(outdir, fname)
2014-06-15 03:14:33 +00:00
print ('writing: %r...' % path)
2013-10-17 13:57:48 +00:00
fp = gzip.open(path, 'wb')
converter.dump_cmap(fp, enc)
fp.close()
fname = 'to-unicode-%s.pickle.gz' % regname
2013-10-17 13:57:48 +00:00
path = os.path.join(outdir, fname)
2014-06-15 03:14:33 +00:00
print ('writing: %r...' % path)
2013-10-17 13:57:48 +00:00
fp = gzip.open(path, 'wb')
converter.dump_unicodemap(fp)
fp.close()
2013-10-12 04:20:40 +00:00
return
if __name__ == '__main__': sys.exit(main(sys.argv))