pdfminer.six/pdfminer/cmapdb.py

#!/usr/bin/env python

""" Adobe character mapping (CMap) support.

CMaps provide the mapping between character codes and Unicode
code-points to character ids (CIDs).

More information is available on the Adobe website:

  http://opensource.adobe.com/wiki/display/cmap/CMap+Resources

"""

import sys
import re
import os
import os.path
from struct import pack, unpack
from psparser import PSStackParser
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF
from psparser import PSLiteral, PSKeyword
from psparser import literal_name, keyword_name
from fontmetrics import FONT_METRICS
from latin_enc import ENCODING
from glyphlist import charname2unicode
from utils import choplist, nunpack
try:
    import cdb
except ImportError:
    import pdfminer.pycdb as cdb


class CMapError(Exception): pass


##  find_cmap_path
##
def find_cmap_path():
    """Returns the location of CMap directory."""
    for path in (os.environ.get('CMAP_PATH', '.'),
                 os.path.join(os.path.dirname(__file__), 'CMap')):
        if os.path.isdir(path):
            return path
    raise IOError


##  name2unicode
##
STRIP_NAME = re.compile(r'[0-9]+')
def name2unicode(name):
    """Converts Adobe glyph names to Unicode numbers."""
    if name in charname2unicode:
        return charname2unicode[name]
    m = STRIP_NAME.search(name)
    if not m: raise KeyError(name)
    return int(m.group(0))


##  CMap
##
class CMap(object):

    debug = 0

    def __init__(self):
        self.code2cid = {}
        self.cid2code = {}
        self.attrs = {}
        return

    def __repr__(self):
        return '<CMap: %s>' % self.attrs.get('CMapName')

    def update(self, code2cid=None, cid2code=None):
        if code2cid:
            self.code2cid.update(code2cid)
        if cid2code:
            self.cid2code.update(cid2code)
        return self

    def copycmap(self, cmap):
        self.code2cid.update(cmap.getall_code2cid())
        self.cid2code.update(cmap.getall_cid2code())
        return self

    def register_code2cid(self, code, cid):
        if isinstance(code, str) and isinstance(cid, int):
            self.code2cid[code] = cid
        return self

    def register_cid2code(self, cid, code):
        if isinstance(cid, int):
            if isinstance(code, PSLiteral):
                self.cid2code[cid] = pack('>H', name2unicode(code.name))
            elif isinstance(code, str):
                self.cid2code[cid] = code
        return self

    def decode(self, bytes):
        if self.debug:
            print >>sys.stderr, 'decode: %r, %r' % (self, bytes)
        x = ''
        for c in bytes:
            if x:
                if x+c in self.code2cid:
                    yield self.code2cid[x+c]
                x = ''
            elif c in self.code2cid:
                yield self.code2cid[c]
            else:
                x = c
        return

    def is_vertical(self):
        return self.attrs.get('WMode', 0)

    def tocid(self, code):
        return self.code2cid.get(code)
    def tocode(self, cid):
        return self.cid2code.get(cid)

    def getall_attrs(self):
        return self.attrs.iteritems()
    def getall_code2cid(self):
        return self.code2cid.iteritems()
    def getall_cid2code(self):
        return self.cid2code.iteritems()


##  CDBCMap
##
class CDBCMap(CMap):

    def __init__(self, cdbname):
        CMap.__init__(self)
        self.cdbname = cdbname
        self.db = cdb.init(cdbname)
        return

    def __repr__(self):
        return '<CDBCMap: %s (%r)>' % (self.db['/CMapName'], self.cdbname)

    def tocid(self, code):
        k = 'c'+code
        if not self.db.has_key(k):
            return None
        return unpack('>L', self.db[k])
    def tocode(self, cid):
        k = 'i'+pack('>L', cid)
        if not self.db.has_key(k):
            return None
        return self.db[k]

    def is_vertical(self):
        return (self.db.has_key('/WMode') and
                self.db['/WMode'] == '1')

    def getall(self, c):
        while 1:
            x = self.db.each()
            if not x: break
            (k,v) = x
            if k.startswith(c):
                yield (k[1:], unpack('>L', v)[0])
        return

    def getall_attrs(self):
        while 1:
            x = self.db.each()
            if not x: break
            (k,v) = x
            if k.startswith('/'):
                yield (k[1:], eval(v)[0])
        return

    def getall_cid2code(self):
        return self.getall('i')
    def getall_code2cid(self):
        return self.getall('c')

    def decode(self, bytes):
        if self.debug:
            print >>sys.stderr, 'decode: %r, %r' % (self, bytes)
        x = ''
        for c in bytes:
            if x:
                if x+c in self.code2cid:
                    yield self.code2cid[x+c]
                elif self.db.has_key('c'+x+c):
                    (dest,) = unpack('>L', self.db['c'+x+c])
                    self.code2cid[x+c] = dest
                    yield dest
                x = ''
            elif c in self.code2cid:
                yield self.code2cid[c]
            elif self.db.has_key('c'+c):
                (dest,) = unpack('>L', self.db['c'+c])
                self.code2cid[c] = dest
                yield dest
            else:
                x = c
        return


##  CMapDB
##
class CMapDB(object):

    class CMapNotFound(CMapError): pass

    CMAP_ALIAS = { }
    debug = 0

    def __init__(self, dirname=None, cdbdirname=None):
        if not dirname:
            dirname = find_cmap_path()
        self.dirname = dirname
        self.cdbdirname = cdbdirname or dirname
        self.cmapdb = {}
        return

    def get_cmap(self, cmapname, strict=True):
        cmapname = self.CMAP_ALIAS.get(cmapname, cmapname)
        if cmapname in self.cmapdb:
            cmap = self.cmapdb[cmapname]
        else:
            fname = os.path.join(self.dirname, cmapname)
            cdbname = os.path.join(self.cdbdirname, cmapname+'.cmap.cdb')
            if os.path.exists(cdbname):
                if 1 <= self.debug:
                    print >>sys.stderr, 'Opening: CDBCMap %r...' % cdbname
                cmap = CDBCMap(cdbname)
            elif os.path.exists(fname):
                if 1 <= self.debug:
                    print >>sys.stderr, 'Reading: CMap %r...' % fname
                cmap = CMap()
                fp = file(fname, 'rb')
                CMapParser(self, cmap, fp).run()
                fp.close()
            elif not strict:
                cmap = CMap() # just create empty cmap
            else:
                raise CMapDB.CMapNotFound(cmapname)
            self.cmapdb[cmapname] = cmap
        return cmap


##  CMapParser
##
class CMapParser(PSStackParser):

    def __init__(self, cmapdb, cmap, fp):
        PSStackParser.__init__(self, fp)
        self.cmapdb = cmapdb
        self.cmap = cmap
        self.in_cmap = False
        return

    def run(self):
        try:
            self.nextobject()
        except PSEOF:
            pass
        return

    def do_keyword(self, pos, token):
        name = token.name
        if name == 'begincmap':
            self.in_cmap = True
            self.popall()
            return
        elif name == 'endcmap':
            self.in_cmap = False
            return
        if not self.in_cmap: return
        #
        if name == 'def':
            try:
                ((_,k),(_,v)) = self.pop(2)
                self.cmap.attrs[literal_name(k)] = v
            except PSSyntaxError:
                pass
            return

        if name == 'usecmap':
            if self.cmapdb:
                try:
                    ((_,cmapname),) = self.pop(1)
                    self.cmap.copycmap(self.cmapdb.get_cmap(literal_name(cmapname)))
                except PSSyntaxError:
                    pass
            return

        if name == 'begincodespacerange':
            self.popall()
            return
        if name == 'endcodespacerange':
            self.popall()
            return

        if name == 'begincidrange':
            self.popall()
            return
        if name == 'endcidrange':
            objs = [ obj for (_,obj) in self.popall() ]
            for (s,e,cid) in choplist(3, objs):
                if (not isinstance(s, str) or not isinstance(e, str) or
                    not isinstance(cid, int) or len(s) != len(e)): continue
                sprefix = s[:-4]
                eprefix = e[:-4]
                if sprefix != eprefix: continue
                svar = s[-4:]
                evar = e[-4:]
                s1 = nunpack(svar)
                e1 = nunpack(evar)
                vlen = len(svar)
                #assert s1 <= e1
                for i in xrange(e1-s1+1):
                    x = sprefix+pack('>L',s1+i)[-vlen:]
                    self.cmap.register_code2cid(x, cid+i)
            return

        if name == 'begincidchar':
            self.popall()
            return
        if name == 'endcidchar':
            objs = [ obj for (_,obj) in self.popall() ]
            for (cid,code) in choplist(2, objs):
                if isinstance(code, str) and isinstance(cid, str):
                    self.cmap.register_code2cid(code, nunpack(cid))
            return

        if name == 'beginbfrange':
            self.popall()
            return
        if name == 'endbfrange':
            objs = [ obj for (_,obj) in self.popall() ]
            for (s,e,code) in choplist(3, objs):
                if (not isinstance(s, str) or not isinstance(e, str) or
                    len(s) != len(e)): continue
                s1 = nunpack(s)
                e1 = nunpack(e)
                #assert s1 <= e1
                if isinstance(code, list):
                    for i in xrange(e1-s1+1):
                        self.cmap.register_cid2code(s1+i, code[i])
                else:
                    var = code[-4:]
                    base = nunpack(var)
                    prefix = code[:-4]
                    vlen = len(var)
                    for i in xrange(e1-s1+1):
                        x = prefix+pack('>L',base+i)[-vlen:]
                        self.cmap.register_cid2code(s1+i, x)
            return

        if name == 'beginbfchar':
            self.popall()
            return
        if name == 'endbfchar':
            objs = [ obj for (_,obj) in self.popall() ]
            for (cid,code) in choplist(2, objs):
                if isinstance(cid, str) and isinstance(code, str):
                    self.cmap.register_cid2code(nunpack(cid), code)
            return

        if name == 'beginnotdefrange':
            self.popall()
            return
        if name == 'endnotdefrange':
            self.popall()
            return

        self.push((pos, token))
        return


##  FontMetricsDB
##
class FontMetricsDB(object):

    @classmethod
    def get_metrics(klass, fontname):
        return FONT_METRICS[fontname]


##  EncodingDB
##
class EncodingDB(object):

    std2unicode = {}
    mac2unicode = {}
    win2unicode = {}
    pdf2unicode = {}
    for (name,std,mac,win,pdf) in ENCODING:
        c = unichr(name2unicode(name))
        if std: std2unicode[std] = c
        if mac: mac2unicode[mac] = c
        if win: win2unicode[win] = c
        if pdf: pdf2unicode[pdf] = c

    encodings = {
      'StandardEncoding': std2unicode,
      'MacRomanEncoding': mac2unicode,
      'WinAnsiEncoding': win2unicode,
      'PDFDocEncoding': pdf2unicode,
      }

    @classmethod
    def get_encoding(klass, name, diff=None):
        cid2unicode = klass.encodings.get(name, klass.std2unicode)
        if diff:
            cid2unicode = cid2unicode.copy()
            cid = 0
            for x in diff:
                if isinstance(x, int):
                    cid = x
                elif isinstance(x, PSLiteral):
                    try:
                        cid2unicode[cid] = unichr(name2unicode(x.name))
                    except KeyError:
                        pass
                    cid += 1
        return cid2unicode


##  CMap -> CMapCDB conversion
##
def dump_cdb(cmap, cdbfile, verbose=1):
    """Writes a CMap object into a cdb file."""
    m = cdb.cdbmake(cdbfile, cdbfile+'.tmp')
    if verbose:
        print >>sys.stderr, 'Writing: %r...' % cdbfile
    for (k,v) in cmap.getall_attrs():
        m.add('/'+k, repr(v))
    for (code,cid) in cmap.getall_code2cid():
        m.add('c'+code, pack('>L',cid))
    for (cid,code) in cmap.getall_cid2code():
        m.add('i'+pack('>L',cid), code)
    m.finish()
    return

def convert_cmap(cmapdir, outputdir, force=False):
    """Convert all CMap source files in a directory into cdb files."""
    cmapdb = CMapDB(cmapdir)
    for fname in os.listdir(cmapdir):
        if '.' in fname: continue
        cmapname = os.path.basename(fname)
        cdbname = os.path.join(outputdir, cmapname+'.cmap.cdb')
        if not force and os.path.exists(cdbname):
            print >>sys.stderr, 'Skipping: %r' % cmapname
            continue
        print >>sys.stderr, 'Reading: %r...' % cmapname
        cmap = cmapdb.get_cmap(cmapname)
        dump_cdb(cmap, cdbname)
    return

def main(argv):
    """Converts CMap files into cdb files.

    usage: python -m pdfminer.cmap [-f] [cmap_dir [output_dir]]
    """

    import getopt
    def usage():
        print 'usage: %s [-f] [cmap_dir [output_dir]]' % argv[0]
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'f')
    except getopt.GetoptError:
        return usage()
    if args:
        cmapdir = args.pop(0)
    else:
        try:
            cmapdir = find_cmap_path()
        except IOError:
            print >>sys.stderr, 'cannot find CMap directory'
            return 1
    if args:
        outputdir = args.pop(0)
    else:
        outputdir = cmapdir
    force = False
    for (k, v) in opts:
        if k == '-f': force = True
    if not os.path.isdir(cmapdir):
        print >>sys.stderr, 'directory does not exist: %r' % cmapdir
        return 1
    if not os.path.isdir(outputdir):
        print >>sys.stderr, 'directory does not exist: %r' % outputdir
        return 1
    return convert_cmap(cmapdir, outputdir, force=force)

if __name__ == '__main__': sys.exit(main(sys.argv))