pdfminer.six/pdfminer/cmap.py

#!/usr/bin/env python
import sys
import re
import os
import os.path
from sys import stderr
from struct import pack
from struct import unpack
from fontmetrics import FONT_METRICS
from latin_enc import ENCODING
from glyphlist import charname2unicode
from psparser import PSException
from psparser import PSSyntaxError
from psparser import PSTypeError
from psparser import PSEOF
from psparser import PSLiteral
from psparser import PSKeyword
from psparser import literal_name
from psparser import keyword_name
from psparser import PSStackParser
from utils import choplist
from utils import nunpack
try:
    import cdb
except ImportError:
    import pdfminer.pycdb as cdb


class CMapError(Exception): pass


##  find_cmap_path
##
def find_cmap_path():
    try:
        return os.environ['CMAP_PATH']
    except KeyError:
        pass
    basedir = os.path.dirname(__file__)
    return os.path.join(basedir, 'CMap')


STRIP_NAME = re.compile(r'[0-9]+')
def name2unicode(name):
    if name in charname2unicode:
        return charname2unicode[name]
    m = STRIP_NAME.search(name)
    if not m: raise KeyError(name)
    return int(m.group(0))


##  CMap
##
class CMap(object):

    debug = 0

    def __init__(self):
        self.code2cid = {}
        self.cid2code = {}
        self.attrs = {}
        return

    def __repr__(self):
        return '<CMap: %s>' % self.attrs.get('CMapName')

    def update(self, code2cid=None, cid2code=None):
        if code2cid:
            self.code2cid.update(code2cid)
        if cid2code:
            self.cid2code.update(cid2code)
        return self

    def copycmap(self, cmap):
        self.code2cid.update(cmap.getall_code2cid())
        self.cid2code.update(cmap.getall_cid2code())
        return self

    def register_code2cid(self, code, cid):
        if isinstance(code, str) and isinstance(cid, int):
            self.code2cid[code] = cid
        return self

    def register_cid2code(self, cid, code):
        if isinstance(cid, int):
            if isinstance(code, PSLiteral):
                self.cid2code[cid] = pack('>H', name2unicode(code.name))
            elif isinstance(code, str):
                self.cid2code[cid] = code
        return self

    def decode(self, bytes):
        if self.debug:
            print >>stderr, 'decode: %r, %r' % (self, bytes)
        x = ''
        for c in bytes:
            if x:
                if x+c in self.code2cid:
                    yield self.code2cid[x+c]
                x = ''
            elif c in self.code2cid:
                yield self.code2cid[c]
            else:
                x = c
        return

    def is_vertical(self):
        return self.attrs.get('WMode', 0)

    def tocid(self, code):
        return self.code2cid.get(code)
    def tocode(self, cid):
        return self.cid2code.get(cid)

    def getall_attrs(self):
        return self.attrs.iteritems()
    def getall_code2cid(self):
        return self.code2cid.iteritems()
    def getall_cid2code(self):
        return self.cid2code.iteritems()


##  CDBCMap
##
class CDBCMap(CMap):

    def __init__(self, cdbname):
        CMap.__init__(self)
        self.cdbname = cdbname
        self.db = cdb.init(cdbname)
        return

    def __repr__(self):
        return '<CDBCMap: %s (%r)>' % (self.db['/CMapName'], self.cdbname)

    def tocid(self, code):
        k = 'c'+code
        if not self.db.has_key(k):
            return None
        return unpack('>L', self.db[k])
    def tocode(self, cid):
        k = 'i'+pack('>L', cid)
        if not self.db.has_key(k):
            return None
        return self.db[k]

    def is_vertical(self):
        return (self.db.has_key('/WMode') and
                self.db['/WMode'] == '1')

    def getall(self, c):
        while 1:
            x = self.db.each()
            if not x: break
            (k,v) = x
            if k.startswith(c):
                yield (k[1:], unpack('>L', v)[0])
        return

    def getall_attrs(self):
        while 1:
            x = self.db.each()
            if not x: break
            (k,v) = x
            if k.startswith('/'):
                yield (k[1:], eval(v)[0])
        return

    def getall_cid2code(self):
        return self.getall('i')
    def getall_code2cid(self):
        return self.getall('c')

    def decode(self, bytes):
        if self.debug:
            print >>stderr, 'decode: %r, %r' % (self, bytes)
        x = ''
        for c in bytes:
            if x:
                if x+c in self.code2cid:
                    yield self.code2cid[x+c]
                elif self.db.has_key('c'+x+c):
                    (dest,) = unpack('>L', self.db['c'+x+c])
                    self.code2cid[x+c] = dest
                    yield dest
                x = ''
            elif c in self.code2cid:
                yield self.code2cid[c]
            elif self.db.has_key('c'+c):
                (dest,) = unpack('>L', self.db['c'+c])
                self.code2cid[c] = dest
                yield dest
            else:
                x = c
        return


##  CMapDB
##
class CMapDB(object):

    class CMapNotFound(CMapError): pass

    CMAP_ALIAS = {
      }

    debug = 0
    dirname = None
    cdbdirname = None
    cmapdb = {}

    @classmethod
    def initialize(klass, dirname=None, cdbdirname=None):
        if not dirname:
            dirname = find_cmap_path()
        klass.dirname = dirname
        klass.cdbdirname = cdbdirname or dirname
        return

    @classmethod
    def get_cmap(klass, cmapname, strict=True):
        cmapname = klass.CMAP_ALIAS.get(cmapname, cmapname)
        if cmapname in klass.cmapdb:
            cmap = klass.cmapdb[cmapname]
        else:
            fname = os.path.join(klass.dirname, cmapname)
            cdbname = os.path.join(klass.cdbdirname, cmapname+'.cmap.cdb')
            if os.path.exists(cdbname):
                if 1 <= klass.debug:
                    print >>stderr, 'Opening: CDBCMap %r...' % cdbname
                cmap = CDBCMap(cdbname)
            elif os.path.exists(fname):
                if 1 <= klass.debug:
                    print >>stderr, 'Reading: CMap %r...' % fname
                cmap = CMap()
                fp = file(fname, 'rb')
                CMapParser(cmap, fp).run()
                fp.close()
            elif not strict:
                cmap = CMap() # just create empty cmap
            else:
                raise CMapDB.CMapNotFound(cmapname)
            klass.cmapdb[cmapname] = cmap
        return cmap


##  CMapParser
##
class CMapParser(PSStackParser):

    def __init__(self, cmap, fp):
        PSStackParser.__init__(self, fp)
        self.cmap = cmap
        self.in_cmap = False
        return

    def run(self):
        try:
            self.nextobject()
        except PSEOF:
            pass
        return

    def do_keyword(self, pos, token):
        name = token.name
        if name == 'begincmap':
            self.in_cmap = True
            self.popall()
            return
        elif name == 'endcmap':
            self.in_cmap = False
            return
        if not self.in_cmap: return
        #
        if name == 'def':
            try:
                ((_,k),(_,v)) = self.pop(2)
                self.cmap.attrs[literal_name(k)] = v
            except PSSyntaxError:
                pass
            return

        if name == 'usecmap':
            try:
                ((_,cmapname),) = self.pop(1)
                self.cmap.copycmap(CMapDB.get_cmap(literal_name(cmapname)))
            except PSSyntaxError:
                pass
            return

        if name == 'begincodespacerange':
            self.popall()
            return
        if name == 'endcodespacerange':
            self.popall()
            return

        if name == 'begincidrange':
            self.popall()
            return
        if name == 'endcidrange':
            objs = [ obj for (_,obj) in self.popall() ]
            for (s,e,cid) in choplist(3, objs):
                if (not isinstance(s, str) or not isinstance(e, str) or
                    not isinstance(cid, int) or len(s) != len(e)): continue
                sprefix = s[:-4]
                eprefix = e[:-4]
                if sprefix != eprefix: continue
                svar = s[-4:]
                evar = e[-4:]
                s1 = nunpack(svar)
                e1 = nunpack(evar)
                vlen = len(svar)
                #assert s1 <= e1
                for i in xrange(e1-s1+1):
                    x = sprefix+pack('>L',s1+i)[-vlen:]
                    self.cmap.register_code2cid(x, cid+i)
            return

        if name == 'begincidchar':
            self.popall()
            return
        if name == 'endcidchar':
            objs = [ obj for (_,obj) in self.popall() ]
            for (cid,code) in choplist(2, objs):
                if isinstance(code, str) and isinstance(cid, str):
                    self.cmap.register_code2cid(code, nunpack(cid))
            return

        if name == 'beginbfrange':
            self.popall()
            return
        if name == 'endbfrange':
            objs = [ obj for (_,obj) in self.popall() ]
            for (s,e,code) in choplist(3, objs):
                if (not isinstance(s, str) or not isinstance(e, str) or
                    len(s) != len(e)): continue
                s1 = nunpack(s)
                e1 = nunpack(e)
                #assert s1 <= e1
                if isinstance(code, list):
                    for i in xrange(e1-s1+1):
                        self.cmap.register_cid2code(s1+i, code[i])
                else:
                    var = code[-4:]
                    base = nunpack(var)
                    prefix = code[:-4]
                    vlen = len(var)
                    for i in xrange(e1-s1+1):
                        x = prefix+pack('>L',base+i)[-vlen:]
                        self.cmap.register_cid2code(s1+i, x)
            return

        if name == 'beginbfchar':
            self.popall()
            return
        if name == 'endbfchar':
            objs = [ obj for (_,obj) in self.popall() ]
            for (cid,code) in choplist(2, objs):
                if isinstance(cid, str) and isinstance(code, str):
                    self.cmap.register_cid2code(nunpack(cid), code)
            return

        if name == 'beginnotdefrange':
            self.popall()
            return
        if name == 'endnotdefrange':
            self.popall()
            return

        self.push((pos, token))
        return


##  FontMetricsDB
##
class FontMetricsDB(object):

    @classmethod
    def get_metrics(klass, fontname):
        return FONT_METRICS[fontname]


##  EncodingDB
##
class EncodingDB(object):

    std2unicode = {}
    mac2unicode = {}
    win2unicode = {}
    pdf2unicode = {}
    for (name,std,mac,win,pdf) in ENCODING:
        c = unichr(name2unicode(name))
        if std: std2unicode[std] = c
        if mac: mac2unicode[mac] = c
        if win: win2unicode[win] = c
        if pdf: pdf2unicode[pdf] = c

    encodings = {
      'StandardEncoding': std2unicode,
      'MacRomanEncoding': mac2unicode,
      'WinAnsiEncoding': win2unicode,
      'PDFDocEncoding': pdf2unicode,
      }

    @classmethod
    def get_encoding(klass, name, diff=None):
        cid2unicode = klass.encodings.get(name, klass.std2unicode)
        if diff:
            cid2unicode = cid2unicode.copy()
            cid = 0
            for x in diff:
                if isinstance(x, int):
                    cid = x
                elif isinstance(x, PSLiteral):
                    try:
                        cid2unicode[cid] = unichr(name2unicode(x.name))
                    except KeyError:
                        pass
                    cid += 1
        return cid2unicode


##  CMap -> CMapCDB conversion
##
def dumpcdb(cmap, cdbfile, verbose=1):
    m = cdb.cdbmake(cdbfile, cdbfile+'.tmp')
    if verbose:
        print >>stderr, 'Writing: %r...' % cdbfile
    for (k,v) in cmap.getall_attrs():
        m.add('/'+k, repr(v))
    for (code,cid) in cmap.getall_code2cid():
        m.add('c'+code, pack('>L',cid))
    for (cid,code) in cmap.getall_cid2code():
        m.add('i'+pack('>L',cid), code)
    m.finish()
    return

def convert_cmap(cmapdir, outputdir, force=False):
    CMapDB.initialize(cmapdir)
    for fname in os.listdir(cmapdir):
        if '.' in fname: continue
        cmapname = os.path.basename(fname)
        cdbname = os.path.join(outputdir, cmapname+'.cmap.cdb')
        if not force and os.path.exists(cdbname):
            print >>stderr, 'Skipping: %r' % cmapname
            continue
        print >>stderr, 'Reading: %r...' % cmapname
        cmap = CMapDB.get_cmap(cmapname)
        dumpcdb(cmap, cdbname)
    return

def main(argv):
    import getopt
    def usage():
        print 'usage: %s [-D outputdir] [-f] cmap_dir' % argv[0]
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'C:D:f')
    except getopt.GetoptError:
        return usage()
    if args:
        cmapdir = args.pop(0)
    else:
        cmapdir = find_cmap_path()
    outputdir = cmapdir
    force = False
    for (k, v) in opts:
        if k == '-f': force = True
        elif k == '-C': cmapdir = v
        elif k == '-D': outputdir = v
    if not os.path.isdir(cmapdir):
        print >>stderr, 'directory does not exist: %r' % cmapdir
        return 111
    if not os.path.isdir(outputdir):
        print >>stderr, 'directory does not exist: %r' % outputdir
        return 111
    return convert_cmap(cmapdir, outputdir, force=force)

if __name__ == '__main__': sys.exit(main(sys.argv))