include cmap

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@162 1aa58f4a-7d42-0410-adbc-911cccaed67c
2009-12-19 14:17:00 +00:00 · 2009-12-19 14:17:00 +00:00 · e4b089e327
parent ed8a5362b9
commit e4b089e327
20 changed files with 92216 additions and 362 deletions
--- a/6
+++ b/6
@ -6,7 +6,8 @@ pdfminer/Makefile
 pdfminer/__init__.py
 pdfminer/arcfour.py
 pdfminer/ascii85.py
-pdfminer/cmap.py
+pdfminer/cmapdb.py
+pdfminer/encodingdb.py
 pdfminer/converter.py
 pdfminer/fontmetrics.py
 pdfminer/glyphlist.py
@ -24,6 +25,8 @@ pdfminer/psparser.py
 pdfminer/pycdb.py
 pdfminer/rijndael.py
 pdfminer/utils.py
+pdfminer/cmap/Makefile
+pdfminer/cmap/__init__.py
 tools/Makefile
 tools/dumppdf.py
 tools/pdf2txt.py
@ -40,3 +43,4 @@ samples/i1040nr.pdf
 samples/kampo.pdf
 samples/naacl06-shinyama.pdf
 samples/nlp2004slides.pdf
+cmaprsrc/README.txt
--- a/12
+++ b/12
@ -36,3 +36,15 @@ register: clean
 WEBDIR=$$HOME/Site/unixuser.org/python/$(PACKAGE)
 publish:
 	$(CP) docs/*.html $(WEBDIR)
+
+CONV_CMAP=$(PYTHON) tools/conv_cmap.py
+CMAPDIR=pdfminer/cmap
+CMAPRSRC=cmaprsrc
+cmap: cmaprsrc
+	$(CONV_CMAP) $(CMAPDIR) Adobe-CNS1 $(CMAPRSRC)/cid2code_Adobe_CNS1.txt cp950 big5
+	$(CONV_CMAP) $(CMAPDIR) Adobe-GB1 $(CMAPRSRC)/cid2code_Adobe_GB1.txt cp936 gb2312
+	$(CONV_CMAP) $(CMAPDIR) Adobe-Japan1 $(CMAPRSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
+	$(CONV_CMAP) $(CMAPDIR) Adobe-Korea1 $(CMAPRSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
+
+cmap_clean:
+	cd $(CMAPDIR) && make cmap_clean
--- a/2
+++ b/2
@ -4,4 +4,4 @@ TODOs:
  - Better API Documentation.
  - Robust error handling.
  - Any special handling for linearized PDFs?
-  - Handle crypt filter. (I need more samples!)
+  - Handle crypt filter. (More sample documents are needed!)
--- a/cmaprsrc/README.txt
+++ b/cmaprsrc/README.txt
@ -0,0 +1,60 @@
+README.txt for cmaprsrc
+
+This directory contains Adobe CMap resources. CMaps are required 
+to decode text data written in Chinese, Japanese or Korean language.
+CMap resources are now available freely from Adobe web site:
+http://opensource.adobe.com/wiki/display/cmap/CMap+Resources
+
+The follwing files were extracted from the downloadable tarballs:
+
+cid2code_Adobe_CNS1.txt:
+	http://download.macromedia.com/pub/opensource/cmap/cmapresources_cns1-6.tar.z
+
+cid2code_Adobe_GB1.txt:
+	http://download.macromedia.com/pub/opensource/cmap/cmapresources_gb1-5.tar.z
+
+cid2code_Adobe_Japan1.txt:
+	http://download.macromedia.com/pub/opensource/cmap/cmapresources_japan1-6.tar.z
+
+cid2code_Adobe_Korea1.txt:
+	http://download.macromedia.com/pub/opensource/cmap/cmapresources_korean1-2.tar.z
+
+
+Here is the license information in the original files:
+
+%%Copyright: -----------------------------------------------------------
+%%Copyright: Copyright 1990-20xx Adobe Systems Incorporated.
+%%Copyright: All rights reserved.
+%%Copyright:
+%%Copyright: Redistribution and use in source and binary forms, with or
+%%Copyright: without modification, are permitted provided that the
+%%Copyright: following conditions are met:
+%%Copyright:
+%%Copyright: Redistributions of source code must retain the above
+%%Copyright: copyright notice, this list of conditions and the following
+%%Copyright: disclaimer.
+%%Copyright:
+%%Copyright: Redistributions in binary form must reproduce the above
+%%Copyright: copyright notice, this list of conditions and the following
+%%Copyright: disclaimer in the documentation and/or other materials
+%%Copyright: provided with the distribution.
+%%Copyright:
+%%Copyright: Neither the name of Adobe Systems Incorporated nor the names
+%%Copyright: of its contributors may be used to endorse or promote
+%%Copyright: products derived from this software without specific prior
+%%Copyright: written permission.
+%%Copyright:
+%%Copyright: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+%%Copyright: CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+%%Copyright: INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+%%Copyright: MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+%%Copyright: DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+%%Copyright: CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+%%Copyright: SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+%%Copyright: NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+%%Copyright: LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+%%Copyright: HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+%%Copyright: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+%%Copyright: OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+%%Copyright: SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+%%Copyright: -----------------------------------------------------------
--- a/cmaprsrc/cid2code_Adobe_CNS1.txt
+++ b/cmaprsrc/cid2code_Adobe_CNS1.txt
--- a/cmaprsrc/cid2code_Adobe_GB1.txt
+++ b/cmaprsrc/cid2code_Adobe_GB1.txt
--- a/cmaprsrc/cid2code_Adobe_Japan1.txt
+++ b/cmaprsrc/cid2code_Adobe_Japan1.txt
--- a/cmaprsrc/cid2code_Adobe_Korea1.txt
+++ b/cmaprsrc/cid2code_Adobe_Korea1.txt
--- a/pdfminer/Makefile
+++ b/pdfminer/Makefile
@ -6,6 +6,7 @@ all:

 clean:
 	-rm *.pyc *.pyo
+	cd cmap && make clean

 check:
 	$(PYCHECKER) *.py
--- a/pdfminer/cmap/Makefile
+++ b/pdfminer/cmap/Makefile
@ -0,0 +1,10 @@
+# Makefile for pdfminer.cmap
+
+all:
+
+clean:
+	-rm *.pyc *.pyo
+
+cmap_clean:
+	-rm *.py
+	touch __init__.py
--- a/pdfminer/cmap/init.py
+++ b/pdfminer/cmap/init.py
--- a/pdfminer/cmapdb.py
+++ b/pdfminer/cmapdb.py
@ -20,240 +20,229 @@ from psparser import PSStackParser
 from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF
 from psparser import PSLiteral, PSKeyword
 from psparser import literal_name, keyword_name
-from fontmetrics import FONT_METRICS
-from latin_enc import ENCODING
-from glyphlist import charname2unicode
+from encodingdb import name2unicode
 from utils import choplist, nunpack
-try:
-    import cdb
-except ImportError:
-    import pdfminer.pycdb as cdb


 class CMapError(Exception): pass


-##  find_cmap_path
-##
-def find_cmap_path():
-    """Returns the location of CMap directory."""
-    for path in (os.environ.get('CMAP_PATH', '.'),
-                 os.path.join(os.path.dirname(__file__), 'CMap')):
-        if os.path.isdir(path):
-            return path
-    raise IOError
-
-
-##  name2unicode
-##
-STRIP_NAME = re.compile(r'[0-9]+')
-def name2unicode(name):
-    """Converts Adobe glyph names to Unicode numbers."""
-    if name in charname2unicode:
-        return charname2unicode[name]
-    m = STRIP_NAME.search(name)
-    if not m: raise KeyError(name)
-    return int(m.group(0))
-
-
 ##  CMap
 ##
 class CMap(object):

    debug = 0

+    def __init__(self, code2cid=None):
+        self.code2cid = code2cid or {}
+        return
+
+    def is_vertical(self):
+        return False
+
+    def use_cmap(self, cmap):
+        assert isinstance(cmap, CMap)
+        def copy(dst, src):
+            for (k,v) in src.iteritems():
+                if isinstance(v, dict):
+                    d = {}
+                    dst[k] = d
+                    copy(d, v)
+                else:
+                    dst[k] = v
+        copy(self.code2cid, cmap.code2cid)
+        return
+
+    def decode(self, code):
+        if self.debug:
+            print >>sys.stderr, 'decode: %r, %r' % (self, code)
+        d = self.code2cid
+        for c in code:
+            c = ord(c)
+            if c in d:
+                d = d[c]
+                if isinstance(d, int):
+                    yield d
+                    d = self.code2cid
+            else:
+                d = self.code2cid
+        return
+
+
+##  IdentityCMap
+##
+class IdentityCMap(object):
+
+    def __init__(self, vertical):
+        self.vertical = vertical
+        return
+
+    def is_vertical(self):
+        return self.vertical
+
+    def decode(self, code):
+        return unpack('>%dH' % (len(code)/2), code)
+            
+
+##  UnicodeMap
+##
+class UnicodeMap(object):
+
+    debug = 0
+
+    def __init__(self, cid2unicode=None):
+        self.cid2unicode = cid2unicode or {}
+        return
+
+    def get_unicode(self, cid):
+        if self.debug:
+            print >>sys.stderr, 'get_unicode: %r, %r' % (self, cid)
+        return self.cid2unicode.get(cid)
+
+
+##  FileCMap
+##
+class FileCMap(CMap):
+
    def __init__(self):
-        self.code2cid = {}
-        self.cid2code = {}
+        CMap.__init__(self)
        self.attrs = {}
        return

    def __repr__(self):
        return '<CMap: %s>' % self.attrs.get('CMapName')

-    def update(self, code2cid=None, cid2code=None):
-        if code2cid:
-            self.code2cid.update(code2cid)
-        if cid2code:
-            self.cid2code.update(cid2code)
-        return self
-
-    def copycmap(self, cmap):
-        self.code2cid.update(cmap.getall_code2cid())
-        self.cid2code.update(cmap.getall_cid2code())
-        return self
-
-    def register_code2cid(self, code, cid):
-        if isinstance(code, str) and isinstance(cid, int):
-            self.code2cid[code] = cid
-        return self
-
-    def register_cid2code(self, cid, code):
-        if isinstance(cid, int):
-            if isinstance(code, PSLiteral):
-                self.cid2code[cid] = pack('>H', name2unicode(code.name))
-            elif isinstance(code, str):
-                self.cid2code[cid] = code
-        return self
-
-    def decode(self, bytes):
-        if self.debug:
-            print >>sys.stderr, 'decode: %r, %r' % (self, bytes)
-        x = ''
-        for c in bytes:
-            if x:
-                if x+c in self.code2cid:
-                    yield self.code2cid[x+c]
-                x = ''
-            elif c in self.code2cid:
-                yield self.code2cid[c]
-            else:
-                x = c
-        return
-
    def is_vertical(self):
        return self.attrs.get('WMode', 0)

-    def tocid(self, code):
-        return self.code2cid.get(code)
-    def tocode(self, cid):
-        return self.cid2code.get(cid)
+    def set_attr(self, k, v):
+        self.attrs[k] = v
+        return

-    def getall_attrs(self):
-        return self.attrs.iteritems()
-    def getall_code2cid(self):
-        return self.code2cid.iteritems()
-    def getall_cid2code(self):
-        return self.cid2code.iteritems()
+    def add_code2cid(self, code, cid):
+        assert isinstance(code, str) and isinstance(cid, int)
+        d = self.code2cid
+        for c in code[:-1]:
+            c = ord(c)
+            if c in d:
+                d = d[c]
+            else:
+                t = {}
+                d[c] = t
+                d =t
+        c = ord(code[-1])
+        d[c] = cid
+        return


-##  CDBCMap
+##  FileUnicodeMap
 ##
-class CDBCMap(CMap):
+class FileUnicodeMap(UnicodeMap):
    
-    def __init__(self, cdbname):
-        CMap.__init__(self)
-        self.cdbname = cdbname
-        self.db = cdb.init(cdbname)
+    def __init__(self):
+        UnicodeMap.__init__(self)
+        self.attrs = {}
        return

    def __repr__(self):
-        return '<CDBCMap: %s (%r)>' % (self.db['/CMapName'], self.cdbname)
+        return '<UnicodeMap: %s>' % self.attrs.get('CMapName')

-    def tocid(self, code):
-        k = 'c'+code
-        if not self.db.has_key(k):
-            return None
-        return unpack('>L', self.db[k])
-    def tocode(self, cid):
-        k = 'i'+pack('>L', cid)
-        if not self.db.has_key(k):
-            return None
-        return self.db[k]
+    def set_attr(self, k, v):
+        self.attrs[k] = v
+        return
+
+    def add_cid2unicode(self, cid, code):
+        assert isinstance(cid, int)
+        if isinstance(code, PSLiteral):
+            # Interpret as an Adobe glyph name.
+            self.cid2unicode[cid] = name2unicode(code.name)
+        elif isinstance(code, str):
+            # Interpret as UTF-16BE.
+            self.cid2unicode[cid] = unpack('>H', code)[0]
+        elif isinstance(code, int):
+            self.cid2unicode[cid] = code
+        else:
+            raise TypeError(code)
+        return
+
+
+##  PyCMap
+##
+class PyCMap(CMap):
+
+    def __init__(self, name, module):
+        CMap.__init__(self, module.CODE2CID)
+        self.name = name
+        self._is_vertical = module.IS_VERTICAL
+        return
+
+    def __repr__(self):
+        return '<PyCMap: %s>' % (self.name)

    def is_vertical(self):
-        return (self.db.has_key('/WMode') and
-                self.db['/WMode'] == '1')
+        return self._is_vertical
    
-    def getall(self, c):
-        while 1:
-            x = self.db.each()
-            if not x: break
-            (k,v) = x
-            if k.startswith(c):
-                yield (k[1:], unpack('>L', v)[0])
+
+##  PyUnicodeMap
+##
+class PyUnicodeMap(UnicodeMap):
+    
+    def __init__(self, name, module, vertical):
+        if vertical:
+            cid2unicode = module.CID2UNICODE_V
+        else:
+            cid2unicode = module.CID2UNICODE_H
+        UnicodeMap.__init__(self, cid2unicode)
+        self.name = name
        return

-    def getall_attrs(self):
-        while 1:
-            x = self.db.each()
-            if not x: break
-            (k,v) = x
-            if k.startswith('/'):
-                yield (k[1:], eval(v)[0])
-        return
-
-    def getall_cid2code(self):
-        return self.getall('i')
-    def getall_code2cid(self):
-        return self.getall('c')
-
-    def decode(self, bytes):
-        if self.debug:
-            print >>sys.stderr, 'decode: %r, %r' % (self, bytes)
-        x = ''
-        for c in bytes:
-            if x:
-                if x+c in self.code2cid:
-                    yield self.code2cid[x+c]
-                elif self.db.has_key('c'+x+c):
-                    (dest,) = unpack('>L', self.db['c'+x+c])
-                    self.code2cid[x+c] = dest
-                    yield dest
-                x = ''
-            elif c in self.code2cid:
-                yield self.code2cid[c]
-            elif self.db.has_key('c'+c):
-                (dest,) = unpack('>L', self.db['c'+c])
-                self.code2cid[c] = dest
-                yield dest
-            else:
-                x = c
-        return
+    def __repr__(self):
+        return '<PyUnicodeMap: %s>' % (self.name)


 ##  CMapDB
 ##
 class CMapDB(object):

-    class CMapNotFound(CMapError): pass
-
-    CMAP_ALIAS = { }
    debug = 0
    
-    def __init__(self, dirname=None, cdbdirname=None):
-        if not dirname:
-            dirname = find_cmap_path()
-        self.dirname = dirname
-        self.cdbdirname = cdbdirname or dirname
-        self.cmapdb = {}
-        return
+    class CMapNotFound(CMapError): pass

-    def get_cmap(self, cmapname, strict=True):
-        cmapname = self.CMAP_ALIAS.get(cmapname, cmapname)
-        if cmapname in self.cmapdb:
-            cmap = self.cmapdb[cmapname]
-        else:
-            fname = os.path.join(self.dirname, cmapname)
-            cdbname = os.path.join(self.cdbdirname, cmapname+'.cmap.cdb')
-            if os.path.exists(cdbname):
-                if 1 <= self.debug:
-                    print >>sys.stderr, 'Opening: CDBCMap %r...' % cdbname
-                cmap = CDBCMap(cdbname)
-            elif os.path.exists(fname):
-                if 1 <= self.debug:
-                    print >>sys.stderr, 'Reading: CMap %r...' % fname
-                cmap = CMap()
-                fp = file(fname, 'rb')
-                CMapParser(self, cmap, fp).run()
-                fp.close()
-            elif not strict:
-                cmap = CMap() # just create empty cmap
-            else:
-                raise CMapDB.CMapNotFound(cmapname)
-            self.cmapdb[cmapname] = cmap
-        return cmap
+    @classmethod
+    def get_cmap(klass, name):
+        if name == 'Identity-H':
+            return IdentityCMap(False)
+        elif name == 'Identity-V':
+            return IdentityCMap(True)
+        modname = 'pdfminer.cmap.%s' % name.replace('-','_')
+        if klass.debug:
+            print >>sys.stderr, 'loading:', modname
+        try:
+            module = __import__(modname, fromlist=['pdfminer.cmap'])
+        except ImportError:
+            raise CMapDB.CMapNotFound(name)
+        return PyCMap(name, module)
+
+    @classmethod
+    def get_unicode_map(klass, name, vertical=False):
+        modname = 'pdfminer.cmap.TO_UNICODE_%s' % name.replace('-','_')
+        if klass.debug:
+            print >>sys.stderr, 'loading:', modname, vertical
+        try:
+            module = __import__(modname, fromlist=['pdfminer.cmap'])
+        except ImportError:
+            raise CMapDB.CMapNotFound(name)
+        return PyUnicodeMap(name, module, vertical)


 ##  CMapParser
 ##
 class CMapParser(PSStackParser):

-    def __init__(self, cmapdb, cmap, fp):
+    def __init__(self, cmap, fp):
        PSStackParser.__init__(self, fp)
-        self.cmapdb = cmapdb
        self.cmap = cmap
-        self.in_cmap = False
+        self._in_cmap = False
        return

    def run(self):
@ -266,29 +255,30 @@ class CMapParser(PSStackParser):
    def do_keyword(self, pos, token):
        name = token.name
        if name == 'begincmap':
-            self.in_cmap = True
+            self._in_cmap = True
            self.popall()
            return
        elif name == 'endcmap':
-            self.in_cmap = False
+            self._in_cmap = False
            return
-        if not self.in_cmap: return
+        if not self._in_cmap: return
        #
        if name == 'def':
            try:
                ((_,k),(_,v)) = self.pop(2)
-                self.cmap.attrs[literal_name(k)] = v
+                self.cmap.set_attr(literal_name(k), v)
            except PSSyntaxError:
                pass
            return

        if name == 'usecmap':
-            if self.cmapdb:
-                try:
-                    ((_,cmapname),) = self.pop(1)
-                    self.cmap.copycmap(self.cmapdb.get_cmap(literal_name(cmapname)))
-                except PSSyntaxError:
-                    pass
+            try:
+                ((_,cmapname),) = self.pop(1)
+                self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
+            except PSSyntaxError:
+                pass
+            except CMapDB.CMapNotFound:
+                pass
            return

        if name == 'begincodespacerange':
@ -317,7 +307,7 @@ class CMapParser(PSStackParser):
                #assert s1 <= e1
                for i in xrange(e1-s1+1):
                    x = sprefix+pack('>L',s1+i)[-vlen:]
-                    self.cmap.register_code2cid(x, cid+i)
+                    self.cmap.add_code2cid(x, cid+i)
            return

        if name == 'begincidchar':
@ -327,7 +317,7 @@ class CMapParser(PSStackParser):
            objs = [ obj for (_,obj) in self.popall() ]
            for (cid,code) in choplist(2, objs):
                if isinstance(code, str) and isinstance(cid, str):
-                    self.cmap.register_code2cid(code, nunpack(cid))
+                    self.cmap.add_code2cid(code, nunpack(cid))
            return

        if name == 'beginbfrange':
@ -343,7 +333,7 @@ class CMapParser(PSStackParser):
                #assert s1 <= e1
                if isinstance(code, list):
                    for i in xrange(e1-s1+1):
-                        self.cmap.register_cid2code(s1+i, code[i])
+                        self.cmap.add_cid2unicode(s1+i, code[i])
                else:
                    var = code[-4:]
                    base = nunpack(var)
@ -351,7 +341,7 @@ class CMapParser(PSStackParser):
                    vlen = len(var)
                    for i in xrange(e1-s1+1):
                        x = prefix+pack('>L',base+i)[-vlen:]
-                        self.cmap.register_cid2code(s1+i, x)
+                        self.cmap.add_cid2unicode(s1+i, x)
            return

        if name == 'beginbfchar':
@ -361,7 +351,7 @@ class CMapParser(PSStackParser):
            objs = [ obj for (_,obj) in self.popall() ]
            for (cid,code) in choplist(2, objs):
                if isinstance(cid, str) and isinstance(code, str):
-                    self.cmap.register_cid2code(nunpack(cid), code)
+                    self.cmap.add_cid2unicode(nunpack(cid), code)
            return

        if name == 'beginnotdefrange':
@ -373,123 +363,3 @@ class CMapParser(PSStackParser):

        self.push((pos, token))
        return
-
-
-##  FontMetricsDB
-##
-class FontMetricsDB(object):
-
-    @classmethod
-    def get_metrics(klass, fontname):
-        return FONT_METRICS[fontname]
-
-
-##  EncodingDB
-##
-class EncodingDB(object):
-
-    std2unicode = {}
-    mac2unicode = {}
-    win2unicode = {}
-    pdf2unicode = {}
-    for (name,std,mac,win,pdf) in ENCODING:
-        c = unichr(name2unicode(name))
-        if std: std2unicode[std] = c
-        if mac: mac2unicode[mac] = c
-        if win: win2unicode[win] = c
-        if pdf: pdf2unicode[pdf] = c
-
-    encodings = {
-      'StandardEncoding': std2unicode,
-      'MacRomanEncoding': mac2unicode,
-      'WinAnsiEncoding': win2unicode,
-      'PDFDocEncoding': pdf2unicode,
-      }
-
-    @classmethod
-    def get_encoding(klass, name, diff=None):
-        cid2unicode = klass.encodings.get(name, klass.std2unicode)
-        if diff:
-            cid2unicode = cid2unicode.copy()
-            cid = 0
-            for x in diff:
-                if isinstance(x, int):
-                    cid = x
-                elif isinstance(x, PSLiteral):
-                    try:
-                        cid2unicode[cid] = unichr(name2unicode(x.name))
-                    except KeyError:
-                        pass
-                    cid += 1
-        return cid2unicode
-
-
-##  CMap -> CMapCDB conversion
-##
-def dump_cdb(cmap, cdbfile, verbose=1):
-    """Writes a CMap object into a cdb file."""
-    m = cdb.cdbmake(cdbfile, cdbfile+'.tmp')
-    if verbose:
-        print >>sys.stderr, 'Writing: %r...' % cdbfile
-    for (k,v) in cmap.getall_attrs():
-        m.add('/'+k, repr(v))
-    for (code,cid) in cmap.getall_code2cid():
-        m.add('c'+code, pack('>L',cid))
-    for (cid,code) in cmap.getall_cid2code():
-        m.add('i'+pack('>L',cid), code)
-    m.finish()
-    return
-
-def convert_cmap(cmapdir, outputdir, force=False):
-    """Convert all CMap source files in a directory into cdb files."""
-    cmapdb = CMapDB(cmapdir)
-    for fname in os.listdir(cmapdir):
-        if '.' in fname: continue
-        cmapname = os.path.basename(fname)
-        cdbname = os.path.join(outputdir, cmapname+'.cmap.cdb')
-        if not force and os.path.exists(cdbname):
-            print >>sys.stderr, 'Skipping: %r' % cmapname
-            continue
-        print >>sys.stderr, 'Reading: %r...' % cmapname
-        cmap = cmapdb.get_cmap(cmapname)
-        dump_cdb(cmap, cdbname)
-    return
-
-def main(argv):
-    """Converts CMap files into cdb files.
-
-    usage: python -m pdfminer.cmap [-f] [cmap_dir [output_dir]]
-    """
-    
-    import getopt
-    def usage():
-        print 'usage: %s [-f] [cmap_dir [output_dir]]' % argv[0]
-        return 100
-    try:
-        (opts, args) = getopt.getopt(argv[1:], 'f')
-    except getopt.GetoptError:
-        return usage()
-    if args:
-        cmapdir = args.pop(0)
-    else:
-        try:
-            cmapdir = find_cmap_path()
-        except IOError:
-            print >>sys.stderr, 'cannot find CMap directory'
-            return 1
-    if args:
-        outputdir = args.pop(0)
-    else:
-        outputdir = cmapdir
-    force = False
-    for (k, v) in opts:
-        if k == '-f': force = True
-    if not os.path.isdir(cmapdir):
-        print >>sys.stderr, 'directory does not exist: %r' % cmapdir
-        return 1
-    if not os.path.isdir(outputdir):
-        print >>sys.stderr, 'directory does not exist: %r' % outputdir
-        return 1
-    return convert_cmap(cmapdir, outputdir, force=force)
-
-if __name__ == '__main__': sys.exit(main(sys.argv))
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@ -29,7 +29,7 @@ class TagExtractor(PDFDevice):
            chars = font.decode(obj)
            for cid in chars:
                try:
-                    char = font.to_unicode(cid)
+                    char = font.to_unichr(cid)
                    text += char
                except PDFUnicodeNotDefined:
                    pass
--- a/pdfminer/encodingdb.py
+++ b/pdfminer/encodingdb.py
@ -0,0 +1,58 @@
+#!/usr/bin/env python
+
+import re
+from psparser import PSLiteral
+from glyphlist import charname2unicode
+from latin_enc import ENCODING
+
+
+##  name2unicode
+##
+STRIP_NAME = re.compile(r'[0-9]+')
+def name2unicode(name):
+    """Converts Adobe glyph names to Unicode numbers."""
+    if name in charname2unicode:
+        return charname2unicode[name]
+    m = STRIP_NAME.search(name)
+    if not m: raise KeyError(name)
+    return int(m.group(0))
+
+
+##  EncodingDB
+##
+class EncodingDB(object):
+
+    std2unicode = {}
+    mac2unicode = {}
+    win2unicode = {}
+    pdf2unicode = {}
+    for (name,std,mac,win,pdf) in ENCODING:
+        c = unichr(name2unicode(name))
+        if std: std2unicode[std] = c
+        if mac: mac2unicode[mac] = c
+        if win: win2unicode[win] = c
+        if pdf: pdf2unicode[pdf] = c
+
+    encodings = {
+      'StandardEncoding': std2unicode,
+      'MacRomanEncoding': mac2unicode,
+      'WinAnsiEncoding': win2unicode,
+      'PDFDocEncoding': pdf2unicode,
+      }
+
+    @classmethod
+    def get_encoding(klass, name, diff=None):
+        cid2unicode = klass.encodings.get(name, klass.std2unicode)
+        if diff:
+            cid2unicode = cid2unicode.copy()
+            cid = 0
+            for x in diff:
+                if isinstance(x, int):
+                    cid = x
+                elif isinstance(x, PSLiteral):
+                    try:
+                        cid2unicode[cid] = unichr(name2unicode(x.name))
+                    except KeyError:
+                        pass
+                    cid += 1
+        return cid2unicode
--- a/pdfminer/pdfdevice.py
+++ b/pdfminer/pdfdevice.py
@ -89,7 +89,7 @@ class PDFTextDevice(PDFDevice):
            else:
                for cid in font.decode(obj):
                    try:
-                        char = font.to_unicode(cid)
+                        char = font.to_unichr(cid)
                    except PDFUnicodeNotDefined, e:
                        (cidcoding, cid) = e.args
                        char = self.handle_undefined_char(cidcoding, cid)
--- a/pdfminer/pdffont.py
+++ b/pdfminer/pdffont.py
@ -4,17 +4,27 @@ try:
    from cStringIO import StringIO
 except ImportError:
    from StringIO import StringIO
-from cmapdb import CMap, CMapDB, CMapParser
-from cmapdb import FontMetricsDB, EncodingDB
+from cmapdb import CMapDB, CMapParser, FileUnicodeMap, CMap
+from encodingdb import EncodingDB
 from struct import pack, unpack
 from psparser import LIT, STRICT
 from psparser import PSLiteral, literal_name
 from pdftypes import PDFException, resolve1
 from pdftypes import int_value, float_value, num_value
 from pdftypes import str_value, list_value, dict_value, stream_value
+from fontmetrics import FONT_METRICS
 from utils import apply_matrix_norm, nunpack


+##  FontMetricsDB
+##
+class FontMetricsDB(object):
+
+    @classmethod
+    def get_metrics(klass, fontname):
+        return FONT_METRICS[fontname]
+
+
 ##  CFFFont
 ##  (Format specified in Adobe Technical Note: #5176
 ##   "The Compact Font Format Specification")
@ -249,7 +259,7 @@ class TrueTypeFont(object):
            self.tables[name] = (offset, length)
        return

-    def create_cmap(self):
+    def create_unicode_map(self):
        if 'cmap' not in self.tables:
            raise TrueTypeFont.CMapNotFound
        (base_offset, length) = self.tables['cmap']
@ -302,9 +312,11 @@ class TrueTypeFont(object):
                    else:
                        for c in xrange(sc, ec+1):
                            char2gid[c] = (c + idd) & 0xffff
-        gid2char = dict( (gid, pack('>H', char))
-                         for (char,gid) in char2gid.iteritems() )
-        return CMap().update(char2gid, gid2char)
+        # create unicode map
+        unicode_map = FileUnicodeMap()
+        for (char,gid) in char2gid.iteritems():
+            unicode_map.add_cid2code(gid, char)
+        return unicode_map


 ##  Fonts
@ -383,20 +395,19 @@ class PDFSimpleFont(PDFFont):
            self.encoding = EncodingDB.get_encoding(name, diff)
        else:
            self.encoding = EncodingDB.get_encoding(literal_name(encoding))
-        self.ucs2_cmap = None
+        self.unicode_map = None
        if 'ToUnicode' in spec:
            strm = stream_value(spec['ToUnicode'])
-            self.ucs2_cmap = CMap()
-            CMapParser(None, self.ucs2_cmap, StringIO(strm.get_data())).run()
+            self.unicode_map = FileUnicodeMap()
+            CMapParser(self.unicode_map, StringIO(strm.get_data())).run()
        PDFFont.__init__(self, descriptor, widths)
        return

-    def to_unicode(self, cid):
-        if self.ucs2_cmap:
-            code = self.ucs2_cmap.tocode(cid)
-            if code:
-                chars = unpack('>%dH' % (len(code)/2), code)
-                return ''.join( unichr(c) for c in chars )
+    def to_unichr(self, cid):
+        if self.unicode_map:
+            code = self.unicode_map.get_unicode(cid)
+            if code is not None:
+                return unichr(code)
        try:
            return self.encoding[cid]
        except KeyError:
@ -476,9 +487,11 @@ class PDFCIDFont(PDFFont):
                raise PDFFontError('Encoding is unspecified')
            name = 'unknown'
        try:
-            self.cmap = rsrc.get_cmap(name, strict=STRICT)
+            self.cmap = CMapDB.get_cmap(name)
        except CMapDB.CMapNotFound, e:
-            raise PDFFontError(e)
+            if STRICT:
+                raise PDFFontError(e)
+            self.cmap = CMap()
        try:
            descriptor = dict_value(spec['FontDescriptor'])
        except KeyError:
@ -490,21 +503,20 @@ class PDFCIDFont(PDFFont):
            self.fontfile = stream_value(descriptor.get('FontFile2'))
            ttf = TrueTypeFont(self.basefont,
                               StringIO(self.fontfile.get_data()))
-        self.ucs2_cmap = None
+        self.unicode_map = None
        if 'ToUnicode' in spec:
            strm = stream_value(spec['ToUnicode'])
-            self.ucs2_cmap = CMap()
-            CMapParser(None, self.ucs2_cmap, StringIO(strm.get_data())).run()
+            self.unicode_map = FileUnicodeMap()
+            CMapParser(self.unicode_map, StringIO(strm.get_data())).run()
        elif self.cidcoding == 'Adobe-Identity':
            if ttf:
                try:
-                    self.ucs2_cmap = ttf.create_cmap()
+                    self.unicode_map = ttf.create_unicode_map()
                except TrueTypeFont.CMapNotFound:
                    pass
        else:
            try:
-                self.ucs2_cmap = rsrc.get_cmap('%s-UCS2' % self.cidcoding,
-                                               strict=STRICT)
+                self.unicode_map = CMapDB.get_unicode_map(self.cidcoding, self.cmap.is_vertical())
            except CMapDB.CMapNotFound, e:
                raise PDFFontError(e)

@ -558,14 +570,13 @@ class PDFCIDFont(PDFFont):
    def char_disp(self, cid):
        return self.disps.get(cid, self.default_disp)

-    def to_unicode(self, cid):
-        if not self.ucs2_cmap:
+    def to_unichr(self, cid):
+        if not self.unicode_map:
            raise PDFUnicodeNotDefined(self.cidcoding, cid)
-        code = self.ucs2_cmap.tocode(cid)
-        if not code:
-            raise PDFUnicodeNotDefined(self.cidcoding, cid)
-        chars = unpack('>%dH' % (len(code)/2), code)
-        return ''.join( unichr(c) for c in chars )
+        code = self.unicode_map.get_unicode(cid)
+        if code is not None:
+            return unichr(code)
+        raise PDFUnicodeNotDefined(self.cidcoding, cid)


 # main
--- a/pdfminer/pdfinterp.py
+++ b/pdfminer/pdfinterp.py
@ -6,7 +6,7 @@ try:
    from cStringIO import StringIO
 except ImportError:
    from StringIO import StringIO
-from cmapdb import CMapDB
+from cmapdb import CMapDB, CMap
 from psparser import PSException, PSTypeError, PSEOF
 from psparser import PSKeyword, literal_name, keyword_name
 from psparser import PSStackParser
@ -106,9 +106,8 @@ class PDFResourceManager(object):
    '''
    debug = 0

-    def __init__(self, cmapdb):
+    def __init__(self):
        self.fonts = {}
-        self.cmapdb = cmapdb
        return

    def get_procset(self, procs):
@ -123,7 +122,11 @@ class PDFResourceManager(object):
        return

    def get_cmap(self, cmapname, strict=False):
-        return self.cmapdb.get_cmap(cmapname, strict=strict)
+        try:
+            return CMapDB.get_cmap(cmapname)
+        except CMapDB.CMapNotFound:
+            if strict: raise
+            return CMapDB.CMap()

    def get_font(self, objid, spec):
        if objid and objid in self.fonts:
--- a/setup.py
+++ b/setup.py
@ -19,7 +19,8 @@ PDF parser that can be used for other purposes instead of text analysis.''',
    author_email='yusuke at cs dot nyu dot edu',
    url='http://www.unixuser.org/~euske/python/pdfminer/index.html',
    packages=[
-    'pdfminer'
+    'pdfminer',
+    'pdfminer.cmap'
    ],
    scripts=[
    'tools/pdf2txt.py',
--- a/tools/conv_cmap.py
+++ b/tools/conv_cmap.py
@ -0,0 +1,155 @@
+#!/usr/bin/env python
+import sys
+import os.path
+
+def process_cid2code(fp, check_codecs=[]):
+
+    def get_canonicals(name):
+        if name.endswith('-H'):
+            return (name, None)
+        elif name == 'H':
+            return ('H', 'V')
+        else:
+            return (name+'-H', name+'-V')
+
+    def get_unicode(codes):
+        # determine the "most popular" candidate.
+        d = {}
+        for code in codes:
+            char = unicode(code, 'utf-8')
+            if char not in d:
+                d[char] = 0
+            for codec in check_codecs:
+                try:
+                    char.encode(codec, 'strict')
+                    d[char] += 1
+                except UnicodeError:
+                    pass
+        chars = sorted(d.keys(), key=lambda char:d[char], reverse=True)
+        return ord(chars[0])
+
+    def put(dmap, code, cid, force=False):
+        for b in code[:-1]:
+            b = ord(b)
+            if b in dmap:
+                dmap = dmap[b]
+            else:
+                d = {}
+                dmap[b] = d
+                dmap = d
+        b = ord(code[-1])
+        if force or ((b not in dmap) or dmap[b] == cid):
+            dmap[b] = cid
+        return
+
+    names = []
+    code2cid = {} # {'cmapname': ...}
+    is_vertical = {}
+    cid2unicode_h = {} # {cid: unicode}
+    cid2unicode_v = {} # {cid: unicode}
+    
+    for line in fp:
+        line = line.strip()
+        if line.startswith('#'): continue
+        if line.startswith('CID'):
+            names = line.split('\t')[1:]
+            continue
+        f = line.split('\t')
+        if not f: continue
+        cid = int(f[0])
+        for (x,name) in zip(f[1:], names):
+            if x == '*': continue
+            (hmapname, vmapname) = get_canonicals(name)
+            if hmapname in code2cid:
+                hmap = code2cid[hmapname]
+            else:
+                hmap = {}
+                code2cid[hmapname] = hmap
+            vmap = None
+            if vmapname:
+                is_vertical[vmapname] = True
+                if vmapname in code2cid:
+                    vmap = code2cid[vmapname]
+                else:
+                    vmap = {}
+                    code2cid[vmapname] = vmap
+            hcodes = []
+            vcodes = []
+            for code in x.split(','):
+                vertical = code.endswith('v')
+                if vertical:
+                    code = code[:-1]
+                try:
+                    code = code.decode('hex')
+                except:
+                    code = chr(int(code, 16))
+                if vertical:
+                    vcodes.append(code)
+                else:
+                    hcodes.append(code)
+            if vcodes:
+                assert vmap is not None
+                for code in vcodes:
+                    put(vmap, code, cid, True)
+                for code in hcodes:
+                    put(hmap, code, cid, True)
+                if name.endswith('-UTF8'):
+                    if hcodes:
+                        cid2unicode_h[cid] = get_unicode(hcodes)
+                    if vcodes:
+                        cid2unicode_v[cid] = get_unicode(vcodes)
+            else:
+                for code in hcodes:
+                    put(hmap, code, cid)
+                    put(vmap, code, cid)
+                if name.endswith('-UTF8') and hcodes:
+                    code = get_unicode(hcodes)
+                    if cid not in cid2unicode_h:
+                        cid2unicode_h[cid] = code
+                    if cid not in cid2unicode_v:
+                        cid2unicode_v[cid] = code
+
+    return (code2cid, is_vertical, cid2unicode_h, cid2unicode_v)
+
+# main
+def main(argv):
+
+    def usage():
+        print 'usage: %s output_dir regname cid2code.txt codecs ...' % argv[0]
+        return 100
+    
+    def pyname(name):
+        return name.replace('-','_')+'.py'
+
+    args = argv[1:]
+    if len(args) < 3: return usage()
+    (outdir, regname, src) = args[:3]
+    check_codecs = args[3:]
+
+    print >>sys.stderr, 'reading %r...' % src
+    fp = file(src)
+    (code2cid, is_vertical, cid2unicode_h, cid2unicode_v) = process_cid2code(fp, check_codecs)
+    fp.close()
+
+    for (name, cmap) in code2cid.iteritems():
+        fname = pyname(name)
+        print >>sys.stderr, 'writing %r...' % fname
+        fp = file(os.path.join(outdir, fname), 'w')
+        print >>fp, '#!/usr/bin/env python'
+        print >>fp, '#', fname
+        print >>fp, 'IS_VERTICAL = %r' % is_vertical.get(name, False)
+        print >>fp, 'CODE2CID = %r' % cmap
+        fp.close()
+
+    fname = 'TO_UNICODE_'+pyname(regname)
+    print >>sys.stderr, 'writing %r...' % fname
+    fp = file(os.path.join(outdir, fname), 'w')
+    print >>fp, '#!/usr/bin/env python'
+    print >>fp, '#', fname
+    print >>fp, 'CID2UNICODE_H = %r' % cid2unicode_h
+    print >>fp, 'CID2UNICODE_V = %r' % cid2unicode_v
+    fp.close()
+
+    return 0
+
+if __name__ == '__main__': sys.exit(main(sys.argv))
--- a/tools/pdf2txt.py
+++ b/tools/pdf2txt.py
@ -4,7 +4,7 @@ from pdfminer.pdfparser import PDFDocument, PDFParser
 from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
 from pdfminer.pdfdevice import PDFDevice
 from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter, TagExtractor
-from pdfminer.cmapdb import CMapDB, find_cmap_path
+from pdfminer.cmapdb import CMapDB
 from pdfminer.layout import LAParams

 # main
@ -22,8 +22,6 @@ def main(argv):
    if not args: return usage()
    # debug option
    debug = 0
-    # path option
-    cmapdir = find_cmap_path()
    # input option
    password = ''
    pagenos = set()
@ -38,7 +36,6 @@ def main(argv):
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
-        elif k == '-C': cmapdir = v
        elif k == '-P': password = v
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
@ -59,8 +56,7 @@ def main(argv):
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
-    cmapdb = CMapDB(cmapdir)
-    rsrc = PDFResourceManager(cmapdb)
+    rsrc = PDFResourceManager()
    if not outtype:
        outtype = 'text'
        if outfile: