include cmap

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@162 1aa58f4a-7d42-0410-adbc-911cccaed67c
2009-12-19 14:17:00 +00:00 · 2009-12-19 14:17:00 +00:00 · e4b089e327
parent ed8a5362b9
commit e4b089e327
20 changed files with 92216 additions and 362 deletions
--- a/6
+++ b/6
@ -6,7 +6,8 @@ pdfminer/Makefile
 pdfminer/__init__.py
 pdfminer/arcfour.py
 pdfminer/ascii85.py
-pdfminer/cmap.py
+pdfminer/cmapdb.py
 pdfminer/encodingdb.py
 pdfminer/converter.py
 pdfminer/fontmetrics.py
 pdfminer/glyphlist.py
@ -24,6 +25,8 @@ pdfminer/psparser.py
 pdfminer/pycdb.py
 pdfminer/rijndael.py
 pdfminer/utils.py
 pdfminer/cmap/Makefile
 pdfminer/cmap/__init__.py
 tools/Makefile
 tools/dumppdf.py
 tools/pdf2txt.py
@ -40,3 +43,4 @@ samples/i1040nr.pdf
 samples/kampo.pdf
 samples/naacl06-shinyama.pdf
 samples/nlp2004slides.pdf
 cmaprsrc/README.txt
--- a/12
+++ b/12
@ -36,3 +36,15 @@ register: clean
 WEBDIR=$$HOME/Site/unixuser.org/python/$(PACKAGE)
 publish:
 	$(CP) docs/*.html $(WEBDIR)
 CONV_CMAP=$(PYTHON) tools/conv_cmap.py
 CMAPDIR=pdfminer/cmap
 CMAPRSRC=cmaprsrc
 cmap: cmaprsrc
 	$(CONV_CMAP) $(CMAPDIR) Adobe-CNS1 $(CMAPRSRC)/cid2code_Adobe_CNS1.txt cp950 big5
 	$(CONV_CMAP) $(CMAPDIR) Adobe-GB1 $(CMAPRSRC)/cid2code_Adobe_GB1.txt cp936 gb2312
 	$(CONV_CMAP) $(CMAPDIR) Adobe-Japan1 $(CMAPRSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
 	$(CONV_CMAP) $(CMAPDIR) Adobe-Korea1 $(CMAPRSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
 cmap_clean:
 	cd $(CMAPDIR) && make cmap_clean
--- a/2
+++ b/2
@ -4,4 +4,4 @@ TODOs:
  - Better API Documentation.
  - Robust error handling.
  - Any special handling for linearized PDFs?
-  - Handle crypt filter. (I need more samples!)
+  - Handle crypt filter. (More sample documents are needed!)
--- a/cmaprsrc/README.txt
+++ b/cmaprsrc/README.txt
@ -0,0 +1,60 @@
 README.txt for cmaprsrc
 This directory contains Adobe CMap resources. CMaps are required 
 to decode text data written in Chinese, Japanese or Korean language.
 CMap resources are now available freely from Adobe web site:
 http://opensource.adobe.com/wiki/display/cmap/CMap+Resources
 The follwing files were extracted from the downloadable tarballs:
 cid2code_Adobe_CNS1.txt:
 	http://download.macromedia.com/pub/opensource/cmap/cmapresources_cns1-6.tar.z
 cid2code_Adobe_GB1.txt:
 	http://download.macromedia.com/pub/opensource/cmap/cmapresources_gb1-5.tar.z
 cid2code_Adobe_Japan1.txt:
 	http://download.macromedia.com/pub/opensource/cmap/cmapresources_japan1-6.tar.z
 cid2code_Adobe_Korea1.txt:
 	http://download.macromedia.com/pub/opensource/cmap/cmapresources_korean1-2.tar.z
 Here is the license information in the original files:
 %%Copyright: -----------------------------------------------------------
 %%Copyright: Copyright 1990-20xx Adobe Systems Incorporated.
 %%Copyright: All rights reserved.
 %%Copyright:
 %%Copyright: Redistribution and use in source and binary forms, with or
 %%Copyright: without modification, are permitted provided that the
 %%Copyright: following conditions are met:
 %%Copyright:
 %%Copyright: Redistributions of source code must retain the above
 %%Copyright: copyright notice, this list of conditions and the following
 %%Copyright: disclaimer.
 %%Copyright:
 %%Copyright: Redistributions in binary form must reproduce the above
 %%Copyright: copyright notice, this list of conditions and the following
 %%Copyright: disclaimer in the documentation and/or other materials
 %%Copyright: provided with the distribution.
 %%Copyright:
 %%Copyright: Neither the name of Adobe Systems Incorporated nor the names
 %%Copyright: of its contributors may be used to endorse or promote
 %%Copyright: products derived from this software without specific prior
 %%Copyright: written permission.
 %%Copyright:
 %%Copyright: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
 %%Copyright: CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
 %%Copyright: INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 %%Copyright: MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 %%Copyright: DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
 %%Copyright: CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 %%Copyright: SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 %%Copyright: NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 %%Copyright: LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 %%Copyright: HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 %%Copyright: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 %%Copyright: OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 %%Copyright: SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 %%Copyright: -----------------------------------------------------------
--- a/cmaprsrc/cid2code_Adobe_CNS1.txt
+++ b/cmaprsrc/cid2code_Adobe_CNS1.txt
--- a/cmaprsrc/cid2code_Adobe_GB1.txt
+++ b/cmaprsrc/cid2code_Adobe_GB1.txt
--- a/cmaprsrc/cid2code_Adobe_Japan1.txt
+++ b/cmaprsrc/cid2code_Adobe_Japan1.txt
--- a/cmaprsrc/cid2code_Adobe_Korea1.txt
+++ b/cmaprsrc/cid2code_Adobe_Korea1.txt
--- a/pdfminer/Makefile
+++ b/pdfminer/Makefile
@ -6,6 +6,7 @@ all:
 clean:
 	-rm *.pyc *.pyo
 	cd cmap && make clean
 check:
 	$(PYCHECKER) *.py
--- a/pdfminer/cmap/Makefile
+++ b/pdfminer/cmap/Makefile
@ -0,0 +1,10 @@
 # Makefile for pdfminer.cmap
 all:
 clean:
 	-rm *.pyc *.pyo
 cmap_clean:
 	-rm *.py
 	touch __init__.py
--- a/pdfminer/cmap/init.py
+++ b/pdfminer/cmap/init.py
--- a/pdfminer/cmapdb.py
+++ b/pdfminer/cmapdb.py
@ -20,240 +20,229 @@ from psparser import PSStackParser
 from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF
 from psparser import PSLiteral, PSKeyword
 from psparser import literal_name, keyword_name
-from fontmetrics import FONT_METRICS
+from encodingdb import name2unicode
 from latin_enc import ENCODING
 from glyphlist import charname2unicode
 from utils import choplist, nunpack
 try:
    import cdb
 except ImportError:
    import pdfminer.pycdb as cdb
 class CMapError(Exception): pass
 ##  find_cmap_path
 ##
 def find_cmap_path():
    """Returns the location of CMap directory."""
    for path in (os.environ.get('CMAP_PATH', '.'),
                 os.path.join(os.path.dirname(__file__), 'CMap')):
        if os.path.isdir(path):
            return path
    raise IOError
 ##  name2unicode
 ##
 STRIP_NAME = re.compile(r'[0-9]+')
 def name2unicode(name):
    """Converts Adobe glyph names to Unicode numbers."""
    if name in charname2unicode:
        return charname2unicode[name]
    m = STRIP_NAME.search(name)
    if not m: raise KeyError(name)
    return int(m.group(0))
 ##  CMap
 ##
 class CMap(object):
    debug = 0
    def __init__(self, code2cid=None):
        self.code2cid = code2cid or {}
        return
    def is_vertical(self):
        return False
    def use_cmap(self, cmap):
        assert isinstance(cmap, CMap)
        def copy(dst, src):
            for (k,v) in src.iteritems():
                if isinstance(v, dict):
                    d = {}
                    dst[k] = d
                    copy(d, v)
                else:
                    dst[k] = v
        copy(self.code2cid, cmap.code2cid)
        return
    def decode(self, code):
        if self.debug:
            print >>sys.stderr, 'decode: %r, %r' % (self, code)
        d = self.code2cid
        for c in code:
            c = ord(c)
            if c in d:
                d = d[c]
                if isinstance(d, int):
                    yield d
                    d = self.code2cid
            else:
                d = self.code2cid
        return
 ##  IdentityCMap
 ##
 class IdentityCMap(object):
    def __init__(self, vertical):
        self.vertical = vertical
        return
    def is_vertical(self):
        return self.vertical
    def decode(self, code):
        return unpack('>%dH' % (len(code)/2), code)
 ##  UnicodeMap
 ##
 class UnicodeMap(object):
    debug = 0
    def __init__(self, cid2unicode=None):
        self.cid2unicode = cid2unicode or {}
        return
    def get_unicode(self, cid):
        if self.debug:
            print >>sys.stderr, 'get_unicode: %r, %r' % (self, cid)
        return self.cid2unicode.get(cid)
 ##  FileCMap
 ##
 class FileCMap(CMap):
    def __init__(self):
-        self.code2cid = {}
+        CMap.__init__(self)
        self.cid2code = {}
        self.attrs = {}
        return
    def __repr__(self):
        return '<CMap: %s>' % self.attrs.get('CMapName')
    def update(self, code2cid=None, cid2code=None):
        if code2cid:
            self.code2cid.update(code2cid)
        if cid2code:
            self.cid2code.update(cid2code)
        return self
    def copycmap(self, cmap):
        self.code2cid.update(cmap.getall_code2cid())
        self.cid2code.update(cmap.getall_cid2code())
        return self
    def register_code2cid(self, code, cid):
        if isinstance(code, str) and isinstance(cid, int):
            self.code2cid[code] = cid
        return self
    def register_cid2code(self, cid, code):
        if isinstance(cid, int):
            if isinstance(code, PSLiteral):
                self.cid2code[cid] = pack('>H', name2unicode(code.name))
            elif isinstance(code, str):
                self.cid2code[cid] = code
        return self
    def decode(self, bytes):
        if self.debug:
            print >>sys.stderr, 'decode: %r, %r' % (self, bytes)
        x = ''
        for c in bytes:
            if x:
                if x+c in self.code2cid:
                    yield self.code2cid[x+c]
                x = ''
            elif c in self.code2cid:
                yield self.code2cid[c]
            else:
                x = c
        return
    def is_vertical(self):
        return self.attrs.get('WMode', 0)
-    def tocid(self, code):
+    def set_attr(self, k, v):
-        return self.code2cid.get(code)
+        self.attrs[k] = v
-    def tocode(self, cid):
+        return
        return self.cid2code.get(cid)
-    def getall_attrs(self):
+    def add_code2cid(self, code, cid):
-        return self.attrs.iteritems()
+        assert isinstance(code, str) and isinstance(cid, int)
-    def getall_code2cid(self):
+        d = self.code2cid
-        return self.code2cid.iteritems()
+        for c in code[:-1]:
-    def getall_cid2code(self):
+            c = ord(c)
-        return self.cid2code.iteritems()
+            if c in d:
                d = d[c]
            else:
                t = {}
                d[c] = t
                d =t
        c = ord(code[-1])
        d[c] = cid
        return
-##  CDBCMap
+##  FileUnicodeMap
 ##
-class CDBCMap(CMap):
+class FileUnicodeMap(UnicodeMap):
-    def __init__(self, cdbname):
+    def __init__(self):
-        CMap.__init__(self)
+        UnicodeMap.__init__(self)
-        self.cdbname = cdbname
+        self.attrs = {}
        self.db = cdb.init(cdbname)
        return
    def __repr__(self):
-        return '<CDBCMap: %s (%r)>' % (self.db['/CMapName'], self.cdbname)
+        return '<UnicodeMap: %s>' % self.attrs.get('CMapName')
-    def tocid(self, code):
+    def set_attr(self, k, v):
-        k = 'c'+code
+        self.attrs[k] = v
-        if not self.db.has_key(k):
+        return
-            return None
+
-        return unpack('>L', self.db[k])
+    def add_cid2unicode(self, cid, code):
-    def tocode(self, cid):
+        assert isinstance(cid, int)
-        k = 'i'+pack('>L', cid)
+        if isinstance(code, PSLiteral):
-        if not self.db.has_key(k):
+            # Interpret as an Adobe glyph name.
-            return None
+            self.cid2unicode[cid] = name2unicode(code.name)
-        return self.db[k]
+        elif isinstance(code, str):
            # Interpret as UTF-16BE.
            self.cid2unicode[cid] = unpack('>H', code)[0]
        elif isinstance(code, int):
            self.cid2unicode[cid] = code
        else:
            raise TypeError(code)
        return
 ##  PyCMap
 ##
 class PyCMap(CMap):
    def __init__(self, name, module):
        CMap.__init__(self, module.CODE2CID)
        self.name = name
        self._is_vertical = module.IS_VERTICAL
        return
    def __repr__(self):
        return '<PyCMap: %s>' % (self.name)
    def is_vertical(self):
-        return (self.db.has_key('/WMode') and
+        return self._is_vertical
                self.db['/WMode'] == '1')
-    def getall(self, c):
+
-        while 1:
+##  PyUnicodeMap
-            x = self.db.each()
+##
-            if not x: break
+class PyUnicodeMap(UnicodeMap):
-            (k,v) = x
+    
-            if k.startswith(c):
+    def __init__(self, name, module, vertical):
-                yield (k[1:], unpack('>L', v)[0])
+        if vertical:
            cid2unicode = module.CID2UNICODE_V
        else:
            cid2unicode = module.CID2UNICODE_H
        UnicodeMap.__init__(self, cid2unicode)
        self.name = name
        return
-    def getall_attrs(self):
+    def __repr__(self):
-        while 1:
+        return '<PyUnicodeMap: %s>' % (self.name)
            x = self.db.each()
            if not x: break
            (k,v) = x
            if k.startswith('/'):
                yield (k[1:], eval(v)[0])
        return
    def getall_cid2code(self):
        return self.getall('i')
    def getall_code2cid(self):
        return self.getall('c')
    def decode(self, bytes):
        if self.debug:
            print >>sys.stderr, 'decode: %r, %r' % (self, bytes)
        x = ''
        for c in bytes:
            if x:
                if x+c in self.code2cid:
                    yield self.code2cid[x+c]
                elif self.db.has_key('c'+x+c):
                    (dest,) = unpack('>L', self.db['c'+x+c])
                    self.code2cid[x+c] = dest
                    yield dest
                x = ''
            elif c in self.code2cid:
                yield self.code2cid[c]
            elif self.db.has_key('c'+c):
                (dest,) = unpack('>L', self.db['c'+c])
                self.code2cid[c] = dest
                yield dest
            else:
                x = c
        return
 ##  CMapDB
 ##
 class CMapDB(object):
    class CMapNotFound(CMapError): pass
    CMAP_ALIAS = { }
    debug = 0
-    def __init__(self, dirname=None, cdbdirname=None):
+    class CMapNotFound(CMapError): pass
        if not dirname:
            dirname = find_cmap_path()
        self.dirname = dirname
        self.cdbdirname = cdbdirname or dirname
        self.cmapdb = {}
        return
-    def get_cmap(self, cmapname, strict=True):
+    @classmethod
-        cmapname = self.CMAP_ALIAS.get(cmapname, cmapname)
+    def get_cmap(klass, name):
-        if cmapname in self.cmapdb:
+        if name == 'Identity-H':
-            cmap = self.cmapdb[cmapname]
+            return IdentityCMap(False)
-        else:
+        elif name == 'Identity-V':
-            fname = os.path.join(self.dirname, cmapname)
+            return IdentityCMap(True)
-            cdbname = os.path.join(self.cdbdirname, cmapname+'.cmap.cdb')
+        modname = 'pdfminer.cmap.%s' % name.replace('-','_')
-            if os.path.exists(cdbname):
+        if klass.debug:
-                if 1 <= self.debug:
+            print >>sys.stderr, 'loading:', modname
-                    print >>sys.stderr, 'Opening: CDBCMap %r...' % cdbname
+        try:
-                cmap = CDBCMap(cdbname)
+            module = __import__(modname, fromlist=['pdfminer.cmap'])
-            elif os.path.exists(fname):
+        except ImportError:
-                if 1 <= self.debug:
+            raise CMapDB.CMapNotFound(name)
-                    print >>sys.stderr, 'Reading: CMap %r...' % fname
+        return PyCMap(name, module)
-                cmap = CMap()
+
-                fp = file(fname, 'rb')
+    @classmethod
-                CMapParser(self, cmap, fp).run()
+    def get_unicode_map(klass, name, vertical=False):
-                fp.close()
+        modname = 'pdfminer.cmap.TO_UNICODE_%s' % name.replace('-','_')
-            elif not strict:
+        if klass.debug:
-                cmap = CMap() # just create empty cmap
+            print >>sys.stderr, 'loading:', modname, vertical
-            else:
+        try:
-                raise CMapDB.CMapNotFound(cmapname)
+            module = __import__(modname, fromlist=['pdfminer.cmap'])
-            self.cmapdb[cmapname] = cmap
+        except ImportError:
-        return cmap
+            raise CMapDB.CMapNotFound(name)
        return PyUnicodeMap(name, module, vertical)
 ##  CMapParser
 ##
 class CMapParser(PSStackParser):
-    def __init__(self, cmapdb, cmap, fp):
+    def __init__(self, cmap, fp):
        PSStackParser.__init__(self, fp)
        self.cmapdb = cmapdb
        self.cmap = cmap
-        self.in_cmap = False
+        self._in_cmap = False
        return
    def run(self):
@ -266,29 +255,30 @@ class CMapParser(PSStackParser):
    def do_keyword(self, pos, token):
        name = token.name
        if name == 'begincmap':
-            self.in_cmap = True
+            self._in_cmap = True
            self.popall()
            return
        elif name == 'endcmap':
-            self.in_cmap = False
+            self._in_cmap = False
            return
-        if not self.in_cmap: return
+        if not self._in_cmap: return
        #
        if name == 'def':
            try:
                ((_,k),(_,v)) = self.pop(2)
-                self.cmap.attrs[literal_name(k)] = v
+                self.cmap.set_attr(literal_name(k), v)
            except PSSyntaxError:
                pass
            return
        if name == 'usecmap':
-            if self.cmapdb:
+            try:
-                try:
+                ((_,cmapname),) = self.pop(1)
-                    ((_,cmapname),) = self.pop(1)
+                self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
-                    self.cmap.copycmap(self.cmapdb.get_cmap(literal_name(cmapname)))
+            except PSSyntaxError:
-                except PSSyntaxError:
+                pass
-                    pass
+            except CMapDB.CMapNotFound:
                pass
            return
        if name == 'begincodespacerange':
@ -317,7 +307,7 @@ class CMapParser(PSStackParser):
                #assert s1 <= e1
                for i in xrange(e1-s1+1):
                    x = sprefix+pack('>L',s1+i)[-vlen:]
-                    self.cmap.register_code2cid(x, cid+i)
+                    self.cmap.add_code2cid(x, cid+i)
            return
        if name == 'begincidchar':
@ -327,7 +317,7 @@ class CMapParser(PSStackParser):
            objs = [ obj for (_,obj) in self.popall() ]
            for (cid,code) in choplist(2, objs):
                if isinstance(code, str) and isinstance(cid, str):
-                    self.cmap.register_code2cid(code, nunpack(cid))
+                    self.cmap.add_code2cid(code, nunpack(cid))
            return
        if name == 'beginbfrange':
@ -343,7 +333,7 @@ class CMapParser(PSStackParser):
                #assert s1 <= e1
                if isinstance(code, list):
                    for i in xrange(e1-s1+1):
-                        self.cmap.register_cid2code(s1+i, code[i])
+                        self.cmap.add_cid2unicode(s1+i, code[i])
                else:
                    var = code[-4:]
                    base = nunpack(var)
@ -351,7 +341,7 @@ class CMapParser(PSStackParser):
                    vlen = len(var)
                    for i in xrange(e1-s1+1):
                        x = prefix+pack('>L',base+i)[-vlen:]
-                        self.cmap.register_cid2code(s1+i, x)
+                        self.cmap.add_cid2unicode(s1+i, x)
            return
        if name == 'beginbfchar':
@ -361,7 +351,7 @@ class CMapParser(PSStackParser):
            objs = [ obj for (_,obj) in self.popall() ]
            for (cid,code) in choplist(2, objs):
                if isinstance(cid, str) and isinstance(code, str):
-                    self.cmap.register_cid2code(nunpack(cid), code)
+                    self.cmap.add_cid2unicode(nunpack(cid), code)
            return
        if name == 'beginnotdefrange':
@ -373,123 +363,3 @@ class CMapParser(PSStackParser):
        self.push((pos, token))
        return
 ##  FontMetricsDB
 ##
 class FontMetricsDB(object):
    @classmethod
    def get_metrics(klass, fontname):
        return FONT_METRICS[fontname]
 ##  EncodingDB
 ##
 class EncodingDB(object):
    std2unicode = {}
    mac2unicode = {}
    win2unicode = {}
    pdf2unicode = {}
    for (name,std,mac,win,pdf) in ENCODING:
        c = unichr(name2unicode(name))
        if std: std2unicode[std] = c
        if mac: mac2unicode[mac] = c
        if win: win2unicode[win] = c
        if pdf: pdf2unicode[pdf] = c
    encodings = {
      'StandardEncoding': std2unicode,
      'MacRomanEncoding': mac2unicode,
      'WinAnsiEncoding': win2unicode,
      'PDFDocEncoding': pdf2unicode,
      }
    @classmethod
    def get_encoding(klass, name, diff=None):
        cid2unicode = klass.encodings.get(name, klass.std2unicode)
        if diff:
            cid2unicode = cid2unicode.copy()
            cid = 0
            for x in diff:
                if isinstance(x, int):
                    cid = x
                elif isinstance(x, PSLiteral):
                    try:
                        cid2unicode[cid] = unichr(name2unicode(x.name))
                    except KeyError:
                        pass
                    cid += 1
        return cid2unicode
 ##  CMap -> CMapCDB conversion
 ##
 def dump_cdb(cmap, cdbfile, verbose=1):
    """Writes a CMap object into a cdb file."""
    m = cdb.cdbmake(cdbfile, cdbfile+'.tmp')
    if verbose:
        print >>sys.stderr, 'Writing: %r...' % cdbfile
    for (k,v) in cmap.getall_attrs():
        m.add('/'+k, repr(v))
    for (code,cid) in cmap.getall_code2cid():
        m.add('c'+code, pack('>L',cid))
    for (cid,code) in cmap.getall_cid2code():
        m.add('i'+pack('>L',cid), code)
    m.finish()
    return
 def convert_cmap(cmapdir, outputdir, force=False):
    """Convert all CMap source files in a directory into cdb files."""
    cmapdb = CMapDB(cmapdir)
    for fname in os.listdir(cmapdir):
        if '.' in fname: continue
        cmapname = os.path.basename(fname)
        cdbname = os.path.join(outputdir, cmapname+'.cmap.cdb')
        if not force and os.path.exists(cdbname):
            print >>sys.stderr, 'Skipping: %r' % cmapname
            continue
        print >>sys.stderr, 'Reading: %r...' % cmapname
        cmap = cmapdb.get_cmap(cmapname)
        dump_cdb(cmap, cdbname)
    return
 def main(argv):
    """Converts CMap files into cdb files.
    usage: python -m pdfminer.cmap [-f] [cmap_dir [output_dir]]
    """
    import getopt
    def usage():
        print 'usage: %s [-f] [cmap_dir [output_dir]]' % argv[0]
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'f')
    except getopt.GetoptError:
        return usage()
    if args:
        cmapdir = args.pop(0)
    else:
        try:
            cmapdir = find_cmap_path()
        except IOError:
            print >>sys.stderr, 'cannot find CMap directory'
            return 1
    if args:
        outputdir = args.pop(0)
    else:
        outputdir = cmapdir
    force = False
    for (k, v) in opts:
        if k == '-f': force = True
    if not os.path.isdir(cmapdir):
        print >>sys.stderr, 'directory does not exist: %r' % cmapdir
        return 1
    if not os.path.isdir(outputdir):
        print >>sys.stderr, 'directory does not exist: %r' % outputdir
        return 1
    return convert_cmap(cmapdir, outputdir, force=force)
 if __name__ == '__main__': sys.exit(main(sys.argv))
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@ -29,7 +29,7 @@ class TagExtractor(PDFDevice):
            chars = font.decode(obj)
            for cid in chars:
                try:
-                    char = font.to_unicode(cid)
+                    char = font.to_unichr(cid)
                    text += char
                except PDFUnicodeNotDefined:
                    pass
--- a/pdfminer/encodingdb.py
+++ b/pdfminer/encodingdb.py
@ -0,0 +1,58 @@
 #!/usr/bin/env python
 import re
 from psparser import PSLiteral
 from glyphlist import charname2unicode
 from latin_enc import ENCODING
 ##  name2unicode
 ##
 STRIP_NAME = re.compile(r'[0-9]+')
 def name2unicode(name):
    """Converts Adobe glyph names to Unicode numbers."""
    if name in charname2unicode:
        return charname2unicode[name]
    m = STRIP_NAME.search(name)
    if not m: raise KeyError(name)
    return int(m.group(0))
 ##  EncodingDB
 ##
 class EncodingDB(object):
    std2unicode = {}
    mac2unicode = {}
    win2unicode = {}
    pdf2unicode = {}
    for (name,std,mac,win,pdf) in ENCODING:
        c = unichr(name2unicode(name))
        if std: std2unicode[std] = c
        if mac: mac2unicode[mac] = c
        if win: win2unicode[win] = c
        if pdf: pdf2unicode[pdf] = c
    encodings = {
      'StandardEncoding': std2unicode,
      'MacRomanEncoding': mac2unicode,
      'WinAnsiEncoding': win2unicode,
      'PDFDocEncoding': pdf2unicode,
      }
    @classmethod
    def get_encoding(klass, name, diff=None):
        cid2unicode = klass.encodings.get(name, klass.std2unicode)
        if diff:
            cid2unicode = cid2unicode.copy()
            cid = 0
            for x in diff:
                if isinstance(x, int):
                    cid = x
                elif isinstance(x, PSLiteral):
                    try:
                        cid2unicode[cid] = unichr(name2unicode(x.name))
                    except KeyError:
                        pass
                    cid += 1
        return cid2unicode
--- a/pdfminer/pdfdevice.py
+++ b/pdfminer/pdfdevice.py
@ -89,7 +89,7 @@ class PDFTextDevice(PDFDevice):
            else:
                for cid in font.decode(obj):
                    try:
-                        char = font.to_unicode(cid)
+                        char = font.to_unichr(cid)
                    except PDFUnicodeNotDefined, e:
                        (cidcoding, cid) = e.args
                        char = self.handle_undefined_char(cidcoding, cid)
--- a/pdfminer/pdffont.py
+++ b/pdfminer/pdffont.py
@ -4,17 +4,27 @@ try:
    from cStringIO import StringIO
 except ImportError:
    from StringIO import StringIO
-from cmapdb import CMap, CMapDB, CMapParser
+from cmapdb import CMapDB, CMapParser, FileUnicodeMap, CMap
-from cmapdb import FontMetricsDB, EncodingDB
+from encodingdb import EncodingDB
 from struct import pack, unpack
 from psparser import LIT, STRICT
 from psparser import PSLiteral, literal_name
 from pdftypes import PDFException, resolve1
 from pdftypes import int_value, float_value, num_value
 from pdftypes import str_value, list_value, dict_value, stream_value
 from fontmetrics import FONT_METRICS
 from utils import apply_matrix_norm, nunpack
 ##  FontMetricsDB
 ##
 class FontMetricsDB(object):
    @classmethod
    def get_metrics(klass, fontname):
        return FONT_METRICS[fontname]
 ##  CFFFont
 ##  (Format specified in Adobe Technical Note: #5176
 ##   "The Compact Font Format Specification")
@ -249,7 +259,7 @@ class TrueTypeFont(object):
            self.tables[name] = (offset, length)
        return
-    def create_cmap(self):
+    def create_unicode_map(self):
        if 'cmap' not in self.tables:
            raise TrueTypeFont.CMapNotFound
        (base_offset, length) = self.tables['cmap']
@ -302,9 +312,11 @@ class TrueTypeFont(object):
                    else:
                        for c in xrange(sc, ec+1):
                            char2gid[c] = (c + idd) & 0xffff
-        gid2char = dict( (gid, pack('>H', char))
+        # create unicode map
-                         for (char,gid) in char2gid.iteritems() )
+        unicode_map = FileUnicodeMap()
-        return CMap().update(char2gid, gid2char)
+        for (char,gid) in char2gid.iteritems():
            unicode_map.add_cid2code(gid, char)
        return unicode_map
 ##  Fonts
@ -383,20 +395,19 @@ class PDFSimpleFont(PDFFont):
            self.encoding = EncodingDB.get_encoding(name, diff)
        else:
            self.encoding = EncodingDB.get_encoding(literal_name(encoding))
-        self.ucs2_cmap = None
+        self.unicode_map = None
        if 'ToUnicode' in spec:
            strm = stream_value(spec['ToUnicode'])
-            self.ucs2_cmap = CMap()
+            self.unicode_map = FileUnicodeMap()
-            CMapParser(None, self.ucs2_cmap, StringIO(strm.get_data())).run()
+            CMapParser(self.unicode_map, StringIO(strm.get_data())).run()
        PDFFont.__init__(self, descriptor, widths)
        return
-    def to_unicode(self, cid):
+    def to_unichr(self, cid):
-        if self.ucs2_cmap:
+        if self.unicode_map:
-            code = self.ucs2_cmap.tocode(cid)
+            code = self.unicode_map.get_unicode(cid)
-            if code:
+            if code is not None:
-                chars = unpack('>%dH' % (len(code)/2), code)
+                return unichr(code)
                return ''.join( unichr(c) for c in chars )
        try:
            return self.encoding[cid]
        except KeyError:
@ -476,9 +487,11 @@ class PDFCIDFont(PDFFont):
                raise PDFFontError('Encoding is unspecified')
            name = 'unknown'
        try:
-            self.cmap = rsrc.get_cmap(name, strict=STRICT)
+            self.cmap = CMapDB.get_cmap(name)
        except CMapDB.CMapNotFound, e:
-            raise PDFFontError(e)
+            if STRICT:
                raise PDFFontError(e)
            self.cmap = CMap()
        try:
            descriptor = dict_value(spec['FontDescriptor'])
        except KeyError:
@ -490,21 +503,20 @@ class PDFCIDFont(PDFFont):
            self.fontfile = stream_value(descriptor.get('FontFile2'))
            ttf = TrueTypeFont(self.basefont,
                               StringIO(self.fontfile.get_data()))
-        self.ucs2_cmap = None
+        self.unicode_map = None
        if 'ToUnicode' in spec:
            strm = stream_value(spec['ToUnicode'])
-            self.ucs2_cmap = CMap()
+            self.unicode_map = FileUnicodeMap()
-            CMapParser(None, self.ucs2_cmap, StringIO(strm.get_data())).run()
+            CMapParser(self.unicode_map, StringIO(strm.get_data())).run()
        elif self.cidcoding == 'Adobe-Identity':
            if ttf:
                try:
-                    self.ucs2_cmap = ttf.create_cmap()
+                    self.unicode_map = ttf.create_unicode_map()
                except TrueTypeFont.CMapNotFound:
                    pass
        else:
            try:
-                self.ucs2_cmap = rsrc.get_cmap('%s-UCS2' % self.cidcoding,
+                self.unicode_map = CMapDB.get_unicode_map(self.cidcoding, self.cmap.is_vertical())
                                               strict=STRICT)
            except CMapDB.CMapNotFound, e:
                raise PDFFontError(e)
@ -558,14 +570,13 @@ class PDFCIDFont(PDFFont):
    def char_disp(self, cid):
        return self.disps.get(cid, self.default_disp)
-    def to_unicode(self, cid):
+    def to_unichr(self, cid):
-        if not self.ucs2_cmap:
+        if not self.unicode_map:
            raise PDFUnicodeNotDefined(self.cidcoding, cid)
-        code = self.ucs2_cmap.tocode(cid)
+        code = self.unicode_map.get_unicode(cid)
-        if not code:
+        if code is not None:
-            raise PDFUnicodeNotDefined(self.cidcoding, cid)
+            return unichr(code)
-        chars = unpack('>%dH' % (len(code)/2), code)
+        raise PDFUnicodeNotDefined(self.cidcoding, cid)
        return ''.join( unichr(c) for c in chars )
 # main
--- a/pdfminer/pdfinterp.py
+++ b/pdfminer/pdfinterp.py
@ -6,7 +6,7 @@ try:
    from cStringIO import StringIO
 except ImportError:
    from StringIO import StringIO
-from cmapdb import CMapDB
+from cmapdb import CMapDB, CMap
 from psparser import PSException, PSTypeError, PSEOF
 from psparser import PSKeyword, literal_name, keyword_name
 from psparser import PSStackParser
@ -106,9 +106,8 @@ class PDFResourceManager(object):
    '''
    debug = 0
-    def __init__(self, cmapdb):
+    def __init__(self):
        self.fonts = {}
        self.cmapdb = cmapdb
        return
    def get_procset(self, procs):
@ -123,7 +122,11 @@ class PDFResourceManager(object):
        return
    def get_cmap(self, cmapname, strict=False):
-        return self.cmapdb.get_cmap(cmapname, strict=strict)
+        try:
            return CMapDB.get_cmap(cmapname)
        except CMapDB.CMapNotFound:
            if strict: raise
            return CMapDB.CMap()
    def get_font(self, objid, spec):
        if objid and objid in self.fonts:
--- a/setup.py
+++ b/setup.py
@ -19,7 +19,8 @@ PDF parser that can be used for other purposes instead of text analysis.''',
    author_email='yusuke at cs dot nyu dot edu',
    url='http://www.unixuser.org/~euske/python/pdfminer/index.html',
    packages=[
-    'pdfminer'
+    'pdfminer',
    'pdfminer.cmap'
    ],
    scripts=[
    'tools/pdf2txt.py',
--- a/tools/conv_cmap.py
+++ b/tools/conv_cmap.py
@ -0,0 +1,155 @@
 #!/usr/bin/env python
 import sys
 import os.path
 def process_cid2code(fp, check_codecs=[]):
    def get_canonicals(name):
        if name.endswith('-H'):
            return (name, None)
        elif name == 'H':
            return ('H', 'V')
        else:
            return (name+'-H', name+'-V')
    def get_unicode(codes):
        # determine the "most popular" candidate.
        d = {}
        for code in codes:
            char = unicode(code, 'utf-8')
            if char not in d:
                d[char] = 0
            for codec in check_codecs:
                try:
                    char.encode(codec, 'strict')
                    d[char] += 1
                except UnicodeError:
                    pass
        chars = sorted(d.keys(), key=lambda char:d[char], reverse=True)
        return ord(chars[0])
    def put(dmap, code, cid, force=False):
        for b in code[:-1]:
            b = ord(b)
            if b in dmap:
                dmap = dmap[b]
            else:
                d = {}
                dmap[b] = d
                dmap = d
        b = ord(code[-1])
        if force or ((b not in dmap) or dmap[b] == cid):
            dmap[b] = cid
        return
    names = []
    code2cid = {} # {'cmapname': ...}
    is_vertical = {}
    cid2unicode_h = {} # {cid: unicode}
    cid2unicode_v = {} # {cid: unicode}
    for line in fp:
        line = line.strip()
        if line.startswith('#'): continue
        if line.startswith('CID'):
            names = line.split('\t')[1:]
            continue
        f = line.split('\t')
        if not f: continue
        cid = int(f[0])
        for (x,name) in zip(f[1:], names):
            if x == '*': continue
            (hmapname, vmapname) = get_canonicals(name)
            if hmapname in code2cid:
                hmap = code2cid[hmapname]
            else:
                hmap = {}
                code2cid[hmapname] = hmap
            vmap = None
            if vmapname:
                is_vertical[vmapname] = True
                if vmapname in code2cid:
                    vmap = code2cid[vmapname]
                else:
                    vmap = {}
                    code2cid[vmapname] = vmap
            hcodes = []
            vcodes = []
            for code in x.split(','):
                vertical = code.endswith('v')
                if vertical:
                    code = code[:-1]
                try:
                    code = code.decode('hex')
                except:
                    code = chr(int(code, 16))
                if vertical:
                    vcodes.append(code)
                else:
                    hcodes.append(code)
            if vcodes:
                assert vmap is not None
                for code in vcodes:
                    put(vmap, code, cid, True)
                for code in hcodes:
                    put(hmap, code, cid, True)
                if name.endswith('-UTF8'):
                    if hcodes:
                        cid2unicode_h[cid] = get_unicode(hcodes)
                    if vcodes:
                        cid2unicode_v[cid] = get_unicode(vcodes)
            else:
                for code in hcodes:
                    put(hmap, code, cid)
                    put(vmap, code, cid)
                if name.endswith('-UTF8') and hcodes:
                    code = get_unicode(hcodes)
                    if cid not in cid2unicode_h:
                        cid2unicode_h[cid] = code
                    if cid not in cid2unicode_v:
                        cid2unicode_v[cid] = code
    return (code2cid, is_vertical, cid2unicode_h, cid2unicode_v)
 # main
 def main(argv):
    def usage():
        print 'usage: %s output_dir regname cid2code.txt codecs ...' % argv[0]
        return 100
    def pyname(name):
        return name.replace('-','_')+'.py'
    args = argv[1:]
    if len(args) < 3: return usage()
    (outdir, regname, src) = args[:3]
    check_codecs = args[3:]
    print >>sys.stderr, 'reading %r...' % src
    fp = file(src)
    (code2cid, is_vertical, cid2unicode_h, cid2unicode_v) = process_cid2code(fp, check_codecs)
    fp.close()
    for (name, cmap) in code2cid.iteritems():
        fname = pyname(name)
        print >>sys.stderr, 'writing %r...' % fname
        fp = file(os.path.join(outdir, fname), 'w')
        print >>fp, '#!/usr/bin/env python'
        print >>fp, '#', fname
        print >>fp, 'IS_VERTICAL = %r' % is_vertical.get(name, False)
        print >>fp, 'CODE2CID = %r' % cmap
        fp.close()
    fname = 'TO_UNICODE_'+pyname(regname)
    print >>sys.stderr, 'writing %r...' % fname
    fp = file(os.path.join(outdir, fname), 'w')
    print >>fp, '#!/usr/bin/env python'
    print >>fp, '#', fname
    print >>fp, 'CID2UNICODE_H = %r' % cid2unicode_h
    print >>fp, 'CID2UNICODE_V = %r' % cid2unicode_v
    fp.close()
    return 0
 if __name__ == '__main__': sys.exit(main(sys.argv))
--- a/tools/pdf2txt.py
+++ b/tools/pdf2txt.py
@ -4,7 +4,7 @@ from pdfminer.pdfparser import PDFDocument, PDFParser
 from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
 from pdfminer.pdfdevice import PDFDevice
 from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter, TagExtractor
-from pdfminer.cmapdb import CMapDB, find_cmap_path
+from pdfminer.cmapdb import CMapDB
 from pdfminer.layout import LAParams
 # main
@ -22,8 +22,6 @@ def main(argv):
    if not args: return usage()
    # debug option
    debug = 0
    # path option
    cmapdir = find_cmap_path()
    # input option
    password = ''
    pagenos = set()
@ -38,7 +36,6 @@ def main(argv):
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-C': cmapdir = v
        elif k == '-P': password = v
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
@ -59,8 +56,7 @@ def main(argv):
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
-    cmapdb = CMapDB(cmapdir)
+    rsrc = PDFResourceManager()
    rsrc = PDFResourceManager(cmapdb)
    if not outtype:
        outtype = 'text'
        if outfile: