From a0dd46bd8e38c725cad91e2e9a3d3ddf240ada02 Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Sun, 13 Jun 2010 13:50:24 +0000 Subject: [PATCH] cmap compression patch. thanks to Jakub Wilk git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@228 1aa58f4a-7d42-0410-adbc-911cccaed67c --- Makefile | 12 +++++------ pdfminer/cmap/Makefile | 3 +-- pdfminer/cmapdb.py | 47 +++++++++++++++++++++++++++++------------- setup.py | 3 +++ tools/conv_cmap.py | 31 ++++++++++++++-------------- 5 files changed, 59 insertions(+), 37 deletions(-) diff --git a/Makefile b/Makefile index ae574c4..96cb1c2 100644 --- a/Makefile +++ b/Makefile @@ -33,17 +33,17 @@ publish: CONV_CMAP=$(PYTHON) tools/conv_cmap.py CMAPSRC=cmaprsrc CMAPDST=pdfminer/cmap -cmap: $(CMAPDST)/TO_UNICODE_Adobe_CNS1.py $(CMAPDST)/TO_UNICODE_Adobe_GB1.py \ - $(CMAPDST)/TO_UNICODE_Adobe_Japan1.py $(CMAPDST)/TO_UNICODE_Adobe_Korea1.py +cmap: $(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz $(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz \ + $(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz $(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz cmap_clean: cd $(CMAPDST) && make cmap_clean -$(CMAPDST)/TO_UNICODE_Adobe_CNS1.py: +$(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz: $(CONV_CMAP) $(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt cp950 big5 -$(CMAPDST)/TO_UNICODE_Adobe_GB1.py: +$(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz: $(CONV_CMAP) $(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt cp936 gb2312 -$(CMAPDST)/TO_UNICODE_Adobe_Japan1.py: +$(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz: $(CONV_CMAP) $(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp -$(CMAPDST)/TO_UNICODE_Adobe_Korea1.py: +$(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz: $(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr test: cmap diff --git a/pdfminer/cmap/Makefile b/pdfminer/cmap/Makefile index 044f49c..117e9f4 100644 --- a/pdfminer/cmap/Makefile +++ b/pdfminer/cmap/Makefile @@ -6,5 +6,4 @@ clean: -rm *.pyc *.pyo cmap_clean: - -rm *.py - touch __init__.py + rm -f *.pickle.gz diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index 0ef4750..2902cb9 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -15,6 +15,9 @@ import sys import re import os import os.path +import gzip +import cPickle as pickle +import cmap from struct import pack, unpack from psparser import PSStackParser from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF @@ -210,34 +213,50 @@ class PyUnicodeMap(UnicodeMap): class CMapDB(object): debug = 0 + _cmap_cache = {} + _umap_cache = {} class CMapNotFound(CMapError): pass + @classmethod + def _load_data(klass, name): + filename = '%s.pickle.gz' % name + if klass.debug: + print >>sys.stderr, 'loading:', name + for directory in os.path.dirname(cmap.__file__), '/usr/share/pdfminer/': + path = os.path.join(directory, filename) + if os.path.exists(path): + gzfile = gzip.open(path) + try: + return type(name, (), pickle.loads(gzfile.read())) + finally: + gzfile.close() + else: + raise CMapDB.CMapNotFound(name) + @classmethod def get_cmap(klass, name): if name == 'Identity-H': return IdentityCMap(False) elif name == 'Identity-V': return IdentityCMap(True) - modname = 'pdfminer.cmap.%s' % name.replace('-','_') - if klass.debug: - print >>sys.stderr, 'loading:', modname try: - module = __import__(modname, fromlist=['pdfminer.cmap']) - except ImportError: - raise CMapDB.CMapNotFound(name) - return PyCMap(name, module) + return klass._cmap_cache[name] + except KeyError: + pass + data = klass._load_data(name) + klass._cmap_cache[name] = cmap = PyCMap(name, data) + return cmap @classmethod def get_unicode_map(klass, name, vertical=False): - modname = 'pdfminer.cmap.TO_UNICODE_%s' % name.replace('-','_') - if klass.debug: - print >>sys.stderr, 'loading:', modname, vertical try: - module = __import__(modname, fromlist=['pdfminer.cmap']) - except ImportError: - raise CMapDB.CMapNotFound(name) - return PyUnicodeMap(name, module, vertical) + return klass._umap_cache[name][vertical] + except KeyError: + pass + data = klass._load_data('to-unicode-%s' % name) + klass._umap_cache[name] = umaps = [PyUnicodeMap(name, data, v) for v in (False, True)] + return umaps[vertical] ## CMapParser diff --git a/setup.py b/setup.py index baca2fe..fe20e37 100644 --- a/setup.py +++ b/setup.py @@ -22,6 +22,9 @@ PDF parser that can be used for other purposes instead of text analysis.''', 'pdfminer', 'pdfminer.cmap', ], + package_data={ + 'pdfminer.cmap': ['*.pickle.gz'], + }, scripts=[ 'tools/pdf2txt.py', 'tools/dumppdf.py', diff --git a/tools/conv_cmap.py b/tools/conv_cmap.py index dcb190e..0420aa9 100755 --- a/tools/conv_cmap.py +++ b/tools/conv_cmap.py @@ -1,6 +1,8 @@ #!/usr/bin/env python import sys import os.path +import gzip +import cPickle as pickle def process_cid2code(fp, check_codecs=[]): @@ -118,9 +120,6 @@ def main(argv): print 'usage: %s output_dir regname cid2code.txt codecs ...' % argv[0] return 100 - def pyname(name): - return name.replace('-','_')+'.py' - args = argv[1:] if len(args) < 3: return usage() (outdir, regname, src) = args[:3] @@ -132,22 +131,24 @@ def main(argv): fp.close() for (name, cmap) in code2cid.iteritems(): - fname = pyname(name) + fname = '%s.pickle.gz' % name print >>sys.stderr, 'writing %r...' % fname - fp = file(os.path.join(outdir, fname), 'w') - print >>fp, '#!/usr/bin/env python' - print >>fp, '#', fname - print >>fp, 'IS_VERTICAL = %r' % is_vertical.get(name, False) - print >>fp, 'CODE2CID = %r' % cmap + fp = gzip.open(os.path.join(outdir, fname), 'wb') + data = dict( + IS_VERTICAL=is_vertical.get(name, False), + CODE2CID=cmap, + ) + fp.write(pickle.dumps(data)) fp.close() - fname = 'TO_UNICODE_'+pyname(regname) + fname = 'to-unicode-%s.pickle.gz' % regname print >>sys.stderr, 'writing %r...' % fname - fp = file(os.path.join(outdir, fname), 'w') - print >>fp, '#!/usr/bin/env python' - print >>fp, '#', fname - print >>fp, 'CID2UNICHR_H = %r' % cid2unichr_h - print >>fp, 'CID2UNICHR_V = %r' % cid2unichr_v + fp = gzip.open(os.path.join(outdir, fname), 'wb') + data = dict( + CID2UNICHR_H=cid2unichr_h, + CID2UNICHR_V=cid2unichr_v, + ) + fp.write(pickle.dumps(data)) fp.close() return 0