cmap compression patch. thanks to Jakub Wilk

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@228 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2010-06-13 13:50:24 +00:00
parent f2005bee55
commit a0dd46bd8e
5 changed files with 59 additions and 37 deletions

View File

@ -33,17 +33,17 @@ publish:
CONV_CMAP=$(PYTHON) tools/conv_cmap.py CONV_CMAP=$(PYTHON) tools/conv_cmap.py
CMAPSRC=cmaprsrc CMAPSRC=cmaprsrc
CMAPDST=pdfminer/cmap CMAPDST=pdfminer/cmap
cmap: $(CMAPDST)/TO_UNICODE_Adobe_CNS1.py $(CMAPDST)/TO_UNICODE_Adobe_GB1.py \ cmap: $(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz $(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz \
$(CMAPDST)/TO_UNICODE_Adobe_Japan1.py $(CMAPDST)/TO_UNICODE_Adobe_Korea1.py $(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz $(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz
cmap_clean: cmap_clean:
cd $(CMAPDST) && make cmap_clean cd $(CMAPDST) && make cmap_clean
$(CMAPDST)/TO_UNICODE_Adobe_CNS1.py: $(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz:
$(CONV_CMAP) $(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt cp950 big5 $(CONV_CMAP) $(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt cp950 big5
$(CMAPDST)/TO_UNICODE_Adobe_GB1.py: $(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz:
$(CONV_CMAP) $(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt cp936 gb2312 $(CONV_CMAP) $(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt cp936 gb2312
$(CMAPDST)/TO_UNICODE_Adobe_Japan1.py: $(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz:
$(CONV_CMAP) $(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp $(CONV_CMAP) $(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
$(CMAPDST)/TO_UNICODE_Adobe_Korea1.py: $(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz:
$(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr $(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
test: cmap test: cmap

View File

@ -6,5 +6,4 @@ clean:
-rm *.pyc *.pyo -rm *.pyc *.pyo
cmap_clean: cmap_clean:
-rm *.py rm -f *.pickle.gz
touch __init__.py

View File

@ -15,6 +15,9 @@ import sys
import re import re
import os import os
import os.path import os.path
import gzip
import cPickle as pickle
import cmap
from struct import pack, unpack from struct import pack, unpack
from psparser import PSStackParser from psparser import PSStackParser
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF
@ -210,34 +213,50 @@ class PyUnicodeMap(UnicodeMap):
class CMapDB(object): class CMapDB(object):
debug = 0 debug = 0
_cmap_cache = {}
_umap_cache = {}
class CMapNotFound(CMapError): pass class CMapNotFound(CMapError): pass
@classmethod
def _load_data(klass, name):
filename = '%s.pickle.gz' % name
if klass.debug:
print >>sys.stderr, 'loading:', name
for directory in os.path.dirname(cmap.__file__), '/usr/share/pdfminer/':
path = os.path.join(directory, filename)
if os.path.exists(path):
gzfile = gzip.open(path)
try:
return type(name, (), pickle.loads(gzfile.read()))
finally:
gzfile.close()
else:
raise CMapDB.CMapNotFound(name)
@classmethod @classmethod
def get_cmap(klass, name): def get_cmap(klass, name):
if name == 'Identity-H': if name == 'Identity-H':
return IdentityCMap(False) return IdentityCMap(False)
elif name == 'Identity-V': elif name == 'Identity-V':
return IdentityCMap(True) return IdentityCMap(True)
modname = 'pdfminer.cmap.%s' % name.replace('-','_')
if klass.debug:
print >>sys.stderr, 'loading:', modname
try: try:
module = __import__(modname, fromlist=['pdfminer.cmap']) return klass._cmap_cache[name]
except ImportError: except KeyError:
raise CMapDB.CMapNotFound(name) pass
return PyCMap(name, module) data = klass._load_data(name)
klass._cmap_cache[name] = cmap = PyCMap(name, data)
return cmap
@classmethod @classmethod
def get_unicode_map(klass, name, vertical=False): def get_unicode_map(klass, name, vertical=False):
modname = 'pdfminer.cmap.TO_UNICODE_%s' % name.replace('-','_')
if klass.debug:
print >>sys.stderr, 'loading:', modname, vertical
try: try:
module = __import__(modname, fromlist=['pdfminer.cmap']) return klass._umap_cache[name][vertical]
except ImportError: except KeyError:
raise CMapDB.CMapNotFound(name) pass
return PyUnicodeMap(name, module, vertical) data = klass._load_data('to-unicode-%s' % name)
klass._umap_cache[name] = umaps = [PyUnicodeMap(name, data, v) for v in (False, True)]
return umaps[vertical]
## CMapParser ## CMapParser

View File

@ -22,6 +22,9 @@ PDF parser that can be used for other purposes instead of text analysis.''',
'pdfminer', 'pdfminer',
'pdfminer.cmap', 'pdfminer.cmap',
], ],
package_data={
'pdfminer.cmap': ['*.pickle.gz'],
},
scripts=[ scripts=[
'tools/pdf2txt.py', 'tools/pdf2txt.py',
'tools/dumppdf.py', 'tools/dumppdf.py',

View File

@ -1,6 +1,8 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys import sys
import os.path import os.path
import gzip
import cPickle as pickle
def process_cid2code(fp, check_codecs=[]): def process_cid2code(fp, check_codecs=[]):
@ -118,9 +120,6 @@ def main(argv):
print 'usage: %s output_dir regname cid2code.txt codecs ...' % argv[0] print 'usage: %s output_dir regname cid2code.txt codecs ...' % argv[0]
return 100 return 100
def pyname(name):
return name.replace('-','_')+'.py'
args = argv[1:] args = argv[1:]
if len(args) < 3: return usage() if len(args) < 3: return usage()
(outdir, regname, src) = args[:3] (outdir, regname, src) = args[:3]
@ -132,22 +131,24 @@ def main(argv):
fp.close() fp.close()
for (name, cmap) in code2cid.iteritems(): for (name, cmap) in code2cid.iteritems():
fname = pyname(name) fname = '%s.pickle.gz' % name
print >>sys.stderr, 'writing %r...' % fname print >>sys.stderr, 'writing %r...' % fname
fp = file(os.path.join(outdir, fname), 'w') fp = gzip.open(os.path.join(outdir, fname), 'wb')
print >>fp, '#!/usr/bin/env python' data = dict(
print >>fp, '#', fname IS_VERTICAL=is_vertical.get(name, False),
print >>fp, 'IS_VERTICAL = %r' % is_vertical.get(name, False) CODE2CID=cmap,
print >>fp, 'CODE2CID = %r' % cmap )
fp.write(pickle.dumps(data))
fp.close() fp.close()
fname = 'TO_UNICODE_'+pyname(regname) fname = 'to-unicode-%s.pickle.gz' % regname
print >>sys.stderr, 'writing %r...' % fname print >>sys.stderr, 'writing %r...' % fname
fp = file(os.path.join(outdir, fname), 'w') fp = gzip.open(os.path.join(outdir, fname), 'wb')
print >>fp, '#!/usr/bin/env python' data = dict(
print >>fp, '#', fname CID2UNICHR_H=cid2unichr_h,
print >>fp, 'CID2UNICHR_H = %r' % cid2unichr_h CID2UNICHR_V=cid2unichr_v,
print >>fp, 'CID2UNICHR_V = %r' % cid2unichr_v )
fp.write(pickle.dumps(data))
fp.close() fp.close()
return 0 return 0