cmap compression patch. thanks to Jakub Wilk

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@228 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2010-06-13 13:50:24 +00:00
parent f2005bee55
commit a0dd46bd8e
5 changed files with 59 additions and 37 deletions

View File

@ -33,17 +33,17 @@ publish:
CONV_CMAP=$(PYTHON) tools/conv_cmap.py
CMAPSRC=cmaprsrc
CMAPDST=pdfminer/cmap
cmap: $(CMAPDST)/TO_UNICODE_Adobe_CNS1.py $(CMAPDST)/TO_UNICODE_Adobe_GB1.py \
$(CMAPDST)/TO_UNICODE_Adobe_Japan1.py $(CMAPDST)/TO_UNICODE_Adobe_Korea1.py
cmap: $(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz $(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz \
$(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz $(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz
cmap_clean:
cd $(CMAPDST) && make cmap_clean
$(CMAPDST)/TO_UNICODE_Adobe_CNS1.py:
$(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz:
$(CONV_CMAP) $(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt cp950 big5
$(CMAPDST)/TO_UNICODE_Adobe_GB1.py:
$(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz:
$(CONV_CMAP) $(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt cp936 gb2312
$(CMAPDST)/TO_UNICODE_Adobe_Japan1.py:
$(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz:
$(CONV_CMAP) $(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
$(CMAPDST)/TO_UNICODE_Adobe_Korea1.py:
$(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz:
$(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
test: cmap

View File

@ -6,5 +6,4 @@ clean:
-rm *.pyc *.pyo
cmap_clean:
-rm *.py
touch __init__.py
rm -f *.pickle.gz

View File

@ -15,6 +15,9 @@ import sys
import re
import os
import os.path
import gzip
import cPickle as pickle
import cmap
from struct import pack, unpack
from psparser import PSStackParser
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF
@ -210,34 +213,50 @@ class PyUnicodeMap(UnicodeMap):
class CMapDB(object):
debug = 0
_cmap_cache = {}
_umap_cache = {}
class CMapNotFound(CMapError): pass
@classmethod
def _load_data(klass, name):
filename = '%s.pickle.gz' % name
if klass.debug:
print >>sys.stderr, 'loading:', name
for directory in os.path.dirname(cmap.__file__), '/usr/share/pdfminer/':
path = os.path.join(directory, filename)
if os.path.exists(path):
gzfile = gzip.open(path)
try:
return type(name, (), pickle.loads(gzfile.read()))
finally:
gzfile.close()
else:
raise CMapDB.CMapNotFound(name)
@classmethod
def get_cmap(klass, name):
if name == 'Identity-H':
return IdentityCMap(False)
elif name == 'Identity-V':
return IdentityCMap(True)
modname = 'pdfminer.cmap.%s' % name.replace('-','_')
if klass.debug:
print >>sys.stderr, 'loading:', modname
try:
module = __import__(modname, fromlist=['pdfminer.cmap'])
except ImportError:
raise CMapDB.CMapNotFound(name)
return PyCMap(name, module)
return klass._cmap_cache[name]
except KeyError:
pass
data = klass._load_data(name)
klass._cmap_cache[name] = cmap = PyCMap(name, data)
return cmap
@classmethod
def get_unicode_map(klass, name, vertical=False):
modname = 'pdfminer.cmap.TO_UNICODE_%s' % name.replace('-','_')
if klass.debug:
print >>sys.stderr, 'loading:', modname, vertical
try:
module = __import__(modname, fromlist=['pdfminer.cmap'])
except ImportError:
raise CMapDB.CMapNotFound(name)
return PyUnicodeMap(name, module, vertical)
return klass._umap_cache[name][vertical]
except KeyError:
pass
data = klass._load_data('to-unicode-%s' % name)
klass._umap_cache[name] = umaps = [PyUnicodeMap(name, data, v) for v in (False, True)]
return umaps[vertical]
## CMapParser

View File

@ -22,6 +22,9 @@ PDF parser that can be used for other purposes instead of text analysis.''',
'pdfminer',
'pdfminer.cmap',
],
package_data={
'pdfminer.cmap': ['*.pickle.gz'],
},
scripts=[
'tools/pdf2txt.py',
'tools/dumppdf.py',

View File

@ -1,6 +1,8 @@
#!/usr/bin/env python
import sys
import os.path
import gzip
import cPickle as pickle
def process_cid2code(fp, check_codecs=[]):
@ -118,9 +120,6 @@ def main(argv):
print 'usage: %s output_dir regname cid2code.txt codecs ...' % argv[0]
return 100
def pyname(name):
return name.replace('-','_')+'.py'
args = argv[1:]
if len(args) < 3: return usage()
(outdir, regname, src) = args[:3]
@ -132,22 +131,24 @@ def main(argv):
fp.close()
for (name, cmap) in code2cid.iteritems():
fname = pyname(name)
fname = '%s.pickle.gz' % name
print >>sys.stderr, 'writing %r...' % fname
fp = file(os.path.join(outdir, fname), 'w')
print >>fp, '#!/usr/bin/env python'
print >>fp, '#', fname
print >>fp, 'IS_VERTICAL = %r' % is_vertical.get(name, False)
print >>fp, 'CODE2CID = %r' % cmap
fp = gzip.open(os.path.join(outdir, fname), 'wb')
data = dict(
IS_VERTICAL=is_vertical.get(name, False),
CODE2CID=cmap,
)
fp.write(pickle.dumps(data))
fp.close()
fname = 'TO_UNICODE_'+pyname(regname)
fname = 'to-unicode-%s.pickle.gz' % regname
print >>sys.stderr, 'writing %r...' % fname
fp = file(os.path.join(outdir, fname), 'w')
print >>fp, '#!/usr/bin/env python'
print >>fp, '#', fname
print >>fp, 'CID2UNICHR_H = %r' % cid2unichr_h
print >>fp, 'CID2UNICHR_V = %r' % cid2unichr_v
fp = gzip.open(os.path.join(outdir, fname), 'wb')
data = dict(
CID2UNICHR_H=cid2unichr_h,
CID2UNICHR_V=cid2unichr_v,
)
fp.write(pickle.dumps(data))
fp.close()
return 0