cmap compression patch. thanks to Jakub Wilk
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@228 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
f2005bee55
commit
a0dd46bd8e
12
Makefile
12
Makefile
|
@ -33,17 +33,17 @@ publish:
|
|||
CONV_CMAP=$(PYTHON) tools/conv_cmap.py
|
||||
CMAPSRC=cmaprsrc
|
||||
CMAPDST=pdfminer/cmap
|
||||
cmap: $(CMAPDST)/TO_UNICODE_Adobe_CNS1.py $(CMAPDST)/TO_UNICODE_Adobe_GB1.py \
|
||||
$(CMAPDST)/TO_UNICODE_Adobe_Japan1.py $(CMAPDST)/TO_UNICODE_Adobe_Korea1.py
|
||||
cmap: $(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz $(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz \
|
||||
$(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz $(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz
|
||||
cmap_clean:
|
||||
cd $(CMAPDST) && make cmap_clean
|
||||
$(CMAPDST)/TO_UNICODE_Adobe_CNS1.py:
|
||||
$(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz:
|
||||
$(CONV_CMAP) $(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt cp950 big5
|
||||
$(CMAPDST)/TO_UNICODE_Adobe_GB1.py:
|
||||
$(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz:
|
||||
$(CONV_CMAP) $(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt cp936 gb2312
|
||||
$(CMAPDST)/TO_UNICODE_Adobe_Japan1.py:
|
||||
$(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz:
|
||||
$(CONV_CMAP) $(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
|
||||
$(CMAPDST)/TO_UNICODE_Adobe_Korea1.py:
|
||||
$(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz:
|
||||
$(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
|
||||
|
||||
test: cmap
|
||||
|
|
|
@ -6,5 +6,4 @@ clean:
|
|||
-rm *.pyc *.pyo
|
||||
|
||||
cmap_clean:
|
||||
-rm *.py
|
||||
touch __init__.py
|
||||
rm -f *.pickle.gz
|
||||
|
|
|
@ -15,6 +15,9 @@ import sys
|
|||
import re
|
||||
import os
|
||||
import os.path
|
||||
import gzip
|
||||
import cPickle as pickle
|
||||
import cmap
|
||||
from struct import pack, unpack
|
||||
from psparser import PSStackParser
|
||||
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF
|
||||
|
@ -210,34 +213,50 @@ class PyUnicodeMap(UnicodeMap):
|
|||
class CMapDB(object):
|
||||
|
||||
debug = 0
|
||||
_cmap_cache = {}
|
||||
_umap_cache = {}
|
||||
|
||||
class CMapNotFound(CMapError): pass
|
||||
|
||||
@classmethod
|
||||
def _load_data(klass, name):
|
||||
filename = '%s.pickle.gz' % name
|
||||
if klass.debug:
|
||||
print >>sys.stderr, 'loading:', name
|
||||
for directory in os.path.dirname(cmap.__file__), '/usr/share/pdfminer/':
|
||||
path = os.path.join(directory, filename)
|
||||
if os.path.exists(path):
|
||||
gzfile = gzip.open(path)
|
||||
try:
|
||||
return type(name, (), pickle.loads(gzfile.read()))
|
||||
finally:
|
||||
gzfile.close()
|
||||
else:
|
||||
raise CMapDB.CMapNotFound(name)
|
||||
|
||||
@classmethod
|
||||
def get_cmap(klass, name):
|
||||
if name == 'Identity-H':
|
||||
return IdentityCMap(False)
|
||||
elif name == 'Identity-V':
|
||||
return IdentityCMap(True)
|
||||
modname = 'pdfminer.cmap.%s' % name.replace('-','_')
|
||||
if klass.debug:
|
||||
print >>sys.stderr, 'loading:', modname
|
||||
try:
|
||||
module = __import__(modname, fromlist=['pdfminer.cmap'])
|
||||
except ImportError:
|
||||
raise CMapDB.CMapNotFound(name)
|
||||
return PyCMap(name, module)
|
||||
return klass._cmap_cache[name]
|
||||
except KeyError:
|
||||
pass
|
||||
data = klass._load_data(name)
|
||||
klass._cmap_cache[name] = cmap = PyCMap(name, data)
|
||||
return cmap
|
||||
|
||||
@classmethod
|
||||
def get_unicode_map(klass, name, vertical=False):
|
||||
modname = 'pdfminer.cmap.TO_UNICODE_%s' % name.replace('-','_')
|
||||
if klass.debug:
|
||||
print >>sys.stderr, 'loading:', modname, vertical
|
||||
try:
|
||||
module = __import__(modname, fromlist=['pdfminer.cmap'])
|
||||
except ImportError:
|
||||
raise CMapDB.CMapNotFound(name)
|
||||
return PyUnicodeMap(name, module, vertical)
|
||||
return klass._umap_cache[name][vertical]
|
||||
except KeyError:
|
||||
pass
|
||||
data = klass._load_data('to-unicode-%s' % name)
|
||||
klass._umap_cache[name] = umaps = [PyUnicodeMap(name, data, v) for v in (False, True)]
|
||||
return umaps[vertical]
|
||||
|
||||
|
||||
## CMapParser
|
||||
|
|
3
setup.py
3
setup.py
|
@ -22,6 +22,9 @@ PDF parser that can be used for other purposes instead of text analysis.''',
|
|||
'pdfminer',
|
||||
'pdfminer.cmap',
|
||||
],
|
||||
package_data={
|
||||
'pdfminer.cmap': ['*.pickle.gz'],
|
||||
},
|
||||
scripts=[
|
||||
'tools/pdf2txt.py',
|
||||
'tools/dumppdf.py',
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
import os.path
|
||||
import gzip
|
||||
import cPickle as pickle
|
||||
|
||||
def process_cid2code(fp, check_codecs=[]):
|
||||
|
||||
|
@ -118,9 +120,6 @@ def main(argv):
|
|||
print 'usage: %s output_dir regname cid2code.txt codecs ...' % argv[0]
|
||||
return 100
|
||||
|
||||
def pyname(name):
|
||||
return name.replace('-','_')+'.py'
|
||||
|
||||
args = argv[1:]
|
||||
if len(args) < 3: return usage()
|
||||
(outdir, regname, src) = args[:3]
|
||||
|
@ -132,22 +131,24 @@ def main(argv):
|
|||
fp.close()
|
||||
|
||||
for (name, cmap) in code2cid.iteritems():
|
||||
fname = pyname(name)
|
||||
fname = '%s.pickle.gz' % name
|
||||
print >>sys.stderr, 'writing %r...' % fname
|
||||
fp = file(os.path.join(outdir, fname), 'w')
|
||||
print >>fp, '#!/usr/bin/env python'
|
||||
print >>fp, '#', fname
|
||||
print >>fp, 'IS_VERTICAL = %r' % is_vertical.get(name, False)
|
||||
print >>fp, 'CODE2CID = %r' % cmap
|
||||
fp = gzip.open(os.path.join(outdir, fname), 'wb')
|
||||
data = dict(
|
||||
IS_VERTICAL=is_vertical.get(name, False),
|
||||
CODE2CID=cmap,
|
||||
)
|
||||
fp.write(pickle.dumps(data))
|
||||
fp.close()
|
||||
|
||||
fname = 'TO_UNICODE_'+pyname(regname)
|
||||
fname = 'to-unicode-%s.pickle.gz' % regname
|
||||
print >>sys.stderr, 'writing %r...' % fname
|
||||
fp = file(os.path.join(outdir, fname), 'w')
|
||||
print >>fp, '#!/usr/bin/env python'
|
||||
print >>fp, '#', fname
|
||||
print >>fp, 'CID2UNICHR_H = %r' % cid2unichr_h
|
||||
print >>fp, 'CID2UNICHR_V = %r' % cid2unichr_v
|
||||
fp = gzip.open(os.path.join(outdir, fname), 'wb')
|
||||
data = dict(
|
||||
CID2UNICHR_H=cid2unichr_h,
|
||||
CID2UNICHR_V=cid2unichr_v,
|
||||
)
|
||||
fp.write(pickle.dumps(data))
|
||||
fp.close()
|
||||
|
||||
return 0
|
||||
|
|
Loading…
Reference in New Issue