cmap compression patch. thanks to Jakub Wilk
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@228 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
f2005bee55
commit
a0dd46bd8e
12
Makefile
12
Makefile
|
@ -33,17 +33,17 @@ publish:
|
||||||
CONV_CMAP=$(PYTHON) tools/conv_cmap.py
|
CONV_CMAP=$(PYTHON) tools/conv_cmap.py
|
||||||
CMAPSRC=cmaprsrc
|
CMAPSRC=cmaprsrc
|
||||||
CMAPDST=pdfminer/cmap
|
CMAPDST=pdfminer/cmap
|
||||||
cmap: $(CMAPDST)/TO_UNICODE_Adobe_CNS1.py $(CMAPDST)/TO_UNICODE_Adobe_GB1.py \
|
cmap: $(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz $(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz \
|
||||||
$(CMAPDST)/TO_UNICODE_Adobe_Japan1.py $(CMAPDST)/TO_UNICODE_Adobe_Korea1.py
|
$(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz $(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz
|
||||||
cmap_clean:
|
cmap_clean:
|
||||||
cd $(CMAPDST) && make cmap_clean
|
cd $(CMAPDST) && make cmap_clean
|
||||||
$(CMAPDST)/TO_UNICODE_Adobe_CNS1.py:
|
$(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz:
|
||||||
$(CONV_CMAP) $(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt cp950 big5
|
$(CONV_CMAP) $(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt cp950 big5
|
||||||
$(CMAPDST)/TO_UNICODE_Adobe_GB1.py:
|
$(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz:
|
||||||
$(CONV_CMAP) $(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt cp936 gb2312
|
$(CONV_CMAP) $(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt cp936 gb2312
|
||||||
$(CMAPDST)/TO_UNICODE_Adobe_Japan1.py:
|
$(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz:
|
||||||
$(CONV_CMAP) $(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
|
$(CONV_CMAP) $(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
|
||||||
$(CMAPDST)/TO_UNICODE_Adobe_Korea1.py:
|
$(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz:
|
||||||
$(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
|
$(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
|
||||||
|
|
||||||
test: cmap
|
test: cmap
|
||||||
|
|
|
@ -6,5 +6,4 @@ clean:
|
||||||
-rm *.pyc *.pyo
|
-rm *.pyc *.pyo
|
||||||
|
|
||||||
cmap_clean:
|
cmap_clean:
|
||||||
-rm *.py
|
rm -f *.pickle.gz
|
||||||
touch __init__.py
|
|
||||||
|
|
|
@ -15,6 +15,9 @@ import sys
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
import os.path
|
import os.path
|
||||||
|
import gzip
|
||||||
|
import cPickle as pickle
|
||||||
|
import cmap
|
||||||
from struct import pack, unpack
|
from struct import pack, unpack
|
||||||
from psparser import PSStackParser
|
from psparser import PSStackParser
|
||||||
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF
|
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF
|
||||||
|
@ -210,34 +213,50 @@ class PyUnicodeMap(UnicodeMap):
|
||||||
class CMapDB(object):
|
class CMapDB(object):
|
||||||
|
|
||||||
debug = 0
|
debug = 0
|
||||||
|
_cmap_cache = {}
|
||||||
|
_umap_cache = {}
|
||||||
|
|
||||||
class CMapNotFound(CMapError): pass
|
class CMapNotFound(CMapError): pass
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _load_data(klass, name):
|
||||||
|
filename = '%s.pickle.gz' % name
|
||||||
|
if klass.debug:
|
||||||
|
print >>sys.stderr, 'loading:', name
|
||||||
|
for directory in os.path.dirname(cmap.__file__), '/usr/share/pdfminer/':
|
||||||
|
path = os.path.join(directory, filename)
|
||||||
|
if os.path.exists(path):
|
||||||
|
gzfile = gzip.open(path)
|
||||||
|
try:
|
||||||
|
return type(name, (), pickle.loads(gzfile.read()))
|
||||||
|
finally:
|
||||||
|
gzfile.close()
|
||||||
|
else:
|
||||||
|
raise CMapDB.CMapNotFound(name)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_cmap(klass, name):
|
def get_cmap(klass, name):
|
||||||
if name == 'Identity-H':
|
if name == 'Identity-H':
|
||||||
return IdentityCMap(False)
|
return IdentityCMap(False)
|
||||||
elif name == 'Identity-V':
|
elif name == 'Identity-V':
|
||||||
return IdentityCMap(True)
|
return IdentityCMap(True)
|
||||||
modname = 'pdfminer.cmap.%s' % name.replace('-','_')
|
|
||||||
if klass.debug:
|
|
||||||
print >>sys.stderr, 'loading:', modname
|
|
||||||
try:
|
try:
|
||||||
module = __import__(modname, fromlist=['pdfminer.cmap'])
|
return klass._cmap_cache[name]
|
||||||
except ImportError:
|
except KeyError:
|
||||||
raise CMapDB.CMapNotFound(name)
|
pass
|
||||||
return PyCMap(name, module)
|
data = klass._load_data(name)
|
||||||
|
klass._cmap_cache[name] = cmap = PyCMap(name, data)
|
||||||
|
return cmap
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_unicode_map(klass, name, vertical=False):
|
def get_unicode_map(klass, name, vertical=False):
|
||||||
modname = 'pdfminer.cmap.TO_UNICODE_%s' % name.replace('-','_')
|
|
||||||
if klass.debug:
|
|
||||||
print >>sys.stderr, 'loading:', modname, vertical
|
|
||||||
try:
|
try:
|
||||||
module = __import__(modname, fromlist=['pdfminer.cmap'])
|
return klass._umap_cache[name][vertical]
|
||||||
except ImportError:
|
except KeyError:
|
||||||
raise CMapDB.CMapNotFound(name)
|
pass
|
||||||
return PyUnicodeMap(name, module, vertical)
|
data = klass._load_data('to-unicode-%s' % name)
|
||||||
|
klass._umap_cache[name] = umaps = [PyUnicodeMap(name, data, v) for v in (False, True)]
|
||||||
|
return umaps[vertical]
|
||||||
|
|
||||||
|
|
||||||
## CMapParser
|
## CMapParser
|
||||||
|
|
3
setup.py
3
setup.py
|
@ -22,6 +22,9 @@ PDF parser that can be used for other purposes instead of text analysis.''',
|
||||||
'pdfminer',
|
'pdfminer',
|
||||||
'pdfminer.cmap',
|
'pdfminer.cmap',
|
||||||
],
|
],
|
||||||
|
package_data={
|
||||||
|
'pdfminer.cmap': ['*.pickle.gz'],
|
||||||
|
},
|
||||||
scripts=[
|
scripts=[
|
||||||
'tools/pdf2txt.py',
|
'tools/pdf2txt.py',
|
||||||
'tools/dumppdf.py',
|
'tools/dumppdf.py',
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import sys
|
import sys
|
||||||
import os.path
|
import os.path
|
||||||
|
import gzip
|
||||||
|
import cPickle as pickle
|
||||||
|
|
||||||
def process_cid2code(fp, check_codecs=[]):
|
def process_cid2code(fp, check_codecs=[]):
|
||||||
|
|
||||||
|
@ -118,9 +120,6 @@ def main(argv):
|
||||||
print 'usage: %s output_dir regname cid2code.txt codecs ...' % argv[0]
|
print 'usage: %s output_dir regname cid2code.txt codecs ...' % argv[0]
|
||||||
return 100
|
return 100
|
||||||
|
|
||||||
def pyname(name):
|
|
||||||
return name.replace('-','_')+'.py'
|
|
||||||
|
|
||||||
args = argv[1:]
|
args = argv[1:]
|
||||||
if len(args) < 3: return usage()
|
if len(args) < 3: return usage()
|
||||||
(outdir, regname, src) = args[:3]
|
(outdir, regname, src) = args[:3]
|
||||||
|
@ -132,22 +131,24 @@ def main(argv):
|
||||||
fp.close()
|
fp.close()
|
||||||
|
|
||||||
for (name, cmap) in code2cid.iteritems():
|
for (name, cmap) in code2cid.iteritems():
|
||||||
fname = pyname(name)
|
fname = '%s.pickle.gz' % name
|
||||||
print >>sys.stderr, 'writing %r...' % fname
|
print >>sys.stderr, 'writing %r...' % fname
|
||||||
fp = file(os.path.join(outdir, fname), 'w')
|
fp = gzip.open(os.path.join(outdir, fname), 'wb')
|
||||||
print >>fp, '#!/usr/bin/env python'
|
data = dict(
|
||||||
print >>fp, '#', fname
|
IS_VERTICAL=is_vertical.get(name, False),
|
||||||
print >>fp, 'IS_VERTICAL = %r' % is_vertical.get(name, False)
|
CODE2CID=cmap,
|
||||||
print >>fp, 'CODE2CID = %r' % cmap
|
)
|
||||||
|
fp.write(pickle.dumps(data))
|
||||||
fp.close()
|
fp.close()
|
||||||
|
|
||||||
fname = 'TO_UNICODE_'+pyname(regname)
|
fname = 'to-unicode-%s.pickle.gz' % regname
|
||||||
print >>sys.stderr, 'writing %r...' % fname
|
print >>sys.stderr, 'writing %r...' % fname
|
||||||
fp = file(os.path.join(outdir, fname), 'w')
|
fp = gzip.open(os.path.join(outdir, fname), 'wb')
|
||||||
print >>fp, '#!/usr/bin/env python'
|
data = dict(
|
||||||
print >>fp, '#', fname
|
CID2UNICHR_H=cid2unichr_h,
|
||||||
print >>fp, 'CID2UNICHR_H = %r' % cid2unichr_h
|
CID2UNICHR_V=cid2unichr_v,
|
||||||
print >>fp, 'CID2UNICHR_V = %r' % cid2unichr_v
|
)
|
||||||
|
fp.write(pickle.dumps(data))
|
||||||
fp.close()
|
fp.close()
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
Loading…
Reference in New Issue