include cmap
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@162 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
ed8a5362b9
commit
e4b089e327
6
MANIFEST
6
MANIFEST
|
@ -6,7 +6,8 @@ pdfminer/Makefile
|
||||||
pdfminer/__init__.py
|
pdfminer/__init__.py
|
||||||
pdfminer/arcfour.py
|
pdfminer/arcfour.py
|
||||||
pdfminer/ascii85.py
|
pdfminer/ascii85.py
|
||||||
pdfminer/cmap.py
|
pdfminer/cmapdb.py
|
||||||
|
pdfminer/encodingdb.py
|
||||||
pdfminer/converter.py
|
pdfminer/converter.py
|
||||||
pdfminer/fontmetrics.py
|
pdfminer/fontmetrics.py
|
||||||
pdfminer/glyphlist.py
|
pdfminer/glyphlist.py
|
||||||
|
@ -24,6 +25,8 @@ pdfminer/psparser.py
|
||||||
pdfminer/pycdb.py
|
pdfminer/pycdb.py
|
||||||
pdfminer/rijndael.py
|
pdfminer/rijndael.py
|
||||||
pdfminer/utils.py
|
pdfminer/utils.py
|
||||||
|
pdfminer/cmap/Makefile
|
||||||
|
pdfminer/cmap/__init__.py
|
||||||
tools/Makefile
|
tools/Makefile
|
||||||
tools/dumppdf.py
|
tools/dumppdf.py
|
||||||
tools/pdf2txt.py
|
tools/pdf2txt.py
|
||||||
|
@ -40,3 +43,4 @@ samples/i1040nr.pdf
|
||||||
samples/kampo.pdf
|
samples/kampo.pdf
|
||||||
samples/naacl06-shinyama.pdf
|
samples/naacl06-shinyama.pdf
|
||||||
samples/nlp2004slides.pdf
|
samples/nlp2004slides.pdf
|
||||||
|
cmaprsrc/README.txt
|
||||||
|
|
12
Makefile
12
Makefile
|
@ -36,3 +36,15 @@ register: clean
|
||||||
WEBDIR=$$HOME/Site/unixuser.org/python/$(PACKAGE)
|
WEBDIR=$$HOME/Site/unixuser.org/python/$(PACKAGE)
|
||||||
publish:
|
publish:
|
||||||
$(CP) docs/*.html $(WEBDIR)
|
$(CP) docs/*.html $(WEBDIR)
|
||||||
|
|
||||||
|
CONV_CMAP=$(PYTHON) tools/conv_cmap.py
|
||||||
|
CMAPDIR=pdfminer/cmap
|
||||||
|
CMAPRSRC=cmaprsrc
|
||||||
|
cmap: cmaprsrc
|
||||||
|
$(CONV_CMAP) $(CMAPDIR) Adobe-CNS1 $(CMAPRSRC)/cid2code_Adobe_CNS1.txt cp950 big5
|
||||||
|
$(CONV_CMAP) $(CMAPDIR) Adobe-GB1 $(CMAPRSRC)/cid2code_Adobe_GB1.txt cp936 gb2312
|
||||||
|
$(CONV_CMAP) $(CMAPDIR) Adobe-Japan1 $(CMAPRSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
|
||||||
|
$(CONV_CMAP) $(CMAPDIR) Adobe-Korea1 $(CMAPRSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
|
||||||
|
|
||||||
|
cmap_clean:
|
||||||
|
cd $(CMAPDIR) && make cmap_clean
|
||||||
|
|
2
TODO
2
TODO
|
@ -4,4 +4,4 @@ TODOs:
|
||||||
- Better API Documentation.
|
- Better API Documentation.
|
||||||
- Robust error handling.
|
- Robust error handling.
|
||||||
- Any special handling for linearized PDFs?
|
- Any special handling for linearized PDFs?
|
||||||
- Handle crypt filter. (I need more samples!)
|
- Handle crypt filter. (More sample documents are needed!)
|
||||||
|
|
|
@ -0,0 +1,60 @@
|
||||||
|
README.txt for cmaprsrc
|
||||||
|
|
||||||
|
This directory contains Adobe CMap resources. CMaps are required
|
||||||
|
to decode text data written in Chinese, Japanese or Korean language.
|
||||||
|
CMap resources are now available freely from Adobe web site:
|
||||||
|
http://opensource.adobe.com/wiki/display/cmap/CMap+Resources
|
||||||
|
|
||||||
|
The follwing files were extracted from the downloadable tarballs:
|
||||||
|
|
||||||
|
cid2code_Adobe_CNS1.txt:
|
||||||
|
http://download.macromedia.com/pub/opensource/cmap/cmapresources_cns1-6.tar.z
|
||||||
|
|
||||||
|
cid2code_Adobe_GB1.txt:
|
||||||
|
http://download.macromedia.com/pub/opensource/cmap/cmapresources_gb1-5.tar.z
|
||||||
|
|
||||||
|
cid2code_Adobe_Japan1.txt:
|
||||||
|
http://download.macromedia.com/pub/opensource/cmap/cmapresources_japan1-6.tar.z
|
||||||
|
|
||||||
|
cid2code_Adobe_Korea1.txt:
|
||||||
|
http://download.macromedia.com/pub/opensource/cmap/cmapresources_korean1-2.tar.z
|
||||||
|
|
||||||
|
|
||||||
|
Here is the license information in the original files:
|
||||||
|
|
||||||
|
%%Copyright: -----------------------------------------------------------
|
||||||
|
%%Copyright: Copyright 1990-20xx Adobe Systems Incorporated.
|
||||||
|
%%Copyright: All rights reserved.
|
||||||
|
%%Copyright:
|
||||||
|
%%Copyright: Redistribution and use in source and binary forms, with or
|
||||||
|
%%Copyright: without modification, are permitted provided that the
|
||||||
|
%%Copyright: following conditions are met:
|
||||||
|
%%Copyright:
|
||||||
|
%%Copyright: Redistributions of source code must retain the above
|
||||||
|
%%Copyright: copyright notice, this list of conditions and the following
|
||||||
|
%%Copyright: disclaimer.
|
||||||
|
%%Copyright:
|
||||||
|
%%Copyright: Redistributions in binary form must reproduce the above
|
||||||
|
%%Copyright: copyright notice, this list of conditions and the following
|
||||||
|
%%Copyright: disclaimer in the documentation and/or other materials
|
||||||
|
%%Copyright: provided with the distribution.
|
||||||
|
%%Copyright:
|
||||||
|
%%Copyright: Neither the name of Adobe Systems Incorporated nor the names
|
||||||
|
%%Copyright: of its contributors may be used to endorse or promote
|
||||||
|
%%Copyright: products derived from this software without specific prior
|
||||||
|
%%Copyright: written permission.
|
||||||
|
%%Copyright:
|
||||||
|
%%Copyright: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
|
||||||
|
%%Copyright: CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
|
||||||
|
%%Copyright: INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||||
|
%%Copyright: MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
%%Copyright: DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
|
||||||
|
%%Copyright: CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
%%Copyright: SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
||||||
|
%%Copyright: NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
%%Copyright: LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||||
|
%%Copyright: HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
%%Copyright: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
||||||
|
%%Copyright: OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
%%Copyright: SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
%%Copyright: -----------------------------------------------------------
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -6,6 +6,7 @@ all:
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
-rm *.pyc *.pyo
|
-rm *.pyc *.pyo
|
||||||
|
cd cmap && make clean
|
||||||
|
|
||||||
check:
|
check:
|
||||||
$(PYCHECKER) *.py
|
$(PYCHECKER) *.py
|
||||||
|
|
|
@ -0,0 +1,10 @@
|
||||||
|
# Makefile for pdfminer.cmap
|
||||||
|
|
||||||
|
all:
|
||||||
|
|
||||||
|
clean:
|
||||||
|
-rm *.pyc *.pyo
|
||||||
|
|
||||||
|
cmap_clean:
|
||||||
|
-rm *.py
|
||||||
|
touch __init__.py
|
|
@ -20,240 +20,229 @@ from psparser import PSStackParser
|
||||||
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF
|
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF
|
||||||
from psparser import PSLiteral, PSKeyword
|
from psparser import PSLiteral, PSKeyword
|
||||||
from psparser import literal_name, keyword_name
|
from psparser import literal_name, keyword_name
|
||||||
from fontmetrics import FONT_METRICS
|
from encodingdb import name2unicode
|
||||||
from latin_enc import ENCODING
|
|
||||||
from glyphlist import charname2unicode
|
|
||||||
from utils import choplist, nunpack
|
from utils import choplist, nunpack
|
||||||
try:
|
|
||||||
import cdb
|
|
||||||
except ImportError:
|
|
||||||
import pdfminer.pycdb as cdb
|
|
||||||
|
|
||||||
|
|
||||||
class CMapError(Exception): pass
|
class CMapError(Exception): pass
|
||||||
|
|
||||||
|
|
||||||
## find_cmap_path
|
|
||||||
##
|
|
||||||
def find_cmap_path():
|
|
||||||
"""Returns the location of CMap directory."""
|
|
||||||
for path in (os.environ.get('CMAP_PATH', '.'),
|
|
||||||
os.path.join(os.path.dirname(__file__), 'CMap')):
|
|
||||||
if os.path.isdir(path):
|
|
||||||
return path
|
|
||||||
raise IOError
|
|
||||||
|
|
||||||
|
|
||||||
## name2unicode
|
|
||||||
##
|
|
||||||
STRIP_NAME = re.compile(r'[0-9]+')
|
|
||||||
def name2unicode(name):
|
|
||||||
"""Converts Adobe glyph names to Unicode numbers."""
|
|
||||||
if name in charname2unicode:
|
|
||||||
return charname2unicode[name]
|
|
||||||
m = STRIP_NAME.search(name)
|
|
||||||
if not m: raise KeyError(name)
|
|
||||||
return int(m.group(0))
|
|
||||||
|
|
||||||
|
|
||||||
## CMap
|
## CMap
|
||||||
##
|
##
|
||||||
class CMap(object):
|
class CMap(object):
|
||||||
|
|
||||||
debug = 0
|
debug = 0
|
||||||
|
|
||||||
|
def __init__(self, code2cid=None):
|
||||||
|
self.code2cid = code2cid or {}
|
||||||
|
return
|
||||||
|
|
||||||
|
def is_vertical(self):
|
||||||
|
return False
|
||||||
|
|
||||||
|
def use_cmap(self, cmap):
|
||||||
|
assert isinstance(cmap, CMap)
|
||||||
|
def copy(dst, src):
|
||||||
|
for (k,v) in src.iteritems():
|
||||||
|
if isinstance(v, dict):
|
||||||
|
d = {}
|
||||||
|
dst[k] = d
|
||||||
|
copy(d, v)
|
||||||
|
else:
|
||||||
|
dst[k] = v
|
||||||
|
copy(self.code2cid, cmap.code2cid)
|
||||||
|
return
|
||||||
|
|
||||||
|
def decode(self, code):
|
||||||
|
if self.debug:
|
||||||
|
print >>sys.stderr, 'decode: %r, %r' % (self, code)
|
||||||
|
d = self.code2cid
|
||||||
|
for c in code:
|
||||||
|
c = ord(c)
|
||||||
|
if c in d:
|
||||||
|
d = d[c]
|
||||||
|
if isinstance(d, int):
|
||||||
|
yield d
|
||||||
|
d = self.code2cid
|
||||||
|
else:
|
||||||
|
d = self.code2cid
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
## IdentityCMap
|
||||||
|
##
|
||||||
|
class IdentityCMap(object):
|
||||||
|
|
||||||
|
def __init__(self, vertical):
|
||||||
|
self.vertical = vertical
|
||||||
|
return
|
||||||
|
|
||||||
|
def is_vertical(self):
|
||||||
|
return self.vertical
|
||||||
|
|
||||||
|
def decode(self, code):
|
||||||
|
return unpack('>%dH' % (len(code)/2), code)
|
||||||
|
|
||||||
|
|
||||||
|
## UnicodeMap
|
||||||
|
##
|
||||||
|
class UnicodeMap(object):
|
||||||
|
|
||||||
|
debug = 0
|
||||||
|
|
||||||
|
def __init__(self, cid2unicode=None):
|
||||||
|
self.cid2unicode = cid2unicode or {}
|
||||||
|
return
|
||||||
|
|
||||||
|
def get_unicode(self, cid):
|
||||||
|
if self.debug:
|
||||||
|
print >>sys.stderr, 'get_unicode: %r, %r' % (self, cid)
|
||||||
|
return self.cid2unicode.get(cid)
|
||||||
|
|
||||||
|
|
||||||
|
## FileCMap
|
||||||
|
##
|
||||||
|
class FileCMap(CMap):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.code2cid = {}
|
CMap.__init__(self)
|
||||||
self.cid2code = {}
|
|
||||||
self.attrs = {}
|
self.attrs = {}
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<CMap: %s>' % self.attrs.get('CMapName')
|
return '<CMap: %s>' % self.attrs.get('CMapName')
|
||||||
|
|
||||||
def update(self, code2cid=None, cid2code=None):
|
|
||||||
if code2cid:
|
|
||||||
self.code2cid.update(code2cid)
|
|
||||||
if cid2code:
|
|
||||||
self.cid2code.update(cid2code)
|
|
||||||
return self
|
|
||||||
|
|
||||||
def copycmap(self, cmap):
|
|
||||||
self.code2cid.update(cmap.getall_code2cid())
|
|
||||||
self.cid2code.update(cmap.getall_cid2code())
|
|
||||||
return self
|
|
||||||
|
|
||||||
def register_code2cid(self, code, cid):
|
|
||||||
if isinstance(code, str) and isinstance(cid, int):
|
|
||||||
self.code2cid[code] = cid
|
|
||||||
return self
|
|
||||||
|
|
||||||
def register_cid2code(self, cid, code):
|
|
||||||
if isinstance(cid, int):
|
|
||||||
if isinstance(code, PSLiteral):
|
|
||||||
self.cid2code[cid] = pack('>H', name2unicode(code.name))
|
|
||||||
elif isinstance(code, str):
|
|
||||||
self.cid2code[cid] = code
|
|
||||||
return self
|
|
||||||
|
|
||||||
def decode(self, bytes):
|
|
||||||
if self.debug:
|
|
||||||
print >>sys.stderr, 'decode: %r, %r' % (self, bytes)
|
|
||||||
x = ''
|
|
||||||
for c in bytes:
|
|
||||||
if x:
|
|
||||||
if x+c in self.code2cid:
|
|
||||||
yield self.code2cid[x+c]
|
|
||||||
x = ''
|
|
||||||
elif c in self.code2cid:
|
|
||||||
yield self.code2cid[c]
|
|
||||||
else:
|
|
||||||
x = c
|
|
||||||
return
|
|
||||||
|
|
||||||
def is_vertical(self):
|
def is_vertical(self):
|
||||||
return self.attrs.get('WMode', 0)
|
return self.attrs.get('WMode', 0)
|
||||||
|
|
||||||
def tocid(self, code):
|
def set_attr(self, k, v):
|
||||||
return self.code2cid.get(code)
|
self.attrs[k] = v
|
||||||
def tocode(self, cid):
|
return
|
||||||
return self.cid2code.get(cid)
|
|
||||||
|
|
||||||
def getall_attrs(self):
|
def add_code2cid(self, code, cid):
|
||||||
return self.attrs.iteritems()
|
assert isinstance(code, str) and isinstance(cid, int)
|
||||||
def getall_code2cid(self):
|
d = self.code2cid
|
||||||
return self.code2cid.iteritems()
|
for c in code[:-1]:
|
||||||
def getall_cid2code(self):
|
c = ord(c)
|
||||||
return self.cid2code.iteritems()
|
if c in d:
|
||||||
|
d = d[c]
|
||||||
|
else:
|
||||||
|
t = {}
|
||||||
|
d[c] = t
|
||||||
|
d =t
|
||||||
|
c = ord(code[-1])
|
||||||
|
d[c] = cid
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
## CDBCMap
|
## FileUnicodeMap
|
||||||
##
|
##
|
||||||
class CDBCMap(CMap):
|
class FileUnicodeMap(UnicodeMap):
|
||||||
|
|
||||||
def __init__(self, cdbname):
|
def __init__(self):
|
||||||
CMap.__init__(self)
|
UnicodeMap.__init__(self)
|
||||||
self.cdbname = cdbname
|
self.attrs = {}
|
||||||
self.db = cdb.init(cdbname)
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<CDBCMap: %s (%r)>' % (self.db['/CMapName'], self.cdbname)
|
return '<UnicodeMap: %s>' % self.attrs.get('CMapName')
|
||||||
|
|
||||||
def tocid(self, code):
|
def set_attr(self, k, v):
|
||||||
k = 'c'+code
|
self.attrs[k] = v
|
||||||
if not self.db.has_key(k):
|
return
|
||||||
return None
|
|
||||||
return unpack('>L', self.db[k])
|
def add_cid2unicode(self, cid, code):
|
||||||
def tocode(self, cid):
|
assert isinstance(cid, int)
|
||||||
k = 'i'+pack('>L', cid)
|
if isinstance(code, PSLiteral):
|
||||||
if not self.db.has_key(k):
|
# Interpret as an Adobe glyph name.
|
||||||
return None
|
self.cid2unicode[cid] = name2unicode(code.name)
|
||||||
return self.db[k]
|
elif isinstance(code, str):
|
||||||
|
# Interpret as UTF-16BE.
|
||||||
|
self.cid2unicode[cid] = unpack('>H', code)[0]
|
||||||
|
elif isinstance(code, int):
|
||||||
|
self.cid2unicode[cid] = code
|
||||||
|
else:
|
||||||
|
raise TypeError(code)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
## PyCMap
|
||||||
|
##
|
||||||
|
class PyCMap(CMap):
|
||||||
|
|
||||||
|
def __init__(self, name, module):
|
||||||
|
CMap.__init__(self, module.CODE2CID)
|
||||||
|
self.name = name
|
||||||
|
self._is_vertical = module.IS_VERTICAL
|
||||||
|
return
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return '<PyCMap: %s>' % (self.name)
|
||||||
|
|
||||||
def is_vertical(self):
|
def is_vertical(self):
|
||||||
return (self.db.has_key('/WMode') and
|
return self._is_vertical
|
||||||
self.db['/WMode'] == '1')
|
|
||||||
|
|
||||||
def getall(self, c):
|
|
||||||
while 1:
|
## PyUnicodeMap
|
||||||
x = self.db.each()
|
##
|
||||||
if not x: break
|
class PyUnicodeMap(UnicodeMap):
|
||||||
(k,v) = x
|
|
||||||
if k.startswith(c):
|
def __init__(self, name, module, vertical):
|
||||||
yield (k[1:], unpack('>L', v)[0])
|
if vertical:
|
||||||
|
cid2unicode = module.CID2UNICODE_V
|
||||||
|
else:
|
||||||
|
cid2unicode = module.CID2UNICODE_H
|
||||||
|
UnicodeMap.__init__(self, cid2unicode)
|
||||||
|
self.name = name
|
||||||
return
|
return
|
||||||
|
|
||||||
def getall_attrs(self):
|
def __repr__(self):
|
||||||
while 1:
|
return '<PyUnicodeMap: %s>' % (self.name)
|
||||||
x = self.db.each()
|
|
||||||
if not x: break
|
|
||||||
(k,v) = x
|
|
||||||
if k.startswith('/'):
|
|
||||||
yield (k[1:], eval(v)[0])
|
|
||||||
return
|
|
||||||
|
|
||||||
def getall_cid2code(self):
|
|
||||||
return self.getall('i')
|
|
||||||
def getall_code2cid(self):
|
|
||||||
return self.getall('c')
|
|
||||||
|
|
||||||
def decode(self, bytes):
|
|
||||||
if self.debug:
|
|
||||||
print >>sys.stderr, 'decode: %r, %r' % (self, bytes)
|
|
||||||
x = ''
|
|
||||||
for c in bytes:
|
|
||||||
if x:
|
|
||||||
if x+c in self.code2cid:
|
|
||||||
yield self.code2cid[x+c]
|
|
||||||
elif self.db.has_key('c'+x+c):
|
|
||||||
(dest,) = unpack('>L', self.db['c'+x+c])
|
|
||||||
self.code2cid[x+c] = dest
|
|
||||||
yield dest
|
|
||||||
x = ''
|
|
||||||
elif c in self.code2cid:
|
|
||||||
yield self.code2cid[c]
|
|
||||||
elif self.db.has_key('c'+c):
|
|
||||||
(dest,) = unpack('>L', self.db['c'+c])
|
|
||||||
self.code2cid[c] = dest
|
|
||||||
yield dest
|
|
||||||
else:
|
|
||||||
x = c
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
## CMapDB
|
## CMapDB
|
||||||
##
|
##
|
||||||
class CMapDB(object):
|
class CMapDB(object):
|
||||||
|
|
||||||
class CMapNotFound(CMapError): pass
|
|
||||||
|
|
||||||
CMAP_ALIAS = { }
|
|
||||||
debug = 0
|
debug = 0
|
||||||
|
|
||||||
def __init__(self, dirname=None, cdbdirname=None):
|
class CMapNotFound(CMapError): pass
|
||||||
if not dirname:
|
|
||||||
dirname = find_cmap_path()
|
|
||||||
self.dirname = dirname
|
|
||||||
self.cdbdirname = cdbdirname or dirname
|
|
||||||
self.cmapdb = {}
|
|
||||||
return
|
|
||||||
|
|
||||||
def get_cmap(self, cmapname, strict=True):
|
@classmethod
|
||||||
cmapname = self.CMAP_ALIAS.get(cmapname, cmapname)
|
def get_cmap(klass, name):
|
||||||
if cmapname in self.cmapdb:
|
if name == 'Identity-H':
|
||||||
cmap = self.cmapdb[cmapname]
|
return IdentityCMap(False)
|
||||||
else:
|
elif name == 'Identity-V':
|
||||||
fname = os.path.join(self.dirname, cmapname)
|
return IdentityCMap(True)
|
||||||
cdbname = os.path.join(self.cdbdirname, cmapname+'.cmap.cdb')
|
modname = 'pdfminer.cmap.%s' % name.replace('-','_')
|
||||||
if os.path.exists(cdbname):
|
if klass.debug:
|
||||||
if 1 <= self.debug:
|
print >>sys.stderr, 'loading:', modname
|
||||||
print >>sys.stderr, 'Opening: CDBCMap %r...' % cdbname
|
try:
|
||||||
cmap = CDBCMap(cdbname)
|
module = __import__(modname, fromlist=['pdfminer.cmap'])
|
||||||
elif os.path.exists(fname):
|
except ImportError:
|
||||||
if 1 <= self.debug:
|
raise CMapDB.CMapNotFound(name)
|
||||||
print >>sys.stderr, 'Reading: CMap %r...' % fname
|
return PyCMap(name, module)
|
||||||
cmap = CMap()
|
|
||||||
fp = file(fname, 'rb')
|
@classmethod
|
||||||
CMapParser(self, cmap, fp).run()
|
def get_unicode_map(klass, name, vertical=False):
|
||||||
fp.close()
|
modname = 'pdfminer.cmap.TO_UNICODE_%s' % name.replace('-','_')
|
||||||
elif not strict:
|
if klass.debug:
|
||||||
cmap = CMap() # just create empty cmap
|
print >>sys.stderr, 'loading:', modname, vertical
|
||||||
else:
|
try:
|
||||||
raise CMapDB.CMapNotFound(cmapname)
|
module = __import__(modname, fromlist=['pdfminer.cmap'])
|
||||||
self.cmapdb[cmapname] = cmap
|
except ImportError:
|
||||||
return cmap
|
raise CMapDB.CMapNotFound(name)
|
||||||
|
return PyUnicodeMap(name, module, vertical)
|
||||||
|
|
||||||
|
|
||||||
## CMapParser
|
## CMapParser
|
||||||
##
|
##
|
||||||
class CMapParser(PSStackParser):
|
class CMapParser(PSStackParser):
|
||||||
|
|
||||||
def __init__(self, cmapdb, cmap, fp):
|
def __init__(self, cmap, fp):
|
||||||
PSStackParser.__init__(self, fp)
|
PSStackParser.__init__(self, fp)
|
||||||
self.cmapdb = cmapdb
|
|
||||||
self.cmap = cmap
|
self.cmap = cmap
|
||||||
self.in_cmap = False
|
self._in_cmap = False
|
||||||
return
|
return
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
|
@ -266,29 +255,30 @@ class CMapParser(PSStackParser):
|
||||||
def do_keyword(self, pos, token):
|
def do_keyword(self, pos, token):
|
||||||
name = token.name
|
name = token.name
|
||||||
if name == 'begincmap':
|
if name == 'begincmap':
|
||||||
self.in_cmap = True
|
self._in_cmap = True
|
||||||
self.popall()
|
self.popall()
|
||||||
return
|
return
|
||||||
elif name == 'endcmap':
|
elif name == 'endcmap':
|
||||||
self.in_cmap = False
|
self._in_cmap = False
|
||||||
return
|
return
|
||||||
if not self.in_cmap: return
|
if not self._in_cmap: return
|
||||||
#
|
#
|
||||||
if name == 'def':
|
if name == 'def':
|
||||||
try:
|
try:
|
||||||
((_,k),(_,v)) = self.pop(2)
|
((_,k),(_,v)) = self.pop(2)
|
||||||
self.cmap.attrs[literal_name(k)] = v
|
self.cmap.set_attr(literal_name(k), v)
|
||||||
except PSSyntaxError:
|
except PSSyntaxError:
|
||||||
pass
|
pass
|
||||||
return
|
return
|
||||||
|
|
||||||
if name == 'usecmap':
|
if name == 'usecmap':
|
||||||
if self.cmapdb:
|
try:
|
||||||
try:
|
((_,cmapname),) = self.pop(1)
|
||||||
((_,cmapname),) = self.pop(1)
|
self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
|
||||||
self.cmap.copycmap(self.cmapdb.get_cmap(literal_name(cmapname)))
|
except PSSyntaxError:
|
||||||
except PSSyntaxError:
|
pass
|
||||||
pass
|
except CMapDB.CMapNotFound:
|
||||||
|
pass
|
||||||
return
|
return
|
||||||
|
|
||||||
if name == 'begincodespacerange':
|
if name == 'begincodespacerange':
|
||||||
|
@ -317,7 +307,7 @@ class CMapParser(PSStackParser):
|
||||||
#assert s1 <= e1
|
#assert s1 <= e1
|
||||||
for i in xrange(e1-s1+1):
|
for i in xrange(e1-s1+1):
|
||||||
x = sprefix+pack('>L',s1+i)[-vlen:]
|
x = sprefix+pack('>L',s1+i)[-vlen:]
|
||||||
self.cmap.register_code2cid(x, cid+i)
|
self.cmap.add_code2cid(x, cid+i)
|
||||||
return
|
return
|
||||||
|
|
||||||
if name == 'begincidchar':
|
if name == 'begincidchar':
|
||||||
|
@ -327,7 +317,7 @@ class CMapParser(PSStackParser):
|
||||||
objs = [ obj for (_,obj) in self.popall() ]
|
objs = [ obj for (_,obj) in self.popall() ]
|
||||||
for (cid,code) in choplist(2, objs):
|
for (cid,code) in choplist(2, objs):
|
||||||
if isinstance(code, str) and isinstance(cid, str):
|
if isinstance(code, str) and isinstance(cid, str):
|
||||||
self.cmap.register_code2cid(code, nunpack(cid))
|
self.cmap.add_code2cid(code, nunpack(cid))
|
||||||
return
|
return
|
||||||
|
|
||||||
if name == 'beginbfrange':
|
if name == 'beginbfrange':
|
||||||
|
@ -343,7 +333,7 @@ class CMapParser(PSStackParser):
|
||||||
#assert s1 <= e1
|
#assert s1 <= e1
|
||||||
if isinstance(code, list):
|
if isinstance(code, list):
|
||||||
for i in xrange(e1-s1+1):
|
for i in xrange(e1-s1+1):
|
||||||
self.cmap.register_cid2code(s1+i, code[i])
|
self.cmap.add_cid2unicode(s1+i, code[i])
|
||||||
else:
|
else:
|
||||||
var = code[-4:]
|
var = code[-4:]
|
||||||
base = nunpack(var)
|
base = nunpack(var)
|
||||||
|
@ -351,7 +341,7 @@ class CMapParser(PSStackParser):
|
||||||
vlen = len(var)
|
vlen = len(var)
|
||||||
for i in xrange(e1-s1+1):
|
for i in xrange(e1-s1+1):
|
||||||
x = prefix+pack('>L',base+i)[-vlen:]
|
x = prefix+pack('>L',base+i)[-vlen:]
|
||||||
self.cmap.register_cid2code(s1+i, x)
|
self.cmap.add_cid2unicode(s1+i, x)
|
||||||
return
|
return
|
||||||
|
|
||||||
if name == 'beginbfchar':
|
if name == 'beginbfchar':
|
||||||
|
@ -361,7 +351,7 @@ class CMapParser(PSStackParser):
|
||||||
objs = [ obj for (_,obj) in self.popall() ]
|
objs = [ obj for (_,obj) in self.popall() ]
|
||||||
for (cid,code) in choplist(2, objs):
|
for (cid,code) in choplist(2, objs):
|
||||||
if isinstance(cid, str) and isinstance(code, str):
|
if isinstance(cid, str) and isinstance(code, str):
|
||||||
self.cmap.register_cid2code(nunpack(cid), code)
|
self.cmap.add_cid2unicode(nunpack(cid), code)
|
||||||
return
|
return
|
||||||
|
|
||||||
if name == 'beginnotdefrange':
|
if name == 'beginnotdefrange':
|
||||||
|
@ -373,123 +363,3 @@ class CMapParser(PSStackParser):
|
||||||
|
|
||||||
self.push((pos, token))
|
self.push((pos, token))
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
## FontMetricsDB
|
|
||||||
##
|
|
||||||
class FontMetricsDB(object):
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_metrics(klass, fontname):
|
|
||||||
return FONT_METRICS[fontname]
|
|
||||||
|
|
||||||
|
|
||||||
## EncodingDB
|
|
||||||
##
|
|
||||||
class EncodingDB(object):
|
|
||||||
|
|
||||||
std2unicode = {}
|
|
||||||
mac2unicode = {}
|
|
||||||
win2unicode = {}
|
|
||||||
pdf2unicode = {}
|
|
||||||
for (name,std,mac,win,pdf) in ENCODING:
|
|
||||||
c = unichr(name2unicode(name))
|
|
||||||
if std: std2unicode[std] = c
|
|
||||||
if mac: mac2unicode[mac] = c
|
|
||||||
if win: win2unicode[win] = c
|
|
||||||
if pdf: pdf2unicode[pdf] = c
|
|
||||||
|
|
||||||
encodings = {
|
|
||||||
'StandardEncoding': std2unicode,
|
|
||||||
'MacRomanEncoding': mac2unicode,
|
|
||||||
'WinAnsiEncoding': win2unicode,
|
|
||||||
'PDFDocEncoding': pdf2unicode,
|
|
||||||
}
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_encoding(klass, name, diff=None):
|
|
||||||
cid2unicode = klass.encodings.get(name, klass.std2unicode)
|
|
||||||
if diff:
|
|
||||||
cid2unicode = cid2unicode.copy()
|
|
||||||
cid = 0
|
|
||||||
for x in diff:
|
|
||||||
if isinstance(x, int):
|
|
||||||
cid = x
|
|
||||||
elif isinstance(x, PSLiteral):
|
|
||||||
try:
|
|
||||||
cid2unicode[cid] = unichr(name2unicode(x.name))
|
|
||||||
except KeyError:
|
|
||||||
pass
|
|
||||||
cid += 1
|
|
||||||
return cid2unicode
|
|
||||||
|
|
||||||
|
|
||||||
## CMap -> CMapCDB conversion
|
|
||||||
##
|
|
||||||
def dump_cdb(cmap, cdbfile, verbose=1):
|
|
||||||
"""Writes a CMap object into a cdb file."""
|
|
||||||
m = cdb.cdbmake(cdbfile, cdbfile+'.tmp')
|
|
||||||
if verbose:
|
|
||||||
print >>sys.stderr, 'Writing: %r...' % cdbfile
|
|
||||||
for (k,v) in cmap.getall_attrs():
|
|
||||||
m.add('/'+k, repr(v))
|
|
||||||
for (code,cid) in cmap.getall_code2cid():
|
|
||||||
m.add('c'+code, pack('>L',cid))
|
|
||||||
for (cid,code) in cmap.getall_cid2code():
|
|
||||||
m.add('i'+pack('>L',cid), code)
|
|
||||||
m.finish()
|
|
||||||
return
|
|
||||||
|
|
||||||
def convert_cmap(cmapdir, outputdir, force=False):
|
|
||||||
"""Convert all CMap source files in a directory into cdb files."""
|
|
||||||
cmapdb = CMapDB(cmapdir)
|
|
||||||
for fname in os.listdir(cmapdir):
|
|
||||||
if '.' in fname: continue
|
|
||||||
cmapname = os.path.basename(fname)
|
|
||||||
cdbname = os.path.join(outputdir, cmapname+'.cmap.cdb')
|
|
||||||
if not force and os.path.exists(cdbname):
|
|
||||||
print >>sys.stderr, 'Skipping: %r' % cmapname
|
|
||||||
continue
|
|
||||||
print >>sys.stderr, 'Reading: %r...' % cmapname
|
|
||||||
cmap = cmapdb.get_cmap(cmapname)
|
|
||||||
dump_cdb(cmap, cdbname)
|
|
||||||
return
|
|
||||||
|
|
||||||
def main(argv):
|
|
||||||
"""Converts CMap files into cdb files.
|
|
||||||
|
|
||||||
usage: python -m pdfminer.cmap [-f] [cmap_dir [output_dir]]
|
|
||||||
"""
|
|
||||||
|
|
||||||
import getopt
|
|
||||||
def usage():
|
|
||||||
print 'usage: %s [-f] [cmap_dir [output_dir]]' % argv[0]
|
|
||||||
return 100
|
|
||||||
try:
|
|
||||||
(opts, args) = getopt.getopt(argv[1:], 'f')
|
|
||||||
except getopt.GetoptError:
|
|
||||||
return usage()
|
|
||||||
if args:
|
|
||||||
cmapdir = args.pop(0)
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
cmapdir = find_cmap_path()
|
|
||||||
except IOError:
|
|
||||||
print >>sys.stderr, 'cannot find CMap directory'
|
|
||||||
return 1
|
|
||||||
if args:
|
|
||||||
outputdir = args.pop(0)
|
|
||||||
else:
|
|
||||||
outputdir = cmapdir
|
|
||||||
force = False
|
|
||||||
for (k, v) in opts:
|
|
||||||
if k == '-f': force = True
|
|
||||||
if not os.path.isdir(cmapdir):
|
|
||||||
print >>sys.stderr, 'directory does not exist: %r' % cmapdir
|
|
||||||
return 1
|
|
||||||
if not os.path.isdir(outputdir):
|
|
||||||
print >>sys.stderr, 'directory does not exist: %r' % outputdir
|
|
||||||
return 1
|
|
||||||
return convert_cmap(cmapdir, outputdir, force=force)
|
|
||||||
|
|
||||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
|
||||||
|
|
|
@ -29,7 +29,7 @@ class TagExtractor(PDFDevice):
|
||||||
chars = font.decode(obj)
|
chars = font.decode(obj)
|
||||||
for cid in chars:
|
for cid in chars:
|
||||||
try:
|
try:
|
||||||
char = font.to_unicode(cid)
|
char = font.to_unichr(cid)
|
||||||
text += char
|
text += char
|
||||||
except PDFUnicodeNotDefined:
|
except PDFUnicodeNotDefined:
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -0,0 +1,58 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import re
|
||||||
|
from psparser import PSLiteral
|
||||||
|
from glyphlist import charname2unicode
|
||||||
|
from latin_enc import ENCODING
|
||||||
|
|
||||||
|
|
||||||
|
## name2unicode
|
||||||
|
##
|
||||||
|
STRIP_NAME = re.compile(r'[0-9]+')
|
||||||
|
def name2unicode(name):
|
||||||
|
"""Converts Adobe glyph names to Unicode numbers."""
|
||||||
|
if name in charname2unicode:
|
||||||
|
return charname2unicode[name]
|
||||||
|
m = STRIP_NAME.search(name)
|
||||||
|
if not m: raise KeyError(name)
|
||||||
|
return int(m.group(0))
|
||||||
|
|
||||||
|
|
||||||
|
## EncodingDB
|
||||||
|
##
|
||||||
|
class EncodingDB(object):
|
||||||
|
|
||||||
|
std2unicode = {}
|
||||||
|
mac2unicode = {}
|
||||||
|
win2unicode = {}
|
||||||
|
pdf2unicode = {}
|
||||||
|
for (name,std,mac,win,pdf) in ENCODING:
|
||||||
|
c = unichr(name2unicode(name))
|
||||||
|
if std: std2unicode[std] = c
|
||||||
|
if mac: mac2unicode[mac] = c
|
||||||
|
if win: win2unicode[win] = c
|
||||||
|
if pdf: pdf2unicode[pdf] = c
|
||||||
|
|
||||||
|
encodings = {
|
||||||
|
'StandardEncoding': std2unicode,
|
||||||
|
'MacRomanEncoding': mac2unicode,
|
||||||
|
'WinAnsiEncoding': win2unicode,
|
||||||
|
'PDFDocEncoding': pdf2unicode,
|
||||||
|
}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_encoding(klass, name, diff=None):
|
||||||
|
cid2unicode = klass.encodings.get(name, klass.std2unicode)
|
||||||
|
if diff:
|
||||||
|
cid2unicode = cid2unicode.copy()
|
||||||
|
cid = 0
|
||||||
|
for x in diff:
|
||||||
|
if isinstance(x, int):
|
||||||
|
cid = x
|
||||||
|
elif isinstance(x, PSLiteral):
|
||||||
|
try:
|
||||||
|
cid2unicode[cid] = unichr(name2unicode(x.name))
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
cid += 1
|
||||||
|
return cid2unicode
|
|
@ -89,7 +89,7 @@ class PDFTextDevice(PDFDevice):
|
||||||
else:
|
else:
|
||||||
for cid in font.decode(obj):
|
for cid in font.decode(obj):
|
||||||
try:
|
try:
|
||||||
char = font.to_unicode(cid)
|
char = font.to_unichr(cid)
|
||||||
except PDFUnicodeNotDefined, e:
|
except PDFUnicodeNotDefined, e:
|
||||||
(cidcoding, cid) = e.args
|
(cidcoding, cid) = e.args
|
||||||
char = self.handle_undefined_char(cidcoding, cid)
|
char = self.handle_undefined_char(cidcoding, cid)
|
||||||
|
|
|
@ -4,17 +4,27 @@ try:
|
||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
from cmapdb import CMap, CMapDB, CMapParser
|
from cmapdb import CMapDB, CMapParser, FileUnicodeMap, CMap
|
||||||
from cmapdb import FontMetricsDB, EncodingDB
|
from encodingdb import EncodingDB
|
||||||
from struct import pack, unpack
|
from struct import pack, unpack
|
||||||
from psparser import LIT, STRICT
|
from psparser import LIT, STRICT
|
||||||
from psparser import PSLiteral, literal_name
|
from psparser import PSLiteral, literal_name
|
||||||
from pdftypes import PDFException, resolve1
|
from pdftypes import PDFException, resolve1
|
||||||
from pdftypes import int_value, float_value, num_value
|
from pdftypes import int_value, float_value, num_value
|
||||||
from pdftypes import str_value, list_value, dict_value, stream_value
|
from pdftypes import str_value, list_value, dict_value, stream_value
|
||||||
|
from fontmetrics import FONT_METRICS
|
||||||
from utils import apply_matrix_norm, nunpack
|
from utils import apply_matrix_norm, nunpack
|
||||||
|
|
||||||
|
|
||||||
|
## FontMetricsDB
|
||||||
|
##
|
||||||
|
class FontMetricsDB(object):
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_metrics(klass, fontname):
|
||||||
|
return FONT_METRICS[fontname]
|
||||||
|
|
||||||
|
|
||||||
## CFFFont
|
## CFFFont
|
||||||
## (Format specified in Adobe Technical Note: #5176
|
## (Format specified in Adobe Technical Note: #5176
|
||||||
## "The Compact Font Format Specification")
|
## "The Compact Font Format Specification")
|
||||||
|
@ -249,7 +259,7 @@ class TrueTypeFont(object):
|
||||||
self.tables[name] = (offset, length)
|
self.tables[name] = (offset, length)
|
||||||
return
|
return
|
||||||
|
|
||||||
def create_cmap(self):
|
def create_unicode_map(self):
|
||||||
if 'cmap' not in self.tables:
|
if 'cmap' not in self.tables:
|
||||||
raise TrueTypeFont.CMapNotFound
|
raise TrueTypeFont.CMapNotFound
|
||||||
(base_offset, length) = self.tables['cmap']
|
(base_offset, length) = self.tables['cmap']
|
||||||
|
@ -302,9 +312,11 @@ class TrueTypeFont(object):
|
||||||
else:
|
else:
|
||||||
for c in xrange(sc, ec+1):
|
for c in xrange(sc, ec+1):
|
||||||
char2gid[c] = (c + idd) & 0xffff
|
char2gid[c] = (c + idd) & 0xffff
|
||||||
gid2char = dict( (gid, pack('>H', char))
|
# create unicode map
|
||||||
for (char,gid) in char2gid.iteritems() )
|
unicode_map = FileUnicodeMap()
|
||||||
return CMap().update(char2gid, gid2char)
|
for (char,gid) in char2gid.iteritems():
|
||||||
|
unicode_map.add_cid2code(gid, char)
|
||||||
|
return unicode_map
|
||||||
|
|
||||||
|
|
||||||
## Fonts
|
## Fonts
|
||||||
|
@ -383,20 +395,19 @@ class PDFSimpleFont(PDFFont):
|
||||||
self.encoding = EncodingDB.get_encoding(name, diff)
|
self.encoding = EncodingDB.get_encoding(name, diff)
|
||||||
else:
|
else:
|
||||||
self.encoding = EncodingDB.get_encoding(literal_name(encoding))
|
self.encoding = EncodingDB.get_encoding(literal_name(encoding))
|
||||||
self.ucs2_cmap = None
|
self.unicode_map = None
|
||||||
if 'ToUnicode' in spec:
|
if 'ToUnicode' in spec:
|
||||||
strm = stream_value(spec['ToUnicode'])
|
strm = stream_value(spec['ToUnicode'])
|
||||||
self.ucs2_cmap = CMap()
|
self.unicode_map = FileUnicodeMap()
|
||||||
CMapParser(None, self.ucs2_cmap, StringIO(strm.get_data())).run()
|
CMapParser(self.unicode_map, StringIO(strm.get_data())).run()
|
||||||
PDFFont.__init__(self, descriptor, widths)
|
PDFFont.__init__(self, descriptor, widths)
|
||||||
return
|
return
|
||||||
|
|
||||||
def to_unicode(self, cid):
|
def to_unichr(self, cid):
|
||||||
if self.ucs2_cmap:
|
if self.unicode_map:
|
||||||
code = self.ucs2_cmap.tocode(cid)
|
code = self.unicode_map.get_unicode(cid)
|
||||||
if code:
|
if code is not None:
|
||||||
chars = unpack('>%dH' % (len(code)/2), code)
|
return unichr(code)
|
||||||
return ''.join( unichr(c) for c in chars )
|
|
||||||
try:
|
try:
|
||||||
return self.encoding[cid]
|
return self.encoding[cid]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
|
@ -476,9 +487,11 @@ class PDFCIDFont(PDFFont):
|
||||||
raise PDFFontError('Encoding is unspecified')
|
raise PDFFontError('Encoding is unspecified')
|
||||||
name = 'unknown'
|
name = 'unknown'
|
||||||
try:
|
try:
|
||||||
self.cmap = rsrc.get_cmap(name, strict=STRICT)
|
self.cmap = CMapDB.get_cmap(name)
|
||||||
except CMapDB.CMapNotFound, e:
|
except CMapDB.CMapNotFound, e:
|
||||||
raise PDFFontError(e)
|
if STRICT:
|
||||||
|
raise PDFFontError(e)
|
||||||
|
self.cmap = CMap()
|
||||||
try:
|
try:
|
||||||
descriptor = dict_value(spec['FontDescriptor'])
|
descriptor = dict_value(spec['FontDescriptor'])
|
||||||
except KeyError:
|
except KeyError:
|
||||||
|
@ -490,21 +503,20 @@ class PDFCIDFont(PDFFont):
|
||||||
self.fontfile = stream_value(descriptor.get('FontFile2'))
|
self.fontfile = stream_value(descriptor.get('FontFile2'))
|
||||||
ttf = TrueTypeFont(self.basefont,
|
ttf = TrueTypeFont(self.basefont,
|
||||||
StringIO(self.fontfile.get_data()))
|
StringIO(self.fontfile.get_data()))
|
||||||
self.ucs2_cmap = None
|
self.unicode_map = None
|
||||||
if 'ToUnicode' in spec:
|
if 'ToUnicode' in spec:
|
||||||
strm = stream_value(spec['ToUnicode'])
|
strm = stream_value(spec['ToUnicode'])
|
||||||
self.ucs2_cmap = CMap()
|
self.unicode_map = FileUnicodeMap()
|
||||||
CMapParser(None, self.ucs2_cmap, StringIO(strm.get_data())).run()
|
CMapParser(self.unicode_map, StringIO(strm.get_data())).run()
|
||||||
elif self.cidcoding == 'Adobe-Identity':
|
elif self.cidcoding == 'Adobe-Identity':
|
||||||
if ttf:
|
if ttf:
|
||||||
try:
|
try:
|
||||||
self.ucs2_cmap = ttf.create_cmap()
|
self.unicode_map = ttf.create_unicode_map()
|
||||||
except TrueTypeFont.CMapNotFound:
|
except TrueTypeFont.CMapNotFound:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
self.ucs2_cmap = rsrc.get_cmap('%s-UCS2' % self.cidcoding,
|
self.unicode_map = CMapDB.get_unicode_map(self.cidcoding, self.cmap.is_vertical())
|
||||||
strict=STRICT)
|
|
||||||
except CMapDB.CMapNotFound, e:
|
except CMapDB.CMapNotFound, e:
|
||||||
raise PDFFontError(e)
|
raise PDFFontError(e)
|
||||||
|
|
||||||
|
@ -558,14 +570,13 @@ class PDFCIDFont(PDFFont):
|
||||||
def char_disp(self, cid):
|
def char_disp(self, cid):
|
||||||
return self.disps.get(cid, self.default_disp)
|
return self.disps.get(cid, self.default_disp)
|
||||||
|
|
||||||
def to_unicode(self, cid):
|
def to_unichr(self, cid):
|
||||||
if not self.ucs2_cmap:
|
if not self.unicode_map:
|
||||||
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
||||||
code = self.ucs2_cmap.tocode(cid)
|
code = self.unicode_map.get_unicode(cid)
|
||||||
if not code:
|
if code is not None:
|
||||||
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
return unichr(code)
|
||||||
chars = unpack('>%dH' % (len(code)/2), code)
|
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
||||||
return ''.join( unichr(c) for c in chars )
|
|
||||||
|
|
||||||
|
|
||||||
# main
|
# main
|
||||||
|
|
|
@ -6,7 +6,7 @@ try:
|
||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
from cmapdb import CMapDB
|
from cmapdb import CMapDB, CMap
|
||||||
from psparser import PSException, PSTypeError, PSEOF
|
from psparser import PSException, PSTypeError, PSEOF
|
||||||
from psparser import PSKeyword, literal_name, keyword_name
|
from psparser import PSKeyword, literal_name, keyword_name
|
||||||
from psparser import PSStackParser
|
from psparser import PSStackParser
|
||||||
|
@ -106,9 +106,8 @@ class PDFResourceManager(object):
|
||||||
'''
|
'''
|
||||||
debug = 0
|
debug = 0
|
||||||
|
|
||||||
def __init__(self, cmapdb):
|
def __init__(self):
|
||||||
self.fonts = {}
|
self.fonts = {}
|
||||||
self.cmapdb = cmapdb
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_procset(self, procs):
|
def get_procset(self, procs):
|
||||||
|
@ -123,7 +122,11 @@ class PDFResourceManager(object):
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_cmap(self, cmapname, strict=False):
|
def get_cmap(self, cmapname, strict=False):
|
||||||
return self.cmapdb.get_cmap(cmapname, strict=strict)
|
try:
|
||||||
|
return CMapDB.get_cmap(cmapname)
|
||||||
|
except CMapDB.CMapNotFound:
|
||||||
|
if strict: raise
|
||||||
|
return CMapDB.CMap()
|
||||||
|
|
||||||
def get_font(self, objid, spec):
|
def get_font(self, objid, spec):
|
||||||
if objid and objid in self.fonts:
|
if objid and objid in self.fonts:
|
||||||
|
|
3
setup.py
3
setup.py
|
@ -19,7 +19,8 @@ PDF parser that can be used for other purposes instead of text analysis.''',
|
||||||
author_email='yusuke at cs dot nyu dot edu',
|
author_email='yusuke at cs dot nyu dot edu',
|
||||||
url='http://www.unixuser.org/~euske/python/pdfminer/index.html',
|
url='http://www.unixuser.org/~euske/python/pdfminer/index.html',
|
||||||
packages=[
|
packages=[
|
||||||
'pdfminer'
|
'pdfminer',
|
||||||
|
'pdfminer.cmap'
|
||||||
],
|
],
|
||||||
scripts=[
|
scripts=[
|
||||||
'tools/pdf2txt.py',
|
'tools/pdf2txt.py',
|
||||||
|
|
|
@ -0,0 +1,155 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
import sys
|
||||||
|
import os.path
|
||||||
|
|
||||||
|
def process_cid2code(fp, check_codecs=[]):
|
||||||
|
|
||||||
|
def get_canonicals(name):
|
||||||
|
if name.endswith('-H'):
|
||||||
|
return (name, None)
|
||||||
|
elif name == 'H':
|
||||||
|
return ('H', 'V')
|
||||||
|
else:
|
||||||
|
return (name+'-H', name+'-V')
|
||||||
|
|
||||||
|
def get_unicode(codes):
|
||||||
|
# determine the "most popular" candidate.
|
||||||
|
d = {}
|
||||||
|
for code in codes:
|
||||||
|
char = unicode(code, 'utf-8')
|
||||||
|
if char not in d:
|
||||||
|
d[char] = 0
|
||||||
|
for codec in check_codecs:
|
||||||
|
try:
|
||||||
|
char.encode(codec, 'strict')
|
||||||
|
d[char] += 1
|
||||||
|
except UnicodeError:
|
||||||
|
pass
|
||||||
|
chars = sorted(d.keys(), key=lambda char:d[char], reverse=True)
|
||||||
|
return ord(chars[0])
|
||||||
|
|
||||||
|
def put(dmap, code, cid, force=False):
|
||||||
|
for b in code[:-1]:
|
||||||
|
b = ord(b)
|
||||||
|
if b in dmap:
|
||||||
|
dmap = dmap[b]
|
||||||
|
else:
|
||||||
|
d = {}
|
||||||
|
dmap[b] = d
|
||||||
|
dmap = d
|
||||||
|
b = ord(code[-1])
|
||||||
|
if force or ((b not in dmap) or dmap[b] == cid):
|
||||||
|
dmap[b] = cid
|
||||||
|
return
|
||||||
|
|
||||||
|
names = []
|
||||||
|
code2cid = {} # {'cmapname': ...}
|
||||||
|
is_vertical = {}
|
||||||
|
cid2unicode_h = {} # {cid: unicode}
|
||||||
|
cid2unicode_v = {} # {cid: unicode}
|
||||||
|
|
||||||
|
for line in fp:
|
||||||
|
line = line.strip()
|
||||||
|
if line.startswith('#'): continue
|
||||||
|
if line.startswith('CID'):
|
||||||
|
names = line.split('\t')[1:]
|
||||||
|
continue
|
||||||
|
f = line.split('\t')
|
||||||
|
if not f: continue
|
||||||
|
cid = int(f[0])
|
||||||
|
for (x,name) in zip(f[1:], names):
|
||||||
|
if x == '*': continue
|
||||||
|
(hmapname, vmapname) = get_canonicals(name)
|
||||||
|
if hmapname in code2cid:
|
||||||
|
hmap = code2cid[hmapname]
|
||||||
|
else:
|
||||||
|
hmap = {}
|
||||||
|
code2cid[hmapname] = hmap
|
||||||
|
vmap = None
|
||||||
|
if vmapname:
|
||||||
|
is_vertical[vmapname] = True
|
||||||
|
if vmapname in code2cid:
|
||||||
|
vmap = code2cid[vmapname]
|
||||||
|
else:
|
||||||
|
vmap = {}
|
||||||
|
code2cid[vmapname] = vmap
|
||||||
|
hcodes = []
|
||||||
|
vcodes = []
|
||||||
|
for code in x.split(','):
|
||||||
|
vertical = code.endswith('v')
|
||||||
|
if vertical:
|
||||||
|
code = code[:-1]
|
||||||
|
try:
|
||||||
|
code = code.decode('hex')
|
||||||
|
except:
|
||||||
|
code = chr(int(code, 16))
|
||||||
|
if vertical:
|
||||||
|
vcodes.append(code)
|
||||||
|
else:
|
||||||
|
hcodes.append(code)
|
||||||
|
if vcodes:
|
||||||
|
assert vmap is not None
|
||||||
|
for code in vcodes:
|
||||||
|
put(vmap, code, cid, True)
|
||||||
|
for code in hcodes:
|
||||||
|
put(hmap, code, cid, True)
|
||||||
|
if name.endswith('-UTF8'):
|
||||||
|
if hcodes:
|
||||||
|
cid2unicode_h[cid] = get_unicode(hcodes)
|
||||||
|
if vcodes:
|
||||||
|
cid2unicode_v[cid] = get_unicode(vcodes)
|
||||||
|
else:
|
||||||
|
for code in hcodes:
|
||||||
|
put(hmap, code, cid)
|
||||||
|
put(vmap, code, cid)
|
||||||
|
if name.endswith('-UTF8') and hcodes:
|
||||||
|
code = get_unicode(hcodes)
|
||||||
|
if cid not in cid2unicode_h:
|
||||||
|
cid2unicode_h[cid] = code
|
||||||
|
if cid not in cid2unicode_v:
|
||||||
|
cid2unicode_v[cid] = code
|
||||||
|
|
||||||
|
return (code2cid, is_vertical, cid2unicode_h, cid2unicode_v)
|
||||||
|
|
||||||
|
# main
|
||||||
|
def main(argv):
|
||||||
|
|
||||||
|
def usage():
|
||||||
|
print 'usage: %s output_dir regname cid2code.txt codecs ...' % argv[0]
|
||||||
|
return 100
|
||||||
|
|
||||||
|
def pyname(name):
|
||||||
|
return name.replace('-','_')+'.py'
|
||||||
|
|
||||||
|
args = argv[1:]
|
||||||
|
if len(args) < 3: return usage()
|
||||||
|
(outdir, regname, src) = args[:3]
|
||||||
|
check_codecs = args[3:]
|
||||||
|
|
||||||
|
print >>sys.stderr, 'reading %r...' % src
|
||||||
|
fp = file(src)
|
||||||
|
(code2cid, is_vertical, cid2unicode_h, cid2unicode_v) = process_cid2code(fp, check_codecs)
|
||||||
|
fp.close()
|
||||||
|
|
||||||
|
for (name, cmap) in code2cid.iteritems():
|
||||||
|
fname = pyname(name)
|
||||||
|
print >>sys.stderr, 'writing %r...' % fname
|
||||||
|
fp = file(os.path.join(outdir, fname), 'w')
|
||||||
|
print >>fp, '#!/usr/bin/env python'
|
||||||
|
print >>fp, '#', fname
|
||||||
|
print >>fp, 'IS_VERTICAL = %r' % is_vertical.get(name, False)
|
||||||
|
print >>fp, 'CODE2CID = %r' % cmap
|
||||||
|
fp.close()
|
||||||
|
|
||||||
|
fname = 'TO_UNICODE_'+pyname(regname)
|
||||||
|
print >>sys.stderr, 'writing %r...' % fname
|
||||||
|
fp = file(os.path.join(outdir, fname), 'w')
|
||||||
|
print >>fp, '#!/usr/bin/env python'
|
||||||
|
print >>fp, '#', fname
|
||||||
|
print >>fp, 'CID2UNICODE_H = %r' % cid2unicode_h
|
||||||
|
print >>fp, 'CID2UNICODE_V = %r' % cid2unicode_v
|
||||||
|
fp.close()
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if __name__ == '__main__': sys.exit(main(sys.argv))
|
|
@ -4,7 +4,7 @@ from pdfminer.pdfparser import PDFDocument, PDFParser
|
||||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
|
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
|
||||||
from pdfminer.pdfdevice import PDFDevice
|
from pdfminer.pdfdevice import PDFDevice
|
||||||
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter, TagExtractor
|
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter, TagExtractor
|
||||||
from pdfminer.cmapdb import CMapDB, find_cmap_path
|
from pdfminer.cmapdb import CMapDB
|
||||||
from pdfminer.layout import LAParams
|
from pdfminer.layout import LAParams
|
||||||
|
|
||||||
# main
|
# main
|
||||||
|
@ -22,8 +22,6 @@ def main(argv):
|
||||||
if not args: return usage()
|
if not args: return usage()
|
||||||
# debug option
|
# debug option
|
||||||
debug = 0
|
debug = 0
|
||||||
# path option
|
|
||||||
cmapdir = find_cmap_path()
|
|
||||||
# input option
|
# input option
|
||||||
password = ''
|
password = ''
|
||||||
pagenos = set()
|
pagenos = set()
|
||||||
|
@ -38,7 +36,6 @@ def main(argv):
|
||||||
laparams = LAParams()
|
laparams = LAParams()
|
||||||
for (k, v) in opts:
|
for (k, v) in opts:
|
||||||
if k == '-d': debug += 1
|
if k == '-d': debug += 1
|
||||||
elif k == '-C': cmapdir = v
|
|
||||||
elif k == '-P': password = v
|
elif k == '-P': password = v
|
||||||
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
|
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
|
||||||
elif k == '-m': maxpages = int(v)
|
elif k == '-m': maxpages = int(v)
|
||||||
|
@ -59,8 +56,7 @@ def main(argv):
|
||||||
PDFPageInterpreter.debug = debug
|
PDFPageInterpreter.debug = debug
|
||||||
PDFDevice.debug = debug
|
PDFDevice.debug = debug
|
||||||
#
|
#
|
||||||
cmapdb = CMapDB(cmapdir)
|
rsrc = PDFResourceManager()
|
||||||
rsrc = PDFResourceManager(cmapdb)
|
|
||||||
if not outtype:
|
if not outtype:
|
||||||
outtype = 'text'
|
outtype = 'text'
|
||||||
if outfile:
|
if outfile:
|
||||||
|
|
Loading…
Reference in New Issue