440 lines
13 KiB
Python
440 lines
13 KiB
Python
""" Adobe character mapping (CMap) support.
|
|
|
|
CMaps provide the mapping between character codes and Unicode
|
|
code-points to character ids (CIDs).
|
|
|
|
More information is available on the Adobe website:
|
|
|
|
http://opensource.adobe.com/wiki/display/cmap/CMap+Resources
|
|
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
import os.path
|
|
import gzip
|
|
import pickle as pickle
|
|
import struct
|
|
import logging
|
|
from typing import (Any, BinaryIO, Dict, Iterable, Iterator, List,
|
|
MutableMapping, Optional, TextIO, Tuple, Union, cast)
|
|
from .psparser import PSStackParser
|
|
from .psparser import PSSyntaxError
|
|
from .psparser import PSEOF
|
|
from .psparser import PSKeyword
|
|
from .psparser import PSLiteral
|
|
from .psparser import literal_name
|
|
from .psparser import KWD
|
|
from .encodingdb import name2unicode
|
|
from .utils import choplist
|
|
from .utils import nunpack
|
|
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
class CMapError(Exception):
|
|
pass
|
|
|
|
|
|
class CMapBase:
|
|
|
|
debug = 0
|
|
|
|
def __init__(self, **kwargs: object) -> None:
|
|
self.attrs: MutableMapping[str, object] = kwargs.copy()
|
|
return
|
|
|
|
def is_vertical(self) -> bool:
|
|
return self.attrs.get('WMode', 0) != 0
|
|
|
|
def set_attr(self, k: str, v: object) -> None:
|
|
self.attrs[k] = v
|
|
return
|
|
|
|
def add_code2cid(self, code: str, cid: int) -> None:
|
|
return
|
|
|
|
def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]
|
|
) -> None:
|
|
return
|
|
|
|
def use_cmap(self, cmap: "CMapBase") -> None:
|
|
return
|
|
|
|
def decode(self, code: bytes) -> Iterable[int]:
|
|
raise NotImplementedError
|
|
|
|
|
|
class CMap(CMapBase):
|
|
|
|
def __init__(self, **kwargs: Union[str, int]) -> None:
|
|
CMapBase.__init__(self, **kwargs)
|
|
self.code2cid: Dict[int, object] = {}
|
|
return
|
|
|
|
def __repr__(self) -> str:
|
|
return '<CMap: %s>' % self.attrs.get('CMapName')
|
|
|
|
def use_cmap(self, cmap: CMapBase) -> None:
|
|
assert isinstance(cmap, CMap), str(type(cmap))
|
|
|
|
def copy(dst: Dict[int, object], src: Dict[int, object]) -> None:
|
|
for (k, v) in src.items():
|
|
if isinstance(v, dict):
|
|
d: Dict[int, object] = {}
|
|
dst[k] = d
|
|
copy(d, v)
|
|
else:
|
|
dst[k] = v
|
|
copy(self.code2cid, cmap.code2cid)
|
|
return
|
|
|
|
def decode(self, code: bytes) -> Iterator[int]:
|
|
log.debug('decode: %r, %r', self, code)
|
|
d = self.code2cid
|
|
for i in iter(code):
|
|
if i in d:
|
|
x = d[i]
|
|
if isinstance(x, int):
|
|
yield x
|
|
d = self.code2cid
|
|
else:
|
|
d = cast(Dict[int, object], x)
|
|
else:
|
|
d = self.code2cid
|
|
return
|
|
|
|
def dump(self, out: TextIO = sys.stdout,
|
|
code2cid: Optional[Dict[int, object]] = None,
|
|
code: Tuple[int, ...] = ()) -> None:
|
|
if code2cid is None:
|
|
code2cid = self.code2cid
|
|
code = ()
|
|
for (k, v) in sorted(code2cid.items()):
|
|
c = code+(k,)
|
|
if isinstance(v, int):
|
|
out.write('code %r = cid %d\n' % (c, v))
|
|
else:
|
|
self.dump(out=out, code2cid=cast(Dict[int, object], v), code=c)
|
|
return
|
|
|
|
|
|
class IdentityCMap(CMapBase):
|
|
|
|
def decode(self, code: bytes) -> Tuple[int, ...]:
|
|
n = len(code)//2
|
|
if n:
|
|
return struct.unpack('>%dH' % n, code)
|
|
else:
|
|
return ()
|
|
|
|
|
|
class IdentityCMapByte(IdentityCMap):
|
|
|
|
def decode(self, code: bytes) -> Tuple[int, ...]:
|
|
n = len(code)
|
|
if n:
|
|
return struct.unpack('>%dB' % n, code)
|
|
else:
|
|
return ()
|
|
|
|
|
|
class UnicodeMap(CMapBase):
|
|
|
|
def __init__(self, **kwargs: Union[str, int]) -> None:
|
|
CMapBase.__init__(self, **kwargs)
|
|
self.cid2unichr: Dict[int, str] = {}
|
|
return
|
|
|
|
def __repr__(self) -> str:
|
|
return '<UnicodeMap: %s>' % self.attrs.get('CMapName')
|
|
|
|
def get_unichr(self, cid: int) -> str:
|
|
log.debug('get_unichr: %r, %r', self, cid)
|
|
return self.cid2unichr[cid]
|
|
|
|
def dump(self, out: TextIO = sys.stdout) -> None:
|
|
for (k, v) in sorted(self.cid2unichr.items()):
|
|
out.write('cid %d = unicode %r\n' % (k, v))
|
|
return
|
|
|
|
|
|
class FileCMap(CMap):
|
|
|
|
def add_code2cid(self, code: str, cid: int) -> None:
|
|
assert isinstance(code, str) and isinstance(cid, int),\
|
|
str((type(code), type(cid)))
|
|
d = self.code2cid
|
|
for c in code[:-1]:
|
|
ci = ord(c)
|
|
if ci in d:
|
|
d = cast(Dict[int, object], d[ci])
|
|
else:
|
|
t: Dict[int, object] = {}
|
|
d[ci] = t
|
|
d = t
|
|
ci = ord(code[-1])
|
|
d[ci] = cid
|
|
return
|
|
|
|
|
|
class FileUnicodeMap(UnicodeMap):
|
|
|
|
def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]
|
|
) -> None:
|
|
assert isinstance(cid, int), str(type(cid))
|
|
if isinstance(code, PSLiteral):
|
|
# Interpret as an Adobe glyph name.
|
|
assert isinstance(code.name, str)
|
|
self.cid2unichr[cid] = name2unicode(code.name)
|
|
elif isinstance(code, bytes):
|
|
# Interpret as UTF-16BE.
|
|
self.cid2unichr[cid] = code.decode('UTF-16BE', 'ignore')
|
|
elif isinstance(code, int):
|
|
self.cid2unichr[cid] = chr(code)
|
|
else:
|
|
raise TypeError(code)
|
|
return
|
|
|
|
|
|
class PyCMap(CMap):
|
|
|
|
def __init__(self, name: str, module: Any) -> None:
|
|
super().__init__(CMapName=name)
|
|
self.code2cid = module.CODE2CID
|
|
if module.IS_VERTICAL:
|
|
self.attrs['WMode'] = 1
|
|
return
|
|
|
|
|
|
class PyUnicodeMap(UnicodeMap):
|
|
|
|
def __init__(self, name: str, module: Any, vertical: bool) -> None:
|
|
super().__init__(CMapName=name)
|
|
if vertical:
|
|
self.cid2unichr = module.CID2UNICHR_V
|
|
self.attrs['WMode'] = 1
|
|
else:
|
|
self.cid2unichr = module.CID2UNICHR_H
|
|
return
|
|
|
|
|
|
class CMapDB:
|
|
|
|
_cmap_cache: Dict[str, PyCMap] = {}
|
|
_umap_cache: Dict[str, List[PyUnicodeMap]] = {}
|
|
|
|
class CMapNotFound(CMapError):
|
|
pass
|
|
|
|
@classmethod
|
|
def _load_data(cls, name: str) -> Any:
|
|
name = name.replace("\0", "")
|
|
filename = '%s.pickle.gz' % name
|
|
log.info('loading: %r', name)
|
|
cmap_paths = (os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'),
|
|
os.path.join(os.path.dirname(__file__), 'cmap'),)
|
|
for directory in cmap_paths:
|
|
path = os.path.join(directory, filename)
|
|
if os.path.exists(path):
|
|
gzfile = gzip.open(path)
|
|
try:
|
|
return type(str(name), (), pickle.loads(gzfile.read()))
|
|
finally:
|
|
gzfile.close()
|
|
else:
|
|
raise CMapDB.CMapNotFound(name)
|
|
|
|
@classmethod
|
|
def get_cmap(cls, name: str) -> CMapBase:
|
|
if name == 'Identity-H':
|
|
return IdentityCMap(WMode=0)
|
|
elif name == 'Identity-V':
|
|
return IdentityCMap(WMode=1)
|
|
elif name == 'OneByteIdentityH':
|
|
return IdentityCMapByte(WMode=0)
|
|
elif name == 'OneByteIdentityV':
|
|
return IdentityCMapByte(WMode=1)
|
|
try:
|
|
return cls._cmap_cache[name]
|
|
except KeyError:
|
|
pass
|
|
data = cls._load_data(name)
|
|
cls._cmap_cache[name] = cmap = PyCMap(name, data)
|
|
return cmap
|
|
|
|
@classmethod
|
|
def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap:
|
|
try:
|
|
return cls._umap_cache[name][vertical]
|
|
except KeyError:
|
|
pass
|
|
data = cls._load_data('to-unicode-%s' % name)
|
|
cls._umap_cache[name] = [PyUnicodeMap(name, data, v)
|
|
for v in (False, True)]
|
|
return cls._umap_cache[name][vertical]
|
|
|
|
|
|
class CMapParser(PSStackParser[PSKeyword]):
|
|
|
|
def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None:
|
|
PSStackParser.__init__(self, fp)
|
|
self.cmap = cmap
|
|
# some ToUnicode maps don't have "begincmap" keyword.
|
|
self._in_cmap = True
|
|
return
|
|
|
|
def run(self) -> None:
|
|
try:
|
|
self.nextobject()
|
|
except PSEOF:
|
|
pass
|
|
return
|
|
|
|
KEYWORD_BEGINCMAP = KWD(b'begincmap')
|
|
KEYWORD_ENDCMAP = KWD(b'endcmap')
|
|
KEYWORD_USECMAP = KWD(b'usecmap')
|
|
KEYWORD_DEF = KWD(b'def')
|
|
KEYWORD_BEGINCODESPACERANGE = KWD(b'begincodespacerange')
|
|
KEYWORD_ENDCODESPACERANGE = KWD(b'endcodespacerange')
|
|
KEYWORD_BEGINCIDRANGE = KWD(b'begincidrange')
|
|
KEYWORD_ENDCIDRANGE = KWD(b'endcidrange')
|
|
KEYWORD_BEGINCIDCHAR = KWD(b'begincidchar')
|
|
KEYWORD_ENDCIDCHAR = KWD(b'endcidchar')
|
|
KEYWORD_BEGINBFRANGE = KWD(b'beginbfrange')
|
|
KEYWORD_ENDBFRANGE = KWD(b'endbfrange')
|
|
KEYWORD_BEGINBFCHAR = KWD(b'beginbfchar')
|
|
KEYWORD_ENDBFCHAR = KWD(b'endbfchar')
|
|
KEYWORD_BEGINNOTDEFRANGE = KWD(b'beginnotdefrange')
|
|
KEYWORD_ENDNOTDEFRANGE = KWD(b'endnotdefrange')
|
|
|
|
def do_keyword(self, pos: int, token: PSKeyword) -> None:
|
|
if token is self.KEYWORD_BEGINCMAP:
|
|
self._in_cmap = True
|
|
self.popall()
|
|
return
|
|
elif token is self.KEYWORD_ENDCMAP:
|
|
self._in_cmap = False
|
|
return
|
|
if not self._in_cmap:
|
|
return
|
|
#
|
|
if token is self.KEYWORD_DEF:
|
|
try:
|
|
((_, k), (_, v)) = self.pop(2)
|
|
self.cmap.set_attr(literal_name(k), v)
|
|
except PSSyntaxError:
|
|
pass
|
|
return
|
|
|
|
if token is self.KEYWORD_USECMAP:
|
|
try:
|
|
((_, cmapname),) = self.pop(1)
|
|
self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
|
|
except PSSyntaxError:
|
|
pass
|
|
except CMapDB.CMapNotFound:
|
|
pass
|
|
return
|
|
|
|
if token is self.KEYWORD_BEGINCODESPACERANGE:
|
|
self.popall()
|
|
return
|
|
if token is self.KEYWORD_ENDCODESPACERANGE:
|
|
self.popall()
|
|
return
|
|
|
|
if token is self.KEYWORD_BEGINCIDRANGE:
|
|
self.popall()
|
|
return
|
|
if token is self.KEYWORD_ENDCIDRANGE:
|
|
objs = [obj for (__, obj) in self.popall()]
|
|
for (s, e, cid) in choplist(3, objs):
|
|
if (not isinstance(s, bytes) or not isinstance(e, bytes) or
|
|
not isinstance(cid, int) or len(s) != len(e)):
|
|
continue
|
|
sprefix = s[:-4]
|
|
eprefix = e[:-4]
|
|
if sprefix != eprefix:
|
|
continue
|
|
svar = s[-4:]
|
|
evar = e[-4:]
|
|
s1 = nunpack(svar)
|
|
e1 = nunpack(evar)
|
|
vlen = len(svar)
|
|
for i in range(e1-s1+1):
|
|
x = sprefix+struct.pack('>L', s1+i)[-vlen:]
|
|
self.cmap.add_cid2unichr(cid+i, x)
|
|
return
|
|
|
|
if token is self.KEYWORD_BEGINCIDCHAR:
|
|
self.popall()
|
|
return
|
|
if token is self.KEYWORD_ENDCIDCHAR:
|
|
objs = [obj for (__, obj) in self.popall()]
|
|
for (cid, code) in choplist(2, objs):
|
|
if isinstance(code, bytes) and isinstance(cid, int):
|
|
self.cmap.add_cid2unichr(cid, code)
|
|
return
|
|
|
|
if token is self.KEYWORD_BEGINBFRANGE:
|
|
self.popall()
|
|
return
|
|
if token is self.KEYWORD_ENDBFRANGE:
|
|
objs = [obj for (__, obj) in self.popall()]
|
|
for (s, e, code) in choplist(3, objs):
|
|
if (not isinstance(s, bytes) or not isinstance(e, bytes) or
|
|
len(s) != len(e)):
|
|
continue
|
|
s1 = nunpack(s)
|
|
e1 = nunpack(e)
|
|
if isinstance(code, list):
|
|
for i in range(e1-s1+1):
|
|
self.cmap.add_cid2unichr(s1+i, code[i])
|
|
else:
|
|
assert isinstance(code, bytes)
|
|
var = code[-4:]
|
|
base = nunpack(var)
|
|
prefix = code[:-4]
|
|
vlen = len(var)
|
|
for i in range(e1-s1+1):
|
|
x = prefix+struct.pack('>L', base+i)[-vlen:]
|
|
self.cmap.add_cid2unichr(s1+i, x)
|
|
return
|
|
|
|
if token is self.KEYWORD_BEGINBFCHAR:
|
|
self.popall()
|
|
return
|
|
if token is self.KEYWORD_ENDBFCHAR:
|
|
objs = [obj for (__, obj) in self.popall()]
|
|
for (cid, code) in choplist(2, objs):
|
|
if isinstance(cid, bytes) and isinstance(code, bytes):
|
|
self.cmap.add_cid2unichr(nunpack(cid), code)
|
|
return
|
|
|
|
if token is self.KEYWORD_BEGINNOTDEFRANGE:
|
|
self.popall()
|
|
return
|
|
if token is self.KEYWORD_ENDNOTDEFRANGE:
|
|
self.popall()
|
|
return
|
|
|
|
self.push((pos, token))
|
|
return
|
|
|
|
|
|
def main(argv: List[str]) -> None:
|
|
args = argv[1:]
|
|
for fname in args:
|
|
fp = open(fname, 'rb')
|
|
cmap = FileUnicodeMap()
|
|
CMapParser(cmap, fp).run()
|
|
fp.close()
|
|
cmap.dump()
|
|
return
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main(sys.argv)
|