""" Adobe character mapping (CMap) support. CMaps provide the mapping between character codes and Unicode code-points to character ids (CIDs). More information is available on the Adobe website: http://opensource.adobe.com/wiki/display/cmap/CMap+Resources """ import gzip import logging import os import os.path import pickle as pickle import struct import sys from typing import ( Any, BinaryIO, Dict, Iterable, Iterator, List, MutableMapping, Optional, TextIO, Tuple, Union, cast, Set, ) from .encodingdb import name2unicode from .psparser import KWD from .psparser import PSEOF from .psparser import PSKeyword from .psparser import PSLiteral from .psparser import PSStackParser from .psparser import PSSyntaxError from .psparser import literal_name from .utils import choplist from .utils import nunpack log = logging.getLogger(__name__) class CMapError(Exception): pass class CMapBase: debug = 0 def __init__(self, **kwargs: object) -> None: self.attrs: MutableMapping[str, object] = kwargs.copy() def is_vertical(self) -> bool: return self.attrs.get("WMode", 0) != 0 def set_attr(self, k: str, v: object) -> None: self.attrs[k] = v def add_code2cid(self, code: str, cid: int) -> None: pass def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None: pass def use_cmap(self, cmap: "CMapBase") -> None: pass def decode(self, code: bytes) -> Iterable[int]: raise NotImplementedError class CMap(CMapBase): def __init__(self, **kwargs: Union[str, int]) -> None: CMapBase.__init__(self, **kwargs) self.code2cid: Dict[int, object] = {} def __repr__(self) -> str: return "" % self.attrs.get("CMapName") def use_cmap(self, cmap: CMapBase) -> None: assert isinstance(cmap, CMap), str(type(cmap)) def copy(dst: Dict[int, object], src: Dict[int, object]) -> None: for (k, v) in src.items(): if isinstance(v, dict): d: Dict[int, object] = {} dst[k] = d copy(d, v) else: dst[k] = v copy(self.code2cid, cmap.code2cid) def decode(self, code: bytes) -> Iterator[int]: log.debug("decode: %r, %r", self, code) d = self.code2cid for i in iter(code): if i in d: x = d[i] if isinstance(x, int): yield x d = self.code2cid else: d = cast(Dict[int, object], x) else: d = self.code2cid def dump( self, out: TextIO = sys.stdout, code2cid: Optional[Dict[int, object]] = None, code: Tuple[int, ...] = (), ) -> None: if code2cid is None: code2cid = self.code2cid code = () for (k, v) in sorted(code2cid.items()): c = code + (k,) if isinstance(v, int): out.write("code %r = cid %d\n" % (c, v)) else: self.dump(out=out, code2cid=cast(Dict[int, object], v), code=c) class IdentityCMap(CMapBase): def decode(self, code: bytes) -> Tuple[int, ...]: n = len(code) // 2 if n: return struct.unpack(">%dH" % n, code) else: return () class IdentityCMapByte(IdentityCMap): def decode(self, code: bytes) -> Tuple[int, ...]: n = len(code) if n: return struct.unpack(">%dB" % n, code) else: return () class UnicodeMap(CMapBase): def __init__(self, **kwargs: Union[str, int]) -> None: CMapBase.__init__(self, **kwargs) self.cid2unichr: Dict[int, str] = {} def __repr__(self) -> str: return "" % self.attrs.get("CMapName") def get_unichr(self, cid: int) -> str: log.debug("get_unichr: %r, %r", self, cid) return self.cid2unichr[cid] def dump(self, out: TextIO = sys.stdout) -> None: for (k, v) in sorted(self.cid2unichr.items()): out.write("cid %d = unicode %r\n" % (k, v)) class IdentityUnicodeMap(UnicodeMap): def get_unichr(self, cid: int) -> str: """Interpret character id as unicode codepoint""" log.debug("get_unichr: %r, %r", self, cid) return chr(cid) class FileCMap(CMap): def add_code2cid(self, code: str, cid: int) -> None: assert isinstance(code, str) and isinstance(cid, int), str( (type(code), type(cid)) ) d = self.code2cid for c in code[:-1]: ci = ord(c) if ci in d: d = cast(Dict[int, object], d[ci]) else: t: Dict[int, object] = {} d[ci] = t d = t ci = ord(code[-1]) d[ci] = cid class FileUnicodeMap(UnicodeMap): def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None: assert isinstance(cid, int), str(type(cid)) if isinstance(code, PSLiteral): # Interpret as an Adobe glyph name. assert isinstance(code.name, str) self.cid2unichr[cid] = name2unicode(code.name) elif isinstance(code, bytes): # Interpret as UTF-16BE. self.cid2unichr[cid] = code.decode("UTF-16BE", "ignore") elif isinstance(code, int): self.cid2unichr[cid] = chr(code) else: raise TypeError(code) class PyCMap(CMap): def __init__(self, name: str, module: Any) -> None: super().__init__(CMapName=name) self.code2cid = module.CODE2CID if module.IS_VERTICAL: self.attrs["WMode"] = 1 class PyUnicodeMap(UnicodeMap): def __init__(self, name: str, module: Any, vertical: bool) -> None: super().__init__(CMapName=name) if vertical: self.cid2unichr = module.CID2UNICHR_V self.attrs["WMode"] = 1 else: self.cid2unichr = module.CID2UNICHR_H class CMapDB: _cmap_cache: Dict[str, PyCMap] = {} _umap_cache: Dict[str, List[PyUnicodeMap]] = {} class CMapNotFound(CMapError): pass @classmethod def _load_data(cls, name: str) -> Any: name = name.replace("\0", "") filename = "%s.pickle.gz" % name log.debug("loading: %r", name) cmap_paths = ( os.environ.get("CMAP_PATH", "/usr/share/pdfminer/"), os.path.join(os.path.dirname(__file__), "cmap"), ) for directory in cmap_paths: path = os.path.join(directory, filename) if os.path.exists(path): gzfile = gzip.open(path) try: return type(str(name), (), pickle.loads(gzfile.read())) finally: gzfile.close() else: raise CMapDB.CMapNotFound(name) @classmethod def get_cmap(cls, name: str) -> CMapBase: if name == "Identity-H": return IdentityCMap(WMode=0) elif name == "Identity-V": return IdentityCMap(WMode=1) elif name == "OneByteIdentityH": return IdentityCMapByte(WMode=0) elif name == "OneByteIdentityV": return IdentityCMapByte(WMode=1) try: return cls._cmap_cache[name] except KeyError: pass data = cls._load_data(name) cls._cmap_cache[name] = cmap = PyCMap(name, data) return cmap @classmethod def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap: try: return cls._umap_cache[name][vertical] except KeyError: pass data = cls._load_data("to-unicode-%s" % name) cls._umap_cache[name] = [PyUnicodeMap(name, data, v) for v in (False, True)] return cls._umap_cache[name][vertical] class CMapParser(PSStackParser[PSKeyword]): def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None: PSStackParser.__init__(self, fp) self.cmap = cmap # some ToUnicode maps don't have "begincmap" keyword. self._in_cmap = True self._warnings: Set[str] = set() return def run(self) -> None: try: self.nextobject() except PSEOF: pass return KEYWORD_BEGINCMAP = KWD(b"begincmap") KEYWORD_ENDCMAP = KWD(b"endcmap") KEYWORD_USECMAP = KWD(b"usecmap") KEYWORD_DEF = KWD(b"def") KEYWORD_BEGINCODESPACERANGE = KWD(b"begincodespacerange") KEYWORD_ENDCODESPACERANGE = KWD(b"endcodespacerange") KEYWORD_BEGINCIDRANGE = KWD(b"begincidrange") KEYWORD_ENDCIDRANGE = KWD(b"endcidrange") KEYWORD_BEGINCIDCHAR = KWD(b"begincidchar") KEYWORD_ENDCIDCHAR = KWD(b"endcidchar") KEYWORD_BEGINBFRANGE = KWD(b"beginbfrange") KEYWORD_ENDBFRANGE = KWD(b"endbfrange") KEYWORD_BEGINBFCHAR = KWD(b"beginbfchar") KEYWORD_ENDBFCHAR = KWD(b"endbfchar") KEYWORD_BEGINNOTDEFRANGE = KWD(b"beginnotdefrange") KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange") def do_keyword(self, pos: int, token: PSKeyword) -> None: """ToUnicode CMaps See Section 5.9.2 - ToUnicode CMaps of the PDF Reference. """ if token is self.KEYWORD_BEGINCMAP: self._in_cmap = True self.popall() return elif token is self.KEYWORD_ENDCMAP: self._in_cmap = False return if not self._in_cmap: return if token is self.KEYWORD_DEF: try: ((_, k), (_, v)) = self.pop(2) self.cmap.set_attr(literal_name(k), v) except PSSyntaxError: pass return if token is self.KEYWORD_USECMAP: try: ((_, cmapname),) = self.pop(1) self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname))) except PSSyntaxError: pass except CMapDB.CMapNotFound: pass return if token is self.KEYWORD_BEGINCODESPACERANGE: self.popall() return if token is self.KEYWORD_ENDCODESPACERANGE: self.popall() return if token is self.KEYWORD_BEGINCIDRANGE: self.popall() return if token is self.KEYWORD_ENDCIDRANGE: objs = [obj for (__, obj) in self.popall()] for (start_byte, end_byte, cid) in choplist(3, objs): if not isinstance(start_byte, bytes): self._warn_once("The start object of begincidrange is not a byte.") continue if not isinstance(end_byte, bytes): self._warn_once("The end object of begincidrange is not a byte.") continue if not isinstance(cid, int): self._warn_once("The cid object of begincidrange is not a byte.") continue if len(start_byte) != len(end_byte): self._warn_once( "The start and end byte of begincidrange have " "different lengths." ) continue start_prefix = start_byte[:-4] end_prefix = end_byte[:-4] if start_prefix != end_prefix: self._warn_once( "The prefix of the start and end byte of " "begincidrange are not the same." ) continue svar = start_byte[-4:] evar = end_byte[-4:] start = nunpack(svar) end = nunpack(evar) vlen = len(svar) for i in range(end - start + 1): x = start_prefix + struct.pack(">L", start + i)[-vlen:] self.cmap.add_cid2unichr(cid + i, x) return if token is self.KEYWORD_BEGINCIDCHAR: self.popall() return if token is self.KEYWORD_ENDCIDCHAR: objs = [obj for (__, obj) in self.popall()] for (cid, code) in choplist(2, objs): if isinstance(code, bytes) and isinstance(cid, int): self.cmap.add_cid2unichr(cid, code) return if token is self.KEYWORD_BEGINBFRANGE: self.popall() return if token is self.KEYWORD_ENDBFRANGE: objs = [obj for (__, obj) in self.popall()] for (start_byte, end_byte, code) in choplist(3, objs): if not isinstance(start_byte, bytes): self._warn_once("The start object is not a byte.") continue if not isinstance(end_byte, bytes): self._warn_once("The end object is not a byte.") continue if len(start_byte) != len(end_byte): self._warn_once("The start and end byte have different lengths.") continue start = nunpack(start_byte) end = nunpack(end_byte) if isinstance(code, list): if len(code) != end - start + 1: self._warn_once( "The difference between the start and end " "offsets does not match the code length." ) for cid, unicode_value in zip(range(start, end + 1), code): self.cmap.add_cid2unichr(cid, unicode_value) else: assert isinstance(code, bytes) var = code[-4:] base = nunpack(var) prefix = code[:-4] vlen = len(var) for i in range(end - start + 1): x = prefix + struct.pack(">L", base + i)[-vlen:] self.cmap.add_cid2unichr(start + i, x) return if token is self.KEYWORD_BEGINBFCHAR: self.popall() return if token is self.KEYWORD_ENDBFCHAR: objs = [obj for (__, obj) in self.popall()] for (cid, code) in choplist(2, objs): if isinstance(cid, bytes) and isinstance(code, bytes): self.cmap.add_cid2unichr(nunpack(cid), code) return if token is self.KEYWORD_BEGINNOTDEFRANGE: self.popall() return if token is self.KEYWORD_ENDNOTDEFRANGE: self.popall() return self.push((pos, token)) def _warn_once(self, msg: str) -> None: """Warn once for each unique message""" if msg not in self._warnings: self._warnings.add(msg) base_msg = ( "Ignoring (part of) ToUnicode map because the PDF data " "does not conform to the format. This could result in " "(cid) values in the output. " ) log.warning(base_msg + msg) def main(argv: List[str]) -> None: from warnings import warn warn( "The function main() from cmapdb.py will be removed in 2023. It was probably " "introduced for testing purposes a long time ago, and no longer relevant. " "Feel free to create a GitHub issue if you disagree.", DeprecationWarning, ) args = argv[1:] for fname in args: fp = open(fname, "rb") cmap = FileUnicodeMap() CMapParser(cmap, fp).run() fp.close() cmap.dump() return if __name__ == "__main__": main(sys.argv)