Add support identity unicode cmap (#626)

Fixes #625 * add support for Identity-H/V cmap fonts * format code to pass flake8 check * Remove indent * Remove indent * Use isinstance instead of type check * Use or instead of any * Use str in variable, instead of str.find() * Fix mypy error: add typing annotations to get_unichr() * Fix type of PDFCIDFont. Can be any type of CMapBase. This is a quick fix, the entire cmap structure does not have proper inheritance. * Added line to CHANGELOG.md * Add separate class for IdentityUnicodeMap * Remove ABC from CmapBase * Remove ABC from CmapBase * Remove blank line Co-authored-by: huan_cheng <huan_cheng@bestsign.cn> Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
2021-10-14 03:52:00 +08:00 · 2021-10-14 03:52:00 +08:00 · c883f5e13f
parent da5b96828e
commit c883f5e13f
5 changed files with 31 additions and 3 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -3,6 +3,11 @@ All notable changes in pdfminer.six will be documented in this file.

 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

+## [Unreleased]
+
+### Added
+- Support for identity cmap's ([#626](https://github.com/pdfminer/pdfminer.six/pull/626))
+
 ## [20211012]

 ### Added
--- a/pdfminer/cmapdb.py
+++ b/pdfminer/cmapdb.py
@ -160,6 +160,13 @@ class UnicodeMap(CMapBase):
        return


+class IdentityUnicodeMap(UnicodeMap):
+    def get_unichr(self, cid: int) -> str:
+        """Interpret character id as unicode codepoint"""
+        log.debug('get_unichr: %r, %r', self, cid)
+        return chr(cid)
+
+
 class FileCMap(CMap):

    def add_code2cid(self, code: str, cid: int) -> None:
--- a/pdfminer/pdffont.py
+++ b/pdfminer/pdffont.py
@ -7,6 +7,7 @@ from typing import (Any, BinaryIO, Dict, Iterable, Iterator, List, Mapping,

 from . import settings
 from .cmapdb import CMap
+from .cmapdb import IdentityUnicodeMap
 from .cmapdb import CMapBase
 from .cmapdb import CMapDB
 from .cmapdb import CMapParser
@ -763,9 +764,17 @@ class PDFCIDFont(PDFFont):
                               BytesIO(self.fontfile.get_data()))
        self.unicode_map: Optional[UnicodeMap] = None
        if 'ToUnicode' in spec:
+            if isinstance(spec['ToUnicode'], PDFStream):
                strm = stream_value(spec['ToUnicode'])
                self.unicode_map = FileUnicodeMap()
                CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
+            else:
+                cmap_name = literal_name(spec['ToUnicode'])
+                encoding = literal_name(spec['Encoding'])
+                if 'Identity' in cid_ordering \
+                        or 'Identity' in cmap_name \
+                        or 'Identity' in encoding:
+                    self.unicode_map = IdentityUnicodeMap()
        elif self.cidcoding in ('Adobe-Identity', 'Adobe-UCS'):
            if ttf:
                try:
--- a/samples/contrib/issue-625-identity-cmap.pdf
+++ b/samples/contrib/issue-625-identity-cmap.pdf
--- a/tests/test_highlevel_extracttext.py
+++ b/tests/test_highlevel_extracttext.py
@ -33,6 +33,7 @@ test_strings = {
    "simple4.pdf": "Text1\nText2\nText3\n\n\f",
    "contrib/issue_566_test_1.pdf": "ISSUE Date：2019-4-25 Buyer：黎荣",
    "contrib/issue_566_test_2.pdf": "甲方：中国饮料有限公司（盖章）",
+    "contrib/issue-625-identity-cmap.pdf": "Termin płatności: 2021-05-03",
 }


@ -92,6 +93,12 @@ class TestExtractText(unittest.TestCase):
        s = run_with_file(test_file)
        self.assertEqual(s.strip(), test_strings[test_file])

+    def test_issue_625_identity_cmap(self):
+        test_file = "contrib/issue-625-identity-cmap.pdf"
+        lines = run_with_file(test_file).splitlines()
+
+        self.assertEqual(lines[6], test_strings[test_file])
+

 class TestExtractPages(unittest.TestCase):
    def _get_test_file_path(self):