diff --git a/CHANGELOG.md b/CHANGELOG.md index f0b8c6a..a580bca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,11 @@ All notable changes in pdfminer.six will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [Unreleased] + +### Added +- Support for identity cmap's ([#626](https://github.com/pdfminer/pdfminer.six/pull/626)) + ## [20211012] ### Added diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index 853d877..6974c1c 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -160,6 +160,13 @@ class UnicodeMap(CMapBase): return +class IdentityUnicodeMap(UnicodeMap): + def get_unichr(self, cid: int) -> str: + """Interpret character id as unicode codepoint""" + log.debug('get_unichr: %r, %r', self, cid) + return chr(cid) + + class FileCMap(CMap): def add_code2cid(self, code: str, cid: int) -> None: diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index df0813d..00e325e 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -7,6 +7,7 @@ from typing import (Any, BinaryIO, Dict, Iterable, Iterator, List, Mapping, from . import settings from .cmapdb import CMap +from .cmapdb import IdentityUnicodeMap from .cmapdb import CMapBase from .cmapdb import CMapDB from .cmapdb import CMapParser @@ -763,9 +764,17 @@ class PDFCIDFont(PDFFont): BytesIO(self.fontfile.get_data())) self.unicode_map: Optional[UnicodeMap] = None if 'ToUnicode' in spec: - strm = stream_value(spec['ToUnicode']) - self.unicode_map = FileUnicodeMap() - CMapParser(self.unicode_map, BytesIO(strm.get_data())).run() + if isinstance(spec['ToUnicode'], PDFStream): + strm = stream_value(spec['ToUnicode']) + self.unicode_map = FileUnicodeMap() + CMapParser(self.unicode_map, BytesIO(strm.get_data())).run() + else: + cmap_name = literal_name(spec['ToUnicode']) + encoding = literal_name(spec['Encoding']) + if 'Identity' in cid_ordering \ + or 'Identity' in cmap_name \ + or 'Identity' in encoding: + self.unicode_map = IdentityUnicodeMap() elif self.cidcoding in ('Adobe-Identity', 'Adobe-UCS'): if ttf: try: diff --git a/samples/contrib/issue-625-identity-cmap.pdf b/samples/contrib/issue-625-identity-cmap.pdf new file mode 100644 index 0000000..eb0980a Binary files /dev/null and b/samples/contrib/issue-625-identity-cmap.pdf differ diff --git a/tests/test_highlevel_extracttext.py b/tests/test_highlevel_extracttext.py index 90c9764..3c9f991 100644 --- a/tests/test_highlevel_extracttext.py +++ b/tests/test_highlevel_extracttext.py @@ -33,6 +33,7 @@ test_strings = { "simple4.pdf": "Text1\nText2\nText3\n\n\f", "contrib/issue_566_test_1.pdf": "ISSUE Date:2019-4-25 Buyer:黎荣", "contrib/issue_566_test_2.pdf": "甲方:中国饮料有限公司(盖章)", + "contrib/issue-625-identity-cmap.pdf": "Termin płatności: 2021-05-03", } @@ -92,6 +93,12 @@ class TestExtractText(unittest.TestCase): s = run_with_file(test_file) self.assertEqual(s.strip(), test_strings[test_file]) + def test_issue_625_identity_cmap(self): + test_file = "contrib/issue-625-identity-cmap.pdf" + lines = run_with_file(test_file).splitlines() + + self.assertEqual(lines[6], test_strings[test_file]) + class TestExtractPages(unittest.TestCase): def _get_test_file_path(self):