Add support identity unicode cmap (#626)

Fixes #625 

* add support for Identity-H/V cmap fonts

* format code to pass flake8 check

* Remove indent

* Remove indent

* Use isinstance instead of type check

* Use or instead of any

* Use str in variable, instead of str.find()

* Fix mypy error: add typing annotations to get_unichr()

* Fix type of PDFCIDFont. Can be any type of CMapBase.

This is a quick fix, the entire cmap structure does not have proper inheritance.

* Added line to CHANGELOG.md

* Add separate class for IdentityUnicodeMap

* Remove ABC from CmapBase

* Remove ABC from CmapBase

* Remove blank line

Co-authored-by: huan_cheng <huan_cheng@bestsign.cn>
Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
pull/700/head^2
wind_chh 2021-10-14 03:52:00 +08:00 committed by GitHub
parent da5b96828e
commit c883f5e13f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 31 additions and 3 deletions

View File

@ -3,6 +3,11 @@ All notable changes in pdfminer.six will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
## [Unreleased]
### Added
- Support for identity cmap's ([#626](https://github.com/pdfminer/pdfminer.six/pull/626))
## [20211012] ## [20211012]
### Added ### Added

View File

@ -160,6 +160,13 @@ class UnicodeMap(CMapBase):
return return
class IdentityUnicodeMap(UnicodeMap):
def get_unichr(self, cid: int) -> str:
"""Interpret character id as unicode codepoint"""
log.debug('get_unichr: %r, %r', self, cid)
return chr(cid)
class FileCMap(CMap): class FileCMap(CMap):
def add_code2cid(self, code: str, cid: int) -> None: def add_code2cid(self, code: str, cid: int) -> None:

View File

@ -7,6 +7,7 @@ from typing import (Any, BinaryIO, Dict, Iterable, Iterator, List, Mapping,
from . import settings from . import settings
from .cmapdb import CMap from .cmapdb import CMap
from .cmapdb import IdentityUnicodeMap
from .cmapdb import CMapBase from .cmapdb import CMapBase
from .cmapdb import CMapDB from .cmapdb import CMapDB
from .cmapdb import CMapParser from .cmapdb import CMapParser
@ -763,9 +764,17 @@ class PDFCIDFont(PDFFont):
BytesIO(self.fontfile.get_data())) BytesIO(self.fontfile.get_data()))
self.unicode_map: Optional[UnicodeMap] = None self.unicode_map: Optional[UnicodeMap] = None
if 'ToUnicode' in spec: if 'ToUnicode' in spec:
strm = stream_value(spec['ToUnicode']) if isinstance(spec['ToUnicode'], PDFStream):
self.unicode_map = FileUnicodeMap() strm = stream_value(spec['ToUnicode'])
CMapParser(self.unicode_map, BytesIO(strm.get_data())).run() self.unicode_map = FileUnicodeMap()
CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
else:
cmap_name = literal_name(spec['ToUnicode'])
encoding = literal_name(spec['Encoding'])
if 'Identity' in cid_ordering \
or 'Identity' in cmap_name \
or 'Identity' in encoding:
self.unicode_map = IdentityUnicodeMap()
elif self.cidcoding in ('Adobe-Identity', 'Adobe-UCS'): elif self.cidcoding in ('Adobe-Identity', 'Adobe-UCS'):
if ttf: if ttf:
try: try:

Binary file not shown.

View File

@ -33,6 +33,7 @@ test_strings = {
"simple4.pdf": "Text1\nText2\nText3\n\n\f", "simple4.pdf": "Text1\nText2\nText3\n\n\f",
"contrib/issue_566_test_1.pdf": "ISSUE Date2019-4-25 Buyer黎荣", "contrib/issue_566_test_1.pdf": "ISSUE Date2019-4-25 Buyer黎荣",
"contrib/issue_566_test_2.pdf": "甲方:中国饮料有限公司(盖章)", "contrib/issue_566_test_2.pdf": "甲方:中国饮料有限公司(盖章)",
"contrib/issue-625-identity-cmap.pdf": "Termin płatności: 2021-05-03",
} }
@ -92,6 +93,12 @@ class TestExtractText(unittest.TestCase):
s = run_with_file(test_file) s = run_with_file(test_file)
self.assertEqual(s.strip(), test_strings[test_file]) self.assertEqual(s.strip(), test_strings[test_file])
def test_issue_625_identity_cmap(self):
test_file = "contrib/issue-625-identity-cmap.pdf"
lines = run_with_file(test_file).splitlines()
self.assertEqual(lines[6], test_strings[test_file])
class TestExtractPages(unittest.TestCase): class TestExtractPages(unittest.TestCase):
def _get_test_file_path(self): def _get_test_file_path(self):