Add support identity unicode cmap (#626)

Fixes #625 

* add support for Identity-H/V cmap fonts

* format code to pass flake8 check

* Remove indent

* Remove indent

* Use isinstance instead of type check

* Use or instead of any

* Use str in variable, instead of str.find()

* Fix mypy error: add typing annotations to get_unichr()

* Fix type of PDFCIDFont. Can be any type of CMapBase.

This is a quick fix, the entire cmap structure does not have proper inheritance.

* Added line to CHANGELOG.md

* Add separate class for IdentityUnicodeMap

* Remove ABC from CmapBase

* Remove ABC from CmapBase

* Remove blank line

Co-authored-by: huan_cheng <huan_cheng@bestsign.cn>
Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
pull/700/head^2
wind_chh 2021-10-14 03:52:00 +08:00 committed by GitHub
parent da5b96828e
commit c883f5e13f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 31 additions and 3 deletions

View File

@ -3,6 +3,11 @@ All notable changes in pdfminer.six will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
## [Unreleased]
### Added
- Support for identity cmap's ([#626](https://github.com/pdfminer/pdfminer.six/pull/626))
## [20211012]
### Added

View File

@ -160,6 +160,13 @@ class UnicodeMap(CMapBase):
return
class IdentityUnicodeMap(UnicodeMap):
def get_unichr(self, cid: int) -> str:
"""Interpret character id as unicode codepoint"""
log.debug('get_unichr: %r, %r', self, cid)
return chr(cid)
class FileCMap(CMap):
def add_code2cid(self, code: str, cid: int) -> None:

View File

@ -7,6 +7,7 @@ from typing import (Any, BinaryIO, Dict, Iterable, Iterator, List, Mapping,
from . import settings
from .cmapdb import CMap
from .cmapdb import IdentityUnicodeMap
from .cmapdb import CMapBase
from .cmapdb import CMapDB
from .cmapdb import CMapParser
@ -763,9 +764,17 @@ class PDFCIDFont(PDFFont):
BytesIO(self.fontfile.get_data()))
self.unicode_map: Optional[UnicodeMap] = None
if 'ToUnicode' in spec:
if isinstance(spec['ToUnicode'], PDFStream):
strm = stream_value(spec['ToUnicode'])
self.unicode_map = FileUnicodeMap()
CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
else:
cmap_name = literal_name(spec['ToUnicode'])
encoding = literal_name(spec['Encoding'])
if 'Identity' in cid_ordering \
or 'Identity' in cmap_name \
or 'Identity' in encoding:
self.unicode_map = IdentityUnicodeMap()
elif self.cidcoding in ('Adobe-Identity', 'Adobe-UCS'):
if ttf:
try:

Binary file not shown.

View File

@ -33,6 +33,7 @@ test_strings = {
"simple4.pdf": "Text1\nText2\nText3\n\n\f",
"contrib/issue_566_test_1.pdf": "ISSUE Date2019-4-25 Buyer黎荣",
"contrib/issue_566_test_2.pdf": "甲方:中国饮料有限公司(盖章)",
"contrib/issue-625-identity-cmap.pdf": "Termin płatności: 2021-05-03",
}
@ -92,6 +93,12 @@ class TestExtractText(unittest.TestCase):
s = run_with_file(test_file)
self.assertEqual(s.strip(), test_strings[test_file])
def test_issue_625_identity_cmap(self):
test_file = "contrib/issue-625-identity-cmap.pdf"
lines = run_with_file(test_file).splitlines()
self.assertEqual(lines[6], test_strings[test_file])
class TestExtractPages(unittest.TestCase):
def _get_test_file_path(self):