Add support identity unicode cmap (#626)
Fixes #625 * add support for Identity-H/V cmap fonts * format code to pass flake8 check * Remove indent * Remove indent * Use isinstance instead of type check * Use or instead of any * Use str in variable, instead of str.find() * Fix mypy error: add typing annotations to get_unichr() * Fix type of PDFCIDFont. Can be any type of CMapBase. This is a quick fix, the entire cmap structure does not have proper inheritance. * Added line to CHANGELOG.md * Add separate class for IdentityUnicodeMap * Remove ABC from CmapBase * Remove ABC from CmapBase * Remove blank line Co-authored-by: huan_cheng <huan_cheng@bestsign.cn> Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>pull/700/head^2
parent
da5b96828e
commit
c883f5e13f
|
@ -3,6 +3,11 @@ All notable changes in pdfminer.six will be documented in this file.
|
|||
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
- Support for identity cmap's ([#626](https://github.com/pdfminer/pdfminer.six/pull/626))
|
||||
|
||||
## [20211012]
|
||||
|
||||
### Added
|
||||
|
|
|
@ -160,6 +160,13 @@ class UnicodeMap(CMapBase):
|
|||
return
|
||||
|
||||
|
||||
class IdentityUnicodeMap(UnicodeMap):
|
||||
def get_unichr(self, cid: int) -> str:
|
||||
"""Interpret character id as unicode codepoint"""
|
||||
log.debug('get_unichr: %r, %r', self, cid)
|
||||
return chr(cid)
|
||||
|
||||
|
||||
class FileCMap(CMap):
|
||||
|
||||
def add_code2cid(self, code: str, cid: int) -> None:
|
||||
|
|
|
@ -7,6 +7,7 @@ from typing import (Any, BinaryIO, Dict, Iterable, Iterator, List, Mapping,
|
|||
|
||||
from . import settings
|
||||
from .cmapdb import CMap
|
||||
from .cmapdb import IdentityUnicodeMap
|
||||
from .cmapdb import CMapBase
|
||||
from .cmapdb import CMapDB
|
||||
from .cmapdb import CMapParser
|
||||
|
@ -763,9 +764,17 @@ class PDFCIDFont(PDFFont):
|
|||
BytesIO(self.fontfile.get_data()))
|
||||
self.unicode_map: Optional[UnicodeMap] = None
|
||||
if 'ToUnicode' in spec:
|
||||
if isinstance(spec['ToUnicode'], PDFStream):
|
||||
strm = stream_value(spec['ToUnicode'])
|
||||
self.unicode_map = FileUnicodeMap()
|
||||
CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
|
||||
else:
|
||||
cmap_name = literal_name(spec['ToUnicode'])
|
||||
encoding = literal_name(spec['Encoding'])
|
||||
if 'Identity' in cid_ordering \
|
||||
or 'Identity' in cmap_name \
|
||||
or 'Identity' in encoding:
|
||||
self.unicode_map = IdentityUnicodeMap()
|
||||
elif self.cidcoding in ('Adobe-Identity', 'Adobe-UCS'):
|
||||
if ttf:
|
||||
try:
|
||||
|
|
Binary file not shown.
|
@ -33,6 +33,7 @@ test_strings = {
|
|||
"simple4.pdf": "Text1\nText2\nText3\n\n\f",
|
||||
"contrib/issue_566_test_1.pdf": "ISSUE Date:2019-4-25 Buyer:黎荣",
|
||||
"contrib/issue_566_test_2.pdf": "甲方:中国饮料有限公司(盖章)",
|
||||
"contrib/issue-625-identity-cmap.pdf": "Termin płatności: 2021-05-03",
|
||||
}
|
||||
|
||||
|
||||
|
@ -92,6 +93,12 @@ class TestExtractText(unittest.TestCase):
|
|||
s = run_with_file(test_file)
|
||||
self.assertEqual(s.strip(), test_strings[test_file])
|
||||
|
||||
def test_issue_625_identity_cmap(self):
|
||||
test_file = "contrib/issue-625-identity-cmap.pdf"
|
||||
lines = run_with_file(test_file).splitlines()
|
||||
|
||||
self.assertEqual(lines[6], test_strings[test_file])
|
||||
|
||||
|
||||
class TestExtractPages(unittest.TestCase):
|
||||
def _get_test_file_path(self):
|
||||
|
|
Loading…
Reference in New Issue