Add support identity unicode cmap (#626)
Fixes #625 * add support for Identity-H/V cmap fonts * format code to pass flake8 check * Remove indent * Remove indent * Use isinstance instead of type check * Use or instead of any * Use str in variable, instead of str.find() * Fix mypy error: add typing annotations to get_unichr() * Fix type of PDFCIDFont. Can be any type of CMapBase. This is a quick fix, the entire cmap structure does not have proper inheritance. * Added line to CHANGELOG.md * Add separate class for IdentityUnicodeMap * Remove ABC from CmapBase * Remove ABC from CmapBase * Remove blank line Co-authored-by: huan_cheng <huan_cheng@bestsign.cn> Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>pull/700/head^2
parent
da5b96828e
commit
c883f5e13f
|
@ -3,6 +3,11 @@ All notable changes in pdfminer.six will be documented in this file.
|
||||||
|
|
||||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
|
|
||||||
|
## [Unreleased]
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- Support for identity cmap's ([#626](https://github.com/pdfminer/pdfminer.six/pull/626))
|
||||||
|
|
||||||
## [20211012]
|
## [20211012]
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
|
|
|
@ -160,6 +160,13 @@ class UnicodeMap(CMapBase):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
class IdentityUnicodeMap(UnicodeMap):
|
||||||
|
def get_unichr(self, cid: int) -> str:
|
||||||
|
"""Interpret character id as unicode codepoint"""
|
||||||
|
log.debug('get_unichr: %r, %r', self, cid)
|
||||||
|
return chr(cid)
|
||||||
|
|
||||||
|
|
||||||
class FileCMap(CMap):
|
class FileCMap(CMap):
|
||||||
|
|
||||||
def add_code2cid(self, code: str, cid: int) -> None:
|
def add_code2cid(self, code: str, cid: int) -> None:
|
||||||
|
|
|
@ -7,6 +7,7 @@ from typing import (Any, BinaryIO, Dict, Iterable, Iterator, List, Mapping,
|
||||||
|
|
||||||
from . import settings
|
from . import settings
|
||||||
from .cmapdb import CMap
|
from .cmapdb import CMap
|
||||||
|
from .cmapdb import IdentityUnicodeMap
|
||||||
from .cmapdb import CMapBase
|
from .cmapdb import CMapBase
|
||||||
from .cmapdb import CMapDB
|
from .cmapdb import CMapDB
|
||||||
from .cmapdb import CMapParser
|
from .cmapdb import CMapParser
|
||||||
|
@ -763,9 +764,17 @@ class PDFCIDFont(PDFFont):
|
||||||
BytesIO(self.fontfile.get_data()))
|
BytesIO(self.fontfile.get_data()))
|
||||||
self.unicode_map: Optional[UnicodeMap] = None
|
self.unicode_map: Optional[UnicodeMap] = None
|
||||||
if 'ToUnicode' in spec:
|
if 'ToUnicode' in spec:
|
||||||
strm = stream_value(spec['ToUnicode'])
|
if isinstance(spec['ToUnicode'], PDFStream):
|
||||||
self.unicode_map = FileUnicodeMap()
|
strm = stream_value(spec['ToUnicode'])
|
||||||
CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
|
self.unicode_map = FileUnicodeMap()
|
||||||
|
CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
|
||||||
|
else:
|
||||||
|
cmap_name = literal_name(spec['ToUnicode'])
|
||||||
|
encoding = literal_name(spec['Encoding'])
|
||||||
|
if 'Identity' in cid_ordering \
|
||||||
|
or 'Identity' in cmap_name \
|
||||||
|
or 'Identity' in encoding:
|
||||||
|
self.unicode_map = IdentityUnicodeMap()
|
||||||
elif self.cidcoding in ('Adobe-Identity', 'Adobe-UCS'):
|
elif self.cidcoding in ('Adobe-Identity', 'Adobe-UCS'):
|
||||||
if ttf:
|
if ttf:
|
||||||
try:
|
try:
|
||||||
|
|
Binary file not shown.
|
@ -33,6 +33,7 @@ test_strings = {
|
||||||
"simple4.pdf": "Text1\nText2\nText3\n\n\f",
|
"simple4.pdf": "Text1\nText2\nText3\n\n\f",
|
||||||
"contrib/issue_566_test_1.pdf": "ISSUE Date:2019-4-25 Buyer:黎荣",
|
"contrib/issue_566_test_1.pdf": "ISSUE Date:2019-4-25 Buyer:黎荣",
|
||||||
"contrib/issue_566_test_2.pdf": "甲方:中国饮料有限公司(盖章)",
|
"contrib/issue_566_test_2.pdf": "甲方:中国饮料有限公司(盖章)",
|
||||||
|
"contrib/issue-625-identity-cmap.pdf": "Termin płatności: 2021-05-03",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -92,6 +93,12 @@ class TestExtractText(unittest.TestCase):
|
||||||
s = run_with_file(test_file)
|
s = run_with_file(test_file)
|
||||||
self.assertEqual(s.strip(), test_strings[test_file])
|
self.assertEqual(s.strip(), test_strings[test_file])
|
||||||
|
|
||||||
|
def test_issue_625_identity_cmap(self):
|
||||||
|
test_file = "contrib/issue-625-identity-cmap.pdf"
|
||||||
|
lines = run_with_file(test_file).splitlines()
|
||||||
|
|
||||||
|
self.assertEqual(lines[6], test_strings[test_file])
|
||||||
|
|
||||||
|
|
||||||
class TestExtractPages(unittest.TestCase):
|
class TestExtractPages(unittest.TestCase):
|
||||||
def _get_test_file_path(self):
|
def _get_test_file_path(self):
|
||||||
|
|
Loading…
Reference in New Issue