Fix extraction of some cjk characters (#593)

Fixes #566 

* try to fix issue of some Chinese characters cannot be extracted
correctly (#566).

* format code to pass flake8 check.

* fix typo and refer to issue 593.

Co-authored-by: huan_cheng <huan_cheng@bestsign.cn>
Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
pull/595/head^2
wind_chh 2021-08-27 03:05:03 +08:00 committed by GitHub
parent d821fed340
commit 234c466372
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 23 additions and 8 deletions

4
.gitignore vendored
View File

@ -21,5 +21,7 @@ tests/*.txt
# python venv management tools # python venv management tools
Pipfile Pipfile
Pipfile.lock Pipfile.lock
.noseids
.vscode/
pyproject.toml pyproject.toml
poetry.lock poetry.lock

View File

@ -12,6 +12,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- Fix issue of ValueError and KeyError rasied in PDFdocument and PDFparser ([#573](https://github.com/pdfminer/pdfminer.six/pull/574)) - Fix issue of ValueError and KeyError rasied in PDFdocument and PDFparser ([#573](https://github.com/pdfminer/pdfminer.six/pull/574))
- Fix issue of TypeError: cannot unpack non-iterable PDFObjRef object, when unpacking the value of 'DW2' ([#529](https://github.com/pdfminer/pdfminer.six/pull/529)) - Fix issue of TypeError: cannot unpack non-iterable PDFObjRef object, when unpacking the value of 'DW2' ([#529](https://github.com/pdfminer/pdfminer.six/pull/529))
- `PermissionError` when creating temporary filepaths on windows when running tests ([#469](https://github.com/pdfminer/pdfminer.six/issues/469)) - `PermissionError` when creating temporary filepaths on windows when running tests ([#469](https://github.com/pdfminer/pdfminer.six/issues/469))
- Fix issue of some Chinese characters can not be extracted correctly ([#593](https://github.com/pdfminer/pdfminer.six/pull/593))
- Detecting trailer correctly when surrounded with needless whitespace ([#535](https://github.com/pdfminer/pdfminer.six/pull/535)) - Detecting trailer correctly when surrounded with needless whitespace ([#535](https://github.com/pdfminer/pdfminer.six/pull/535))
- Fix `.paint_path` logic for handling single line segments and extracting point-on-curve positions of Beziér path commands ([#530](https://github.com/pdfminer/pdfminer.six/pull/530)) - Fix `.paint_path` logic for handling single line segments and extracting point-on-curve positions of Beziér path commands ([#530](https://github.com/pdfminer/pdfminer.six/pull/530))

View File

@ -338,7 +338,7 @@ class CMapParser(PSStackParser):
if token is self.KEYWORD_ENDCIDRANGE: if token is self.KEYWORD_ENDCIDRANGE:
objs = [obj for (__, obj) in self.popall()] objs = [obj for (__, obj) in self.popall()]
for (s, e, cid) in choplist(3, objs): for (s, e, cid) in choplist(3, objs):
if (not isinstance(s, str) or not isinstance(e, str) or if (not isinstance(s, bytes) or not isinstance(e, bytes) or
not isinstance(cid, int) or len(s) != len(e)): not isinstance(cid, int) or len(s) != len(e)):
continue continue
sprefix = s[:-4] sprefix = s[:-4]
@ -352,7 +352,7 @@ class CMapParser(PSStackParser):
vlen = len(svar) vlen = len(svar)
for i in range(e1-s1+1): for i in range(e1-s1+1):
x = sprefix+struct.pack('>L', s1+i)[-vlen:] x = sprefix+struct.pack('>L', s1+i)[-vlen:]
self.cmap.add_code2cid(x, cid+i) self.cmap.add_cid2unichr(cid+i, x)
return return
if token is self.KEYWORD_BEGINCIDCHAR: if token is self.KEYWORD_BEGINCIDCHAR:
@ -361,8 +361,8 @@ class CMapParser(PSStackParser):
if token is self.KEYWORD_ENDCIDCHAR: if token is self.KEYWORD_ENDCIDCHAR:
objs = [obj for (__, obj) in self.popall()] objs = [obj for (__, obj) in self.popall()]
for (cid, code) in choplist(2, objs): for (cid, code) in choplist(2, objs):
if isinstance(code, str) and isinstance(cid, str): if isinstance(code, bytes) and isinstance(cid, int):
self.cmap.add_code2cid(code, nunpack(cid)) self.cmap.add_cid2unichr(cid, code)
return return
if token is self.KEYWORD_BEGINBFRANGE: if token is self.KEYWORD_BEGINBFRANGE:

View File

@ -403,9 +403,9 @@ class TrueTypeFont:
return return
def create_unicode_map(self): def create_unicode_map(self):
if 'cmap' not in self.tables: if b'cmap' not in self.tables:
raise TrueTypeFont.CMapNotFound raise TrueTypeFont.CMapNotFound
(base_offset, length) = self.tables['cmap'] (base_offset, length) = self.tables[b'cmap']
fp = self.fp fp = self.fp
fp.seek(base_offset) fp.seek(base_offset)
(version, nsubtables) = struct.unpack('>HH', fp.read(4)) (version, nsubtables) = struct.unpack('>HH', fp.read(4))

Binary file not shown.

Binary file not shown.

View File

@ -30,7 +30,9 @@ test_strings = {
"simple2.pdf": "\f", "simple2.pdf": "\f",
"simple3.pdf": "Hello\n\nHello\n\n\n\n\n\n\n\n\n\n\n" "simple3.pdf": "Hello\n\nHello\n\n\n\n\n\n\n\n\n\n\n"
"World\n\nWorld\n\n\f", "World\n\nWorld\n\n\f",
"simple4.pdf": "Text1\nText2\nText3\n\n\f" "simple4.pdf": "Text1\nText2\nText3\n\n\f",
"contrib/issue_566_test_1.pdf": "ISSUE Date2019-4-25 Buyer黎荣",
"contrib/issue_566_test_2.pdf": "甲方:中国饮料有限公司(盖章)",
} }
@ -80,6 +82,16 @@ class TestExtractText(unittest.TestCase):
s = run_with_file(test_file) s = run_with_file(test_file)
self.assertEqual(s, test_strings[test_file]) self.assertEqual(s, test_strings[test_file])
def test_issue_566_cmap_bytes(self):
test_file = "contrib/issue_566_test_1.pdf"
s = run_with_file(test_file)
self.assertEqual(s.strip(), test_strings[test_file])
def test_issue_566_cid_range(self):
test_file = "contrib/issue_566_test_2.pdf"
s = run_with_file(test_file)
self.assertEqual(s.strip(), test_strings[test_file])
class TestExtractPages(unittest.TestCase): class TestExtractPages(unittest.TestCase):
def _get_test_file_path(self): def _get_test_file_path(self):