Fix extraction of some cjk characters (#593)

Fixes #566 * try to fix issue of some Chinese characters cannot be extracted correctly (#566). * format code to pass flake8 check. * fix typo and refer to issue 593. Co-authored-by: huan_cheng <huan_cheng@bestsign.cn> Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
2021-08-27 03:05:03 +08:00 · 2021-08-27 03:05:03 +08:00 · 234c466372
parent d821fed340
commit 234c466372
7 changed files with 23 additions and 8 deletions
--- a/.gitignore
+++ b/.gitignore
@ -21,5 +21,7 @@ tests/*.txt
 # python venv management tools
 Pipfile
 Pipfile.lock
+.noseids
+.vscode/
 pyproject.toml
-poetry.lock
+poetry.lock
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -12,6 +12,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 - Fix issue of ValueError and KeyError rasied in PDFdocument and PDFparser ([#573](https://github.com/pdfminer/pdfminer.six/pull/574))
 - Fix issue of TypeError: cannot unpack non-iterable PDFObjRef object, when unpacking the value of 'DW2' ([#529](https://github.com/pdfminer/pdfminer.six/pull/529))
 - `PermissionError` when creating temporary filepaths on windows when running tests ([#469](https://github.com/pdfminer/pdfminer.six/issues/469))
+- Fix issue of some Chinese characters can not be extracted correctly ([#593](https://github.com/pdfminer/pdfminer.six/pull/593))
 - Detecting trailer correctly when surrounded with needless whitespace ([#535](https://github.com/pdfminer/pdfminer.six/pull/535))
 - Fix `.paint_path` logic for handling single line segments and extracting point-on-curve positions of Beziér path commands ([#530](https://github.com/pdfminer/pdfminer.six/pull/530))

--- a/pdfminer/cmapdb.py
+++ b/pdfminer/cmapdb.py
@ -338,7 +338,7 @@ class CMapParser(PSStackParser):
        if token is self.KEYWORD_ENDCIDRANGE:
            objs = [obj for (__, obj) in self.popall()]
            for (s, e, cid) in choplist(3, objs):
-                if (not isinstance(s, str) or not isinstance(e, str) or
+                if (not isinstance(s, bytes) or not isinstance(e, bytes) or
                   not isinstance(cid, int) or len(s) != len(e)):
                    continue
                sprefix = s[:-4]
@ -352,7 +352,7 @@ class CMapParser(PSStackParser):
                vlen = len(svar)
                for i in range(e1-s1+1):
                    x = sprefix+struct.pack('>L', s1+i)[-vlen:]
-                    self.cmap.add_code2cid(x, cid+i)
+                    self.cmap.add_cid2unichr(cid+i, x)
            return

        if token is self.KEYWORD_BEGINCIDCHAR:
@ -361,8 +361,8 @@ class CMapParser(PSStackParser):
        if token is self.KEYWORD_ENDCIDCHAR:
            objs = [obj for (__, obj) in self.popall()]
            for (cid, code) in choplist(2, objs):
-                if isinstance(code, str) and isinstance(cid, str):
-                    self.cmap.add_code2cid(code, nunpack(cid))
+                if isinstance(code, bytes) and isinstance(cid, int):
+                    self.cmap.add_cid2unichr(cid, code)
            return

        if token is self.KEYWORD_BEGINBFRANGE:
--- a/pdfminer/pdffont.py
+++ b/pdfminer/pdffont.py
@ -403,9 +403,9 @@ class TrueTypeFont:
        return

    def create_unicode_map(self):
-        if 'cmap' not in self.tables:
+        if b'cmap' not in self.tables:
            raise TrueTypeFont.CMapNotFound
-        (base_offset, length) = self.tables['cmap']
+        (base_offset, length) = self.tables[b'cmap']
        fp = self.fp
        fp.seek(base_offset)
        (version, nsubtables) = struct.unpack('>HH', fp.read(4))
--- a/samples/contrib/issue_566_test_1.pdf
+++ b/samples/contrib/issue_566_test_1.pdf
--- a/samples/contrib/issue_566_test_2.pdf
+++ b/samples/contrib/issue_566_test_2.pdf
--- a/tests/test_highlevel_extracttext.py
+++ b/tests/test_highlevel_extracttext.py
@ -30,7 +30,9 @@ test_strings = {
    "simple2.pdf": "\f",
    "simple3.pdf": "Hello\n\nHello\nあ\nい\nう\nえ\nお\nあ\nい\nう\nえ\nお\n"
                   "World\n\nWorld\n\n\f",
-    "simple4.pdf": "Text1\nText2\nText3\n\n\f"
+    "simple4.pdf": "Text1\nText2\nText3\n\n\f",
+    "contrib/issue_566_test_1.pdf": "ISSUE Date：2019-4-25 Buyer：黎荣",
+    "contrib/issue_566_test_2.pdf": "甲方：中国饮料有限公司（盖章）",
 }


@ -80,6 +82,16 @@ class TestExtractText(unittest.TestCase):
        s = run_with_file(test_file)
        self.assertEqual(s, test_strings[test_file])

+    def test_issue_566_cmap_bytes(self):
+        test_file = "contrib/issue_566_test_1.pdf"
+        s = run_with_file(test_file)
+        self.assertEqual(s.strip(), test_strings[test_file])
+
+    def test_issue_566_cid_range(self):
+        test_file = "contrib/issue_566_test_2.pdf"
+        s = run_with_file(test_file)
+        self.assertEqual(s.strip(), test_strings[test_file])
+

 class TestExtractPages(unittest.TestCase):
    def _get_test_file_path(self):