diff --git a/.gitignore b/.gitignore index a39febb..56f96b3 100644 --- a/.gitignore +++ b/.gitignore @@ -21,5 +21,7 @@ tests/*.txt # python venv management tools Pipfile Pipfile.lock +.noseids +.vscode/ pyproject.toml -poetry.lock \ No newline at end of file +poetry.lock diff --git a/CHANGELOG.md b/CHANGELOG.md index 89b0fe7..1ed10eb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - Fix issue of ValueError and KeyError rasied in PDFdocument and PDFparser ([#573](https://github.com/pdfminer/pdfminer.six/pull/574)) - Fix issue of TypeError: cannot unpack non-iterable PDFObjRef object, when unpacking the value of 'DW2' ([#529](https://github.com/pdfminer/pdfminer.six/pull/529)) - `PermissionError` when creating temporary filepaths on windows when running tests ([#469](https://github.com/pdfminer/pdfminer.six/issues/469)) +- Fix issue of some Chinese characters can not be extracted correctly ([#593](https://github.com/pdfminer/pdfminer.six/pull/593)) - Detecting trailer correctly when surrounded with needless whitespace ([#535](https://github.com/pdfminer/pdfminer.six/pull/535)) - Fix `.paint_path` logic for handling single line segments and extracting point-on-curve positions of Beziér path commands ([#530](https://github.com/pdfminer/pdfminer.six/pull/530)) diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index 7c0b772..35ced14 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -338,7 +338,7 @@ class CMapParser(PSStackParser): if token is self.KEYWORD_ENDCIDRANGE: objs = [obj for (__, obj) in self.popall()] for (s, e, cid) in choplist(3, objs): - if (not isinstance(s, str) or not isinstance(e, str) or + if (not isinstance(s, bytes) or not isinstance(e, bytes) or not isinstance(cid, int) or len(s) != len(e)): continue sprefix = s[:-4] @@ -352,7 +352,7 @@ class CMapParser(PSStackParser): vlen = len(svar) for i in range(e1-s1+1): x = sprefix+struct.pack('>L', s1+i)[-vlen:] - self.cmap.add_code2cid(x, cid+i) + self.cmap.add_cid2unichr(cid+i, x) return if token is self.KEYWORD_BEGINCIDCHAR: @@ -361,8 +361,8 @@ class CMapParser(PSStackParser): if token is self.KEYWORD_ENDCIDCHAR: objs = [obj for (__, obj) in self.popall()] for (cid, code) in choplist(2, objs): - if isinstance(code, str) and isinstance(cid, str): - self.cmap.add_code2cid(code, nunpack(cid)) + if isinstance(code, bytes) and isinstance(cid, int): + self.cmap.add_cid2unichr(cid, code) return if token is self.KEYWORD_BEGINBFRANGE: diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index c0da1b6..74ad6a6 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -403,9 +403,9 @@ class TrueTypeFont: return def create_unicode_map(self): - if 'cmap' not in self.tables: + if b'cmap' not in self.tables: raise TrueTypeFont.CMapNotFound - (base_offset, length) = self.tables['cmap'] + (base_offset, length) = self.tables[b'cmap'] fp = self.fp fp.seek(base_offset) (version, nsubtables) = struct.unpack('>HH', fp.read(4)) diff --git a/samples/contrib/issue_566_test_1.pdf b/samples/contrib/issue_566_test_1.pdf new file mode 100644 index 0000000..2967098 Binary files /dev/null and b/samples/contrib/issue_566_test_1.pdf differ diff --git a/samples/contrib/issue_566_test_2.pdf b/samples/contrib/issue_566_test_2.pdf new file mode 100644 index 0000000..8045325 Binary files /dev/null and b/samples/contrib/issue_566_test_2.pdf differ diff --git a/tests/test_highlevel_extracttext.py b/tests/test_highlevel_extracttext.py index 47496e6..90c9764 100644 --- a/tests/test_highlevel_extracttext.py +++ b/tests/test_highlevel_extracttext.py @@ -30,7 +30,9 @@ test_strings = { "simple2.pdf": "\f", "simple3.pdf": "Hello\n\nHello\nあ\nい\nう\nえ\nお\nあ\nい\nう\nえ\nお\n" "World\n\nWorld\n\n\f", - "simple4.pdf": "Text1\nText2\nText3\n\n\f" + "simple4.pdf": "Text1\nText2\nText3\n\n\f", + "contrib/issue_566_test_1.pdf": "ISSUE Date:2019-4-25 Buyer:黎荣", + "contrib/issue_566_test_2.pdf": "甲方:中国饮料有限公司(盖章)", } @@ -80,6 +82,16 @@ class TestExtractText(unittest.TestCase): s = run_with_file(test_file) self.assertEqual(s, test_strings[test_file]) + def test_issue_566_cmap_bytes(self): + test_file = "contrib/issue_566_test_1.pdf" + s = run_with_file(test_file) + self.assertEqual(s.strip(), test_strings[test_file]) + + def test_issue_566_cid_range(self): + test_file = "contrib/issue_566_test_2.pdf" + s = run_with_file(test_file) + self.assertEqual(s.strip(), test_strings[test_file]) + class TestExtractPages(unittest.TestCase): def _get_test_file_path(self):