Fix extraction of some cjk characters (#593)
Fixes #566 * try to fix issue of some Chinese characters cannot be extracted correctly (#566). * format code to pass flake8 check. * fix typo and refer to issue 593. Co-authored-by: huan_cheng <huan_cheng@bestsign.cn> Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>pull/595/head^2
parent
d821fed340
commit
234c466372
|
@ -21,5 +21,7 @@ tests/*.txt
|
|||
# python venv management tools
|
||||
Pipfile
|
||||
Pipfile.lock
|
||||
.noseids
|
||||
.vscode/
|
||||
pyproject.toml
|
||||
poetry.lock
|
||||
poetry.lock
|
||||
|
|
|
@ -12,6 +12,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
|||
- Fix issue of ValueError and KeyError rasied in PDFdocument and PDFparser ([#573](https://github.com/pdfminer/pdfminer.six/pull/574))
|
||||
- Fix issue of TypeError: cannot unpack non-iterable PDFObjRef object, when unpacking the value of 'DW2' ([#529](https://github.com/pdfminer/pdfminer.six/pull/529))
|
||||
- `PermissionError` when creating temporary filepaths on windows when running tests ([#469](https://github.com/pdfminer/pdfminer.six/issues/469))
|
||||
- Fix issue of some Chinese characters can not be extracted correctly ([#593](https://github.com/pdfminer/pdfminer.six/pull/593))
|
||||
- Detecting trailer correctly when surrounded with needless whitespace ([#535](https://github.com/pdfminer/pdfminer.six/pull/535))
|
||||
- Fix `.paint_path` logic for handling single line segments and extracting point-on-curve positions of Beziér path commands ([#530](https://github.com/pdfminer/pdfminer.six/pull/530))
|
||||
|
||||
|
|
|
@ -338,7 +338,7 @@ class CMapParser(PSStackParser):
|
|||
if token is self.KEYWORD_ENDCIDRANGE:
|
||||
objs = [obj for (__, obj) in self.popall()]
|
||||
for (s, e, cid) in choplist(3, objs):
|
||||
if (not isinstance(s, str) or not isinstance(e, str) or
|
||||
if (not isinstance(s, bytes) or not isinstance(e, bytes) or
|
||||
not isinstance(cid, int) or len(s) != len(e)):
|
||||
continue
|
||||
sprefix = s[:-4]
|
||||
|
@ -352,7 +352,7 @@ class CMapParser(PSStackParser):
|
|||
vlen = len(svar)
|
||||
for i in range(e1-s1+1):
|
||||
x = sprefix+struct.pack('>L', s1+i)[-vlen:]
|
||||
self.cmap.add_code2cid(x, cid+i)
|
||||
self.cmap.add_cid2unichr(cid+i, x)
|
||||
return
|
||||
|
||||
if token is self.KEYWORD_BEGINCIDCHAR:
|
||||
|
@ -361,8 +361,8 @@ class CMapParser(PSStackParser):
|
|||
if token is self.KEYWORD_ENDCIDCHAR:
|
||||
objs = [obj for (__, obj) in self.popall()]
|
||||
for (cid, code) in choplist(2, objs):
|
||||
if isinstance(code, str) and isinstance(cid, str):
|
||||
self.cmap.add_code2cid(code, nunpack(cid))
|
||||
if isinstance(code, bytes) and isinstance(cid, int):
|
||||
self.cmap.add_cid2unichr(cid, code)
|
||||
return
|
||||
|
||||
if token is self.KEYWORD_BEGINBFRANGE:
|
||||
|
|
|
@ -403,9 +403,9 @@ class TrueTypeFont:
|
|||
return
|
||||
|
||||
def create_unicode_map(self):
|
||||
if 'cmap' not in self.tables:
|
||||
if b'cmap' not in self.tables:
|
||||
raise TrueTypeFont.CMapNotFound
|
||||
(base_offset, length) = self.tables['cmap']
|
||||
(base_offset, length) = self.tables[b'cmap']
|
||||
fp = self.fp
|
||||
fp.seek(base_offset)
|
||||
(version, nsubtables) = struct.unpack('>HH', fp.read(4))
|
||||
|
|
Binary file not shown.
Binary file not shown.
|
@ -30,7 +30,9 @@ test_strings = {
|
|||
"simple2.pdf": "\f",
|
||||
"simple3.pdf": "Hello\n\nHello\nあ\nい\nう\nえ\nお\nあ\nい\nう\nえ\nお\n"
|
||||
"World\n\nWorld\n\n\f",
|
||||
"simple4.pdf": "Text1\nText2\nText3\n\n\f"
|
||||
"simple4.pdf": "Text1\nText2\nText3\n\n\f",
|
||||
"contrib/issue_566_test_1.pdf": "ISSUE Date:2019-4-25 Buyer:黎荣",
|
||||
"contrib/issue_566_test_2.pdf": "甲方:中国饮料有限公司(盖章)",
|
||||
}
|
||||
|
||||
|
||||
|
@ -80,6 +82,16 @@ class TestExtractText(unittest.TestCase):
|
|||
s = run_with_file(test_file)
|
||||
self.assertEqual(s, test_strings[test_file])
|
||||
|
||||
def test_issue_566_cmap_bytes(self):
|
||||
test_file = "contrib/issue_566_test_1.pdf"
|
||||
s = run_with_file(test_file)
|
||||
self.assertEqual(s.strip(), test_strings[test_file])
|
||||
|
||||
def test_issue_566_cid_range(self):
|
||||
test_file = "contrib/issue_566_test_2.pdf"
|
||||
s = run_with_file(test_file)
|
||||
self.assertEqual(s.strip(), test_strings[test_file])
|
||||
|
||||
|
||||
class TestExtractPages(unittest.TestCase):
|
||||
def _get_test_file_path(self):
|
||||
|
|
Loading…
Reference in New Issue