From c597e95a9f828b6d6f18566a44d8706bdbc6744b Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sun, 14 Jul 2019 15:37:15 +0200 Subject: [PATCH] Use KeyError to signal that the name does not resemble any unicode, this pattern is also used in the rest of pdfminer.six --- pdfminer/encodingdb.py | 13 ++++++++----- tests/test_encodingdb.py | 12 +++++++----- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/pdfminer/encodingdb.py b/pdfminer/encodingdb.py index aa00005..5dcd8f2 100644 --- a/pdfminer/encodingdb.py +++ b/pdfminer/encodingdb.py @@ -10,12 +10,15 @@ from .psparser import PSLiteral HEXADECIMAL = re.compile(r'[0-9a-fA-F]+') -def name2unicode(name: str): +def name2unicode(name): """Converts Adobe glyph names to Unicode numbers. + In contrast to the specification, this raises a KeyError instead of return an empty string when the key is unknown. + This way the caller must explicitly define what to do when there is not a match. + Reference: https://github.com/adobe-type-tools/agl-specification#2-the-mapping - :returns unicode character if name resembles something, empty string if not + :returns unicode character if name resembles something, otherwise a KeyError """ full_stop = u'\u002E' name = name.split(full_stop)[0] @@ -33,7 +36,7 @@ def name2unicode(name: str): if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0: unicode_digits = [int(name_without_uni[i:i + 4], base=16) for i in range(0, len(name_without_uni), 4)] if any([55295 < digit < 57344 for digit in unicode_digits]): - return '' + raise KeyError characters = map(six.unichr, unicode_digits) return ''.join(characters) @@ -42,10 +45,10 @@ def name2unicode(name: str): if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6: unicode_digit = int(name_without_u, base=16) if 55295 < unicode_digit < 57344: - return '' + raise KeyError return six.unichr(unicode_digit) - return '' + raise KeyError class EncodingDB(object): diff --git a/tests/test_encodingdb.py b/tests/test_encodingdb.py index ac10d54..82c0282 100644 --- a/tests/test_encodingdb.py +++ b/tests/test_encodingdb.py @@ -4,6 +4,8 @@ Tests based on the Adobe Glyph List Specification (https://github.com/adobe-type While not in the specification, lowercase unicode often occurs in pdf's. Therefore lowercase unittest variants are added. """ +from nose.tools import assert_raises + from pdfminer.encodingdb import name2unicode @@ -48,7 +50,7 @@ def test_name2unicode_uni_empty_string_long(): expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the glyph name "u1040C. """ - assert u'' == name2unicode('uniD801DC0C') + assert_raises(KeyError, name2unicode, 'uniD801DC0C') def test_name2unicode_uni_empty_string_long_lowercase(): @@ -57,7 +59,7 @@ def test_name2unicode_uni_empty_string_long_lowercase(): Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the glyph name "u1040C.""" - assert u'' == name2unicode('uniD801DC0C') + assert_raises(KeyError, name2unicode, 'uniD801DC0C') def test_name2unicode_uni_pua(): @@ -102,12 +104,12 @@ def test_name2unicode_multiple_components_lowercase(): def test_name2unicode_foo(): """The name 'foo' maps to an empty string, because 'foo' is not in AGL, and because it does not start with a 'u.'""" - assert u'' == name2unicode('foo') + assert_raises(KeyError, name2unicode, 'foo') def test_name2unicode_notdef(): """The name ".notdef" is reduced to an empty string (step 1) and mapped to an empty string (step 3)""" - assert u'' == name2unicode('.notdef') + assert_raises(KeyError, name2unicode, '.notdef') def test_name2unicode_pua_ogoneksmall(): @@ -116,4 +118,4 @@ def test_name2unicode_pua_ogoneksmall(): def test_name2unicode_overflow_error(): - name2unicode('226215240241240240240240') + assert_raises(KeyError, name2unicode, '226215240241240240240240')