Use KeyError to signal that the name does not resemble any unicode, this pattern is also used in the rest of pdfminer.six

pull/263/head
Pieter Marsman 2019-07-14 15:37:15 +02:00
parent fdb7e54862
commit c597e95a9f
2 changed files with 15 additions and 10 deletions

View File

@ -10,12 +10,15 @@ from .psparser import PSLiteral
HEXADECIMAL = re.compile(r'[0-9a-fA-F]+') HEXADECIMAL = re.compile(r'[0-9a-fA-F]+')
def name2unicode(name: str): def name2unicode(name):
"""Converts Adobe glyph names to Unicode numbers. """Converts Adobe glyph names to Unicode numbers.
In contrast to the specification, this raises a KeyError instead of return an empty string when the key is unknown.
This way the caller must explicitly define what to do when there is not a match.
Reference: https://github.com/adobe-type-tools/agl-specification#2-the-mapping Reference: https://github.com/adobe-type-tools/agl-specification#2-the-mapping
:returns unicode character if name resembles something, empty string if not :returns unicode character if name resembles something, otherwise a KeyError
""" """
full_stop = u'\u002E' full_stop = u'\u002E'
name = name.split(full_stop)[0] name = name.split(full_stop)[0]
@ -33,7 +36,7 @@ def name2unicode(name: str):
if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0: if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0:
unicode_digits = [int(name_without_uni[i:i + 4], base=16) for i in range(0, len(name_without_uni), 4)] unicode_digits = [int(name_without_uni[i:i + 4], base=16) for i in range(0, len(name_without_uni), 4)]
if any([55295 < digit < 57344 for digit in unicode_digits]): if any([55295 < digit < 57344 for digit in unicode_digits]):
return '' raise KeyError
characters = map(six.unichr, unicode_digits) characters = map(six.unichr, unicode_digits)
return ''.join(characters) return ''.join(characters)
@ -42,10 +45,10 @@ def name2unicode(name: str):
if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6: if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6:
unicode_digit = int(name_without_u, base=16) unicode_digit = int(name_without_u, base=16)
if 55295 < unicode_digit < 57344: if 55295 < unicode_digit < 57344:
return '' raise KeyError
return six.unichr(unicode_digit) return six.unichr(unicode_digit)
return '' raise KeyError
class EncodingDB(object): class EncodingDB(object):

View File

@ -4,6 +4,8 @@ Tests based on the Adobe Glyph List Specification (https://github.com/adobe-type
While not in the specification, lowercase unicode often occurs in pdf's. Therefore lowercase unittest variants are While not in the specification, lowercase unicode often occurs in pdf's. Therefore lowercase unittest variants are
added. added.
""" """
from nose.tools import assert_raises
from pdfminer.encodingdb import name2unicode from pdfminer.encodingdb import name2unicode
@ -48,7 +50,7 @@ def test_name2unicode_uni_empty_string_long():
expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the
glyph name "u1040C. glyph name "u1040C.
""" """
assert u'' == name2unicode('uniD801DC0C') assert_raises(KeyError, name2unicode, 'uniD801DC0C')
def test_name2unicode_uni_empty_string_long_lowercase(): def test_name2unicode_uni_empty_string_long_lowercase():
@ -57,7 +59,7 @@ def test_name2unicode_uni_empty_string_long_lowercase():
Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is
expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the
glyph name "u1040C.""" glyph name "u1040C."""
assert u'' == name2unicode('uniD801DC0C') assert_raises(KeyError, name2unicode, 'uniD801DC0C')
def test_name2unicode_uni_pua(): def test_name2unicode_uni_pua():
@ -102,12 +104,12 @@ def test_name2unicode_multiple_components_lowercase():
def test_name2unicode_foo(): def test_name2unicode_foo():
"""The name 'foo' maps to an empty string, because 'foo' is not in AGL, and because it does not start with a 'u.'""" """The name 'foo' maps to an empty string, because 'foo' is not in AGL, and because it does not start with a 'u.'"""
assert u'' == name2unicode('foo') assert_raises(KeyError, name2unicode, 'foo')
def test_name2unicode_notdef(): def test_name2unicode_notdef():
"""The name ".notdef" is reduced to an empty string (step 1) and mapped to an empty string (step 3)""" """The name ".notdef" is reduced to an empty string (step 1) and mapped to an empty string (step 3)"""
assert u'' == name2unicode('.notdef') assert_raises(KeyError, name2unicode, '.notdef')
def test_name2unicode_pua_ogoneksmall(): def test_name2unicode_pua_ogoneksmall():
@ -116,4 +118,4 @@ def test_name2unicode_pua_ogoneksmall():
def test_name2unicode_overflow_error(): def test_name2unicode_overflow_error():
name2unicode('226215240241240240240240') assert_raises(KeyError, name2unicode, '226215240241240240240240')