Use KeyError to signal that the name does not resemble any unicode, this pattern is also used in the rest of pdfminer.six
parent
fdb7e54862
commit
c597e95a9f
|
@ -10,12 +10,15 @@ from .psparser import PSLiteral
|
||||||
HEXADECIMAL = re.compile(r'[0-9a-fA-F]+')
|
HEXADECIMAL = re.compile(r'[0-9a-fA-F]+')
|
||||||
|
|
||||||
|
|
||||||
def name2unicode(name: str):
|
def name2unicode(name):
|
||||||
"""Converts Adobe glyph names to Unicode numbers.
|
"""Converts Adobe glyph names to Unicode numbers.
|
||||||
|
|
||||||
|
In contrast to the specification, this raises a KeyError instead of return an empty string when the key is unknown.
|
||||||
|
This way the caller must explicitly define what to do when there is not a match.
|
||||||
|
|
||||||
Reference: https://github.com/adobe-type-tools/agl-specification#2-the-mapping
|
Reference: https://github.com/adobe-type-tools/agl-specification#2-the-mapping
|
||||||
|
|
||||||
:returns unicode character if name resembles something, empty string if not
|
:returns unicode character if name resembles something, otherwise a KeyError
|
||||||
"""
|
"""
|
||||||
full_stop = u'\u002E'
|
full_stop = u'\u002E'
|
||||||
name = name.split(full_stop)[0]
|
name = name.split(full_stop)[0]
|
||||||
|
@ -33,7 +36,7 @@ def name2unicode(name: str):
|
||||||
if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0:
|
if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0:
|
||||||
unicode_digits = [int(name_without_uni[i:i + 4], base=16) for i in range(0, len(name_without_uni), 4)]
|
unicode_digits = [int(name_without_uni[i:i + 4], base=16) for i in range(0, len(name_without_uni), 4)]
|
||||||
if any([55295 < digit < 57344 for digit in unicode_digits]):
|
if any([55295 < digit < 57344 for digit in unicode_digits]):
|
||||||
return ''
|
raise KeyError
|
||||||
characters = map(six.unichr, unicode_digits)
|
characters = map(six.unichr, unicode_digits)
|
||||||
return ''.join(characters)
|
return ''.join(characters)
|
||||||
|
|
||||||
|
@ -42,10 +45,10 @@ def name2unicode(name: str):
|
||||||
if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6:
|
if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6:
|
||||||
unicode_digit = int(name_without_u, base=16)
|
unicode_digit = int(name_without_u, base=16)
|
||||||
if 55295 < unicode_digit < 57344:
|
if 55295 < unicode_digit < 57344:
|
||||||
return ''
|
raise KeyError
|
||||||
return six.unichr(unicode_digit)
|
return six.unichr(unicode_digit)
|
||||||
|
|
||||||
return ''
|
raise KeyError
|
||||||
|
|
||||||
|
|
||||||
class EncodingDB(object):
|
class EncodingDB(object):
|
||||||
|
|
|
@ -4,6 +4,8 @@ Tests based on the Adobe Glyph List Specification (https://github.com/adobe-type
|
||||||
While not in the specification, lowercase unicode often occurs in pdf's. Therefore lowercase unittest variants are
|
While not in the specification, lowercase unicode often occurs in pdf's. Therefore lowercase unittest variants are
|
||||||
added.
|
added.
|
||||||
"""
|
"""
|
||||||
|
from nose.tools import assert_raises
|
||||||
|
|
||||||
from pdfminer.encodingdb import name2unicode
|
from pdfminer.encodingdb import name2unicode
|
||||||
|
|
||||||
|
|
||||||
|
@ -48,7 +50,7 @@ def test_name2unicode_uni_empty_string_long():
|
||||||
expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the
|
expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the
|
||||||
glyph name "u1040C.
|
glyph name "u1040C.
|
||||||
"""
|
"""
|
||||||
assert u'' == name2unicode('uniD801DC0C')
|
assert_raises(KeyError, name2unicode, 'uniD801DC0C')
|
||||||
|
|
||||||
|
|
||||||
def test_name2unicode_uni_empty_string_long_lowercase():
|
def test_name2unicode_uni_empty_string_long_lowercase():
|
||||||
|
@ -57,7 +59,7 @@ def test_name2unicode_uni_empty_string_long_lowercase():
|
||||||
Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is
|
Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is
|
||||||
expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the
|
expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the
|
||||||
glyph name "u1040C."""
|
glyph name "u1040C."""
|
||||||
assert u'' == name2unicode('uniD801DC0C')
|
assert_raises(KeyError, name2unicode, 'uniD801DC0C')
|
||||||
|
|
||||||
|
|
||||||
def test_name2unicode_uni_pua():
|
def test_name2unicode_uni_pua():
|
||||||
|
@ -102,12 +104,12 @@ def test_name2unicode_multiple_components_lowercase():
|
||||||
|
|
||||||
def test_name2unicode_foo():
|
def test_name2unicode_foo():
|
||||||
"""The name 'foo' maps to an empty string, because 'foo' is not in AGL, and because it does not start with a 'u.'"""
|
"""The name 'foo' maps to an empty string, because 'foo' is not in AGL, and because it does not start with a 'u.'"""
|
||||||
assert u'' == name2unicode('foo')
|
assert_raises(KeyError, name2unicode, 'foo')
|
||||||
|
|
||||||
|
|
||||||
def test_name2unicode_notdef():
|
def test_name2unicode_notdef():
|
||||||
"""The name ".notdef" is reduced to an empty string (step 1) and mapped to an empty string (step 3)"""
|
"""The name ".notdef" is reduced to an empty string (step 1) and mapped to an empty string (step 3)"""
|
||||||
assert u'' == name2unicode('.notdef')
|
assert_raises(KeyError, name2unicode, '.notdef')
|
||||||
|
|
||||||
|
|
||||||
def test_name2unicode_pua_ogoneksmall():
|
def test_name2unicode_pua_ogoneksmall():
|
||||||
|
@ -116,4 +118,4 @@ def test_name2unicode_pua_ogoneksmall():
|
||||||
|
|
||||||
|
|
||||||
def test_name2unicode_overflow_error():
|
def test_name2unicode_overflow_error():
|
||||||
name2unicode('226215240241240240240240')
|
assert_raises(KeyError, name2unicode, '226215240241240240240240')
|
||||||
|
|
Loading…
Reference in New Issue