diff --git a/tests/test_encodingdb.py b/tests/test_encodingdb.py new file mode 100644 index 0000000..c3f8bf0 --- /dev/null +++ b/tests/test_encodingdb.py @@ -0,0 +1,69 @@ +""" +Tests based on the Adobe Glyph List Specification (https://github.com/adobe-type-tools/agl-specification#2-the-mapping) +""" +from pdfminer.encodingdb import name2unicode + + +def test_name2unicode_name_in_agl(): + """The name "Lcommaaccent" has a single component, which is mapped to the string U+013B by AGL""" + assert u'\u013B' == name2unicode('Lcommaaccent') + + +def test_name2unicode_uni(): + """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B""" + assert u'\u013B' == name2unicode('uni013B') + + +def test_name2unicode_uni_with_sequence_of_digits(): + """The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308""" + assert u'\u20AC\u0308' == name2unicode('uni20AC0308') + + +def test_name2unicode_uni_empty_string(): + """The name "uni20ac" has a single component, which is mapped to an empty string""" + assert u'' == name2unicode('uni20ac') + + +def test_name2unicode_uni_empty_string_long(): + """The name "uniD801DC0C" has a single component, which is mapped to an empty string + + Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is + expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the + glyph name "u1040C. + """ + assert u'' == name2unicode('uniD801DC0C') + + +def test_name2unicode_uni_pua(): + """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB.""" + assert u'\uF6FB' == name2unicode('uniF6FB') + + +def test_name2unicode_u_with_4_digits(): + """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B""" + assert u'\u013B' == name2unicode('u013B') + + +def test_name2unicode_u_with_5_digits(): + """The name "u1040C" has a single component, which is mapped to the string U+1040C""" + assert u'\U0001040C' == name2unicode('u1040C') + + +def test_name2unicode_multiple_components(): + """The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C""" + assert u'\u013B\u20AC\U0001040C' == name2unicode('Lcommaaccent_uni20AC0308_u1040C.alternate') + + +def test_name2unicode_foo(): + """The name 'foo' maps to an empty string, because 'foo' is not in AGL, and because it does not start with a 'u.'""" + assert u'' == name2unicode('foo') + + +def test_name2unicode_notdef(): + """The name ".notdef" is reduced to an empty string (step 1) and mapped to an empty string (step 3)""" + assert u'' == name2unicode('.notdef') + + +def test_name2unicode_pua_ogoneksmall(): + """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB.""" + assert u'\uF6FB' == name2unicode('Ogoneksmall')