2019-10-26 16:42:33 +00:00
|
|
|
"""Tests based on the Adobe Glyph List Specification
|
|
|
|
See: https://github.com/adobe-type-tools/agl-specification#2-the-mapping
|
2019-07-14 13:20:25 +00:00
|
|
|
|
2019-12-29 20:20:20 +00:00
|
|
|
While not in the specification, lowercase unicode often occurs in pdf's.
|
|
|
|
Therefore lowercase unittest variants are added.
|
2019-07-10 18:24:30 +00:00
|
|
|
"""
|
2022-02-02 21:24:32 +00:00
|
|
|
import pytest
|
2019-07-14 13:37:15 +00:00
|
|
|
|
2020-03-16 19:12:45 +00:00
|
|
|
from pdfminer.encodingdb import name2unicode, EncodingDB
|
|
|
|
from pdfminer.psparser import PSLiteral
|
2019-07-10 18:24:30 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_name2unicode_name_in_agl():
|
2019-12-29 20:20:20 +00:00
|
|
|
"""The name "Lcommaaccent" has a single component,
|
|
|
|
which is mapped to the string U+013B by AGL"""
|
2022-02-11 21:46:51 +00:00
|
|
|
assert "\u013B" == name2unicode("Lcommaaccent")
|
2019-07-10 18:24:30 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_name2unicode_uni():
|
2019-12-29 20:20:20 +00:00
|
|
|
"""The components "Lcommaaccent," "uni013B," and "u013B"
|
|
|
|
all map to the string U+013B"""
|
2022-02-11 21:46:51 +00:00
|
|
|
assert "\u013B" == name2unicode("uni013B")
|
2019-07-10 18:24:30 +00:00
|
|
|
|
|
|
|
|
2019-07-14 13:20:25 +00:00
|
|
|
def test_name2unicode_uni_lowercase():
|
2019-12-29 20:20:20 +00:00
|
|
|
"""The components "Lcommaaccent," "uni013B," and "u013B"
|
|
|
|
all map to the string U+013B"""
|
2022-02-11 21:46:51 +00:00
|
|
|
assert "\u013B" == name2unicode("uni013b")
|
2019-07-14 13:20:25 +00:00
|
|
|
|
|
|
|
|
2019-07-10 18:24:30 +00:00
|
|
|
def test_name2unicode_uni_with_sequence_of_digits():
|
2019-12-29 20:20:20 +00:00
|
|
|
"""The name "uni20AC0308" has a single component,
|
|
|
|
which is mapped to the string U+20AC U+0308"""
|
2022-02-11 21:46:51 +00:00
|
|
|
assert "\u20AC\u0308" == name2unicode("uni20AC0308")
|
2019-07-10 18:24:30 +00:00
|
|
|
|
|
|
|
|
2019-07-14 13:20:25 +00:00
|
|
|
def test_name2unicode_uni_with_sequence_of_digits_lowercase():
|
2019-12-29 20:20:20 +00:00
|
|
|
"""The name "uni20AC0308" has a single component,
|
|
|
|
which is mapped to the string U+20AC U+0308"""
|
2022-02-11 21:46:51 +00:00
|
|
|
assert "\u20AC\u0308" == name2unicode("uni20ac0308")
|
2019-07-14 13:20:25 +00:00
|
|
|
|
|
|
|
|
2019-07-10 18:24:30 +00:00
|
|
|
def test_name2unicode_uni_empty_string():
|
2019-12-29 20:20:20 +00:00
|
|
|
"""The name "uni20ac" has a single component,
|
|
|
|
which is mapped to a euro-sign.
|
2019-07-14 13:20:25 +00:00
|
|
|
|
2019-12-29 20:20:20 +00:00
|
|
|
According to the specification this should be mapped to an empty string,
|
|
|
|
but we also want to support lowercase hexadecimals"""
|
2022-02-11 21:46:51 +00:00
|
|
|
assert "\u20ac" == name2unicode("uni20ac")
|
2019-07-10 18:24:30 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_name2unicode_uni_empty_string_long():
|
2019-12-29 20:20:20 +00:00
|
|
|
"""The name "uniD801DC0C" has a single component,
|
|
|
|
which is mapped to an empty string
|
2019-07-10 18:24:30 +00:00
|
|
|
|
2019-12-29 20:20:20 +00:00
|
|
|
Neither D801 nor DC0C are in the appropriate set.
|
|
|
|
This form cannot be used to map to the character which is
|
|
|
|
expressed as D801 DC0C in UTF-16, specifically U+1040C.
|
|
|
|
This character can be correctly mapped by using the
|
2019-07-10 18:24:30 +00:00
|
|
|
glyph name "u1040C.
|
|
|
|
"""
|
2022-02-02 21:24:32 +00:00
|
|
|
with pytest.raises(KeyError):
|
2022-02-11 21:46:51 +00:00
|
|
|
name2unicode("uniD801DC0C")
|
2019-07-10 18:24:30 +00:00
|
|
|
|
|
|
|
|
2019-07-14 13:20:25 +00:00
|
|
|
def test_name2unicode_uni_empty_string_long_lowercase():
|
2019-12-29 20:20:20 +00:00
|
|
|
"""The name "uniD801DC0C" has a single component,
|
|
|
|
which is mapped to an empty string
|
2019-07-14 13:20:25 +00:00
|
|
|
|
2019-12-29 20:20:20 +00:00
|
|
|
Neither D801 nor DC0C are in the appropriate set.
|
|
|
|
This form cannot be used to map to the character which is
|
|
|
|
expressed as D801 DC0C in UTF-16, specifically U+1040C.
|
|
|
|
This character can be correctly mapped by using the
|
2019-07-14 13:20:25 +00:00
|
|
|
glyph name "u1040C."""
|
2022-02-02 21:24:32 +00:00
|
|
|
with pytest.raises(KeyError):
|
2022-02-11 21:46:51 +00:00
|
|
|
name2unicode("uniD801DC0C")
|
2019-07-14 13:20:25 +00:00
|
|
|
|
|
|
|
|
2019-07-10 18:24:30 +00:00
|
|
|
def test_name2unicode_uni_pua():
|
2022-02-11 21:46:51 +00:00
|
|
|
""" "Ogoneksmall" and "uniF6FB" both map to the string that corresponds to
|
|
|
|
U+F6FB."""
|
|
|
|
assert "\uF6FB" == name2unicode("uniF6FB")
|
2019-07-10 18:24:30 +00:00
|
|
|
|
|
|
|
|
2019-07-14 13:20:25 +00:00
|
|
|
def test_name2unicode_uni_pua_lowercase():
|
2022-02-11 21:46:51 +00:00
|
|
|
""" "Ogoneksmall" and "uniF6FB" both map to the string that corresponds to
|
|
|
|
U+F6FB."""
|
|
|
|
assert "\uF6FB" == name2unicode("unif6fb")
|
2019-07-14 13:20:25 +00:00
|
|
|
|
|
|
|
|
2019-07-10 18:24:30 +00:00
|
|
|
def test_name2unicode_u_with_4_digits():
|
2020-01-04 15:47:07 +00:00
|
|
|
"""The components "Lcommaaccent," "uni013B," and "u013B" all map to the
|
|
|
|
string U+013B"""
|
2022-02-11 21:46:51 +00:00
|
|
|
assert "\u013B" == name2unicode("u013B")
|
2019-07-10 18:24:30 +00:00
|
|
|
|
|
|
|
|
2019-07-14 13:20:25 +00:00
|
|
|
def test_name2unicode_u_with_4_digits_lowercase():
|
2020-01-04 15:47:07 +00:00
|
|
|
"""The components "Lcommaaccent," "uni013B," and "u013B" all map to the
|
|
|
|
string U+013B"""
|
2022-02-11 21:46:51 +00:00
|
|
|
assert "\u013B" == name2unicode("u013b")
|
2019-07-14 13:20:25 +00:00
|
|
|
|
|
|
|
|
2019-07-10 18:24:30 +00:00
|
|
|
def test_name2unicode_u_with_5_digits():
|
2020-01-04 15:47:07 +00:00
|
|
|
"""The name "u1040C" has a single component, which is mapped to the string
|
2022-02-11 21:46:51 +00:00
|
|
|
U+1040C"""
|
|
|
|
assert "\U0001040C" == name2unicode("u1040C")
|
2019-07-10 18:24:30 +00:00
|
|
|
|
|
|
|
|
2019-07-14 13:20:25 +00:00
|
|
|
def test_name2unicode_u_with_5_digits_lowercase():
|
2020-01-04 15:47:07 +00:00
|
|
|
"""The name "u1040C" has a single component, which is mapped to the string
|
2022-02-11 21:46:51 +00:00
|
|
|
U+1040C"""
|
|
|
|
assert "\U0001040C" == name2unicode("u1040c")
|
2019-07-14 13:20:25 +00:00
|
|
|
|
|
|
|
|
2019-07-10 18:24:30 +00:00
|
|
|
def test_name2unicode_multiple_components():
|
2020-01-04 15:47:07 +00:00
|
|
|
"""The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the
|
|
|
|
string U+013B U+20AC U+0308 U+1040C"""
|
2022-02-11 21:46:51 +00:00
|
|
|
assert "\u013B\u20AC\u0308\U0001040C" == name2unicode(
|
|
|
|
"Lcommaaccent_uni20AC0308_u1040C.alternate"
|
|
|
|
)
|
2019-07-14 13:20:25 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_name2unicode_multiple_components_lowercase():
|
2020-01-04 15:47:07 +00:00
|
|
|
"""The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the
|
2022-02-11 21:46:51 +00:00
|
|
|
string U+013B U+20AC U+0308 U+1040C"""
|
|
|
|
assert "\u013B\u20AC\u0308\U0001040C" == name2unicode(
|
|
|
|
"Lcommaaccent_uni20ac0308_u1040c.alternate"
|
|
|
|
)
|
2019-07-10 18:24:30 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_name2unicode_foo():
|
2019-12-29 20:20:20 +00:00
|
|
|
"""The name 'foo' maps to an empty string,
|
|
|
|
because 'foo' is not in AGL,
|
|
|
|
and because it does not start with a 'u.'"""
|
2022-02-02 21:24:32 +00:00
|
|
|
with pytest.raises(KeyError):
|
2022-02-11 21:46:51 +00:00
|
|
|
name2unicode("foo")
|
2019-07-10 18:24:30 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_name2unicode_notdef():
|
2019-12-29 20:20:20 +00:00
|
|
|
"""The name ".notdef" is reduced to an empty string (step 1)
|
|
|
|
and mapped to an empty string (step 3)"""
|
2022-02-02 21:24:32 +00:00
|
|
|
with pytest.raises(KeyError):
|
2022-02-11 21:46:51 +00:00
|
|
|
name2unicode(".notdef")
|
2019-07-10 18:24:30 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_name2unicode_pua_ogoneksmall():
|
2022-02-11 21:46:51 +00:00
|
|
|
""" "
|
2019-12-29 20:20:20 +00:00
|
|
|
Ogoneksmall" and "uniF6FB" both map to the string
|
|
|
|
that corresponds to U+F6FB."""
|
2022-02-11 21:46:51 +00:00
|
|
|
assert "\uF6FB" == name2unicode("Ogoneksmall")
|
2019-07-10 18:44:23 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_name2unicode_overflow_error():
|
2022-02-02 21:24:32 +00:00
|
|
|
with pytest.raises(KeyError):
|
2022-02-11 21:46:51 +00:00
|
|
|
name2unicode("226215240241240240240240")
|
2020-03-16 19:12:45 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_get_encoding_with_invalid_differences():
|
|
|
|
"""Invalid differences should be silently ignored
|
|
|
|
|
|
|
|
Regression test for https://github.com/pdfminer/pdfminer.six/issues/385
|
|
|
|
"""
|
2022-02-11 21:46:51 +00:00
|
|
|
invalid_differences = [PSLiteral("ubuntu"), PSLiteral("1234")]
|
|
|
|
EncodingDB.get_encoding("StandardEncoding", invalid_differences)
|