Merge pull request #263 from pietermarsman/261-glyph-list-specification

name2unicode() should follow the Adobe Glyph List Specification
2019-07-26 22:13:34 +05:30 · 2019-07-26 22:13:34 +05:30 · 42e2c8143b
parent 17364aa88e 6f362f53fe
commit 42e2c8143b
3 changed files with 212 additions and 39 deletions
--- a/pdfminer/encodingdb.py
+++ b/pdfminer/encodingdb.py
@ -1,28 +1,67 @@
-
+import logging
 import re
-from .psparser import PSLiteral
+
 import six  # Python 2+3 compatibility
 from .glyphlist import glyphname2unicode
 from .latin_enc import ENCODING
 from .psparser import PSLiteral
-import six # Python 2+3 compatibility
+HEXADECIMAL = re.compile(r'[0-9a-fA-F]+')
-STRIP_NAME = re.compile(r'[0-9]+')
+log = logging.getLogger(__name__)
 ##  name2unicode
 ##
 def name2unicode(name):
-    """Converts Adobe glyph names to Unicode numbers."""
+    """Converts Adobe glyph names to Unicode numbers.
-    if name in glyphname2unicode:
+
-        return glyphname2unicode[name]
+    In contrast to the specification, this raises a KeyError instead of return an empty string when the key is unknown.
-    m = STRIP_NAME.search(name)
+    This way the caller must explicitly define what to do when there is not a match.
-    if not m:
+
-        raise KeyError(name)
+    Reference: https://github.com/adobe-type-tools/agl-specification#2-the-mapping
-    return six.unichr(int(m.group(0)))
+
    :returns unicode character if name resembles something, otherwise a KeyError
    """
    name = name.split('.')[0]
    components = name.split('_')
    if len(components) > 1:
        return ''.join(map(name2unicode, components))
    else:
        if name in glyphname2unicode:
            return glyphname2unicode.get(name)
        elif name.startswith('uni'):
            name_without_uni = name.strip('uni')
            if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0:
                unicode_digits = [int(name_without_uni[i:i + 4], base=16) for i in range(0, len(name_without_uni), 4)]
                for digit in unicode_digits:
                    raise_key_error_for_invalid_unicode(digit)
                characters = map(six.unichr, unicode_digits)
                return ''.join(characters)
        elif name.startswith('u'):
            name_without_u = name.strip('u')
            if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6:
                unicode_digit = int(name_without_u, base=16)
                raise_key_error_for_invalid_unicode(unicode_digit)
                return six.unichr(unicode_digit)
    raise KeyError('Could not convert unicode name "%s" to character because it does not match specification' % name)
 def raise_key_error_for_invalid_unicode(unicode_digit):
    """Unicode values should not be in the range D800 through DFFF because that is used for surrogate pairs in UTF-16
    :raises KeyError if unicode digit is invalid
    """
    if 55295 < unicode_digit < 57344:
        raise KeyError('Unicode digit %d is invalid because it is in the range D800 through DFFF' % unicode_digit)
 ##  EncodingDB
 ##
 class EncodingDB(object):
    std2unicode = {}
@ -59,7 +98,7 @@ class EncodingDB(object):
                elif isinstance(x, PSLiteral):
                    try:
                        cid2unicode[cid] = name2unicode(x.name)
-                    except KeyError:
+                    except KeyError as e:
-                        pass
+                        log.debug(str(e))
                    cid += 1
        return cid2unicode
--- a/pdfminer/pdffont.py
+++ b/pdfminer/pdffont.py
@ -1,34 +1,37 @@
-
+import logging
 import sys
 import struct
 import sys
 from io import BytesIO
 import six  # Python 2+3 compatibility
 from . import settings
 from .cmapdb import CMap
 from .cmapdb import CMapDB
 from .cmapdb import CMapParser
 from .cmapdb import FileUnicodeMap
 from .cmapdb import CMap
 from .encodingdb import EncodingDB
 from .encodingdb import name2unicode
 from .psparser import PSStackParser
 from .psparser import PSEOF
 from .psparser import LIT
 from .psparser import KWD
 from . import settings
 from .psparser import PSLiteral
 from .psparser import literal_name
 from .pdftypes import PDFException
 from .pdftypes import resolve1
 from .pdftypes import int_value
 from .pdftypes import num_value
 from .pdftypes import list_value
 from .pdftypes import dict_value
 from .pdftypes import stream_value
 from .fontmetrics import FONT_METRICS
 from .pdftypes import PDFException
 from .pdftypes import dict_value
 from .pdftypes import int_value
 from .pdftypes import list_value
 from .pdftypes import num_value
 from .pdftypes import resolve1
 from .pdftypes import stream_value
 from .psparser import KWD
 from .psparser import LIT
 from .psparser import PSEOF
 from .psparser import PSLiteral
 from .psparser import PSStackParser
 from .psparser import literal_name
 from .utils import apply_matrix_norm
 from .utils import nunpack
 from .utils import choplist
 from .utils import isnumber
 from .utils import nunpack
-import six #Python 2+3 compatibility
+log = logging.getLogger(__name__)
 def get_widths(seq):
@ -98,7 +101,6 @@ class Type1FontHeaderParser(PSStackParser):
    KEYWORD_ARRAY = KWD(b'array')
    KEYWORD_READONLY = KWD(b'readonly')
    KEYWORD_FOR = KWD(b'for')
    KEYWORD_FOR = KWD(b'for')
    def __init__(self, data):
        PSStackParser.__init__(self, data)
@ -106,6 +108,17 @@ class Type1FontHeaderParser(PSStackParser):
        return
    def get_encoding(self):
        """Parse the font encoding
        The Type1 font encoding maps character codes to character names. These character names could either be standard
        Adobe glyph names, or character names associated with custom CharStrings for this font. A CharString is a
        sequence of operations that describe how the character should be drawn.
        Currently, this function returns '' (empty string) for character names that are associated with a CharStrings.
        References: http://wwwimages.adobe.com/content/dam/Adobe/en/devnet/font/pdfs/T1_SPEC.pdf
        :returns mapping of character identifiers (cid's) to unicode characters
        """
        while 1:
            try:
                (cid, name) = self.nextobject()
@ -113,8 +126,8 @@ class Type1FontHeaderParser(PSStackParser):
                break
            try:
                self._cid2unicode[cid] = name2unicode(name)
-            except KeyError:
+            except KeyError as e:
-                pass
+                log.debug(str(e))
        return self._cid2unicode
    def do_keyword(self, pos, token):
--- a/tests/test_encodingdb.py
+++ b/tests/test_encodingdb.py
@ -0,0 +1,121 @@
 """
 Tests based on the Adobe Glyph List Specification (https://github.com/adobe-type-tools/agl-specification#2-the-mapping)
 While not in the specification, lowercase unicode often occurs in pdf's. Therefore lowercase unittest variants are
 added.
 """
 from nose.tools import assert_raises
 from pdfminer.encodingdb import name2unicode
 def test_name2unicode_name_in_agl():
    """The name "Lcommaaccent" has a single component, which is mapped to the string U+013B by AGL"""
    assert u'\u013B' == name2unicode('Lcommaaccent')
 def test_name2unicode_uni():
    """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
    assert u'\u013B' == name2unicode('uni013B')
 def test_name2unicode_uni_lowercase():
    """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
    assert u'\u013B' == name2unicode('uni013b')
 def test_name2unicode_uni_with_sequence_of_digits():
    """The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308"""
    assert u'\u20AC\u0308' == name2unicode('uni20AC0308')
 def test_name2unicode_uni_with_sequence_of_digits_lowercase():
    """The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308"""
    assert u'\u20AC\u0308' == name2unicode('uni20ac0308')
 def test_name2unicode_uni_empty_string():
    """The name "uni20ac" has a single component, which is mapped to a euro-sign.
    According to the specification this should be mapped to an empty string, but we also want to support lowercase
    hexadecimals
    """
    assert u'\u20ac' == name2unicode('uni20ac')
 def test_name2unicode_uni_empty_string_long():
    """The name "uniD801DC0C" has a single component, which is mapped to an empty string
    Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is
    expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the
    glyph name "u1040C.
    """
    assert_raises(KeyError, name2unicode, 'uniD801DC0C')
 def test_name2unicode_uni_empty_string_long_lowercase():
    """The name "uniD801DC0C" has a single component, which is mapped to an empty string
    Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is
    expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the
    glyph name "u1040C."""
    assert_raises(KeyError, name2unicode, 'uniD801DC0C')
 def test_name2unicode_uni_pua():
    """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB."""
    assert u'\uF6FB' == name2unicode('uniF6FB')
 def test_name2unicode_uni_pua_lowercase():
    """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB."""
    assert u'\uF6FB' == name2unicode('unif6fb')
 def test_name2unicode_u_with_4_digits():
    """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
    assert u'\u013B' == name2unicode('u013B')
 def test_name2unicode_u_with_4_digits_lowercase():
    """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
    assert u'\u013B' == name2unicode('u013b')
 def test_name2unicode_u_with_5_digits():
    """The name "u1040C" has a single component, which is mapped to the string U+1040C"""
    assert u'\U0001040C' == name2unicode('u1040C')
 def test_name2unicode_u_with_5_digits_lowercase():
    """The name "u1040C" has a single component, which is mapped to the string U+1040C"""
    assert u'\U0001040C' == name2unicode('u1040c')
 def test_name2unicode_multiple_components():
    """The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C"""
    assert u'\u013B\u20AC\u0308\U0001040C' == name2unicode('Lcommaaccent_uni20AC0308_u1040C.alternate')
 def test_name2unicode_multiple_components_lowercase():
    """The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C"""
    assert u'\u013B\u20AC\u0308\U0001040C' == name2unicode('Lcommaaccent_uni20ac0308_u1040c.alternate')
 def test_name2unicode_foo():
    """The name 'foo' maps to an empty string, because 'foo' is not in AGL, and because it does not start with a 'u.'"""
    assert_raises(KeyError, name2unicode, 'foo')
 def test_name2unicode_notdef():
    """The name ".notdef" is reduced to an empty string (step 1) and mapped to an empty string (step 3)"""
    assert_raises(KeyError, name2unicode, '.notdef')
 def test_name2unicode_pua_ogoneksmall():
    """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB."""
    assert u'\uF6FB' == name2unicode('Ogoneksmall')
 def test_name2unicode_overflow_error():
    assert_raises(KeyError, name2unicode, '226215240241240240240240')