From ec5218a05f4d0e75e88079a19da75982573b5426 Mon Sep 17 00:00:00 2001
From: Pieter Marsman <pietermarsman@gmail.com>
Date: Wed, 10 Jul 2019 20:24:30 +0200
Subject: [PATCH 01/10] Add some (failing) unittests for name2unicode based on
 the examples in the Adobe Glyph List Specification

---
 tests/test_encodingdb.py | 69 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 tests/test_encodingdb.py

diff --git a/tests/test_encodingdb.py b/tests/test_encodingdb.py
new file mode 100644
index 0000000..c3f8bf0
--- /dev/null
+++ b/tests/test_encodingdb.py
@@ -0,0 +1,69 @@
+"""
+Tests based on the Adobe Glyph List Specification (https://github.com/adobe-type-tools/agl-specification#2-the-mapping)
+"""
+from pdfminer.encodingdb import name2unicode
+
+
+def test_name2unicode_name_in_agl():
+    """The name "Lcommaaccent" has a single component, which is mapped to the string U+013B by AGL"""
+    assert u'\u013B' == name2unicode('Lcommaaccent')
+
+
+def test_name2unicode_uni():
+    """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
+    assert u'\u013B' == name2unicode('uni013B')
+
+
+def test_name2unicode_uni_with_sequence_of_digits():
+    """The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308"""
+    assert u'\u20AC\u0308' == name2unicode('uni20AC0308')
+
+
+def test_name2unicode_uni_empty_string():
+    """The name "uni20ac" has a single component, which is mapped to an empty string"""
+    assert u'' == name2unicode('uni20ac')
+
+
+def test_name2unicode_uni_empty_string_long():
+    """The name "uniD801DC0C" has a single component, which is mapped to an empty string
+
+    Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is
+    expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the
+    glyph name "u1040C.
+    """
+    assert u'' == name2unicode('uniD801DC0C')
+
+
+def test_name2unicode_uni_pua():
+    """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB."""
+    assert u'\uF6FB' == name2unicode('uniF6FB')
+
+
+def test_name2unicode_u_with_4_digits():
+    """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
+    assert u'\u013B' == name2unicode('u013B')
+
+
+def test_name2unicode_u_with_5_digits():
+    """The name "u1040C" has a single component, which is mapped to the string U+1040C"""
+    assert u'\U0001040C' == name2unicode('u1040C')
+
+
+def test_name2unicode_multiple_components():
+    """The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C"""
+    assert u'\u013B\u20AC\U0001040C' == name2unicode('Lcommaaccent_uni20AC0308_u1040C.alternate')
+
+
+def test_name2unicode_foo():
+    """The name 'foo' maps to an empty string, because 'foo' is not in AGL, and because it does not start with a 'u.'"""
+    assert u'' == name2unicode('foo')
+
+
+def test_name2unicode_notdef():
+    """The name ".notdef" is reduced to an empty string (step 1) and mapped to an empty string (step 3)"""
+    assert u'' == name2unicode('.notdef')
+
+
+def test_name2unicode_pua_ogoneksmall():
+    """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB."""
+    assert u'\uF6FB' == name2unicode('Ogoneksmall')

From 5d7ac7e88a0df5a445318bf6d7b2d924041b204b Mon Sep 17 00:00:00 2001
From: Pieter Marsman <pietermarsman@gmail.com>
Date: Wed, 10 Jul 2019 20:44:23 +0200
Subject: [PATCH 02/10] Added test for overflow error reported by @jtlz2:
 https://github.com/pdfminer/pdfminer.six/issues/177#issuecomment-510173228_

---
 tests/test_encodingdb.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/test_encodingdb.py b/tests/test_encodingdb.py
index c3f8bf0..2fac375 100644
--- a/tests/test_encodingdb.py
+++ b/tests/test_encodingdb.py
@@ -67,3 +67,7 @@ def test_name2unicode_notdef():
 def test_name2unicode_pua_ogoneksmall():
     """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB."""
     assert u'\uF6FB' == name2unicode('Ogoneksmall')
+
+
+def test_name2unicode_overflow_error():
+    name2unicode('226215240241240240240240')

From f0392f804971e1d1f1de8cf66f70dfb09a373241 Mon Sep 17 00:00:00 2001
From: Pieter Marsman <pietermarsman@gmail.com>
Date: Sun, 14 Jul 2019 15:16:42 +0200
Subject: [PATCH 03/10] Change implementation of name2unicode such that it
 follows the Adobe Glyph specs (with allowing lowercase)

---
 pdfminer/encodingdb.py | 57 ++++++++++++++++++++++++++++++------------
 1 file changed, 41 insertions(+), 16 deletions(-)

diff --git a/pdfminer/encodingdb.py b/pdfminer/encodingdb.py
index 870bd28..aa00005 100644
--- a/pdfminer/encodingdb.py
+++ b/pdfminer/encodingdb.py
@@ -1,28 +1,53 @@
 
 import re
-from .psparser import PSLiteral
+
+import six  # Python 2+3 compatibility
+
 from .glyphlist import glyphname2unicode
 from .latin_enc import ENCODING
+from .psparser import PSLiteral
 
-import six # Python 2+3 compatibility
-
-STRIP_NAME = re.compile(r'[0-9]+')
+HEXADECIMAL = re.compile(r'[0-9a-fA-F]+')
 
 
-##  name2unicode
-##
-def name2unicode(name):
-    """Converts Adobe glyph names to Unicode numbers."""
-    if name in glyphname2unicode:
-        return glyphname2unicode[name]
-    m = STRIP_NAME.search(name)
-    if not m:
-        raise KeyError(name)
-    return six.unichr(int(m.group(0)))
+def name2unicode(name: str):
+    """Converts Adobe glyph names to Unicode numbers.
+
+    Reference: https://github.com/adobe-type-tools/agl-specification#2-the-mapping
+
+    :returns unicode character if name resembles something, empty string if not
+    """
+    full_stop = u'\u002E'
+    name = name.split(full_stop)[0]
+    components = name.split('_')
+
+    if len(components) > 1:
+        return ''.join(map(name2unicode, components))
+
+    else:
+        if name in glyphname2unicode:
+            return glyphname2unicode.get(name)
+
+        elif name.startswith('uni'):
+            name_without_uni = name.strip('uni')
+            if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0:
+                unicode_digits = [int(name_without_uni[i:i + 4], base=16) for i in range(0, len(name_without_uni), 4)]
+                if any([55295 < digit < 57344 for digit in unicode_digits]):
+                    return ''
+                characters = map(six.unichr, unicode_digits)
+                return ''.join(characters)
+
+        elif name.startswith('u'):
+            name_without_u = name.strip('u')
+            if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6:
+                unicode_digit = int(name_without_u, base=16)
+                if 55295 < unicode_digit < 57344:
+                    return ''
+                return six.unichr(unicode_digit)
+
+    return ''
 
 
-##  EncodingDB
-##
 class EncodingDB(object):
 
     std2unicode = {}

From 33cc9861ae06d44ef2d7173a6781197749bff26c Mon Sep 17 00:00:00 2001
From: Pieter Marsman <pietermarsman@gmail.com>
Date: Sun, 14 Jul 2019 15:19:17 +0200
Subject: [PATCH 04/10] Add docstring to Type1FontHeaderParser.get_encoding()
 that describes that the custom CharStrings of the font are mapped to ''

---
 pdfminer/pdffont.py | 51 +++++++++++++++++++++++++++------------------
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py
index feb8557..5d7eaf1 100644
--- a/pdfminer/pdffont.py
+++ b/pdfminer/pdffont.py
@@ -1,34 +1,35 @@
 
-import sys
 import struct
+import sys
 from io import BytesIO
+
+import six  # Python 2+3 compatibility
+
+from . import settings
+from .cmapdb import CMap
 from .cmapdb import CMapDB
 from .cmapdb import CMapParser
 from .cmapdb import FileUnicodeMap
-from .cmapdb import CMap
 from .encodingdb import EncodingDB
 from .encodingdb import name2unicode
-from .psparser import PSStackParser
-from .psparser import PSEOF
-from .psparser import LIT
-from .psparser import KWD
-from . import settings
-from .psparser import PSLiteral
-from .psparser import literal_name
-from .pdftypes import PDFException
-from .pdftypes import resolve1
-from .pdftypes import int_value
-from .pdftypes import num_value
-from .pdftypes import list_value
-from .pdftypes import dict_value
-from .pdftypes import stream_value
 from .fontmetrics import FONT_METRICS
+from .pdftypes import PDFException
+from .pdftypes import dict_value
+from .pdftypes import int_value
+from .pdftypes import list_value
+from .pdftypes import num_value
+from .pdftypes import resolve1
+from .pdftypes import stream_value
+from .psparser import KWD
+from .psparser import LIT
+from .psparser import PSEOF
+from .psparser import PSLiteral
+from .psparser import PSStackParser
+from .psparser import literal_name
 from .utils import apply_matrix_norm
-from .utils import nunpack
 from .utils import choplist
 from .utils import isnumber
-
-import six #Python 2+3 compatibility
+from .utils import nunpack
 
 
 def get_widths(seq):
@@ -98,7 +99,6 @@ class Type1FontHeaderParser(PSStackParser):
     KEYWORD_ARRAY = KWD(b'array')
     KEYWORD_READONLY = KWD(b'readonly')
     KEYWORD_FOR = KWD(b'for')
-    KEYWORD_FOR = KWD(b'for')
 
     def __init__(self, data):
         PSStackParser.__init__(self, data)
@@ -106,6 +106,17 @@ class Type1FontHeaderParser(PSStackParser):
         return
 
     def get_encoding(self):
+        """Parse the font encoding
+
+        The Type1 font encoding maps character codes to character names. These character names could either be standard
+        Adobe glyph names, or character names associated with custom CharStrings for this font. A CharString is a
+        sequence of operations that describe how the character should be drawn.
+        Currently, this function returns '' (empty string) for character names that are associated with a CharStrings.
+
+        References: http://wwwimages.adobe.com/content/dam/Adobe/en/devnet/font/pdfs/T1_SPEC.pdf
+
+        :returns mapping of character identifiers (cid's) to unicode characters
+        """
         while 1:
             try:
                 (cid, name) = self.nextobject()

From fdb7e5486287e008cb2e71d0d16ef21863954b68 Mon Sep 17 00:00:00 2001
From: Pieter Marsman <pietermarsman@gmail.com>
Date: Sun, 14 Jul 2019 15:20:25 +0200
Subject: [PATCH 05/10] Add lowercase adobe glyph name tests

---
 tests/test_encodingdb.py | 52 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 49 insertions(+), 3 deletions(-)

diff --git a/tests/test_encodingdb.py b/tests/test_encodingdb.py
index 2fac375..ac10d54 100644
--- a/tests/test_encodingdb.py
+++ b/tests/test_encodingdb.py
@@ -1,5 +1,8 @@
 """
 Tests based on the Adobe Glyph List Specification (https://github.com/adobe-type-tools/agl-specification#2-the-mapping)
+
+While not in the specification, lowercase unicode often occurs in pdf's. Therefore lowercase unittest variants are
+added.
 """
 from pdfminer.encodingdb import name2unicode
 
@@ -14,14 +17,28 @@ def test_name2unicode_uni():
     assert u'\u013B' == name2unicode('uni013B')
 
 
+def test_name2unicode_uni_lowercase():
+    """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
+    assert u'\u013B' == name2unicode('uni013b')
+
+
 def test_name2unicode_uni_with_sequence_of_digits():
     """The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308"""
     assert u'\u20AC\u0308' == name2unicode('uni20AC0308')
 
 
+def test_name2unicode_uni_with_sequence_of_digits_lowercase():
+    """The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308"""
+    assert u'\u20AC\u0308' == name2unicode('uni20ac0308')
+
+
 def test_name2unicode_uni_empty_string():
-    """The name "uni20ac" has a single component, which is mapped to an empty string"""
-    assert u'' == name2unicode('uni20ac')
+    """The name "uni20ac" has a single component, which is mapped to a €.
+
+    According to the specification this should be mapped to an empty string, but we also want to support lowercase
+    hexadecimals
+    """
+    assert u'€' == name2unicode('uni20ac')
 
 
 def test_name2unicode_uni_empty_string_long():
@@ -34,24 +51,53 @@ def test_name2unicode_uni_empty_string_long():
     assert u'' == name2unicode('uniD801DC0C')
 
 
+def test_name2unicode_uni_empty_string_long_lowercase():
+    """The name "uniD801DC0C" has a single component, which is mapped to an empty string
+
+    Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is
+    expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the
+    glyph name "u1040C."""
+    assert u'' == name2unicode('uniD801DC0C')
+
+
 def test_name2unicode_uni_pua():
     """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB."""
     assert u'\uF6FB' == name2unicode('uniF6FB')
 
 
+def test_name2unicode_uni_pua_lowercase():
+    """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB."""
+    assert u'\uF6FB' == name2unicode('unif6fb')
+
+
 def test_name2unicode_u_with_4_digits():
     """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
     assert u'\u013B' == name2unicode('u013B')
 
 
+def test_name2unicode_u_with_4_digits_lowercase():
+    """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
+    assert u'\u013B' == name2unicode('u013b')
+
+
 def test_name2unicode_u_with_5_digits():
     """The name "u1040C" has a single component, which is mapped to the string U+1040C"""
     assert u'\U0001040C' == name2unicode('u1040C')
 
 
+def test_name2unicode_u_with_5_digits_lowercase():
+    """The name "u1040C" has a single component, which is mapped to the string U+1040C"""
+    assert u'\U0001040C' == name2unicode('u1040c')
+
+
 def test_name2unicode_multiple_components():
     """The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C"""
-    assert u'\u013B\u20AC\U0001040C' == name2unicode('Lcommaaccent_uni20AC0308_u1040C.alternate')
+    assert u'\u013B\u20AC\u0308\U0001040C' == name2unicode('Lcommaaccent_uni20AC0308_u1040C.alternate')
+
+
+def test_name2unicode_multiple_components_lowercase():
+    """The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C"""
+    assert u'\u013B\u20AC\u0308\U0001040C' == name2unicode('Lcommaaccent_uni20ac0308_u1040c.alternate')
 
 
 def test_name2unicode_foo():

From c597e95a9f828b6d6f18566a44d8706bdbc6744b Mon Sep 17 00:00:00 2001
From: Pieter Marsman <pietermarsman@gmail.com>
Date: Sun, 14 Jul 2019 15:37:15 +0200
Subject: [PATCH 06/10] Use KeyError to signal that the name does not resemble
 any unicode, this pattern is also used in the rest of pdfminer.six

---
 pdfminer/encodingdb.py   | 13 ++++++++-----
 tests/test_encodingdb.py | 12 +++++++-----
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/pdfminer/encodingdb.py b/pdfminer/encodingdb.py
index aa00005..5dcd8f2 100644
--- a/pdfminer/encodingdb.py
+++ b/pdfminer/encodingdb.py
@@ -10,12 +10,15 @@ from .psparser import PSLiteral
 HEXADECIMAL = re.compile(r'[0-9a-fA-F]+')
 
 
-def name2unicode(name: str):
+def name2unicode(name):
     """Converts Adobe glyph names to Unicode numbers.
 
+    In contrast to the specification, this raises a KeyError instead of return an empty string when the key is unknown.
+    This way the caller must explicitly define what to do when there is not a match.
+
     Reference: https://github.com/adobe-type-tools/agl-specification#2-the-mapping
 
-    :returns unicode character if name resembles something, empty string if not
+    :returns unicode character if name resembles something, otherwise a KeyError
     """
     full_stop = u'\u002E'
     name = name.split(full_stop)[0]
@@ -33,7 +36,7 @@ def name2unicode(name: str):
             if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0:
                 unicode_digits = [int(name_without_uni[i:i + 4], base=16) for i in range(0, len(name_without_uni), 4)]
                 if any([55295 < digit < 57344 for digit in unicode_digits]):
-                    return ''
+                    raise KeyError
                 characters = map(six.unichr, unicode_digits)
                 return ''.join(characters)
 
@@ -42,10 +45,10 @@ def name2unicode(name: str):
             if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6:
                 unicode_digit = int(name_without_u, base=16)
                 if 55295 < unicode_digit < 57344:
-                    return ''
+                    raise KeyError
                 return six.unichr(unicode_digit)
 
-    return ''
+    raise KeyError
 
 
 class EncodingDB(object):
diff --git a/tests/test_encodingdb.py b/tests/test_encodingdb.py
index ac10d54..82c0282 100644
--- a/tests/test_encodingdb.py
+++ b/tests/test_encodingdb.py
@@ -4,6 +4,8 @@ Tests based on the Adobe Glyph List Specification (https://github.com/adobe-type
 While not in the specification, lowercase unicode often occurs in pdf's. Therefore lowercase unittest variants are
 added.
 """
+from nose.tools import assert_raises
+
 from pdfminer.encodingdb import name2unicode
 
 
@@ -48,7 +50,7 @@ def test_name2unicode_uni_empty_string_long():
     expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the
     glyph name "u1040C.
     """
-    assert u'' == name2unicode('uniD801DC0C')
+    assert_raises(KeyError, name2unicode, 'uniD801DC0C')
 
 
 def test_name2unicode_uni_empty_string_long_lowercase():
@@ -57,7 +59,7 @@ def test_name2unicode_uni_empty_string_long_lowercase():
     Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is
     expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the
     glyph name "u1040C."""
-    assert u'' == name2unicode('uniD801DC0C')
+    assert_raises(KeyError, name2unicode, 'uniD801DC0C')
 
 
 def test_name2unicode_uni_pua():
@@ -102,12 +104,12 @@ def test_name2unicode_multiple_components_lowercase():
 
 def test_name2unicode_foo():
     """The name 'foo' maps to an empty string, because 'foo' is not in AGL, and because it does not start with a 'u.'"""
-    assert u'' == name2unicode('foo')
+    assert_raises(KeyError, name2unicode, 'foo')
 
 
 def test_name2unicode_notdef():
     """The name ".notdef" is reduced to an empty string (step 1) and mapped to an empty string (step 3)"""
-    assert u'' == name2unicode('.notdef')
+    assert_raises(KeyError, name2unicode, '.notdef')
 
 
 def test_name2unicode_pua_ogoneksmall():
@@ -116,4 +118,4 @@ def test_name2unicode_pua_ogoneksmall():
 
 
 def test_name2unicode_overflow_error():
-    name2unicode('226215240241240240240240')
+    assert_raises(KeyError, name2unicode, '226215240241240240240240')

From 1e24bfa0bd1ef332e30ffd57b2328ecacc0ff6c4 Mon Sep 17 00:00:00 2001
From: Pieter Marsman <pietermarsman@gmail.com>
Date: Sun, 14 Jul 2019 15:40:22 +0200
Subject: [PATCH 07/10] Fix error, python2 cannot handle unicode in a .py file

---
 tests/test_encodingdb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_encodingdb.py b/tests/test_encodingdb.py
index 82c0282..bfd2a87 100644
--- a/tests/test_encodingdb.py
+++ b/tests/test_encodingdb.py
@@ -35,7 +35,7 @@ def test_name2unicode_uni_with_sequence_of_digits_lowercase():
 
 
 def test_name2unicode_uni_empty_string():
-    """The name "uni20ac" has a single component, which is mapped to a €.
+    """The name "uni20ac" has a single component, which is mapped to a euro-sign.
 
     According to the specification this should be mapped to an empty string, but we also want to support lowercase
     hexadecimals

From 2bb850cdaee9135fcf50770211b6817904950b5b Mon Sep 17 00:00:00 2001
From: Pieter Marsman <pietermarsman@gmail.com>
Date: Sun, 14 Jul 2019 15:43:07 +0200
Subject: [PATCH 08/10] Fix error, python2 cannot handle unicode in a .py file

---
 tests/test_encodingdb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_encodingdb.py b/tests/test_encodingdb.py
index bfd2a87..36e4b11 100644
--- a/tests/test_encodingdb.py
+++ b/tests/test_encodingdb.py
@@ -40,7 +40,7 @@ def test_name2unicode_uni_empty_string():
     According to the specification this should be mapped to an empty string, but we also want to support lowercase
     hexadecimals
     """
-    assert u'€' == name2unicode('uni20ac')
+    assert u'\u20ac' == name2unicode('uni20ac')
 
 
 def test_name2unicode_uni_empty_string_long():

From 0fb83366b61af6c9cf5ff32164075d9d355cbbe8 Mon Sep 17 00:00:00 2001
From: Pieter Marsman <pietermarsman@gmail.com>
Date: Tue, 16 Jul 2019 08:49:57 +0200
Subject: [PATCH 09/10] Remove intermediate variable `full_stop` because it is
 just a dot

---
 pdfminer/encodingdb.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pdfminer/encodingdb.py b/pdfminer/encodingdb.py
index 5dcd8f2..dea23a1 100644
--- a/pdfminer/encodingdb.py
+++ b/pdfminer/encodingdb.py
@@ -1,4 +1,4 @@
-
+import logging
 import re
 
 import six  # Python 2+3 compatibility
@@ -20,8 +20,7 @@ def name2unicode(name):
 
     :returns unicode character if name resembles something, otherwise a KeyError
     """
-    full_stop = u'\u002E'
-    name = name.split(full_stop)[0]
+    name = name.split('.')[0]
     components = name.split('_')
 
     if len(components) > 1:

From 6f362f53feefc81224d740a011fac69ea9707180 Mon Sep 17 00:00:00 2001
From: Pieter Marsman <pietermarsman@gmail.com>
Date: Tue, 16 Jul 2019 08:52:24 +0200
Subject: [PATCH 10/10] Raise a `KeyError` with a useful message if
 `unicode2name()` does not match any glyph name. Use this message to log debug
 statements.

---
 pdfminer/encodingdb.py | 26 +++++++++++++++++++-------
 pdfminer/pdffont.py    |  8 +++++---
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/pdfminer/encodingdb.py b/pdfminer/encodingdb.py
index dea23a1..7100235 100644
--- a/pdfminer/encodingdb.py
+++ b/pdfminer/encodingdb.py
@@ -9,6 +9,8 @@ from .psparser import PSLiteral
 
 HEXADECIMAL = re.compile(r'[0-9a-fA-F]+')
 
+log = logging.getLogger(__name__)
+
 
 def name2unicode(name):
     """Converts Adobe glyph names to Unicode numbers.
@@ -32,22 +34,32 @@ def name2unicode(name):
 
         elif name.startswith('uni'):
             name_without_uni = name.strip('uni')
+
             if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0:
                 unicode_digits = [int(name_without_uni[i:i + 4], base=16) for i in range(0, len(name_without_uni), 4)]
-                if any([55295 < digit < 57344 for digit in unicode_digits]):
-                    raise KeyError
+                for digit in unicode_digits:
+                    raise_key_error_for_invalid_unicode(digit)
                 characters = map(six.unichr, unicode_digits)
                 return ''.join(characters)
 
         elif name.startswith('u'):
             name_without_u = name.strip('u')
+
             if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6:
                 unicode_digit = int(name_without_u, base=16)
-                if 55295 < unicode_digit < 57344:
-                    raise KeyError
+                raise_key_error_for_invalid_unicode(unicode_digit)
                 return six.unichr(unicode_digit)
 
-    raise KeyError
+    raise KeyError('Could not convert unicode name "%s" to character because it does not match specification' % name)
+
+
+def raise_key_error_for_invalid_unicode(unicode_digit):
+    """Unicode values should not be in the range D800 through DFFF because that is used for surrogate pairs in UTF-16
+
+    :raises KeyError if unicode digit is invalid
+    """
+    if 55295 < unicode_digit < 57344:
+        raise KeyError('Unicode digit %d is invalid because it is in the range D800 through DFFF' % unicode_digit)
 
 
 class EncodingDB(object):
@@ -86,7 +98,7 @@ class EncodingDB(object):
                 elif isinstance(x, PSLiteral):
                     try:
                         cid2unicode[cid] = name2unicode(x.name)
-                    except KeyError:
-                        pass
+                    except KeyError as e:
+                        log.debug(str(e))
                     cid += 1
         return cid2unicode
diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py
index 5d7eaf1..1a7603d 100644
--- a/pdfminer/pdffont.py
+++ b/pdfminer/pdffont.py
@@ -1,4 +1,4 @@
-
+import logging
 import struct
 import sys
 from io import BytesIO
@@ -31,6 +31,8 @@ from .utils import choplist
 from .utils import isnumber
 from .utils import nunpack
 
+log = logging.getLogger(__name__)
+
 
 def get_widths(seq):
     widths = {}
@@ -124,8 +126,8 @@ class Type1FontHeaderParser(PSStackParser):
                 break
             try:
                 self._cid2unicode[cid] = name2unicode(name)
-            except KeyError:
-                pass
+            except KeyError as e:
+                log.debug(str(e))
         return self._cid2unicode
 
     def do_keyword(self, pos, token):