Merge pull request #230 from 0xabu/unicode_glyph_bug

name2unicode(): handle hexadecimal literals for unicode glyphs in text extraction Unicode literals are hex, not decimal (refer: https://github.com/adobe-type-tools/agl-specification). We are still far from conformance with the full spec, but this handles more PDFs seen in the wild.
2019-07-09 12:35:39 -07:00 · 2019-07-09 12:35:39 -07:00 · 6b312edd6a
parent b6a5848208 c4c0a36e4f
commit 6b312edd6a
1 changed files with 2 additions and 2 deletions
--- a/pdfminer/encodingdb.py
+++ b/pdfminer/encodingdb.py
@ -6,7 +6,7 @@ from .latin_enc import ENCODING
 import six # Python 2+3 compatibility
-STRIP_NAME = re.compile(r'[0-9]+')
+STRIP_NAME = re.compile(r'[0-9A-Fa-f]+')
 ##  name2unicode
@ -18,7 +18,7 @@ def name2unicode(name):
    m = STRIP_NAME.search(name)
    if not m:
        raise KeyError(name)
-    return six.unichr(int(m.group(0)))
+    return six.unichr(int(m.group(0), base=16))
 ##  EncodingDB