Merge pull request #230 from 0xabu/unicode_glyph_bug
name2unicode(): handle hexadecimal literals for unicode glyphs in text extraction Unicode literals are hex, not decimal (refer: https://github.com/adobe-type-tools/agl-specification). We are still far from conformance with the full spec, but this handles more PDFs seen in the wild.pull/262/head
commit
6b312edd6a
|
@ -6,7 +6,7 @@ from .latin_enc import ENCODING
|
||||||
|
|
||||||
import six # Python 2+3 compatibility
|
import six # Python 2+3 compatibility
|
||||||
|
|
||||||
STRIP_NAME = re.compile(r'[0-9]+')
|
STRIP_NAME = re.compile(r'[0-9A-Fa-f]+')
|
||||||
|
|
||||||
|
|
||||||
## name2unicode
|
## name2unicode
|
||||||
|
@ -18,7 +18,7 @@ def name2unicode(name):
|
||||||
m = STRIP_NAME.search(name)
|
m = STRIP_NAME.search(name)
|
||||||
if not m:
|
if not m:
|
||||||
raise KeyError(name)
|
raise KeyError(name)
|
||||||
return six.unichr(int(m.group(0)))
|
return six.unichr(int(m.group(0), base=16))
|
||||||
|
|
||||||
|
|
||||||
## EncodingDB
|
## EncodingDB
|
||||||
|
|
Loading…
Reference in New Issue