Merge pull request #263 from pietermarsman/261-glyph-list-specification
name2unicode() should follow the Adobe Glyph List Specificationpull/273/head^2
commit
42e2c8143b
|
@ -1,28 +1,67 @@
|
|||
|
||||
import logging
|
||||
import re
|
||||
from .psparser import PSLiteral
|
||||
from .glyphlist import glyphname2unicode
|
||||
from .latin_enc import ENCODING
|
||||
|
||||
import six # Python 2+3 compatibility
|
||||
|
||||
STRIP_NAME = re.compile(r'[0-9]+')
|
||||
from .glyphlist import glyphname2unicode
|
||||
from .latin_enc import ENCODING
|
||||
from .psparser import PSLiteral
|
||||
|
||||
HEXADECIMAL = re.compile(r'[0-9a-fA-F]+')
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
## name2unicode
|
||||
##
|
||||
def name2unicode(name):
|
||||
"""Converts Adobe glyph names to Unicode numbers."""
|
||||
"""Converts Adobe glyph names to Unicode numbers.
|
||||
|
||||
In contrast to the specification, this raises a KeyError instead of return an empty string when the key is unknown.
|
||||
This way the caller must explicitly define what to do when there is not a match.
|
||||
|
||||
Reference: https://github.com/adobe-type-tools/agl-specification#2-the-mapping
|
||||
|
||||
:returns unicode character if name resembles something, otherwise a KeyError
|
||||
"""
|
||||
name = name.split('.')[0]
|
||||
components = name.split('_')
|
||||
|
||||
if len(components) > 1:
|
||||
return ''.join(map(name2unicode, components))
|
||||
|
||||
else:
|
||||
if name in glyphname2unicode:
|
||||
return glyphname2unicode[name]
|
||||
m = STRIP_NAME.search(name)
|
||||
if not m:
|
||||
raise KeyError(name)
|
||||
return six.unichr(int(m.group(0)))
|
||||
return glyphname2unicode.get(name)
|
||||
|
||||
elif name.startswith('uni'):
|
||||
name_without_uni = name.strip('uni')
|
||||
|
||||
if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0:
|
||||
unicode_digits = [int(name_without_uni[i:i + 4], base=16) for i in range(0, len(name_without_uni), 4)]
|
||||
for digit in unicode_digits:
|
||||
raise_key_error_for_invalid_unicode(digit)
|
||||
characters = map(six.unichr, unicode_digits)
|
||||
return ''.join(characters)
|
||||
|
||||
elif name.startswith('u'):
|
||||
name_without_u = name.strip('u')
|
||||
|
||||
if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6:
|
||||
unicode_digit = int(name_without_u, base=16)
|
||||
raise_key_error_for_invalid_unicode(unicode_digit)
|
||||
return six.unichr(unicode_digit)
|
||||
|
||||
raise KeyError('Could not convert unicode name "%s" to character because it does not match specification' % name)
|
||||
|
||||
|
||||
def raise_key_error_for_invalid_unicode(unicode_digit):
|
||||
"""Unicode values should not be in the range D800 through DFFF because that is used for surrogate pairs in UTF-16
|
||||
|
||||
:raises KeyError if unicode digit is invalid
|
||||
"""
|
||||
if 55295 < unicode_digit < 57344:
|
||||
raise KeyError('Unicode digit %d is invalid because it is in the range D800 through DFFF' % unicode_digit)
|
||||
|
||||
|
||||
## EncodingDB
|
||||
##
|
||||
class EncodingDB(object):
|
||||
|
||||
std2unicode = {}
|
||||
|
@ -59,7 +98,7 @@ class EncodingDB(object):
|
|||
elif isinstance(x, PSLiteral):
|
||||
try:
|
||||
cid2unicode[cid] = name2unicode(x.name)
|
||||
except KeyError:
|
||||
pass
|
||||
except KeyError as e:
|
||||
log.debug(str(e))
|
||||
cid += 1
|
||||
return cid2unicode
|
||||
|
|
|
@ -1,34 +1,37 @@
|
|||
|
||||
import sys
|
||||
import logging
|
||||
import struct
|
||||
import sys
|
||||
from io import BytesIO
|
||||
|
||||
import six # Python 2+3 compatibility
|
||||
|
||||
from . import settings
|
||||
from .cmapdb import CMap
|
||||
from .cmapdb import CMapDB
|
||||
from .cmapdb import CMapParser
|
||||
from .cmapdb import FileUnicodeMap
|
||||
from .cmapdb import CMap
|
||||
from .encodingdb import EncodingDB
|
||||
from .encodingdb import name2unicode
|
||||
from .psparser import PSStackParser
|
||||
from .psparser import PSEOF
|
||||
from .psparser import LIT
|
||||
from .psparser import KWD
|
||||
from . import settings
|
||||
from .psparser import PSLiteral
|
||||
from .psparser import literal_name
|
||||
from .pdftypes import PDFException
|
||||
from .pdftypes import resolve1
|
||||
from .pdftypes import int_value
|
||||
from .pdftypes import num_value
|
||||
from .pdftypes import list_value
|
||||
from .pdftypes import dict_value
|
||||
from .pdftypes import stream_value
|
||||
from .fontmetrics import FONT_METRICS
|
||||
from .pdftypes import PDFException
|
||||
from .pdftypes import dict_value
|
||||
from .pdftypes import int_value
|
||||
from .pdftypes import list_value
|
||||
from .pdftypes import num_value
|
||||
from .pdftypes import resolve1
|
||||
from .pdftypes import stream_value
|
||||
from .psparser import KWD
|
||||
from .psparser import LIT
|
||||
from .psparser import PSEOF
|
||||
from .psparser import PSLiteral
|
||||
from .psparser import PSStackParser
|
||||
from .psparser import literal_name
|
||||
from .utils import apply_matrix_norm
|
||||
from .utils import nunpack
|
||||
from .utils import choplist
|
||||
from .utils import isnumber
|
||||
from .utils import nunpack
|
||||
|
||||
import six #Python 2+3 compatibility
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_widths(seq):
|
||||
|
@ -98,7 +101,6 @@ class Type1FontHeaderParser(PSStackParser):
|
|||
KEYWORD_ARRAY = KWD(b'array')
|
||||
KEYWORD_READONLY = KWD(b'readonly')
|
||||
KEYWORD_FOR = KWD(b'for')
|
||||
KEYWORD_FOR = KWD(b'for')
|
||||
|
||||
def __init__(self, data):
|
||||
PSStackParser.__init__(self, data)
|
||||
|
@ -106,6 +108,17 @@ class Type1FontHeaderParser(PSStackParser):
|
|||
return
|
||||
|
||||
def get_encoding(self):
|
||||
"""Parse the font encoding
|
||||
|
||||
The Type1 font encoding maps character codes to character names. These character names could either be standard
|
||||
Adobe glyph names, or character names associated with custom CharStrings for this font. A CharString is a
|
||||
sequence of operations that describe how the character should be drawn.
|
||||
Currently, this function returns '' (empty string) for character names that are associated with a CharStrings.
|
||||
|
||||
References: http://wwwimages.adobe.com/content/dam/Adobe/en/devnet/font/pdfs/T1_SPEC.pdf
|
||||
|
||||
:returns mapping of character identifiers (cid's) to unicode characters
|
||||
"""
|
||||
while 1:
|
||||
try:
|
||||
(cid, name) = self.nextobject()
|
||||
|
@ -113,8 +126,8 @@ class Type1FontHeaderParser(PSStackParser):
|
|||
break
|
||||
try:
|
||||
self._cid2unicode[cid] = name2unicode(name)
|
||||
except KeyError:
|
||||
pass
|
||||
except KeyError as e:
|
||||
log.debug(str(e))
|
||||
return self._cid2unicode
|
||||
|
||||
def do_keyword(self, pos, token):
|
||||
|
|
|
@ -0,0 +1,121 @@
|
|||
"""
|
||||
Tests based on the Adobe Glyph List Specification (https://github.com/adobe-type-tools/agl-specification#2-the-mapping)
|
||||
|
||||
While not in the specification, lowercase unicode often occurs in pdf's. Therefore lowercase unittest variants are
|
||||
added.
|
||||
"""
|
||||
from nose.tools import assert_raises
|
||||
|
||||
from pdfminer.encodingdb import name2unicode
|
||||
|
||||
|
||||
def test_name2unicode_name_in_agl():
|
||||
"""The name "Lcommaaccent" has a single component, which is mapped to the string U+013B by AGL"""
|
||||
assert u'\u013B' == name2unicode('Lcommaaccent')
|
||||
|
||||
|
||||
def test_name2unicode_uni():
|
||||
"""The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
|
||||
assert u'\u013B' == name2unicode('uni013B')
|
||||
|
||||
|
||||
def test_name2unicode_uni_lowercase():
|
||||
"""The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
|
||||
assert u'\u013B' == name2unicode('uni013b')
|
||||
|
||||
|
||||
def test_name2unicode_uni_with_sequence_of_digits():
|
||||
"""The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308"""
|
||||
assert u'\u20AC\u0308' == name2unicode('uni20AC0308')
|
||||
|
||||
|
||||
def test_name2unicode_uni_with_sequence_of_digits_lowercase():
|
||||
"""The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308"""
|
||||
assert u'\u20AC\u0308' == name2unicode('uni20ac0308')
|
||||
|
||||
|
||||
def test_name2unicode_uni_empty_string():
|
||||
"""The name "uni20ac" has a single component, which is mapped to a euro-sign.
|
||||
|
||||
According to the specification this should be mapped to an empty string, but we also want to support lowercase
|
||||
hexadecimals
|
||||
"""
|
||||
assert u'\u20ac' == name2unicode('uni20ac')
|
||||
|
||||
|
||||
def test_name2unicode_uni_empty_string_long():
|
||||
"""The name "uniD801DC0C" has a single component, which is mapped to an empty string
|
||||
|
||||
Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is
|
||||
expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the
|
||||
glyph name "u1040C.
|
||||
"""
|
||||
assert_raises(KeyError, name2unicode, 'uniD801DC0C')
|
||||
|
||||
|
||||
def test_name2unicode_uni_empty_string_long_lowercase():
|
||||
"""The name "uniD801DC0C" has a single component, which is mapped to an empty string
|
||||
|
||||
Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is
|
||||
expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the
|
||||
glyph name "u1040C."""
|
||||
assert_raises(KeyError, name2unicode, 'uniD801DC0C')
|
||||
|
||||
|
||||
def test_name2unicode_uni_pua():
|
||||
""""Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB."""
|
||||
assert u'\uF6FB' == name2unicode('uniF6FB')
|
||||
|
||||
|
||||
def test_name2unicode_uni_pua_lowercase():
|
||||
""""Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB."""
|
||||
assert u'\uF6FB' == name2unicode('unif6fb')
|
||||
|
||||
|
||||
def test_name2unicode_u_with_4_digits():
|
||||
"""The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
|
||||
assert u'\u013B' == name2unicode('u013B')
|
||||
|
||||
|
||||
def test_name2unicode_u_with_4_digits_lowercase():
|
||||
"""The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
|
||||
assert u'\u013B' == name2unicode('u013b')
|
||||
|
||||
|
||||
def test_name2unicode_u_with_5_digits():
|
||||
"""The name "u1040C" has a single component, which is mapped to the string U+1040C"""
|
||||
assert u'\U0001040C' == name2unicode('u1040C')
|
||||
|
||||
|
||||
def test_name2unicode_u_with_5_digits_lowercase():
|
||||
"""The name "u1040C" has a single component, which is mapped to the string U+1040C"""
|
||||
assert u'\U0001040C' == name2unicode('u1040c')
|
||||
|
||||
|
||||
def test_name2unicode_multiple_components():
|
||||
"""The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C"""
|
||||
assert u'\u013B\u20AC\u0308\U0001040C' == name2unicode('Lcommaaccent_uni20AC0308_u1040C.alternate')
|
||||
|
||||
|
||||
def test_name2unicode_multiple_components_lowercase():
|
||||
"""The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C"""
|
||||
assert u'\u013B\u20AC\u0308\U0001040C' == name2unicode('Lcommaaccent_uni20ac0308_u1040c.alternate')
|
||||
|
||||
|
||||
def test_name2unicode_foo():
|
||||
"""The name 'foo' maps to an empty string, because 'foo' is not in AGL, and because it does not start with a 'u.'"""
|
||||
assert_raises(KeyError, name2unicode, 'foo')
|
||||
|
||||
|
||||
def test_name2unicode_notdef():
|
||||
"""The name ".notdef" is reduced to an empty string (step 1) and mapped to an empty string (step 3)"""
|
||||
assert_raises(KeyError, name2unicode, '.notdef')
|
||||
|
||||
|
||||
def test_name2unicode_pua_ogoneksmall():
|
||||
""""Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB."""
|
||||
assert u'\uF6FB' == name2unicode('Ogoneksmall')
|
||||
|
||||
|
||||
def test_name2unicode_overflow_error():
|
||||
assert_raises(KeyError, name2unicode, '226215240241240240240240')
|
Loading…
Reference in New Issue