diff --git a/pdfminer/encodingdb.py b/pdfminer/encodingdb.py index 870bd28..7100235 100644 --- a/pdfminer/encodingdb.py +++ b/pdfminer/encodingdb.py @@ -1,28 +1,67 @@ - +import logging import re -from .psparser import PSLiteral + +import six # Python 2+3 compatibility + from .glyphlist import glyphname2unicode from .latin_enc import ENCODING +from .psparser import PSLiteral -import six # Python 2+3 compatibility +HEXADECIMAL = re.compile(r'[0-9a-fA-F]+') -STRIP_NAME = re.compile(r'[0-9]+') +log = logging.getLogger(__name__) -## name2unicode -## def name2unicode(name): - """Converts Adobe glyph names to Unicode numbers.""" - if name in glyphname2unicode: - return glyphname2unicode[name] - m = STRIP_NAME.search(name) - if not m: - raise KeyError(name) - return six.unichr(int(m.group(0))) + """Converts Adobe glyph names to Unicode numbers. + + In contrast to the specification, this raises a KeyError instead of return an empty string when the key is unknown. + This way the caller must explicitly define what to do when there is not a match. + + Reference: https://github.com/adobe-type-tools/agl-specification#2-the-mapping + + :returns unicode character if name resembles something, otherwise a KeyError + """ + name = name.split('.')[0] + components = name.split('_') + + if len(components) > 1: + return ''.join(map(name2unicode, components)) + + else: + if name in glyphname2unicode: + return glyphname2unicode.get(name) + + elif name.startswith('uni'): + name_without_uni = name.strip('uni') + + if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0: + unicode_digits = [int(name_without_uni[i:i + 4], base=16) for i in range(0, len(name_without_uni), 4)] + for digit in unicode_digits: + raise_key_error_for_invalid_unicode(digit) + characters = map(six.unichr, unicode_digits) + return ''.join(characters) + + elif name.startswith('u'): + name_without_u = name.strip('u') + + if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6: + unicode_digit = int(name_without_u, base=16) + raise_key_error_for_invalid_unicode(unicode_digit) + return six.unichr(unicode_digit) + + raise KeyError('Could not convert unicode name "%s" to character because it does not match specification' % name) + + +def raise_key_error_for_invalid_unicode(unicode_digit): + """Unicode values should not be in the range D800 through DFFF because that is used for surrogate pairs in UTF-16 + + :raises KeyError if unicode digit is invalid + """ + if 55295 < unicode_digit < 57344: + raise KeyError('Unicode digit %d is invalid because it is in the range D800 through DFFF' % unicode_digit) -## EncodingDB -## class EncodingDB(object): std2unicode = {} @@ -59,7 +98,7 @@ class EncodingDB(object): elif isinstance(x, PSLiteral): try: cid2unicode[cid] = name2unicode(x.name) - except KeyError: - pass + except KeyError as e: + log.debug(str(e)) cid += 1 return cid2unicode diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index d61bcab..60bbb82 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -1,34 +1,37 @@ - -import sys +import logging import struct +import sys from io import BytesIO + +import six # Python 2+3 compatibility + +from . import settings +from .cmapdb import CMap from .cmapdb import CMapDB from .cmapdb import CMapParser from .cmapdb import FileUnicodeMap -from .cmapdb import CMap from .encodingdb import EncodingDB from .encodingdb import name2unicode -from .psparser import PSStackParser -from .psparser import PSEOF -from .psparser import LIT -from .psparser import KWD -from . import settings -from .psparser import PSLiteral -from .psparser import literal_name -from .pdftypes import PDFException -from .pdftypes import resolve1, resolve_all -from .pdftypes import int_value -from .pdftypes import num_value -from .pdftypes import list_value -from .pdftypes import dict_value -from .pdftypes import stream_value from .fontmetrics import FONT_METRICS +from .pdftypes import PDFException +from .pdftypes import dict_value +from .pdftypes import int_value +from .pdftypes import list_value +from .pdftypes import num_value +from .pdftypes import resolve1, resolve_all +from .pdftypes import stream_value +from .psparser import KWD +from .psparser import LIT +from .psparser import PSEOF +from .psparser import PSLiteral +from .psparser import PSStackParser +from .psparser import literal_name from .utils import apply_matrix_norm -from .utils import nunpack from .utils import choplist from .utils import isnumber +from .utils import nunpack -import six #Python 2+3 compatibility +log = logging.getLogger(__name__) def get_widths(seq): @@ -98,7 +101,6 @@ class Type1FontHeaderParser(PSStackParser): KEYWORD_ARRAY = KWD(b'array') KEYWORD_READONLY = KWD(b'readonly') KEYWORD_FOR = KWD(b'for') - KEYWORD_FOR = KWD(b'for') def __init__(self, data): PSStackParser.__init__(self, data) @@ -106,6 +108,17 @@ class Type1FontHeaderParser(PSStackParser): return def get_encoding(self): + """Parse the font encoding + + The Type1 font encoding maps character codes to character names. These character names could either be standard + Adobe glyph names, or character names associated with custom CharStrings for this font. A CharString is a + sequence of operations that describe how the character should be drawn. + Currently, this function returns '' (empty string) for character names that are associated with a CharStrings. + + References: http://wwwimages.adobe.com/content/dam/Adobe/en/devnet/font/pdfs/T1_SPEC.pdf + + :returns mapping of character identifiers (cid's) to unicode characters + """ while 1: try: (cid, name) = self.nextobject() @@ -113,8 +126,8 @@ class Type1FontHeaderParser(PSStackParser): break try: self._cid2unicode[cid] = name2unicode(name) - except KeyError: - pass + except KeyError as e: + log.debug(str(e)) return self._cid2unicode def do_keyword(self, pos, token): diff --git a/tests/test_encodingdb.py b/tests/test_encodingdb.py new file mode 100644 index 0000000..36e4b11 --- /dev/null +++ b/tests/test_encodingdb.py @@ -0,0 +1,121 @@ +""" +Tests based on the Adobe Glyph List Specification (https://github.com/adobe-type-tools/agl-specification#2-the-mapping) + +While not in the specification, lowercase unicode often occurs in pdf's. Therefore lowercase unittest variants are +added. +""" +from nose.tools import assert_raises + +from pdfminer.encodingdb import name2unicode + + +def test_name2unicode_name_in_agl(): + """The name "Lcommaaccent" has a single component, which is mapped to the string U+013B by AGL""" + assert u'\u013B' == name2unicode('Lcommaaccent') + + +def test_name2unicode_uni(): + """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B""" + assert u'\u013B' == name2unicode('uni013B') + + +def test_name2unicode_uni_lowercase(): + """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B""" + assert u'\u013B' == name2unicode('uni013b') + + +def test_name2unicode_uni_with_sequence_of_digits(): + """The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308""" + assert u'\u20AC\u0308' == name2unicode('uni20AC0308') + + +def test_name2unicode_uni_with_sequence_of_digits_lowercase(): + """The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308""" + assert u'\u20AC\u0308' == name2unicode('uni20ac0308') + + +def test_name2unicode_uni_empty_string(): + """The name "uni20ac" has a single component, which is mapped to a euro-sign. + + According to the specification this should be mapped to an empty string, but we also want to support lowercase + hexadecimals + """ + assert u'\u20ac' == name2unicode('uni20ac') + + +def test_name2unicode_uni_empty_string_long(): + """The name "uniD801DC0C" has a single component, which is mapped to an empty string + + Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is + expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the + glyph name "u1040C. + """ + assert_raises(KeyError, name2unicode, 'uniD801DC0C') + + +def test_name2unicode_uni_empty_string_long_lowercase(): + """The name "uniD801DC0C" has a single component, which is mapped to an empty string + + Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is + expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the + glyph name "u1040C.""" + assert_raises(KeyError, name2unicode, 'uniD801DC0C') + + +def test_name2unicode_uni_pua(): + """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB.""" + assert u'\uF6FB' == name2unicode('uniF6FB') + + +def test_name2unicode_uni_pua_lowercase(): + """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB.""" + assert u'\uF6FB' == name2unicode('unif6fb') + + +def test_name2unicode_u_with_4_digits(): + """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B""" + assert u'\u013B' == name2unicode('u013B') + + +def test_name2unicode_u_with_4_digits_lowercase(): + """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B""" + assert u'\u013B' == name2unicode('u013b') + + +def test_name2unicode_u_with_5_digits(): + """The name "u1040C" has a single component, which is mapped to the string U+1040C""" + assert u'\U0001040C' == name2unicode('u1040C') + + +def test_name2unicode_u_with_5_digits_lowercase(): + """The name "u1040C" has a single component, which is mapped to the string U+1040C""" + assert u'\U0001040C' == name2unicode('u1040c') + + +def test_name2unicode_multiple_components(): + """The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C""" + assert u'\u013B\u20AC\u0308\U0001040C' == name2unicode('Lcommaaccent_uni20AC0308_u1040C.alternate') + + +def test_name2unicode_multiple_components_lowercase(): + """The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C""" + assert u'\u013B\u20AC\u0308\U0001040C' == name2unicode('Lcommaaccent_uni20ac0308_u1040c.alternate') + + +def test_name2unicode_foo(): + """The name 'foo' maps to an empty string, because 'foo' is not in AGL, and because it does not start with a 'u.'""" + assert_raises(KeyError, name2unicode, 'foo') + + +def test_name2unicode_notdef(): + """The name ".notdef" is reduced to an empty string (step 1) and mapped to an empty string (step 3)""" + assert_raises(KeyError, name2unicode, '.notdef') + + +def test_name2unicode_pua_ogoneksmall(): + """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB.""" + assert u'\uF6FB' == name2unicode('Ogoneksmall') + + +def test_name2unicode_overflow_error(): + assert_raises(KeyError, name2unicode, '226215240241240240240240')