From ec5218a05f4d0e75e88079a19da75982573b5426 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Wed, 10 Jul 2019 20:24:30 +0200 Subject: [PATCH 01/10] Add some (failing) unittests for name2unicode based on the examples in the Adobe Glyph List Specification --- tests/test_encodingdb.py | 69 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 tests/test_encodingdb.py diff --git a/tests/test_encodingdb.py b/tests/test_encodingdb.py new file mode 100644 index 0000000..c3f8bf0 --- /dev/null +++ b/tests/test_encodingdb.py @@ -0,0 +1,69 @@ +""" +Tests based on the Adobe Glyph List Specification (https://github.com/adobe-type-tools/agl-specification#2-the-mapping) +""" +from pdfminer.encodingdb import name2unicode + + +def test_name2unicode_name_in_agl(): + """The name "Lcommaaccent" has a single component, which is mapped to the string U+013B by AGL""" + assert u'\u013B' == name2unicode('Lcommaaccent') + + +def test_name2unicode_uni(): + """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B""" + assert u'\u013B' == name2unicode('uni013B') + + +def test_name2unicode_uni_with_sequence_of_digits(): + """The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308""" + assert u'\u20AC\u0308' == name2unicode('uni20AC0308') + + +def test_name2unicode_uni_empty_string(): + """The name "uni20ac" has a single component, which is mapped to an empty string""" + assert u'' == name2unicode('uni20ac') + + +def test_name2unicode_uni_empty_string_long(): + """The name "uniD801DC0C" has a single component, which is mapped to an empty string + + Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is + expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the + glyph name "u1040C. + """ + assert u'' == name2unicode('uniD801DC0C') + + +def test_name2unicode_uni_pua(): + """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB.""" + assert u'\uF6FB' == name2unicode('uniF6FB') + + +def test_name2unicode_u_with_4_digits(): + """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B""" + assert u'\u013B' == name2unicode('u013B') + + +def test_name2unicode_u_with_5_digits(): + """The name "u1040C" has a single component, which is mapped to the string U+1040C""" + assert u'\U0001040C' == name2unicode('u1040C') + + +def test_name2unicode_multiple_components(): + """The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C""" + assert u'\u013B\u20AC\U0001040C' == name2unicode('Lcommaaccent_uni20AC0308_u1040C.alternate') + + +def test_name2unicode_foo(): + """The name 'foo' maps to an empty string, because 'foo' is not in AGL, and because it does not start with a 'u.'""" + assert u'' == name2unicode('foo') + + +def test_name2unicode_notdef(): + """The name ".notdef" is reduced to an empty string (step 1) and mapped to an empty string (step 3)""" + assert u'' == name2unicode('.notdef') + + +def test_name2unicode_pua_ogoneksmall(): + """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB.""" + assert u'\uF6FB' == name2unicode('Ogoneksmall') From 5d7ac7e88a0df5a445318bf6d7b2d924041b204b Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Wed, 10 Jul 2019 20:44:23 +0200 Subject: [PATCH 02/10] Added test for overflow error reported by @jtlz2: https://github.com/pdfminer/pdfminer.six/issues/177#issuecomment-510173228_ --- tests/test_encodingdb.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_encodingdb.py b/tests/test_encodingdb.py index c3f8bf0..2fac375 100644 --- a/tests/test_encodingdb.py +++ b/tests/test_encodingdb.py @@ -67,3 +67,7 @@ def test_name2unicode_notdef(): def test_name2unicode_pua_ogoneksmall(): """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB.""" assert u'\uF6FB' == name2unicode('Ogoneksmall') + + +def test_name2unicode_overflow_error(): + name2unicode('226215240241240240240240') From f0392f804971e1d1f1de8cf66f70dfb09a373241 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sun, 14 Jul 2019 15:16:42 +0200 Subject: [PATCH 03/10] Change implementation of name2unicode such that it follows the Adobe Glyph specs (with allowing lowercase) --- pdfminer/encodingdb.py | 57 ++++++++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 16 deletions(-) diff --git a/pdfminer/encodingdb.py b/pdfminer/encodingdb.py index 870bd28..aa00005 100644 --- a/pdfminer/encodingdb.py +++ b/pdfminer/encodingdb.py @@ -1,28 +1,53 @@ import re -from .psparser import PSLiteral + +import six # Python 2+3 compatibility + from .glyphlist import glyphname2unicode from .latin_enc import ENCODING +from .psparser import PSLiteral -import six # Python 2+3 compatibility - -STRIP_NAME = re.compile(r'[0-9]+') +HEXADECIMAL = re.compile(r'[0-9a-fA-F]+') -## name2unicode -## -def name2unicode(name): - """Converts Adobe glyph names to Unicode numbers.""" - if name in glyphname2unicode: - return glyphname2unicode[name] - m = STRIP_NAME.search(name) - if not m: - raise KeyError(name) - return six.unichr(int(m.group(0))) +def name2unicode(name: str): + """Converts Adobe glyph names to Unicode numbers. + + Reference: https://github.com/adobe-type-tools/agl-specification#2-the-mapping + + :returns unicode character if name resembles something, empty string if not + """ + full_stop = u'\u002E' + name = name.split(full_stop)[0] + components = name.split('_') + + if len(components) > 1: + return ''.join(map(name2unicode, components)) + + else: + if name in glyphname2unicode: + return glyphname2unicode.get(name) + + elif name.startswith('uni'): + name_without_uni = name.strip('uni') + if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0: + unicode_digits = [int(name_without_uni[i:i + 4], base=16) for i in range(0, len(name_without_uni), 4)] + if any([55295 < digit < 57344 for digit in unicode_digits]): + return '' + characters = map(six.unichr, unicode_digits) + return ''.join(characters) + + elif name.startswith('u'): + name_without_u = name.strip('u') + if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6: + unicode_digit = int(name_without_u, base=16) + if 55295 < unicode_digit < 57344: + return '' + return six.unichr(unicode_digit) + + return '' -## EncodingDB -## class EncodingDB(object): std2unicode = {} From 33cc9861ae06d44ef2d7173a6781197749bff26c Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sun, 14 Jul 2019 15:19:17 +0200 Subject: [PATCH 04/10] Add docstring to Type1FontHeaderParser.get_encoding() that describes that the custom CharStrings of the font are mapped to '' --- pdfminer/pdffont.py | 51 +++++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index feb8557..5d7eaf1 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -1,34 +1,35 @@ -import sys import struct +import sys from io import BytesIO + +import six # Python 2+3 compatibility + +from . import settings +from .cmapdb import CMap from .cmapdb import CMapDB from .cmapdb import CMapParser from .cmapdb import FileUnicodeMap -from .cmapdb import CMap from .encodingdb import EncodingDB from .encodingdb import name2unicode -from .psparser import PSStackParser -from .psparser import PSEOF -from .psparser import LIT -from .psparser import KWD -from . import settings -from .psparser import PSLiteral -from .psparser import literal_name -from .pdftypes import PDFException -from .pdftypes import resolve1 -from .pdftypes import int_value -from .pdftypes import num_value -from .pdftypes import list_value -from .pdftypes import dict_value -from .pdftypes import stream_value from .fontmetrics import FONT_METRICS +from .pdftypes import PDFException +from .pdftypes import dict_value +from .pdftypes import int_value +from .pdftypes import list_value +from .pdftypes import num_value +from .pdftypes import resolve1 +from .pdftypes import stream_value +from .psparser import KWD +from .psparser import LIT +from .psparser import PSEOF +from .psparser import PSLiteral +from .psparser import PSStackParser +from .psparser import literal_name from .utils import apply_matrix_norm -from .utils import nunpack from .utils import choplist from .utils import isnumber - -import six #Python 2+3 compatibility +from .utils import nunpack def get_widths(seq): @@ -98,7 +99,6 @@ class Type1FontHeaderParser(PSStackParser): KEYWORD_ARRAY = KWD(b'array') KEYWORD_READONLY = KWD(b'readonly') KEYWORD_FOR = KWD(b'for') - KEYWORD_FOR = KWD(b'for') def __init__(self, data): PSStackParser.__init__(self, data) @@ -106,6 +106,17 @@ class Type1FontHeaderParser(PSStackParser): return def get_encoding(self): + """Parse the font encoding + + The Type1 font encoding maps character codes to character names. These character names could either be standard + Adobe glyph names, or character names associated with custom CharStrings for this font. A CharString is a + sequence of operations that describe how the character should be drawn. + Currently, this function returns '' (empty string) for character names that are associated with a CharStrings. + + References: http://wwwimages.adobe.com/content/dam/Adobe/en/devnet/font/pdfs/T1_SPEC.pdf + + :returns mapping of character identifiers (cid's) to unicode characters + """ while 1: try: (cid, name) = self.nextobject() From fdb7e5486287e008cb2e71d0d16ef21863954b68 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sun, 14 Jul 2019 15:20:25 +0200 Subject: [PATCH 05/10] Add lowercase adobe glyph name tests --- tests/test_encodingdb.py | 52 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 3 deletions(-) diff --git a/tests/test_encodingdb.py b/tests/test_encodingdb.py index 2fac375..ac10d54 100644 --- a/tests/test_encodingdb.py +++ b/tests/test_encodingdb.py @@ -1,5 +1,8 @@ """ Tests based on the Adobe Glyph List Specification (https://github.com/adobe-type-tools/agl-specification#2-the-mapping) + +While not in the specification, lowercase unicode often occurs in pdf's. Therefore lowercase unittest variants are +added. """ from pdfminer.encodingdb import name2unicode @@ -14,14 +17,28 @@ def test_name2unicode_uni(): assert u'\u013B' == name2unicode('uni013B') +def test_name2unicode_uni_lowercase(): + """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B""" + assert u'\u013B' == name2unicode('uni013b') + + def test_name2unicode_uni_with_sequence_of_digits(): """The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308""" assert u'\u20AC\u0308' == name2unicode('uni20AC0308') +def test_name2unicode_uni_with_sequence_of_digits_lowercase(): + """The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308""" + assert u'\u20AC\u0308' == name2unicode('uni20ac0308') + + def test_name2unicode_uni_empty_string(): - """The name "uni20ac" has a single component, which is mapped to an empty string""" - assert u'' == name2unicode('uni20ac') + """The name "uni20ac" has a single component, which is mapped to a €. + + According to the specification this should be mapped to an empty string, but we also want to support lowercase + hexadecimals + """ + assert u'€' == name2unicode('uni20ac') def test_name2unicode_uni_empty_string_long(): @@ -34,24 +51,53 @@ def test_name2unicode_uni_empty_string_long(): assert u'' == name2unicode('uniD801DC0C') +def test_name2unicode_uni_empty_string_long_lowercase(): + """The name "uniD801DC0C" has a single component, which is mapped to an empty string + + Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is + expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the + glyph name "u1040C.""" + assert u'' == name2unicode('uniD801DC0C') + + def test_name2unicode_uni_pua(): """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB.""" assert u'\uF6FB' == name2unicode('uniF6FB') +def test_name2unicode_uni_pua_lowercase(): + """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB.""" + assert u'\uF6FB' == name2unicode('unif6fb') + + def test_name2unicode_u_with_4_digits(): """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B""" assert u'\u013B' == name2unicode('u013B') +def test_name2unicode_u_with_4_digits_lowercase(): + """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B""" + assert u'\u013B' == name2unicode('u013b') + + def test_name2unicode_u_with_5_digits(): """The name "u1040C" has a single component, which is mapped to the string U+1040C""" assert u'\U0001040C' == name2unicode('u1040C') +def test_name2unicode_u_with_5_digits_lowercase(): + """The name "u1040C" has a single component, which is mapped to the string U+1040C""" + assert u'\U0001040C' == name2unicode('u1040c') + + def test_name2unicode_multiple_components(): """The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C""" - assert u'\u013B\u20AC\U0001040C' == name2unicode('Lcommaaccent_uni20AC0308_u1040C.alternate') + assert u'\u013B\u20AC\u0308\U0001040C' == name2unicode('Lcommaaccent_uni20AC0308_u1040C.alternate') + + +def test_name2unicode_multiple_components_lowercase(): + """The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C""" + assert u'\u013B\u20AC\u0308\U0001040C' == name2unicode('Lcommaaccent_uni20ac0308_u1040c.alternate') def test_name2unicode_foo(): From c597e95a9f828b6d6f18566a44d8706bdbc6744b Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sun, 14 Jul 2019 15:37:15 +0200 Subject: [PATCH 06/10] Use KeyError to signal that the name does not resemble any unicode, this pattern is also used in the rest of pdfminer.six --- pdfminer/encodingdb.py | 13 ++++++++----- tests/test_encodingdb.py | 12 +++++++----- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/pdfminer/encodingdb.py b/pdfminer/encodingdb.py index aa00005..5dcd8f2 100644 --- a/pdfminer/encodingdb.py +++ b/pdfminer/encodingdb.py @@ -10,12 +10,15 @@ from .psparser import PSLiteral HEXADECIMAL = re.compile(r'[0-9a-fA-F]+') -def name2unicode(name: str): +def name2unicode(name): """Converts Adobe glyph names to Unicode numbers. + In contrast to the specification, this raises a KeyError instead of return an empty string when the key is unknown. + This way the caller must explicitly define what to do when there is not a match. + Reference: https://github.com/adobe-type-tools/agl-specification#2-the-mapping - :returns unicode character if name resembles something, empty string if not + :returns unicode character if name resembles something, otherwise a KeyError """ full_stop = u'\u002E' name = name.split(full_stop)[0] @@ -33,7 +36,7 @@ def name2unicode(name: str): if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0: unicode_digits = [int(name_without_uni[i:i + 4], base=16) for i in range(0, len(name_without_uni), 4)] if any([55295 < digit < 57344 for digit in unicode_digits]): - return '' + raise KeyError characters = map(six.unichr, unicode_digits) return ''.join(characters) @@ -42,10 +45,10 @@ def name2unicode(name: str): if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6: unicode_digit = int(name_without_u, base=16) if 55295 < unicode_digit < 57344: - return '' + raise KeyError return six.unichr(unicode_digit) - return '' + raise KeyError class EncodingDB(object): diff --git a/tests/test_encodingdb.py b/tests/test_encodingdb.py index ac10d54..82c0282 100644 --- a/tests/test_encodingdb.py +++ b/tests/test_encodingdb.py @@ -4,6 +4,8 @@ Tests based on the Adobe Glyph List Specification (https://github.com/adobe-type While not in the specification, lowercase unicode often occurs in pdf's. Therefore lowercase unittest variants are added. """ +from nose.tools import assert_raises + from pdfminer.encodingdb import name2unicode @@ -48,7 +50,7 @@ def test_name2unicode_uni_empty_string_long(): expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the glyph name "u1040C. """ - assert u'' == name2unicode('uniD801DC0C') + assert_raises(KeyError, name2unicode, 'uniD801DC0C') def test_name2unicode_uni_empty_string_long_lowercase(): @@ -57,7 +59,7 @@ def test_name2unicode_uni_empty_string_long_lowercase(): Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the glyph name "u1040C.""" - assert u'' == name2unicode('uniD801DC0C') + assert_raises(KeyError, name2unicode, 'uniD801DC0C') def test_name2unicode_uni_pua(): @@ -102,12 +104,12 @@ def test_name2unicode_multiple_components_lowercase(): def test_name2unicode_foo(): """The name 'foo' maps to an empty string, because 'foo' is not in AGL, and because it does not start with a 'u.'""" - assert u'' == name2unicode('foo') + assert_raises(KeyError, name2unicode, 'foo') def test_name2unicode_notdef(): """The name ".notdef" is reduced to an empty string (step 1) and mapped to an empty string (step 3)""" - assert u'' == name2unicode('.notdef') + assert_raises(KeyError, name2unicode, '.notdef') def test_name2unicode_pua_ogoneksmall(): @@ -116,4 +118,4 @@ def test_name2unicode_pua_ogoneksmall(): def test_name2unicode_overflow_error(): - name2unicode('226215240241240240240240') + assert_raises(KeyError, name2unicode, '226215240241240240240240') From 1e24bfa0bd1ef332e30ffd57b2328ecacc0ff6c4 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sun, 14 Jul 2019 15:40:22 +0200 Subject: [PATCH 07/10] Fix error, python2 cannot handle unicode in a .py file --- tests/test_encodingdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_encodingdb.py b/tests/test_encodingdb.py index 82c0282..bfd2a87 100644 --- a/tests/test_encodingdb.py +++ b/tests/test_encodingdb.py @@ -35,7 +35,7 @@ def test_name2unicode_uni_with_sequence_of_digits_lowercase(): def test_name2unicode_uni_empty_string(): - """The name "uni20ac" has a single component, which is mapped to a €. + """The name "uni20ac" has a single component, which is mapped to a euro-sign. According to the specification this should be mapped to an empty string, but we also want to support lowercase hexadecimals From 2bb850cdaee9135fcf50770211b6817904950b5b Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sun, 14 Jul 2019 15:43:07 +0200 Subject: [PATCH 08/10] Fix error, python2 cannot handle unicode in a .py file --- tests/test_encodingdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_encodingdb.py b/tests/test_encodingdb.py index bfd2a87..36e4b11 100644 --- a/tests/test_encodingdb.py +++ b/tests/test_encodingdb.py @@ -40,7 +40,7 @@ def test_name2unicode_uni_empty_string(): According to the specification this should be mapped to an empty string, but we also want to support lowercase hexadecimals """ - assert u'€' == name2unicode('uni20ac') + assert u'\u20ac' == name2unicode('uni20ac') def test_name2unicode_uni_empty_string_long(): From 0fb83366b61af6c9cf5ff32164075d9d355cbbe8 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Tue, 16 Jul 2019 08:49:57 +0200 Subject: [PATCH 09/10] Remove intermediate variable `full_stop` because it is just a dot --- pdfminer/encodingdb.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pdfminer/encodingdb.py b/pdfminer/encodingdb.py index 5dcd8f2..dea23a1 100644 --- a/pdfminer/encodingdb.py +++ b/pdfminer/encodingdb.py @@ -1,4 +1,4 @@ - +import logging import re import six # Python 2+3 compatibility @@ -20,8 +20,7 @@ def name2unicode(name): :returns unicode character if name resembles something, otherwise a KeyError """ - full_stop = u'\u002E' - name = name.split(full_stop)[0] + name = name.split('.')[0] components = name.split('_') if len(components) > 1: From 6f362f53feefc81224d740a011fac69ea9707180 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Tue, 16 Jul 2019 08:52:24 +0200 Subject: [PATCH 10/10] Raise a `KeyError` with a useful message if `unicode2name()` does not match any glyph name. Use this message to log debug statements. --- pdfminer/encodingdb.py | 26 +++++++++++++++++++------- pdfminer/pdffont.py | 8 +++++--- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/pdfminer/encodingdb.py b/pdfminer/encodingdb.py index dea23a1..7100235 100644 --- a/pdfminer/encodingdb.py +++ b/pdfminer/encodingdb.py @@ -9,6 +9,8 @@ from .psparser import PSLiteral HEXADECIMAL = re.compile(r'[0-9a-fA-F]+') +log = logging.getLogger(__name__) + def name2unicode(name): """Converts Adobe glyph names to Unicode numbers. @@ -32,22 +34,32 @@ def name2unicode(name): elif name.startswith('uni'): name_without_uni = name.strip('uni') + if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0: unicode_digits = [int(name_without_uni[i:i + 4], base=16) for i in range(0, len(name_without_uni), 4)] - if any([55295 < digit < 57344 for digit in unicode_digits]): - raise KeyError + for digit in unicode_digits: + raise_key_error_for_invalid_unicode(digit) characters = map(six.unichr, unicode_digits) return ''.join(characters) elif name.startswith('u'): name_without_u = name.strip('u') + if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6: unicode_digit = int(name_without_u, base=16) - if 55295 < unicode_digit < 57344: - raise KeyError + raise_key_error_for_invalid_unicode(unicode_digit) return six.unichr(unicode_digit) - raise KeyError + raise KeyError('Could not convert unicode name "%s" to character because it does not match specification' % name) + + +def raise_key_error_for_invalid_unicode(unicode_digit): + """Unicode values should not be in the range D800 through DFFF because that is used for surrogate pairs in UTF-16 + + :raises KeyError if unicode digit is invalid + """ + if 55295 < unicode_digit < 57344: + raise KeyError('Unicode digit %d is invalid because it is in the range D800 through DFFF' % unicode_digit) class EncodingDB(object): @@ -86,7 +98,7 @@ class EncodingDB(object): elif isinstance(x, PSLiteral): try: cid2unicode[cid] = name2unicode(x.name) - except KeyError: - pass + except KeyError as e: + log.debug(str(e)) cid += 1 return cid2unicode diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 5d7eaf1..1a7603d 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -1,4 +1,4 @@ - +import logging import struct import sys from io import BytesIO @@ -31,6 +31,8 @@ from .utils import choplist from .utils import isnumber from .utils import nunpack +log = logging.getLogger(__name__) + def get_widths(seq): widths = {} @@ -124,8 +126,8 @@ class Type1FontHeaderParser(PSStackParser): break try: self._cid2unicode[cid] = name2unicode(name) - except KeyError: - pass + except KeyError as e: + log.debug(str(e)) return self._cid2unicode def do_keyword(self, pos, token):