From 594321fcf6b6612dc466d6430dccd6791be46cb7 Mon Sep 17 00:00:00 2001 From: Gert de Pagter Date: Mon, 25 Feb 2019 16:38:46 +0100 Subject: [PATCH 01/15] Remove self refference on python3 This *is* the 'six' repo, so no need to mention that again in the readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2a96278..4dc6741 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ Features How to Install -------------- - * Install Python 2.7 or newer. (Python 3.x is supported in pdfminer.six) + * Install Python 2.7 or newer. * Install `pip install pdfminer.six` From a03566da21530159175f9856a1fcce3438a09b29 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Mon, 8 Jul 2019 23:03:47 +0200 Subject: [PATCH 02/15] Add contribution guidelines --- CONTRIBUTING.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..15c82ae --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,30 @@ +# Contributing guidelines + +Any contribution is appreciated! You might want to: + +* Fix spelling errors +* Improve documentation +* Add tests for untested code +* Add new features +* Fix bugs + +## How can I contribute? + +* Use [issues](https://github.com/pdfminer/pdfminer.six/issues) to report bugs and features +* Fix issues by [creating pull requests](https://help.github.com/en/articles/creating-a-pull-request) +* Help others giving your thoughts on open issues and pull requests. + +## General guidelines for creating issues and pull requests + +* Search previous issues, as yours might be a duplicate. +* When creating a new issue for a bug, include a minimal reproducible example. +* When creating a new issue for a feature, be sure to describe the context of the problem you are trying to solve. This + will help others to see the importance of your feature request. +* Link pull request to a single issue. +* Pull requests should be merged to develop, not master. This ensures that master always equals the released verion. +* Include unit tests when possible. In case of bugs, this will help to prevent the same mistake in the future. In case + of features, this will show that your code works correctly. +* Code should work for Python 2.7 and Python 3.x (for now). +* Code changes should conform to PEP8 coding style (with a line-width of 120). Existing code may stay as it is. +* New features should be well documented using docstrings. +* Check spelling and grammar. \ No newline at end of file From 2743f2b20a304bc56cd306effdd32676b373ca70 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Mon, 8 Jul 2019 23:05:47 +0200 Subject: [PATCH 03/15] Add reference to contribution guidelines in README.md --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 2a96278..4a863c5 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,12 @@ TODO * Performance improvements. +Contributing +------------ + +Be sure to read the [contribution guidelines](https://github.com/pdfminer/pdfminer.six/blob/master/CONTRIBUTING.md). + + Terms and Conditions -------------------- From 5acfdd8f9ba73ad8770dc0bfdb76f2f4c9dcb307 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Wed, 10 Jul 2019 19:38:38 +0200 Subject: [PATCH 04/15] Add sentence about including pdf's in issues --- CONTRIBUTING.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 15c82ae..b2680c2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -11,6 +11,8 @@ Any contribution is appreciated! You might want to: ## How can I contribute? * Use [issues](https://github.com/pdfminer/pdfminer.six/issues) to report bugs and features + - If you report a bug in the results for a particular pdf, include that pdf. This allows others to replicate the + issue. * Fix issues by [creating pull requests](https://help.github.com/en/articles/creating-a-pull-request) * Help others giving your thoughts on open issues and pull requests. From ec5218a05f4d0e75e88079a19da75982573b5426 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Wed, 10 Jul 2019 20:24:30 +0200 Subject: [PATCH 05/15] Add some (failing) unittests for name2unicode based on the examples in the Adobe Glyph List Specification --- tests/test_encodingdb.py | 69 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 tests/test_encodingdb.py diff --git a/tests/test_encodingdb.py b/tests/test_encodingdb.py new file mode 100644 index 0000000..c3f8bf0 --- /dev/null +++ b/tests/test_encodingdb.py @@ -0,0 +1,69 @@ +""" +Tests based on the Adobe Glyph List Specification (https://github.com/adobe-type-tools/agl-specification#2-the-mapping) +""" +from pdfminer.encodingdb import name2unicode + + +def test_name2unicode_name_in_agl(): + """The name "Lcommaaccent" has a single component, which is mapped to the string U+013B by AGL""" + assert u'\u013B' == name2unicode('Lcommaaccent') + + +def test_name2unicode_uni(): + """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B""" + assert u'\u013B' == name2unicode('uni013B') + + +def test_name2unicode_uni_with_sequence_of_digits(): + """The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308""" + assert u'\u20AC\u0308' == name2unicode('uni20AC0308') + + +def test_name2unicode_uni_empty_string(): + """The name "uni20ac" has a single component, which is mapped to an empty string""" + assert u'' == name2unicode('uni20ac') + + +def test_name2unicode_uni_empty_string_long(): + """The name "uniD801DC0C" has a single component, which is mapped to an empty string + + Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is + expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the + glyph name "u1040C. + """ + assert u'' == name2unicode('uniD801DC0C') + + +def test_name2unicode_uni_pua(): + """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB.""" + assert u'\uF6FB' == name2unicode('uniF6FB') + + +def test_name2unicode_u_with_4_digits(): + """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B""" + assert u'\u013B' == name2unicode('u013B') + + +def test_name2unicode_u_with_5_digits(): + """The name "u1040C" has a single component, which is mapped to the string U+1040C""" + assert u'\U0001040C' == name2unicode('u1040C') + + +def test_name2unicode_multiple_components(): + """The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C""" + assert u'\u013B\u20AC\U0001040C' == name2unicode('Lcommaaccent_uni20AC0308_u1040C.alternate') + + +def test_name2unicode_foo(): + """The name 'foo' maps to an empty string, because 'foo' is not in AGL, and because it does not start with a 'u.'""" + assert u'' == name2unicode('foo') + + +def test_name2unicode_notdef(): + """The name ".notdef" is reduced to an empty string (step 1) and mapped to an empty string (step 3)""" + assert u'' == name2unicode('.notdef') + + +def test_name2unicode_pua_ogoneksmall(): + """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB.""" + assert u'\uF6FB' == name2unicode('Ogoneksmall') From 5d7ac7e88a0df5a445318bf6d7b2d924041b204b Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Wed, 10 Jul 2019 20:44:23 +0200 Subject: [PATCH 06/15] Added test for overflow error reported by @jtlz2: https://github.com/pdfminer/pdfminer.six/issues/177#issuecomment-510173228_ --- tests/test_encodingdb.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_encodingdb.py b/tests/test_encodingdb.py index c3f8bf0..2fac375 100644 --- a/tests/test_encodingdb.py +++ b/tests/test_encodingdb.py @@ -67,3 +67,7 @@ def test_name2unicode_notdef(): def test_name2unicode_pua_ogoneksmall(): """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB.""" assert u'\uF6FB' == name2unicode('Ogoneksmall') + + +def test_name2unicode_overflow_error(): + name2unicode('226215240241240240240240') From f0392f804971e1d1f1de8cf66f70dfb09a373241 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sun, 14 Jul 2019 15:16:42 +0200 Subject: [PATCH 07/15] Change implementation of name2unicode such that it follows the Adobe Glyph specs (with allowing lowercase) --- pdfminer/encodingdb.py | 57 ++++++++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 16 deletions(-) diff --git a/pdfminer/encodingdb.py b/pdfminer/encodingdb.py index 870bd28..aa00005 100644 --- a/pdfminer/encodingdb.py +++ b/pdfminer/encodingdb.py @@ -1,28 +1,53 @@ import re -from .psparser import PSLiteral + +import six # Python 2+3 compatibility + from .glyphlist import glyphname2unicode from .latin_enc import ENCODING +from .psparser import PSLiteral -import six # Python 2+3 compatibility - -STRIP_NAME = re.compile(r'[0-9]+') +HEXADECIMAL = re.compile(r'[0-9a-fA-F]+') -## name2unicode -## -def name2unicode(name): - """Converts Adobe glyph names to Unicode numbers.""" - if name in glyphname2unicode: - return glyphname2unicode[name] - m = STRIP_NAME.search(name) - if not m: - raise KeyError(name) - return six.unichr(int(m.group(0))) +def name2unicode(name: str): + """Converts Adobe glyph names to Unicode numbers. + + Reference: https://github.com/adobe-type-tools/agl-specification#2-the-mapping + + :returns unicode character if name resembles something, empty string if not + """ + full_stop = u'\u002E' + name = name.split(full_stop)[0] + components = name.split('_') + + if len(components) > 1: + return ''.join(map(name2unicode, components)) + + else: + if name in glyphname2unicode: + return glyphname2unicode.get(name) + + elif name.startswith('uni'): + name_without_uni = name.strip('uni') + if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0: + unicode_digits = [int(name_without_uni[i:i + 4], base=16) for i in range(0, len(name_without_uni), 4)] + if any([55295 < digit < 57344 for digit in unicode_digits]): + return '' + characters = map(six.unichr, unicode_digits) + return ''.join(characters) + + elif name.startswith('u'): + name_without_u = name.strip('u') + if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6: + unicode_digit = int(name_without_u, base=16) + if 55295 < unicode_digit < 57344: + return '' + return six.unichr(unicode_digit) + + return '' -## EncodingDB -## class EncodingDB(object): std2unicode = {} From 33cc9861ae06d44ef2d7173a6781197749bff26c Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sun, 14 Jul 2019 15:19:17 +0200 Subject: [PATCH 08/15] Add docstring to Type1FontHeaderParser.get_encoding() that describes that the custom CharStrings of the font are mapped to '' --- pdfminer/pdffont.py | 51 +++++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index feb8557..5d7eaf1 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -1,34 +1,35 @@ -import sys import struct +import sys from io import BytesIO + +import six # Python 2+3 compatibility + +from . import settings +from .cmapdb import CMap from .cmapdb import CMapDB from .cmapdb import CMapParser from .cmapdb import FileUnicodeMap -from .cmapdb import CMap from .encodingdb import EncodingDB from .encodingdb import name2unicode -from .psparser import PSStackParser -from .psparser import PSEOF -from .psparser import LIT -from .psparser import KWD -from . import settings -from .psparser import PSLiteral -from .psparser import literal_name -from .pdftypes import PDFException -from .pdftypes import resolve1 -from .pdftypes import int_value -from .pdftypes import num_value -from .pdftypes import list_value -from .pdftypes import dict_value -from .pdftypes import stream_value from .fontmetrics import FONT_METRICS +from .pdftypes import PDFException +from .pdftypes import dict_value +from .pdftypes import int_value +from .pdftypes import list_value +from .pdftypes import num_value +from .pdftypes import resolve1 +from .pdftypes import stream_value +from .psparser import KWD +from .psparser import LIT +from .psparser import PSEOF +from .psparser import PSLiteral +from .psparser import PSStackParser +from .psparser import literal_name from .utils import apply_matrix_norm -from .utils import nunpack from .utils import choplist from .utils import isnumber - -import six #Python 2+3 compatibility +from .utils import nunpack def get_widths(seq): @@ -98,7 +99,6 @@ class Type1FontHeaderParser(PSStackParser): KEYWORD_ARRAY = KWD(b'array') KEYWORD_READONLY = KWD(b'readonly') KEYWORD_FOR = KWD(b'for') - KEYWORD_FOR = KWD(b'for') def __init__(self, data): PSStackParser.__init__(self, data) @@ -106,6 +106,17 @@ class Type1FontHeaderParser(PSStackParser): return def get_encoding(self): + """Parse the font encoding + + The Type1 font encoding maps character codes to character names. These character names could either be standard + Adobe glyph names, or character names associated with custom CharStrings for this font. A CharString is a + sequence of operations that describe how the character should be drawn. + Currently, this function returns '' (empty string) for character names that are associated with a CharStrings. + + References: http://wwwimages.adobe.com/content/dam/Adobe/en/devnet/font/pdfs/T1_SPEC.pdf + + :returns mapping of character identifiers (cid's) to unicode characters + """ while 1: try: (cid, name) = self.nextobject() From fdb7e5486287e008cb2e71d0d16ef21863954b68 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sun, 14 Jul 2019 15:20:25 +0200 Subject: [PATCH 09/15] Add lowercase adobe glyph name tests --- tests/test_encodingdb.py | 52 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 3 deletions(-) diff --git a/tests/test_encodingdb.py b/tests/test_encodingdb.py index 2fac375..ac10d54 100644 --- a/tests/test_encodingdb.py +++ b/tests/test_encodingdb.py @@ -1,5 +1,8 @@ """ Tests based on the Adobe Glyph List Specification (https://github.com/adobe-type-tools/agl-specification#2-the-mapping) + +While not in the specification, lowercase unicode often occurs in pdf's. Therefore lowercase unittest variants are +added. """ from pdfminer.encodingdb import name2unicode @@ -14,14 +17,28 @@ def test_name2unicode_uni(): assert u'\u013B' == name2unicode('uni013B') +def test_name2unicode_uni_lowercase(): + """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B""" + assert u'\u013B' == name2unicode('uni013b') + + def test_name2unicode_uni_with_sequence_of_digits(): """The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308""" assert u'\u20AC\u0308' == name2unicode('uni20AC0308') +def test_name2unicode_uni_with_sequence_of_digits_lowercase(): + """The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308""" + assert u'\u20AC\u0308' == name2unicode('uni20ac0308') + + def test_name2unicode_uni_empty_string(): - """The name "uni20ac" has a single component, which is mapped to an empty string""" - assert u'' == name2unicode('uni20ac') + """The name "uni20ac" has a single component, which is mapped to a €. + + According to the specification this should be mapped to an empty string, but we also want to support lowercase + hexadecimals + """ + assert u'€' == name2unicode('uni20ac') def test_name2unicode_uni_empty_string_long(): @@ -34,24 +51,53 @@ def test_name2unicode_uni_empty_string_long(): assert u'' == name2unicode('uniD801DC0C') +def test_name2unicode_uni_empty_string_long_lowercase(): + """The name "uniD801DC0C" has a single component, which is mapped to an empty string + + Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is + expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the + glyph name "u1040C.""" + assert u'' == name2unicode('uniD801DC0C') + + def test_name2unicode_uni_pua(): """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB.""" assert u'\uF6FB' == name2unicode('uniF6FB') +def test_name2unicode_uni_pua_lowercase(): + """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB.""" + assert u'\uF6FB' == name2unicode('unif6fb') + + def test_name2unicode_u_with_4_digits(): """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B""" assert u'\u013B' == name2unicode('u013B') +def test_name2unicode_u_with_4_digits_lowercase(): + """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B""" + assert u'\u013B' == name2unicode('u013b') + + def test_name2unicode_u_with_5_digits(): """The name "u1040C" has a single component, which is mapped to the string U+1040C""" assert u'\U0001040C' == name2unicode('u1040C') +def test_name2unicode_u_with_5_digits_lowercase(): + """The name "u1040C" has a single component, which is mapped to the string U+1040C""" + assert u'\U0001040C' == name2unicode('u1040c') + + def test_name2unicode_multiple_components(): """The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C""" - assert u'\u013B\u20AC\U0001040C' == name2unicode('Lcommaaccent_uni20AC0308_u1040C.alternate') + assert u'\u013B\u20AC\u0308\U0001040C' == name2unicode('Lcommaaccent_uni20AC0308_u1040C.alternate') + + +def test_name2unicode_multiple_components_lowercase(): + """The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C""" + assert u'\u013B\u20AC\u0308\U0001040C' == name2unicode('Lcommaaccent_uni20ac0308_u1040c.alternate') def test_name2unicode_foo(): From c597e95a9f828b6d6f18566a44d8706bdbc6744b Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sun, 14 Jul 2019 15:37:15 +0200 Subject: [PATCH 10/15] Use KeyError to signal that the name does not resemble any unicode, this pattern is also used in the rest of pdfminer.six --- pdfminer/encodingdb.py | 13 ++++++++----- tests/test_encodingdb.py | 12 +++++++----- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/pdfminer/encodingdb.py b/pdfminer/encodingdb.py index aa00005..5dcd8f2 100644 --- a/pdfminer/encodingdb.py +++ b/pdfminer/encodingdb.py @@ -10,12 +10,15 @@ from .psparser import PSLiteral HEXADECIMAL = re.compile(r'[0-9a-fA-F]+') -def name2unicode(name: str): +def name2unicode(name): """Converts Adobe glyph names to Unicode numbers. + In contrast to the specification, this raises a KeyError instead of return an empty string when the key is unknown. + This way the caller must explicitly define what to do when there is not a match. + Reference: https://github.com/adobe-type-tools/agl-specification#2-the-mapping - :returns unicode character if name resembles something, empty string if not + :returns unicode character if name resembles something, otherwise a KeyError """ full_stop = u'\u002E' name = name.split(full_stop)[0] @@ -33,7 +36,7 @@ def name2unicode(name: str): if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0: unicode_digits = [int(name_without_uni[i:i + 4], base=16) for i in range(0, len(name_without_uni), 4)] if any([55295 < digit < 57344 for digit in unicode_digits]): - return '' + raise KeyError characters = map(six.unichr, unicode_digits) return ''.join(characters) @@ -42,10 +45,10 @@ def name2unicode(name: str): if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6: unicode_digit = int(name_without_u, base=16) if 55295 < unicode_digit < 57344: - return '' + raise KeyError return six.unichr(unicode_digit) - return '' + raise KeyError class EncodingDB(object): diff --git a/tests/test_encodingdb.py b/tests/test_encodingdb.py index ac10d54..82c0282 100644 --- a/tests/test_encodingdb.py +++ b/tests/test_encodingdb.py @@ -4,6 +4,8 @@ Tests based on the Adobe Glyph List Specification (https://github.com/adobe-type While not in the specification, lowercase unicode often occurs in pdf's. Therefore lowercase unittest variants are added. """ +from nose.tools import assert_raises + from pdfminer.encodingdb import name2unicode @@ -48,7 +50,7 @@ def test_name2unicode_uni_empty_string_long(): expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the glyph name "u1040C. """ - assert u'' == name2unicode('uniD801DC0C') + assert_raises(KeyError, name2unicode, 'uniD801DC0C') def test_name2unicode_uni_empty_string_long_lowercase(): @@ -57,7 +59,7 @@ def test_name2unicode_uni_empty_string_long_lowercase(): Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the glyph name "u1040C.""" - assert u'' == name2unicode('uniD801DC0C') + assert_raises(KeyError, name2unicode, 'uniD801DC0C') def test_name2unicode_uni_pua(): @@ -102,12 +104,12 @@ def test_name2unicode_multiple_components_lowercase(): def test_name2unicode_foo(): """The name 'foo' maps to an empty string, because 'foo' is not in AGL, and because it does not start with a 'u.'""" - assert u'' == name2unicode('foo') + assert_raises(KeyError, name2unicode, 'foo') def test_name2unicode_notdef(): """The name ".notdef" is reduced to an empty string (step 1) and mapped to an empty string (step 3)""" - assert u'' == name2unicode('.notdef') + assert_raises(KeyError, name2unicode, '.notdef') def test_name2unicode_pua_ogoneksmall(): @@ -116,4 +118,4 @@ def test_name2unicode_pua_ogoneksmall(): def test_name2unicode_overflow_error(): - name2unicode('226215240241240240240240') + assert_raises(KeyError, name2unicode, '226215240241240240240240') From 1e24bfa0bd1ef332e30ffd57b2328ecacc0ff6c4 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sun, 14 Jul 2019 15:40:22 +0200 Subject: [PATCH 11/15] Fix error, python2 cannot handle unicode in a .py file --- tests/test_encodingdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_encodingdb.py b/tests/test_encodingdb.py index 82c0282..bfd2a87 100644 --- a/tests/test_encodingdb.py +++ b/tests/test_encodingdb.py @@ -35,7 +35,7 @@ def test_name2unicode_uni_with_sequence_of_digits_lowercase(): def test_name2unicode_uni_empty_string(): - """The name "uni20ac" has a single component, which is mapped to a €. + """The name "uni20ac" has a single component, which is mapped to a euro-sign. According to the specification this should be mapped to an empty string, but we also want to support lowercase hexadecimals From 2bb850cdaee9135fcf50770211b6817904950b5b Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sun, 14 Jul 2019 15:43:07 +0200 Subject: [PATCH 12/15] Fix error, python2 cannot handle unicode in a .py file --- tests/test_encodingdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_encodingdb.py b/tests/test_encodingdb.py index bfd2a87..36e4b11 100644 --- a/tests/test_encodingdb.py +++ b/tests/test_encodingdb.py @@ -40,7 +40,7 @@ def test_name2unicode_uni_empty_string(): According to the specification this should be mapped to an empty string, but we also want to support lowercase hexadecimals """ - assert u'€' == name2unicode('uni20ac') + assert u'\u20ac' == name2unicode('uni20ac') def test_name2unicode_uni_empty_string_long(): From 0fb83366b61af6c9cf5ff32164075d9d355cbbe8 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Tue, 16 Jul 2019 08:49:57 +0200 Subject: [PATCH 13/15] Remove intermediate variable `full_stop` because it is just a dot --- pdfminer/encodingdb.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pdfminer/encodingdb.py b/pdfminer/encodingdb.py index 5dcd8f2..dea23a1 100644 --- a/pdfminer/encodingdb.py +++ b/pdfminer/encodingdb.py @@ -1,4 +1,4 @@ - +import logging import re import six # Python 2+3 compatibility @@ -20,8 +20,7 @@ def name2unicode(name): :returns unicode character if name resembles something, otherwise a KeyError """ - full_stop = u'\u002E' - name = name.split(full_stop)[0] + name = name.split('.')[0] components = name.split('_') if len(components) > 1: From 6f362f53feefc81224d740a011fac69ea9707180 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Tue, 16 Jul 2019 08:52:24 +0200 Subject: [PATCH 14/15] Raise a `KeyError` with a useful message if `unicode2name()` does not match any glyph name. Use this message to log debug statements. --- pdfminer/encodingdb.py | 26 +++++++++++++++++++------- pdfminer/pdffont.py | 8 +++++--- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/pdfminer/encodingdb.py b/pdfminer/encodingdb.py index dea23a1..7100235 100644 --- a/pdfminer/encodingdb.py +++ b/pdfminer/encodingdb.py @@ -9,6 +9,8 @@ from .psparser import PSLiteral HEXADECIMAL = re.compile(r'[0-9a-fA-F]+') +log = logging.getLogger(__name__) + def name2unicode(name): """Converts Adobe glyph names to Unicode numbers. @@ -32,22 +34,32 @@ def name2unicode(name): elif name.startswith('uni'): name_without_uni = name.strip('uni') + if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0: unicode_digits = [int(name_without_uni[i:i + 4], base=16) for i in range(0, len(name_without_uni), 4)] - if any([55295 < digit < 57344 for digit in unicode_digits]): - raise KeyError + for digit in unicode_digits: + raise_key_error_for_invalid_unicode(digit) characters = map(six.unichr, unicode_digits) return ''.join(characters) elif name.startswith('u'): name_without_u = name.strip('u') + if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6: unicode_digit = int(name_without_u, base=16) - if 55295 < unicode_digit < 57344: - raise KeyError + raise_key_error_for_invalid_unicode(unicode_digit) return six.unichr(unicode_digit) - raise KeyError + raise KeyError('Could not convert unicode name "%s" to character because it does not match specification' % name) + + +def raise_key_error_for_invalid_unicode(unicode_digit): + """Unicode values should not be in the range D800 through DFFF because that is used for surrogate pairs in UTF-16 + + :raises KeyError if unicode digit is invalid + """ + if 55295 < unicode_digit < 57344: + raise KeyError('Unicode digit %d is invalid because it is in the range D800 through DFFF' % unicode_digit) class EncodingDB(object): @@ -86,7 +98,7 @@ class EncodingDB(object): elif isinstance(x, PSLiteral): try: cid2unicode[cid] = name2unicode(x.name) - except KeyError: - pass + except KeyError as e: + log.debug(str(e)) cid += 1 return cid2unicode diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 5d7eaf1..1a7603d 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -1,4 +1,4 @@ - +import logging import struct import sys from io import BytesIO @@ -31,6 +31,8 @@ from .utils import choplist from .utils import isnumber from .utils import nunpack +log = logging.getLogger(__name__) + def get_widths(seq): widths = {} @@ -124,8 +126,8 @@ class Type1FontHeaderParser(PSStackParser): break try: self._cid2unicode[cid] = name2unicode(name) - except KeyError: - pass + except KeyError as e: + log.debug(str(e)) return self._cid2unicode def do_keyword(self, pos, token): From 540df9f676f93275787366bfbefaed571074ec8c Mon Sep 17 00:00:00 2001 From: Igor Moura Date: Tue, 16 Jul 2019 21:18:42 -0300 Subject: [PATCH 15/15] Replaced .iteritems() and with six.iteritems() for Python 3 compat This is a squashed commit, the previous messages can be seen bellow This is the 1st commit message: Replaced .iteritems() usage for .items() Fixed some python 2 leftovers, as discussed in #267. Also formatted code according to Black.\nThis possibly breaks some python 2 compatibility This is the commit message #2: Reverted formatting and more spread six usage --- pdfminer/cmapdb.py | 6 +++--- pdfminer/pdfdevice.py | 2 +- pdfminer/pdffont.py | 2 +- pdfminer/pdftypes.py | 2 +- tools/conv_afm.py | 4 +++- tools/pdf2html.cgi | 3 ++- 6 files changed, 11 insertions(+), 8 deletions(-) diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index c3403d1..8185c93 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -83,7 +83,7 @@ class CMap(CMapBase): assert isinstance(cmap, CMap), str(type(cmap)) def copy(dst, src): - for (k, v) in src.iteritems(): + for (k, v) in six.iteritems(src): if isinstance(v, dict): d = {} dst[k] = d @@ -110,7 +110,7 @@ class CMap(CMapBase): if code2cid is None: code2cid = self.code2cid code = () - for (k, v) in sorted(code2cid.iteritems()): + for (k, v) in sorted(six.iteritems(code2cid)): c = code+(k,) if isinstance(v, int): out.write('code %r = cid %d\n' % (c, v)) @@ -148,7 +148,7 @@ class UnicodeMap(CMapBase): return self.cid2unichr[cid] def dump(self, out=sys.stdout): - for (k, v) in sorted(self.cid2unichr.iteritems()): + for (k, v) in sorted(six.iteritems(self.cid2unichr)): out.write('cid %d = unicode %r\n' % (k, v)) return diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py index 0d4c175..54925f1 100644 --- a/pdfminer/pdfdevice.py +++ b/pdfminer/pdfdevice.py @@ -178,7 +178,7 @@ class TagExtractor(PDFDevice): s = '' if isinstance(props, dict): s = ''.join(' %s="%s"' % (utils.enc(k), utils.enc(str(v))) for (k, v) - in sorted(props.iteritems())) + in sorted(six.iteritems(props))) out_s = '<%s%s>' % (utils.enc(tag.name), s) self.outfp.write(utils.make_compat_bytes(out_s)) self._stack.append(tag) diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index feb8557..0ebd952 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -453,7 +453,7 @@ class TrueTypeFont(object): assert False, str(('Unhandled', fmttype)) # create unicode map unicode_map = FileUnicodeMap() - for (char, gid) in char2gid.iteritems(): + for (char, gid) in six.iteritems(char2gid): unicode_map.add_cid2unichr(gid, char) return unicode_map diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index 40cca46..c6e8d86 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -96,7 +96,7 @@ def resolve_all(x, default=None): if isinstance(x, list): x = [resolve_all(v, default=default) for v in x] elif isinstance(x, dict): - for (k, v) in x.iteritems(): + for (k, v) in six.iteritems(x): x[k] = resolve_all(v, default=default) return x diff --git a/tools/conv_afm.py b/tools/conv_afm.py index 2402a8e..2345982 100755 --- a/tools/conv_afm.py +++ b/tools/conv_afm.py @@ -3,6 +3,8 @@ import sys import fileinput +import six #Python 2+3 compatibility + def main(argv): fonts = {} for line in fileinput.input(): @@ -33,7 +35,7 @@ def main(argv): props[k] = tuple(map(float, f[1:5])) print ('# -*- python -*-') print ('FONT_METRICS = {') - for (fontname,(props,chars)) in fonts.iteritems(): + for (fontname,(props,chars)) in six.iteritems(fonts): print (' %r: %r,' % (fontname, (props,chars))) print ('}') return 0 diff --git a/tools/pdf2html.cgi b/tools/pdf2html.cgi index 8522a04..e2ea964 100755 --- a/tools/pdf2html.cgi +++ b/tools/pdf2html.cgi @@ -26,6 +26,7 @@ from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import HTMLConverter, TextConverter from pdfminer.layout import LAParams +import six #Python 2+3 compatibility # quote HTML metacharacters def q(x): @@ -35,7 +36,7 @@ def q(x): Q = re.compile(r'[^a-zA-Z0-9_.-=]') def url(base, **kw): r = [] - for (k,v) in kw.iteritems(): + for (k,v) in six.iteritems(kw): v = Q.sub(lambda m: '%%%02X' % ord(m.group(0)), encoder(q(v), 'replace')[0]) r.append('%s=%s' % (k, v)) return base+'&'.join(r)