From 2ede124142a73175ae5dd4e4cf334bbbcc831a0a Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Sat, 3 Nov 2018 22:52:29 -0700 Subject: [PATCH 01/34] Interpet font Descent as a negative number even if specified as positive The PDF RM specifies that Descent should be negative. Fonts that claim to have a positive Descent (not that it would make sense) always seem to be wrong about this claim. --- pdfminer/pdffont.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index feb8557..ea75b34 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -488,6 +488,13 @@ class PDFFont(object): self.leading = num_value(descriptor.get('Leading', 0)) self.bbox = list_value(descriptor.get('FontBBox', (0, 0, 0, 0))) self.hscale = self.vscale = .001 + + # PDF RM 9.8.1 specifies /Descent should always be a negative number. + # PScript5.dll seems to produce Descent with a positive number, but + # text analysis will be wrong if this is taken as correct. So force + # descent to negative. + if self.descent > 0: + self.descent = -self.descent return def __repr__(self): @@ -503,9 +510,11 @@ class PDFFont(object): return bytearray(bytes) # map(ord, bytes) def get_ascent(self): + """Ascent above the baseline, in text space units""" return self.ascent * self.vscale def get_descent(self): + """Descent below the baseline, in text space units; always negative""" return self.descent * self.vscale def get_width(self): From 5ff84b83fbf60598406e9af616bdc8a565e38fde Mon Sep 17 00:00:00 2001 From: Felix Schwarz Date: Fri, 18 Jan 2019 11:24:51 +0100 Subject: [PATCH 02/34] use conditional requirements to ensure "chardet" listed as requirement on Python 3 (fixes #213) Previously "chardet" was added only added when setup.py was run with Python 3. However wheels contain a static list of requirements and a wheel-based install will never execute setup.py at installation time. pdfminer.six uses universal wheels for Python 2 and Python 3 so the requirements will always be wrong on one version (see #213). The solution is to use conditional requirements as specified in PEP 496 which are evaluated at installation time. --- setup.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/setup.py b/setup.py index dd9db18..404c308 100644 --- a/setup.py +++ b/setup.py @@ -3,9 +3,7 @@ import sys import pdfminer as package -requires = ['six', 'pycryptodome', 'sortedcontainers'] -if sys.version_info >= (3, 0): - requires.append('chardet') +requires = ['six', 'pycryptodome', 'sortedcontainers', 'chardet ; python_version > "3.0"'] setup( name='pdfminer.six', From 594321fcf6b6612dc466d6430dccd6791be46cb7 Mon Sep 17 00:00:00 2001 From: Gert de Pagter Date: Mon, 25 Feb 2019 16:38:46 +0100 Subject: [PATCH 03/34] Remove self refference on python3 This *is* the 'six' repo, so no need to mention that again in the readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2a96278..4dc6741 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ Features How to Install -------------- - * Install Python 2.7 or newer. (Python 3.x is supported in pdfminer.six) + * Install Python 2.7 or newer. * Install `pip install pdfminer.six` From a03566da21530159175f9856a1fcce3438a09b29 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Mon, 8 Jul 2019 23:03:47 +0200 Subject: [PATCH 04/34] Add contribution guidelines --- CONTRIBUTING.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..15c82ae --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,30 @@ +# Contributing guidelines + +Any contribution is appreciated! You might want to: + +* Fix spelling errors +* Improve documentation +* Add tests for untested code +* Add new features +* Fix bugs + +## How can I contribute? + +* Use [issues](https://github.com/pdfminer/pdfminer.six/issues) to report bugs and features +* Fix issues by [creating pull requests](https://help.github.com/en/articles/creating-a-pull-request) +* Help others giving your thoughts on open issues and pull requests. + +## General guidelines for creating issues and pull requests + +* Search previous issues, as yours might be a duplicate. +* When creating a new issue for a bug, include a minimal reproducible example. +* When creating a new issue for a feature, be sure to describe the context of the problem you are trying to solve. This + will help others to see the importance of your feature request. +* Link pull request to a single issue. +* Pull requests should be merged to develop, not master. This ensures that master always equals the released verion. +* Include unit tests when possible. In case of bugs, this will help to prevent the same mistake in the future. In case + of features, this will show that your code works correctly. +* Code should work for Python 2.7 and Python 3.x (for now). +* Code changes should conform to PEP8 coding style (with a line-width of 120). Existing code may stay as it is. +* New features should be well documented using docstrings. +* Check spelling and grammar. \ No newline at end of file From 2743f2b20a304bc56cd306effdd32676b373ca70 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Mon, 8 Jul 2019 23:05:47 +0200 Subject: [PATCH 05/34] Add reference to contribution guidelines in README.md --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 2a96278..4a863c5 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,12 @@ TODO * Performance improvements. +Contributing +------------ + +Be sure to read the [contribution guidelines](https://github.com/pdfminer/pdfminer.six/blob/master/CONTRIBUTING.md). + + Terms and Conditions -------------------- From 5acfdd8f9ba73ad8770dc0bfdb76f2f4c9dcb307 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Wed, 10 Jul 2019 19:38:38 +0200 Subject: [PATCH 06/34] Add sentence about including pdf's in issues --- CONTRIBUTING.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 15c82ae..b2680c2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -11,6 +11,8 @@ Any contribution is appreciated! You might want to: ## How can I contribute? * Use [issues](https://github.com/pdfminer/pdfminer.six/issues) to report bugs and features + - If you report a bug in the results for a particular pdf, include that pdf. This allows others to replicate the + issue. * Fix issues by [creating pull requests](https://help.github.com/en/articles/creating-a-pull-request) * Help others giving your thoughts on open issues and pull requests. From ec5218a05f4d0e75e88079a19da75982573b5426 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Wed, 10 Jul 2019 20:24:30 +0200 Subject: [PATCH 07/34] Add some (failing) unittests for name2unicode based on the examples in the Adobe Glyph List Specification --- tests/test_encodingdb.py | 69 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 tests/test_encodingdb.py diff --git a/tests/test_encodingdb.py b/tests/test_encodingdb.py new file mode 100644 index 0000000..c3f8bf0 --- /dev/null +++ b/tests/test_encodingdb.py @@ -0,0 +1,69 @@ +""" +Tests based on the Adobe Glyph List Specification (https://github.com/adobe-type-tools/agl-specification#2-the-mapping) +""" +from pdfminer.encodingdb import name2unicode + + +def test_name2unicode_name_in_agl(): + """The name "Lcommaaccent" has a single component, which is mapped to the string U+013B by AGL""" + assert u'\u013B' == name2unicode('Lcommaaccent') + + +def test_name2unicode_uni(): + """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B""" + assert u'\u013B' == name2unicode('uni013B') + + +def test_name2unicode_uni_with_sequence_of_digits(): + """The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308""" + assert u'\u20AC\u0308' == name2unicode('uni20AC0308') + + +def test_name2unicode_uni_empty_string(): + """The name "uni20ac" has a single component, which is mapped to an empty string""" + assert u'' == name2unicode('uni20ac') + + +def test_name2unicode_uni_empty_string_long(): + """The name "uniD801DC0C" has a single component, which is mapped to an empty string + + Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is + expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the + glyph name "u1040C. + """ + assert u'' == name2unicode('uniD801DC0C') + + +def test_name2unicode_uni_pua(): + """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB.""" + assert u'\uF6FB' == name2unicode('uniF6FB') + + +def test_name2unicode_u_with_4_digits(): + """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B""" + assert u'\u013B' == name2unicode('u013B') + + +def test_name2unicode_u_with_5_digits(): + """The name "u1040C" has a single component, which is mapped to the string U+1040C""" + assert u'\U0001040C' == name2unicode('u1040C') + + +def test_name2unicode_multiple_components(): + """The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C""" + assert u'\u013B\u20AC\U0001040C' == name2unicode('Lcommaaccent_uni20AC0308_u1040C.alternate') + + +def test_name2unicode_foo(): + """The name 'foo' maps to an empty string, because 'foo' is not in AGL, and because it does not start with a 'u.'""" + assert u'' == name2unicode('foo') + + +def test_name2unicode_notdef(): + """The name ".notdef" is reduced to an empty string (step 1) and mapped to an empty string (step 3)""" + assert u'' == name2unicode('.notdef') + + +def test_name2unicode_pua_ogoneksmall(): + """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB.""" + assert u'\uF6FB' == name2unicode('Ogoneksmall') From 5d7ac7e88a0df5a445318bf6d7b2d924041b204b Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Wed, 10 Jul 2019 20:44:23 +0200 Subject: [PATCH 08/34] Added test for overflow error reported by @jtlz2: https://github.com/pdfminer/pdfminer.six/issues/177#issuecomment-510173228_ --- tests/test_encodingdb.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_encodingdb.py b/tests/test_encodingdb.py index c3f8bf0..2fac375 100644 --- a/tests/test_encodingdb.py +++ b/tests/test_encodingdb.py @@ -67,3 +67,7 @@ def test_name2unicode_notdef(): def test_name2unicode_pua_ogoneksmall(): """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB.""" assert u'\uF6FB' == name2unicode('Ogoneksmall') + + +def test_name2unicode_overflow_error(): + name2unicode('226215240241240240240240') From f0392f804971e1d1f1de8cf66f70dfb09a373241 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sun, 14 Jul 2019 15:16:42 +0200 Subject: [PATCH 09/34] Change implementation of name2unicode such that it follows the Adobe Glyph specs (with allowing lowercase) --- pdfminer/encodingdb.py | 57 ++++++++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 16 deletions(-) diff --git a/pdfminer/encodingdb.py b/pdfminer/encodingdb.py index 870bd28..aa00005 100644 --- a/pdfminer/encodingdb.py +++ b/pdfminer/encodingdb.py @@ -1,28 +1,53 @@ import re -from .psparser import PSLiteral + +import six # Python 2+3 compatibility + from .glyphlist import glyphname2unicode from .latin_enc import ENCODING +from .psparser import PSLiteral -import six # Python 2+3 compatibility - -STRIP_NAME = re.compile(r'[0-9]+') +HEXADECIMAL = re.compile(r'[0-9a-fA-F]+') -## name2unicode -## -def name2unicode(name): - """Converts Adobe glyph names to Unicode numbers.""" - if name in glyphname2unicode: - return glyphname2unicode[name] - m = STRIP_NAME.search(name) - if not m: - raise KeyError(name) - return six.unichr(int(m.group(0))) +def name2unicode(name: str): + """Converts Adobe glyph names to Unicode numbers. + + Reference: https://github.com/adobe-type-tools/agl-specification#2-the-mapping + + :returns unicode character if name resembles something, empty string if not + """ + full_stop = u'\u002E' + name = name.split(full_stop)[0] + components = name.split('_') + + if len(components) > 1: + return ''.join(map(name2unicode, components)) + + else: + if name in glyphname2unicode: + return glyphname2unicode.get(name) + + elif name.startswith('uni'): + name_without_uni = name.strip('uni') + if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0: + unicode_digits = [int(name_without_uni[i:i + 4], base=16) for i in range(0, len(name_without_uni), 4)] + if any([55295 < digit < 57344 for digit in unicode_digits]): + return '' + characters = map(six.unichr, unicode_digits) + return ''.join(characters) + + elif name.startswith('u'): + name_without_u = name.strip('u') + if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6: + unicode_digit = int(name_without_u, base=16) + if 55295 < unicode_digit < 57344: + return '' + return six.unichr(unicode_digit) + + return '' -## EncodingDB -## class EncodingDB(object): std2unicode = {} From 33cc9861ae06d44ef2d7173a6781197749bff26c Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sun, 14 Jul 2019 15:19:17 +0200 Subject: [PATCH 10/34] Add docstring to Type1FontHeaderParser.get_encoding() that describes that the custom CharStrings of the font are mapped to '' --- pdfminer/pdffont.py | 51 +++++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index feb8557..5d7eaf1 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -1,34 +1,35 @@ -import sys import struct +import sys from io import BytesIO + +import six # Python 2+3 compatibility + +from . import settings +from .cmapdb import CMap from .cmapdb import CMapDB from .cmapdb import CMapParser from .cmapdb import FileUnicodeMap -from .cmapdb import CMap from .encodingdb import EncodingDB from .encodingdb import name2unicode -from .psparser import PSStackParser -from .psparser import PSEOF -from .psparser import LIT -from .psparser import KWD -from . import settings -from .psparser import PSLiteral -from .psparser import literal_name -from .pdftypes import PDFException -from .pdftypes import resolve1 -from .pdftypes import int_value -from .pdftypes import num_value -from .pdftypes import list_value -from .pdftypes import dict_value -from .pdftypes import stream_value from .fontmetrics import FONT_METRICS +from .pdftypes import PDFException +from .pdftypes import dict_value +from .pdftypes import int_value +from .pdftypes import list_value +from .pdftypes import num_value +from .pdftypes import resolve1 +from .pdftypes import stream_value +from .psparser import KWD +from .psparser import LIT +from .psparser import PSEOF +from .psparser import PSLiteral +from .psparser import PSStackParser +from .psparser import literal_name from .utils import apply_matrix_norm -from .utils import nunpack from .utils import choplist from .utils import isnumber - -import six #Python 2+3 compatibility +from .utils import nunpack def get_widths(seq): @@ -98,7 +99,6 @@ class Type1FontHeaderParser(PSStackParser): KEYWORD_ARRAY = KWD(b'array') KEYWORD_READONLY = KWD(b'readonly') KEYWORD_FOR = KWD(b'for') - KEYWORD_FOR = KWD(b'for') def __init__(self, data): PSStackParser.__init__(self, data) @@ -106,6 +106,17 @@ class Type1FontHeaderParser(PSStackParser): return def get_encoding(self): + """Parse the font encoding + + The Type1 font encoding maps character codes to character names. These character names could either be standard + Adobe glyph names, or character names associated with custom CharStrings for this font. A CharString is a + sequence of operations that describe how the character should be drawn. + Currently, this function returns '' (empty string) for character names that are associated with a CharStrings. + + References: http://wwwimages.adobe.com/content/dam/Adobe/en/devnet/font/pdfs/T1_SPEC.pdf + + :returns mapping of character identifiers (cid's) to unicode characters + """ while 1: try: (cid, name) = self.nextobject() From fdb7e5486287e008cb2e71d0d16ef21863954b68 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sun, 14 Jul 2019 15:20:25 +0200 Subject: [PATCH 11/34] Add lowercase adobe glyph name tests --- tests/test_encodingdb.py | 52 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 3 deletions(-) diff --git a/tests/test_encodingdb.py b/tests/test_encodingdb.py index 2fac375..ac10d54 100644 --- a/tests/test_encodingdb.py +++ b/tests/test_encodingdb.py @@ -1,5 +1,8 @@ """ Tests based on the Adobe Glyph List Specification (https://github.com/adobe-type-tools/agl-specification#2-the-mapping) + +While not in the specification, lowercase unicode often occurs in pdf's. Therefore lowercase unittest variants are +added. """ from pdfminer.encodingdb import name2unicode @@ -14,14 +17,28 @@ def test_name2unicode_uni(): assert u'\u013B' == name2unicode('uni013B') +def test_name2unicode_uni_lowercase(): + """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B""" + assert u'\u013B' == name2unicode('uni013b') + + def test_name2unicode_uni_with_sequence_of_digits(): """The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308""" assert u'\u20AC\u0308' == name2unicode('uni20AC0308') +def test_name2unicode_uni_with_sequence_of_digits_lowercase(): + """The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308""" + assert u'\u20AC\u0308' == name2unicode('uni20ac0308') + + def test_name2unicode_uni_empty_string(): - """The name "uni20ac" has a single component, which is mapped to an empty string""" - assert u'' == name2unicode('uni20ac') + """The name "uni20ac" has a single component, which is mapped to a €. + + According to the specification this should be mapped to an empty string, but we also want to support lowercase + hexadecimals + """ + assert u'€' == name2unicode('uni20ac') def test_name2unicode_uni_empty_string_long(): @@ -34,24 +51,53 @@ def test_name2unicode_uni_empty_string_long(): assert u'' == name2unicode('uniD801DC0C') +def test_name2unicode_uni_empty_string_long_lowercase(): + """The name "uniD801DC0C" has a single component, which is mapped to an empty string + + Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is + expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the + glyph name "u1040C.""" + assert u'' == name2unicode('uniD801DC0C') + + def test_name2unicode_uni_pua(): """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB.""" assert u'\uF6FB' == name2unicode('uniF6FB') +def test_name2unicode_uni_pua_lowercase(): + """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB.""" + assert u'\uF6FB' == name2unicode('unif6fb') + + def test_name2unicode_u_with_4_digits(): """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B""" assert u'\u013B' == name2unicode('u013B') +def test_name2unicode_u_with_4_digits_lowercase(): + """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B""" + assert u'\u013B' == name2unicode('u013b') + + def test_name2unicode_u_with_5_digits(): """The name "u1040C" has a single component, which is mapped to the string U+1040C""" assert u'\U0001040C' == name2unicode('u1040C') +def test_name2unicode_u_with_5_digits_lowercase(): + """The name "u1040C" has a single component, which is mapped to the string U+1040C""" + assert u'\U0001040C' == name2unicode('u1040c') + + def test_name2unicode_multiple_components(): """The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C""" - assert u'\u013B\u20AC\U0001040C' == name2unicode('Lcommaaccent_uni20AC0308_u1040C.alternate') + assert u'\u013B\u20AC\u0308\U0001040C' == name2unicode('Lcommaaccent_uni20AC0308_u1040C.alternate') + + +def test_name2unicode_multiple_components_lowercase(): + """The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C""" + assert u'\u013B\u20AC\u0308\U0001040C' == name2unicode('Lcommaaccent_uni20ac0308_u1040c.alternate') def test_name2unicode_foo(): From c597e95a9f828b6d6f18566a44d8706bdbc6744b Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sun, 14 Jul 2019 15:37:15 +0200 Subject: [PATCH 12/34] Use KeyError to signal that the name does not resemble any unicode, this pattern is also used in the rest of pdfminer.six --- pdfminer/encodingdb.py | 13 ++++++++----- tests/test_encodingdb.py | 12 +++++++----- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/pdfminer/encodingdb.py b/pdfminer/encodingdb.py index aa00005..5dcd8f2 100644 --- a/pdfminer/encodingdb.py +++ b/pdfminer/encodingdb.py @@ -10,12 +10,15 @@ from .psparser import PSLiteral HEXADECIMAL = re.compile(r'[0-9a-fA-F]+') -def name2unicode(name: str): +def name2unicode(name): """Converts Adobe glyph names to Unicode numbers. + In contrast to the specification, this raises a KeyError instead of return an empty string when the key is unknown. + This way the caller must explicitly define what to do when there is not a match. + Reference: https://github.com/adobe-type-tools/agl-specification#2-the-mapping - :returns unicode character if name resembles something, empty string if not + :returns unicode character if name resembles something, otherwise a KeyError """ full_stop = u'\u002E' name = name.split(full_stop)[0] @@ -33,7 +36,7 @@ def name2unicode(name: str): if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0: unicode_digits = [int(name_without_uni[i:i + 4], base=16) for i in range(0, len(name_without_uni), 4)] if any([55295 < digit < 57344 for digit in unicode_digits]): - return '' + raise KeyError characters = map(six.unichr, unicode_digits) return ''.join(characters) @@ -42,10 +45,10 @@ def name2unicode(name: str): if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6: unicode_digit = int(name_without_u, base=16) if 55295 < unicode_digit < 57344: - return '' + raise KeyError return six.unichr(unicode_digit) - return '' + raise KeyError class EncodingDB(object): diff --git a/tests/test_encodingdb.py b/tests/test_encodingdb.py index ac10d54..82c0282 100644 --- a/tests/test_encodingdb.py +++ b/tests/test_encodingdb.py @@ -4,6 +4,8 @@ Tests based on the Adobe Glyph List Specification (https://github.com/adobe-type While not in the specification, lowercase unicode often occurs in pdf's. Therefore lowercase unittest variants are added. """ +from nose.tools import assert_raises + from pdfminer.encodingdb import name2unicode @@ -48,7 +50,7 @@ def test_name2unicode_uni_empty_string_long(): expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the glyph name "u1040C. """ - assert u'' == name2unicode('uniD801DC0C') + assert_raises(KeyError, name2unicode, 'uniD801DC0C') def test_name2unicode_uni_empty_string_long_lowercase(): @@ -57,7 +59,7 @@ def test_name2unicode_uni_empty_string_long_lowercase(): Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the glyph name "u1040C.""" - assert u'' == name2unicode('uniD801DC0C') + assert_raises(KeyError, name2unicode, 'uniD801DC0C') def test_name2unicode_uni_pua(): @@ -102,12 +104,12 @@ def test_name2unicode_multiple_components_lowercase(): def test_name2unicode_foo(): """The name 'foo' maps to an empty string, because 'foo' is not in AGL, and because it does not start with a 'u.'""" - assert u'' == name2unicode('foo') + assert_raises(KeyError, name2unicode, 'foo') def test_name2unicode_notdef(): """The name ".notdef" is reduced to an empty string (step 1) and mapped to an empty string (step 3)""" - assert u'' == name2unicode('.notdef') + assert_raises(KeyError, name2unicode, '.notdef') def test_name2unicode_pua_ogoneksmall(): @@ -116,4 +118,4 @@ def test_name2unicode_pua_ogoneksmall(): def test_name2unicode_overflow_error(): - name2unicode('226215240241240240240240') + assert_raises(KeyError, name2unicode, '226215240241240240240240') From 1e24bfa0bd1ef332e30ffd57b2328ecacc0ff6c4 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sun, 14 Jul 2019 15:40:22 +0200 Subject: [PATCH 13/34] Fix error, python2 cannot handle unicode in a .py file --- tests/test_encodingdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_encodingdb.py b/tests/test_encodingdb.py index 82c0282..bfd2a87 100644 --- a/tests/test_encodingdb.py +++ b/tests/test_encodingdb.py @@ -35,7 +35,7 @@ def test_name2unicode_uni_with_sequence_of_digits_lowercase(): def test_name2unicode_uni_empty_string(): - """The name "uni20ac" has a single component, which is mapped to a €. + """The name "uni20ac" has a single component, which is mapped to a euro-sign. According to the specification this should be mapped to an empty string, but we also want to support lowercase hexadecimals From 2bb850cdaee9135fcf50770211b6817904950b5b Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sun, 14 Jul 2019 15:43:07 +0200 Subject: [PATCH 14/34] Fix error, python2 cannot handle unicode in a .py file --- tests/test_encodingdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_encodingdb.py b/tests/test_encodingdb.py index bfd2a87..36e4b11 100644 --- a/tests/test_encodingdb.py +++ b/tests/test_encodingdb.py @@ -40,7 +40,7 @@ def test_name2unicode_uni_empty_string(): According to the specification this should be mapped to an empty string, but we also want to support lowercase hexadecimals """ - assert u'€' == name2unicode('uni20ac') + assert u'\u20ac' == name2unicode('uni20ac') def test_name2unicode_uni_empty_string_long(): From 0fb83366b61af6c9cf5ff32164075d9d355cbbe8 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Tue, 16 Jul 2019 08:49:57 +0200 Subject: [PATCH 15/34] Remove intermediate variable `full_stop` because it is just a dot --- pdfminer/encodingdb.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pdfminer/encodingdb.py b/pdfminer/encodingdb.py index 5dcd8f2..dea23a1 100644 --- a/pdfminer/encodingdb.py +++ b/pdfminer/encodingdb.py @@ -1,4 +1,4 @@ - +import logging import re import six # Python 2+3 compatibility @@ -20,8 +20,7 @@ def name2unicode(name): :returns unicode character if name resembles something, otherwise a KeyError """ - full_stop = u'\u002E' - name = name.split(full_stop)[0] + name = name.split('.')[0] components = name.split('_') if len(components) > 1: From 6f362f53feefc81224d740a011fac69ea9707180 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Tue, 16 Jul 2019 08:52:24 +0200 Subject: [PATCH 16/34] Raise a `KeyError` with a useful message if `unicode2name()` does not match any glyph name. Use this message to log debug statements. --- pdfminer/encodingdb.py | 26 +++++++++++++++++++------- pdfminer/pdffont.py | 8 +++++--- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/pdfminer/encodingdb.py b/pdfminer/encodingdb.py index dea23a1..7100235 100644 --- a/pdfminer/encodingdb.py +++ b/pdfminer/encodingdb.py @@ -9,6 +9,8 @@ from .psparser import PSLiteral HEXADECIMAL = re.compile(r'[0-9a-fA-F]+') +log = logging.getLogger(__name__) + def name2unicode(name): """Converts Adobe glyph names to Unicode numbers. @@ -32,22 +34,32 @@ def name2unicode(name): elif name.startswith('uni'): name_without_uni = name.strip('uni') + if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0: unicode_digits = [int(name_without_uni[i:i + 4], base=16) for i in range(0, len(name_without_uni), 4)] - if any([55295 < digit < 57344 for digit in unicode_digits]): - raise KeyError + for digit in unicode_digits: + raise_key_error_for_invalid_unicode(digit) characters = map(six.unichr, unicode_digits) return ''.join(characters) elif name.startswith('u'): name_without_u = name.strip('u') + if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6: unicode_digit = int(name_without_u, base=16) - if 55295 < unicode_digit < 57344: - raise KeyError + raise_key_error_for_invalid_unicode(unicode_digit) return six.unichr(unicode_digit) - raise KeyError + raise KeyError('Could not convert unicode name "%s" to character because it does not match specification' % name) + + +def raise_key_error_for_invalid_unicode(unicode_digit): + """Unicode values should not be in the range D800 through DFFF because that is used for surrogate pairs in UTF-16 + + :raises KeyError if unicode digit is invalid + """ + if 55295 < unicode_digit < 57344: + raise KeyError('Unicode digit %d is invalid because it is in the range D800 through DFFF' % unicode_digit) class EncodingDB(object): @@ -86,7 +98,7 @@ class EncodingDB(object): elif isinstance(x, PSLiteral): try: cid2unicode[cid] = name2unicode(x.name) - except KeyError: - pass + except KeyError as e: + log.debug(str(e)) cid += 1 return cid2unicode diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 5d7eaf1..1a7603d 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -1,4 +1,4 @@ - +import logging import struct import sys from io import BytesIO @@ -31,6 +31,8 @@ from .utils import choplist from .utils import isnumber from .utils import nunpack +log = logging.getLogger(__name__) + def get_widths(seq): widths = {} @@ -124,8 +126,8 @@ class Type1FontHeaderParser(PSStackParser): break try: self._cid2unicode[cid] = name2unicode(name) - except KeyError: - pass + except KeyError as e: + log.debug(str(e)) return self._cid2unicode def do_keyword(self, pos, token): From 540df9f676f93275787366bfbefaed571074ec8c Mon Sep 17 00:00:00 2001 From: Igor Moura Date: Tue, 16 Jul 2019 21:18:42 -0300 Subject: [PATCH 17/34] Replaced .iteritems() and with six.iteritems() for Python 3 compat This is a squashed commit, the previous messages can be seen bellow This is the 1st commit message: Replaced .iteritems() usage for .items() Fixed some python 2 leftovers, as discussed in #267. Also formatted code according to Black.\nThis possibly breaks some python 2 compatibility This is the commit message #2: Reverted formatting and more spread six usage --- pdfminer/cmapdb.py | 6 +++--- pdfminer/pdfdevice.py | 2 +- pdfminer/pdffont.py | 2 +- pdfminer/pdftypes.py | 2 +- tools/conv_afm.py | 4 +++- tools/pdf2html.cgi | 3 ++- 6 files changed, 11 insertions(+), 8 deletions(-) diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index c3403d1..8185c93 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -83,7 +83,7 @@ class CMap(CMapBase): assert isinstance(cmap, CMap), str(type(cmap)) def copy(dst, src): - for (k, v) in src.iteritems(): + for (k, v) in six.iteritems(src): if isinstance(v, dict): d = {} dst[k] = d @@ -110,7 +110,7 @@ class CMap(CMapBase): if code2cid is None: code2cid = self.code2cid code = () - for (k, v) in sorted(code2cid.iteritems()): + for (k, v) in sorted(six.iteritems(code2cid)): c = code+(k,) if isinstance(v, int): out.write('code %r = cid %d\n' % (c, v)) @@ -148,7 +148,7 @@ class UnicodeMap(CMapBase): return self.cid2unichr[cid] def dump(self, out=sys.stdout): - for (k, v) in sorted(self.cid2unichr.iteritems()): + for (k, v) in sorted(six.iteritems(self.cid2unichr)): out.write('cid %d = unicode %r\n' % (k, v)) return diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py index 0d4c175..54925f1 100644 --- a/pdfminer/pdfdevice.py +++ b/pdfminer/pdfdevice.py @@ -178,7 +178,7 @@ class TagExtractor(PDFDevice): s = '' if isinstance(props, dict): s = ''.join(' %s="%s"' % (utils.enc(k), utils.enc(str(v))) for (k, v) - in sorted(props.iteritems())) + in sorted(six.iteritems(props))) out_s = '<%s%s>' % (utils.enc(tag.name), s) self.outfp.write(utils.make_compat_bytes(out_s)) self._stack.append(tag) diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index feb8557..0ebd952 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -453,7 +453,7 @@ class TrueTypeFont(object): assert False, str(('Unhandled', fmttype)) # create unicode map unicode_map = FileUnicodeMap() - for (char, gid) in char2gid.iteritems(): + for (char, gid) in six.iteritems(char2gid): unicode_map.add_cid2unichr(gid, char) return unicode_map diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index 40cca46..c6e8d86 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -96,7 +96,7 @@ def resolve_all(x, default=None): if isinstance(x, list): x = [resolve_all(v, default=default) for v in x] elif isinstance(x, dict): - for (k, v) in x.iteritems(): + for (k, v) in six.iteritems(x): x[k] = resolve_all(v, default=default) return x diff --git a/tools/conv_afm.py b/tools/conv_afm.py index 2402a8e..2345982 100755 --- a/tools/conv_afm.py +++ b/tools/conv_afm.py @@ -3,6 +3,8 @@ import sys import fileinput +import six #Python 2+3 compatibility + def main(argv): fonts = {} for line in fileinput.input(): @@ -33,7 +35,7 @@ def main(argv): props[k] = tuple(map(float, f[1:5])) print ('# -*- python -*-') print ('FONT_METRICS = {') - for (fontname,(props,chars)) in fonts.iteritems(): + for (fontname,(props,chars)) in six.iteritems(fonts): print (' %r: %r,' % (fontname, (props,chars))) print ('}') return 0 diff --git a/tools/pdf2html.cgi b/tools/pdf2html.cgi index 8522a04..e2ea964 100755 --- a/tools/pdf2html.cgi +++ b/tools/pdf2html.cgi @@ -26,6 +26,7 @@ from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import HTMLConverter, TextConverter from pdfminer.layout import LAParams +import six #Python 2+3 compatibility # quote HTML metacharacters def q(x): @@ -35,7 +36,7 @@ def q(x): Q = re.compile(r'[^a-zA-Z0-9_.-=]') def url(base, **kw): r = [] - for (k,v) in kw.iteritems(): + for (k,v) in six.iteritems(kw): v = Q.sub(lambda m: '%%%02X' % ord(m.group(0)), encoder(q(v), 'replace')[0]) r.append('%s=%s' % (k, v)) return base+'&'.join(r) From 2f4518231f0b2f30c14a598948d82b1f24839114 Mon Sep 17 00:00:00 2001 From: Igor Moura Date: Tue, 16 Jul 2019 19:43:22 -0300 Subject: [PATCH 18/34] Use resolve_all on PdfFont widths and bbox Fixes #268 --- pdfminer/pdffont.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 0ebd952..d61bcab 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -16,7 +16,7 @@ from . import settings from .psparser import PSLiteral from .psparser import literal_name from .pdftypes import PDFException -from .pdftypes import resolve1 +from .pdftypes import resolve1, resolve_all from .pdftypes import int_value from .pdftypes import num_value from .pdftypes import list_value @@ -476,7 +476,7 @@ class PDFFont(object): def __init__(self, descriptor, widths, default_width=None): self.descriptor = descriptor - self.widths = widths + self.widths = resolve_all(widths) self.fontname = resolve1(descriptor.get('FontName', 'unknown')) if isinstance(self.fontname, PSLiteral): self.fontname = literal_name(self.fontname) @@ -486,7 +486,7 @@ class PDFFont(object): self.italic_angle = num_value(descriptor.get('ItalicAngle', 0)) self.default_width = default_width or num_value(descriptor.get('MissingWidth', 0)) self.leading = num_value(descriptor.get('Leading', 0)) - self.bbox = list_value(descriptor.get('FontBBox', (0, 0, 0, 0))) + self.bbox = list_value(resolve_all(descriptor.get('FontBBox', (0, 0, 0, 0)))) self.hscale = self.vscale = .001 return From 5a0d8db052115465bfe27c08bb2d20e087d5f305 Mon Sep 17 00:00:00 2001 From: Fakabbir Amin Date: Sat, 10 Aug 2019 10:07:23 +0530 Subject: [PATCH 19/34] Adds decoder for OnebyteIdentityH/V instead of using default CMap --- pdfminer/cmapdb.py | 13 +++++++++++++ pdfminer/pdffont.py | 10 ++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index c3403d1..f7f4a0b 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -131,6 +131,15 @@ class IdentityCMap(CMapBase): return () +class IdentityCMapByte(IdentityCMap): + + def decode(self, code): + n = len(code) + if n: + return struct.unpack('>%dB' % n, code) + else: + return () + ## UnicodeMap ## class UnicodeMap(CMapBase): @@ -252,6 +261,10 @@ class CMapDB(object): return IdentityCMap(WMode=0) elif name == 'Identity-V': return IdentityCMap(WMode=1) + elif name == 'OneByteIdentityH': + return IdentityCMapByte(WMode=0) + elif name == 'OneByteIdentityV': + return IdentityCMapByte(WMode=1) try: return klass._cmap_cache[name] except KeyError: diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 9f24afb..17b80cd 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -128,7 +128,13 @@ class Type1FontHeaderParser(PSStackParser): NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-') -IDENTITY_ENCODER = ('Identity-H', 'Identity-V') +IDENTITY_ENCODER = {'Identity-H':'Identity-H', + 'Identity-V':'Identity-V', + 'DLIdent-H':'Identity-H', + 'DLIdent-V':'Identity-V', + 'OneByteIdentityH':'OneByteIdentityH', + 'OneByteIdentityV':'OneByteIdentityV', + } ## CFFFont ## (Format specified in Adobe Technical Note: #5176 @@ -723,7 +729,7 @@ class PDFCIDFont(PDFFont): raise PDFFontError('CMapName unspecified for encoding') cmap_name = 'unknown' if cmap_name in IDENTITY_ENCODER: - return CMapDB.get_cmap(cmap_name) + return CMapDB.get_cmap(IDENTITY_ENCODER[cmap_name]) else: return CMap() From 5b210981c91cabdef9300e1951a087dc476c72fe Mon Sep 17 00:00:00 2001 From: Fakabbir Amin Date: Sat, 10 Aug 2019 10:19:20 +0530 Subject: [PATCH 20/34] Adds Test Case --- samples/sampleOneByteIdentityEncode.pdf | Bin 0 -> 13941 bytes tests/test_tools_pdf2txt.py | 1 + 2 files changed, 1 insertion(+) create mode 100644 samples/sampleOneByteIdentityEncode.pdf diff --git a/samples/sampleOneByteIdentityEncode.pdf b/samples/sampleOneByteIdentityEncode.pdf new file mode 100644 index 0000000000000000000000000000000000000000..35abc4f80d19587d1abb9ffba8bed18d606cdb0b GIT binary patch literal 13941 zcmb7rWmsIzmTrLH9^4xU9-zDF#@*fBg1fuBJ0Z9P2nhrU8XyFBcMSx0cL{bo-lI3b?!qo< zXY6J!Zf@#mX3j2e?qKO=1%N^Y1zp`-%#H2QymDO8)+VvRU=1LbGK$OwSRMF6R1DD+ z0g)p~noX-tvXEX`m=;x@-nN%H%**+E#ETb@XmkxUa|g5kkN_L~QONnXVl{UsCp&X{ za|bs7kX_Z>(i+C;%_st6GiPE~axpV^v39Uzlre)bSi5;M2@3v^_D2TvZy91T;%eTm zZsztf4i=7pKf+c2FAjk@IoZV&jGYvX?akSh9Lz<%-OT?}zBIeKx0Cr_kN?SQ1C;@r zE+9_JJHkhmX}4(n}?b5Z7F(oIEHi9~gKm zF@qa7a^9}pEYn0)KlR^882IL#Sgzm(Q^AFF?HiBGof7w*^!w+=b;>aIii2sEHD4J> zU|NukHdOJD*LR6IHZBy*#1nd{j@>#@`gqGQ^6i+^PY>c7c?HooKSV=4oIc%QzamVo z{P#q1{hfsWy@EJET>l)*cOZu@AclnJ2T`snyVx}JmxhycVzGrsh-T4f{NkMCmb*vy z1FtD~mB(v^gqT#W&MtkJFC6w0?W5l|XY;4mowDJ4t?nyhqAzT(Q5Q zr`;B)$%*6Cdg0sX^tgxgIJxAy{_eIZqF;~E@+;PCLS#-~jNI>=GWVM}CZQtMqF1vf zh*<4JJt9DO9S}Sp$`8dVJZGtPk>r4#ui8yp5zQGcWPA{Q|I!bhY|_$QJKnLh`m4{I0&PIxo#jcR|_LRl)2B{o^5 zreQh1U9+xtwQM6Ms%^>=(uVk}*_s^2n~xFSw1aUh2uxFD_$&1zXr}zJrn(pFofoc9 zccaYfT-VyPYS_xmNW`zul~nWj|2_U7;QvbkKtLW2$Umjv$j{qPedckO$1~I4oFXjMGm?Z3l;e1BLrY9hdZi`Mxy`@XCZvTD?%iR!z32^Ccsac($(W zL}!_1nMN7BD$&bNHWfnsOU)-dt_&}$7JZ+aZ|*kpcvf7Gb&em8cYH@N0dY8!1(;D1 zUv-Ak8fPDBuKlgNu}D)a*fU(mVtX=Ml!d~RJf?E)SS)FM==?&Jd*4O;utaJTHXjdM zu^rbQe}Sp_){jrLI@RgJPp7eW1j%PNc2ZHv+bdkQ7MlU`2%@1#D;%ypA>1mS#J%jn zl5-*VgzS5{A!%KYiau?-$B5X0$!;%?v8juaZ)5u-a8?q?Rpy`9m5V;zOf6T;<@4)| z?mqHSByN68P9EM{mU~D{P9~YV=g|v|10G*IDebp(-zAf{=eJmq{ftT*x-S$UrE^VQ zwD{y*tvVXojN_$~_;hzgSoHjaRv=KK@#O?=Sh?-@MoGcRlgf0VQGIk8UxA1HWn1g0 zun%c&AG`7P=hk~_>Px*#vemwq5DJC0m5r6qdYd{57mjhiqu&VOVS@yKAq^|~&3M@r z$w6B2`{RNwEUQI0zW`E!m=$5XiC6e9OMkPaBDaKWcg{zj=197|5iKZbz6O8)1TXO=p4PalxJaXOSbP#!52m`g4pX0UOvgM+XqGx z%NToDP3KP-0!W};P^?~BpNZ0&TuE%f9U2b&(>CH5d~)M_7&?_ z#f_;bKNeS2QIOthaqi#5$)*q3iWo@A`Ff6%jwCb|mf11B`enu;@iwitvJGiFY<;No zmxE=LOz4W&Zpcbl=2s)>@-UAEj!Cd!7;~A-@xFl8%lMLh0|X}Y@e{Af*P*;Ji4#`!ks$)_mb}8`OQ5K^5G_s zroT*y#pXfUX$jp-x}@b<;tKq<8A2hFMjtZs^%T#SlIkk3nDZvuxpu4d5BBzt9QIbi zjdbfQqNi!3!_>{G-Cv?=nM!Aoh%V~q>iimEHJ~Z2ydVw9TOH;r6+4tC9*>@{JC7(; z7M0j?vkK7?`D*Kg>zw#G2xUxTdDSMu7hI2(ZwbNu?S9k|@#7#WYn6A*YIiD4!_nE3 zPt1aGNQ~aGAEhg=GxJqjPv3lH2`N6Y5Jkrx*JpTa;d8{N;WQOt{gN zN_OgGeaK&&itITI3NWu+X`8re=}MoEEkI_svzU``ckS}LD#qqj4s0Uc$MT{Uclufv z^*@-tu)c?kYiZprO}tKt&VSt1%n)d4YVJ@AI0_f$R-dmU+iCh$z|EdHzof6IoI~>R zkd{%b5Bdc`Cpk$+T}lN8kd0c1h4A4yh-;>`Azq;virRBO|uTNEIqCqN^Tm(m0XKE#+&n- zMNdS`$f>u+TFk7ceTNpfM-x$7J!dOkt`3mZ8aSPURis+^>`5ZJ*F=K>WTQaKp;u?fWdHg!9bG9gz zOHqyZ<NYxH%-vhJJ9;rqGBNt+iThx^h>o^^dbEPD$$ zM|;_ZJ8(KY%%300a*r#=M(7h42u@X=)I7;B+_m81R|vu(5;2Xe<}>9fDmHN`bkJO) zB`xRv<4<^~O6_u-v6OSRc}csNXXZ1%M%PKxt>AAWjEA||t4U7_E4>Xnx;+cEL1D3C z^O8StZ&mjvb$oH%u9*_cn!hVfe0?%wLKm)DySEMn6(aY$O7><#I;M zznh{U0Ts~Wi#<3qZqpzJRo0gOL|jHLcSoye95lcbpQqOPWwnoFlk=GU!)I(dp56gl znO-a>Mi(Kj>e|QPehlh|V0GJ@mW4Eaob(@|_ zy$#6vwPGN_R5}Ox60>+nsUAM32x`1^5AAh(Gg9JV`q=6kR7OBGO)cH2Of>%)DSe!m z6Pgiq?<8F|Xt+w?yQZ*!q`b@C;-XdvM_X11EgxpWaqXwX7lBzbJ>oyRt;i7U^@|h= z?UPSCHI})3P(nzgFXXi)rEfnMZ|#pEX4dI@+B2}GMNY6M6DWw2^DBu9x^amvNgUBP zc16Ww9OlZ8!aKGm0Ej5Pr4QQi^bH!hbxBAoHYHeEt?2l42VyiaCeVl~swU9VzE0}0 zhl9ff@B?7)G{$E6wRC*L!c+M0z=Bf)v}gG>GW@*Hr}*KOs<&w23{3nYYtCqIKUZaU z5;4UY&C09=py@F#Ng8v6>oL74tV-&nN8?gy%@ja^ zAhTIOOSYhe_!q|xQSQsXs=ST;mP{5)Yq)P@`@0bO`7m7a?e%L;h;KmWXKzfruJb=} z-3`kv39bIJWOQ>*$0k*P`x5)ZNtlX^dvbBa-+Mgs6(foHW^3?MWMd`VHsh&W%tbNs zl?h-+!jEOi9)P1oVJD}+c1OS#P6(GMsw8INGe-VfFM^0N{!=LJAdUIHk!l+G@VSJn zFFcc!biJG7>V}eb<)w_Khw$%i)Dew=0k|stfotDH20%+nGD!w!>Y1WIpO*o33&`mu zN4~UTFWK>gcjfU;%Uh8#$ABTPPo784!8fZ&^i=%mDRNXMyy+SENy{~h2p`(Iz24eH z8ZNmRrDMg!C!2^BX|tm$3u1W_Vc0lstbr`^o@r(Y=O+f=HJRP^nxO03Z8TR1b6Ag$ zxpX61jCCf~fs*0utwW84!kWVsS_bluBIm%%c{gs$v|g{-1Ye+p$WTZtVg}ujYom{K zFCb@YpV%7lwRKN^fb$Niy7rRQ74{*)p*M5uHt^K#wCMAFdN3F!S1Xb^_rh+^?OiU=7Ki@fXRW5p_Iv)4ti1x0S=_`CP`^FN2Z zv$Nfn?SuMyujYSvQ0y4Y;Zu963n*)?gtcG`FT;CtY$_<^+|+)4wl*QnKFy~;d{f7Z zRzB-j=3Sz3s)y;VY5!Ft)zsWy^SwPqY68P5nv+DT7tL-TJU<-7f|wF3OEnCE5s7G3 z6hHK`5d||+AMWlZiN}g5C=DKh0FD*s$JqXYRn;=^aY;M!3ivkk8}M!T3^0&#wtedA zMWoo*pa=NPG3vMB>EV5cSReM7Wd1>vibk@EVbR}1GfE-hs;5RV$I7i_BVKdD0peOM`2o+8z>*M~ zO_5!ZRguYqfPsK7mZE3Y#I;(voK=&cwx^_V@@=@-Hk!JM25HR;IhN0<5f95TY_m}4 z`<)6DYZgm&Pm_l(QY%alcDt?`!8&Eo)+e-jjwu*@jeD|X!TTk5v?zglHdXnV51yFF zt5ZAaa@4Kl!Ru~P6ore=VL5rKC-HM8Xsa>HH)Xzj$i?ZA;33P1n!y9fsZlxuZH?vm zaQCfv4mcl4Ph*A6RxF#mi9yNs z>B^@QHm|;QI+cCt`1%O)f>*1TH*v?)ufA$2vcl{(V#)uKgBx?xJ|fe|6hszUT!$y3 zJvo4r4-Yhzykcabj1FleK3DGdPV!HxrM48|6y65ZCRMx1RdF#Q`-Z$lekN=te*k+i zc~Re{um0!|LS8}1!^%TzCH1580(;ST@!z^s@!gu7Dpzqd3eCS-BDBMDz~n^aLYzaM zL!3i82uK`J>F4D7b1IpUP!6b`4S(b-9tkZ zC=$*ND4MgvD3u{q!Cr-zLz~?ZIJoL$8EthZOntw(y{fBBj17*S#}4}Ple#Si-D8kp z^J8P43Wu&=<9K$H1A?m#X)sjz+7Hd!Gn-WBjD(~dD z%^6YwO@q}5MEPZ#kaPqrfA)sUmWKt3MrpxnympmljAx2xY(v@;lxNxpq0L{RnP2pSoY5f5qi^6vESnxA};&`e0 z2jxu4tCbUtD;&!B0(Yf9JG~xqAu&COaXD`Ek16&$+25BsW+ibOied6}%z8iE{Ik~n zl*0akflF4A?NeRAdlo@GQha8zgUS^{U)zgP#ID|vuU|;6Pv`k`XDfbuvbsK$Y-h>TC5Pr#i7xl7}|vy@}q3$CM;h*_f;%jc21t z=YMN&j~Yb>m}I`jCCLl|_y)x=BcYJKcf8n%H#?VTU|2Z;W4af=2!1b^+(2J_n0%I< z$j04La}^$w+Hk*#I7GDWb63{K>FDqLJ0T*-&*33!FW{knxR@{@hg|o)phJDwM!oR} zSM7(aEH3|F;&+v5+zQ55F8QAkO1u#dr$QyAkHfaTe&njY`wVJeU^vM6L0-y_>0y0( z)vs2u-*s~0)p zZSTvNUmQCp27)d>zX~}n^E|fBxF6kM?O5ctT}*g)T^A+VGg3?dC$f2}RY-Y!my8~M z^o~bVPX#`Wd?j)^<{JW2VlQ2P#HQqCWc9a@=!q$%UTA<|eWw0ug0-EC#A(LUdGNI)mn1o;_o+YXjh}K&B~T2G zv&eD?iAK#EF1irLTJPeQXrKKzkjndRV^e-K^Q}Rxd`>Kr?X-<=>dN zGrPaC)tJ{SaRRvs~Dd}aRSqt;0-$Nd6y3mMG3 z9nYM}H;@f-cG&ChmC<>sQ)$@RUS|7LEYq;FwPN@#qyd=_xtKQsc95{%%n=dJtO$9} zDn;;qMj}E!_FHdFrakdB30QcDCdtIxNhjpewD4sZF|~PUNR*YMa`#;xw?Q4WSr__3 zv~9N3K+HgK^;)SC3%-mAaVb{HI(O>2XIxtt)_GY+YqP}?yMS9`Q7qs!Ll3n48(bUnJU>}E==$mE6LR7@UF z;KQ^@oGwM*0Ve53n|C7M#a{wUPWmk1*our9n5AKYslhMCeY$>%M~z~MlCfeY ztDcK!nb0UjoY~IlRj0m;*Tk&Tt#D8yUsEBsj-gd>J^-xWC-5T-Iu3C3w-nLAWn`B$ zq<8Cn5TQ>XB|gpM!m$S-e)Xm~4KsWxi%QJ~<@h#(gt%j@pgy-WaZVg?xcfsvPWNVD zrNw3I`!>UtT$^Cq1;e>R>*HyHM&u;VRC2Dtj<@9KujcbBC*Q_m^HdC8gywZAX5NeZ zP2FCTTkBwU(p7XR#&r}4@`5Nm+r$0G2xSxXMpJdh(DRVZAZp2vGIH8q==djJY1+)G zoYUGR5peb+qY@QPtDe(p^z;}3Me^q)hS7IvJ&-QH0ape&Hj9`2(EB!w*Sh?C8}$WW zuXJ%}QFScL4`PHsk#>!-;RJjjQiHSgvdkzKdM!N6l5;U5ek|pcq>7`Ir>Gq~Pd4B!iNZbR^@# zE)^RDSL*}?r>$hu0|!@KS-Pr|kzbHKWGL|#Snj@8UWaYm92yHQcRc``QMA9u>y)lW zhXygkc6V7(^jbuOTeb&;Dc)?X<_V0G73`3dMDL<98QWuwMxk~Cfr5D0U2(g>paYx% zP8Rxoy{MGrk{=U2Q5ic>o7#wY+?Q|Ig8Gt2xjW&^X3xULqmpq&@7!%9z&18Q<6r52 zmK>*`JF|a-!@;CZZ^B4vfPlvK!F@( z-^G6R>qvqDkvjr9l#53Kg=%CQEIM3Fgv@FR<338s^~K75Tnr5W=cc$6-9f2W^K{4* zDQ~0`s zdqUL9F%m0Q<@IJ%ksj=FATNwW-B;v=fR~T$Yv~BUmOcx~F1u1|g2z3>nHbNdlkS7m z;gIuvR;|&A&H)-YR=@qWYYEEfqFMYr=us9w#eV^Je2&iS$p3UjwUAEq=w`7!gRqp5 zqto#C8QE)cLz`TiGfsf&%t{~0NfOz4+hU+rKl}nQP8Sm?4RoNVqp7f~Zu@)MszO>^ zN}484mc}gkv5m2p6j%W1L=j~&>|+cGJtt%4j`&Enb3pF>fi~7o)VK`6O=Tq>znHbD zBf;sS&-gBd{Vm3kuOSvlX3f>_K2xoDlXBbdc3iE38;+^Y4S^Na3g;*7)*FJ;8e(#z zDuFKq_4LsdK7$xEJTSp9W+6o?1VZ5>9FGWnXzQP7IFXUpsi%#W=icJWfgf zMJuIon{~6gbL?khy_aMy*H^r~bNi|*cSmkpVbsPK=Y+9|Zj3VEo#V}nfVx(7ed|(Z zG>L?%|6$#!=bq_~bnISc=isu^0ybc9Oseno!Mefc-tSA;0a?$*_sdQnhZKBR#;AKE3A3~|l)4IH<;uZWx(?;E9@RlkMb$-FV> zuFVu#uru?-{q?DXzB9l+u^PPfwfDOi<;vEFRv)__?0qyxTORHE?cu97mgbxKkhsAf zf|M*WNqOHYTMyZ)0+0FVBJu|Bvj^jbc#704!Mo9|)T;_li~G0b_1L+*%~dsSX1YEz zhtO14s#aPtM+GDY!Nq%~8-{UCTHQC8GE-Xz84!imh>3bIA;M5C7rP3MDrvp~_S!nZ z02YOhhyv1=0RWVw$Pk%2{f%`r!e$i0rGx>wGdiwVfgo0SSk6e$$5{NO51mvIZNEa( zS9yFhCf2Y%=d={>$audANHdUrO_i9`lyED=FY8hf@e84!pOBqny$7jTJywfjeh(hC z4-ZE!>}z<`kZ1!=x0qO2uwUA^h(s}Ua4F}jMj96s6gUpJ*EBC!q*8FAQ*#YkDk&gF zs(CzCMO(%15YxEDuWgd>Ibrhe}c;`GLd2B+|4YVHi}`;r^FP9)5<_9)A%tcRz5 z>^po*Id*e}=BLtleeU(ZFI2}DZL(j!rtCaw2K040mulY~t@L}6cYG(z+u1maqS_!= z>2mhSL9(5+8&&Zb3Ik~gZ7)_*GFSG0=ul|>t#33rzSt<6lt%WTos<_@FVhQUPGat5 z>m(#7U<#csXT%gK4;B)AOG}?Own>7J$bK%s-?gDR8aG`*swsi+Vm&3AqO?$vm<(gk z;0;I!fk$1VtU6|kJM5C2NBM=~t=|OnN;N<1cWF5KS|lnNkRI7`v;Q6XwOh4VW$S1H z8VC70k%23oiSF3Bj*TZK{dbX{_P-LHfEgPPkK2 z29#dT+TB+!Mp_k0trI?3K)mP|ttufS?63JcD1U@EDv*jMN0hS)V$KG!p9O5U)z=Fb zsmYBMHNTK;zwqe@+Vx-(s<9`SniR(t;C0QMw$X5VJLs^#y4_NLK;*@I-R4|$G_kVY zn-KU$7hGvEqo(93;3}ke?G0*kP|AG>&1G7D&qLsOMECa2GgPZ&on5pjJxbR|V8)ox zL(o1%;mB?(hP=u7pl+bx%{#1`_rz9r$@tg?SW-1Hpp-QW@REE)cD6Q+Iv%7315%Tb z*TNvBSh!c{^Q%;rF@h?ZYLb6eT7PGZLr-(AJc3}aW$}073@a3nJ;Cmt05^*|*xS=-vf4OHrYYGNGxncPBH zQz|1qmPpwu2bNp>-}W@ zy8K>yGPzOFvi7%K!t{m^BnJZv&Wd zo3sXJ(9J5RdtdMJ+e>B}RMFS#h>wz5{$M(_ntW9;!>lVO24pW#<%@DnE?@&sOGGqr zMvDi~sOTk^RP2(uDcIkCSLdVSRB~n)kJx;TGxIsx!Ke8I%;fVXE1Gg=rKE4|g8lK7 zNT{={@zaOr{o1mVCu)?R4^uOV0yk60S%Dy^pN+m0#d7s3H>ZD`H^18rc2}U6+f3_C zV8hL&GtJtss>KJa@;UK`16ABEY`t%q2*^I=VqEOxa~_E*wU3Ijg%-9Tt9~m?k_zgN zPz7E3P8OZDRCz`BZ-R0{$i5!~S)<9@+mGd6zh?enxe9l0&_~45F8|&9WPNYg>?WIO#2nj)X9GWKVk9^$j znvTyamOKfpw*z`-Tsb^?FFvs7&`McgGBKUGxt;1edt681F6hu!UM2+|d&hr3x@>&? z6}-CTgF#mKjkg)gR1EUD>viYH6*IMd#h{6^`9VGR5VXBZANhLe*BQdE?p$mEY&5|} zfm2NKcw-WZ2tL?q>{McRq&f}#aBnMI-l(XW6j}}^e^kWh$x_Cm2h5wa-D|VaqZ2OQ z)aOL74^}_)c0hQbSMCi8curevH>%`@AZarUZ(Vs$9Ic28RDZH#nD z4Sl=AO_SPqBvv#C3TF9V2IT8ezng6(QnR#O^p04jC`x5~Nddf%;3CmZ*PDBfkg)TW z&v*oXFF8c4fC+ngqee8ZYv(5Y{p|+(!iCZ6Gj-1yAKGH{y%_i#Ch+6Fc`cs3riMG@ ztF>R_M(rJI`>n5`qaTg)1P^~|=Ix>Bubapp8N4}O5`~QP;V`y zukciKqo)cZ-=WpX$R?y*4!?FKt0DQRz8jV3(&MZ4DhInrfR5c25%JKI@?@Cl3xHZ_7y?HdX{LTdJ9Nsa-KR zC1^@{LNHcykdoci?PpL>_A-clieiR&2|C_NoND1dXejJ|U9S8@TqDMi?Fb*4UG+7qky5=B7g)3{!uM#p?_NG_AonqS8YEtKCj)Te1>G!IWt-{c zB@iv1q9GW<`8tQS&~%K+<6EJ8*^Ak264I7lH;J6z_-P_RG}`gX0<_54ZP9?O`13W2 z)hWG87V|>C=;%(nWYU<3c(roK`Bl4kqE0^vx_}Y&+pA87{xvj%w$zUVMi*@Y+SZNF zeSwNB-=UtDIW9_jE!diN4*IuF*I^Ic-G#ExyLWgTgM^1AeqdIJt*<0t$kdrIO$(dK z^R0x>^)c$~rQh6Y$ng95+s^YnS_Za^Bw4Suqn38NO@Ex)k@SZ*@g63oBUSO^d+k@> zzKF<6o}>1oz)#w~q@fp?u@qFuaY5;0iX2uB5W!k&44~GfJhZePhj(vEeDTg0Zw)*# zU!D|@nD2{Xu~GG7vr?=8Yqw}Nnt(I%4+V09_(B){yxphd3 z{irNK#D`?GiYP>)K2jqpYBW-~mCcP=bZvlTSn|g>%B@9bY-6G#&_i(XJFrhYgz)LU#^58x&*YJ&_qkw^e@S7t|3C zGembE7(|5=@g7xbDxB80s=YGdDrb{FM$)jdL5vtc{sQsOdpnG|E@Md=F@8fabhji{ z5AsTZG74f(NSaEC@7dn+se;7?4@HPYLeLl;co=tOFO|x6^$TpAon!bA&ee}bnqmtSaiKo*&r@p(abtL9afIfc{3%L5u<(?Bk zz$SS9IbjaiX-mE@GJKzT_a(TBY`=yfDYdUKFR!jsYHbIjM3KjbAHa( z(CV!Aqpq@}1j;!96E#2H=6<2#FSl~quA;fCA6^TNi(5Bd8j@HZrYXT96XaHf zmpndE;23AiGL#7HA(b{g^}prb2(Jsi%TO@75&nIsyH2^bafd^t%A+ol)udjmUOe43 z!?00aL7CFl?bU|Se z@HoKV<=6l2#U6@G_2`L$SSn<1rD(z5`ms$~3VBkPR2sn=fLRm_NkaPnn>;?}+{)MS zWo7DzJ(=gl>H-!ENvOm^wS|6SZNb0$E-U_&d+6PNBWzf)iPe82@->n_cXRiathNb3 zpwALOc7wtz34VV0KEgJ0B{BhYEWOP;4!Y*%{d{dOv}8lq2CpN^BC|dB(NadUQv7UU z*k0;5Fl!`egSswTAZWj_E6KkkVIk_=+8_OAJUc>5(gI6etDZaM5&e-q%5P)Jfnk}9 zKrS02EL!$UdxlJ*X!40+!RV!D>x6Zx5&X92Z28k>5DWKHLuQ4z?cQ_!$V&1A$B;1+uHbaN?Q(5C``^px%;> z4sL&zyI4E9Il=(&uqh15Yz_we1+9hQ$nA_RT>+549{hzE*OXFHkq}`~Gq-d!*O2*( zE-LEir3d^=JAela0&sJJ4A^DdjP0yVMI0>c%wfoOk$)HnGvIFoI|%f*2md>I8pI7- z@?QY_qcH_JQ4##eGh}_zqDxWc3J3;1o+xgCL1yY@p%reO=Y*8i6xR)~Y|#L*kT@zDh(ZxqK;r|)$t%Q&P{G$HT^2V^z z=jX@12I+UNW&Z1{8h!M_Qy)X;-xJ6Ex7*@c04N6!`~L*sS93RU`wNRNBmO5Te{lI= zc2Q$j^FK58{~-&1XA$tHG*ti2FT0i=P>&PBX#n5=>w&o-1^@_9j{^t>z<#hXL=Oz& zf{}m#*c~?K)`Qvf$FBeU8Nder1C9I7Is;gV>G43Je<}?C<>G+Nx%Ie!AOI8y)5itY ztU!3U4FKFgJt!v-b_eT0dAMOJK(H}XkCPiFlbcHq2H7?M z@W6PuU_3lrdQc$D9&T;`gcA(A!^U6^00)TUA7f5V02l)E1q9^MgYa7#0a24{QM8 zgqaRw218-C!d4ml$(XvMl#Dp6j{+v*Pq_cCYslaE{m&%+$sX{3&E8)<7k{Ny!U5KL z0^7WRKiwU$3_5B!SpR880sLt+P-XweJ~uOl?NL|}xa$3>BvpgIHFNx}SrrTg{4wO8 z?E0teCI(v$J1|@Ue_CMvqf(TOEzMyaD6Wp~E~e(Le|lnIOn^TkVKUfN9o=B9F|eJi zU~Xn@{ErO}1VDM9Y+!BxHw47S&11l>Z0z#q7y|tz^>4HOn*T?m&0oI$QRT`m=IH*1 znO)A>%vJBN82*zhu+08fbIxCD#EfAFilZe=2W;)XtN5p&E4w(Fxtp51Fj}jdd%3Au z8M`>W13=jzY!JW~Fc1i21p{HyLFydZ>HtwqQ4xSXqctoXE)M2y0Ao{_@2&tTdlP9s z0POrScW^Z~19+IbxLP|p=rb{~i@`d9++bsISTq>Ld10bBft(yb5CjUlumHgfKp+DX zyMm+H|CQnI*m${^Tc7~}ASfElvj6x1IH6DuD8K^n4;qLA2!*-#=L2x~4;pL?g4M{s z&|rM9%KmROFb`~>|0@j$1Oxv&A50tvta0dHH8Nx zAdrU(RSuUwJjqK(BtPZ9Sdibs z!~y~anLxRrU=vOg9&R%pV^bb(6EM`22Vx9{y@USLCLP$=)C|mO3X{gg3N?eRVTEvm fOw3sSsxCBFH)9vKzY+@KfIxZB=;$PrB+>p4`g0MS literal 0 HcmV?d00001 diff --git a/tests/test_tools_pdf2txt.py b/tests/test_tools_pdf2txt.py index 18be203..6126d92 100644 --- a/tests/test_tools_pdf2txt.py +++ b/tests/test_tools_pdf2txt.py @@ -24,6 +24,7 @@ class TestDumpPDF(): run('../samples/','simple1') run('../samples/','simple2') run('../samples/','simple3') + run('../samples/','sampleOneByteIdentityEncode') def test_2(self): run('../samples/nonfree/','dmca') From 3125d3634adb4f3f395322d254ffa3e4bd4a73de Mon Sep 17 00:00:00 2001 From: Fakabbir Amin Date: Sat, 10 Aug 2019 11:03:28 +0530 Subject: [PATCH 21/34] Correct old test cases --- tests/test_pdfencoding.py | 40 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/tests/test_pdfencoding.py b/tests/test_pdfencoding.py index 396d12d..5878eba 100644 --- a/tests/test_pdfencoding.py +++ b/tests/test_pdfencoding.py @@ -3,7 +3,7 @@ # -*- coding: utf-8 -*- import nose, logging, os -from pdfminer.cmapdb import IdentityCMap, CMap +from pdfminer.cmapdb import IdentityCMap, CMap, IdentityCMapByte from pdfminer.pdffont import PDFCIDFont from pdfminer.pdftypes import PDFStream from pdfminer.psparser import PSLiteral @@ -14,13 +14,13 @@ class TestPDFEncoding(): stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityV')}, '') spec = {'Encoding': stream} font = PDFCIDFont(None, spec) - assert isinstance(font.cmap, CMap) + assert isinstance(font.cmap, IdentityCMapByte) def test_cmapname_onebyteidentityH(self): stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityH')}, '') spec = {'Encoding': stream} font = PDFCIDFont(None, spec) - assert isinstance(font.cmap, CMap) + assert isinstance(font.cmap, IdentityCMapByte) def test_cmapname_V(self): stream = PDFStream({'CMapName': PSLiteral('V')}, '') @@ -68,6 +68,40 @@ class TestPDFEncoding(): font = PDFCIDFont(None, spec) assert isinstance(font.cmap, IdentityCMap) + def test_encoding_DLIdentH(self): + spec = {'Encoding': PSLiteral('DLIdent-H')} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_encoding_DLIdentV(self): + spec = {'Encoding': PSLiteral('DLIdent-V')} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_encoding_DLIdentH_as_PSLiteral_stream(self): + stream = PDFStream({'CMapName':PSLiteral('DLIdent-H')}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_encoding_DLIdentH_as_PSLiteral_stream(self): + stream = PDFStream({'CMapName':PSLiteral('DLIdent-V')}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_encoding_DLIdentH_as_stream(self): + stream = PDFStream({'CMapName':'DLIdent-H'}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_encoding_DLIdentV_as_stream(self): + stream = PDFStream({'CMapName':'DLIdent-V'}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + def test_font_without_spec(self): font = PDFCIDFont(None, {}) assert isinstance(font.cmap, CMap) From 3d549ea48c11a50d427f4636fb060eac654c044d Mon Sep 17 00:00:00 2001 From: Fakabbir Amin Date: Tue, 20 Aug 2019 16:48:40 +0530 Subject: [PATCH 22/34] Removes code comments --- pdfminer/cmapdb.py | 28 ++-------------------------- pdfminer/pdffont.py | 44 +++++++++----------------------------------- 2 files changed, 11 insertions(+), 61 deletions(-) diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index 83110e7..1681a8d 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -1,5 +1,3 @@ - - """ Adobe character mapping (CMap) support. CMaps provide the mapping between character codes and Unicode @@ -40,8 +38,6 @@ class CMapError(Exception): pass -## CMapBase -## class CMapBase(object): debug = 0 @@ -67,8 +63,6 @@ class CMapBase(object): return -## CMap -## class CMap(CMapBase): def __init__(self, **kwargs): @@ -119,8 +113,6 @@ class CMap(CMapBase): return -## IdentityCMap -## class IdentityCMap(CMapBase): def decode(self, code): @@ -140,8 +132,7 @@ class IdentityCMapByte(IdentityCMap): else: return () -## UnicodeMap -## + class UnicodeMap(CMapBase): def __init__(self, **kwargs): @@ -162,8 +153,6 @@ class UnicodeMap(CMapBase): return -## FileCMap -## class FileCMap(CMap): def add_code2cid(self, code, cid): @@ -182,8 +171,6 @@ class FileCMap(CMap): return -## FileUnicodeMap -## class FileUnicodeMap(UnicodeMap): def add_cid2unichr(self, cid, code): @@ -201,8 +188,6 @@ class FileUnicodeMap(UnicodeMap): return -## PyCMap -## class PyCMap(CMap): def __init__(self, name, module): @@ -213,8 +198,6 @@ class PyCMap(CMap): return -## PyUnicodeMap -## class PyUnicodeMap(UnicodeMap): def __init__(self, name, module, vertical): @@ -227,8 +210,6 @@ class PyUnicodeMap(UnicodeMap): return -## CMapDB -## class CMapDB(object): _cmap_cache = {} @@ -284,8 +265,6 @@ class CMapDB(object): return umaps[vertical] -## CMapParser -## class CMapParser(PSStackParser): def __init__(self, cmap, fp): @@ -373,7 +352,6 @@ class CMapParser(PSStackParser): s1 = nunpack(svar) e1 = nunpack(evar) vlen = len(svar) - #assert s1 <= e1, str((s1, e1)) for i in range(e1-s1+1): x = sprefix+struct.pack('>L', s1+i)[-vlen:] self.cmap.add_code2cid(x, cid+i) @@ -400,7 +378,6 @@ class CMapParser(PSStackParser): continue s1 = nunpack(s) e1 = nunpack(e) - #assert s1 <= e1, str((s1, e1)) if isinstance(code, list): for i in range(e1-s1+1): self.cmap.add_cid2unichr(s1+i, code[i]) @@ -435,17 +412,16 @@ class CMapParser(PSStackParser): return -# test def main(argv): args = argv[1:] for fname in args: fp = open(fname, 'rb') cmap = FileUnicodeMap() - #cmap = FileCMap() CMapParser(cmap, fp).run() fp.close() cmap.dump() return + if __name__ == '__main__': sys.exit(main(sys.argv)) diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 4bfd6ed..e94b383 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -35,7 +35,6 @@ from .utils import nunpack log = logging.getLogger(__name__) - def get_widths(seq): widths = {} r = [] @@ -54,10 +53,6 @@ def get_widths(seq): widths[i] = w r = [] return widths -#assert get_widths([1]) == {} -#assert get_widths([1,2,3]) == {1:3, 2:3} -#assert get_widths([1,[2,3],6,[7,8]]) == {1:2,2:3, 6:7,7:8} - def get_widths2(seq): widths = {} @@ -77,13 +72,8 @@ def get_widths2(seq): widths[i] = (w, (vx, vy)) r = [] return widths -#assert get_widths2([1]) == {} -#assert get_widths2([1,2,3,4,5]) == {1:(3, (4,5)), 2:(3, (4,5))} -#assert get_widths2([1,[2,3,4,5],6,[7,8,9]]) == {1:(2, (3,4)), 6:(7, (8,9))} -## FontMetricsDB -## class FontMetricsDB(object): @classmethod @@ -91,8 +81,6 @@ class FontMetricsDB(object): return FONT_METRICS[fontname] -## Type1FontHeaderParser -## class Type1FontHeaderParser(PSStackParser): KEYWORD_BEGIN = KWD(b'begin') @@ -142,6 +130,10 @@ class Type1FontHeaderParser(PSStackParser): NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-') +""" +Note: DLIdent-* isn't found in PDF Reference but is been kept as +it is harmless and have possibility of been a type. (induced from bug report/PR) +""" IDENTITY_ENCODER = {'Identity-H':'Identity-H', 'Identity-V':'Identity-V', 'DLIdent-H':'Identity-H', @@ -150,10 +142,6 @@ IDENTITY_ENCODER = {'Identity-H':'Identity-H', 'OneByteIdentityV':'OneByteIdentityV', } -## CFFFont -## (Format specified in Adobe Technical Note: #5176 -## "The Compact Font Format Specification") -## def getdict(data): d = {} fp = BytesIO(data) @@ -281,6 +269,7 @@ class CFFFont(object): 'Light', 'Medium', 'Regular', 'Roman', 'Semibold', ) + class INDEX(object): def __init__(self, fp): @@ -381,9 +370,6 @@ class CFFFont(object): assert False, str(('Unhandled', format)) else: raise ValueError('unsupported charset format: %r' % format) - #print self.code2gid - #print self.name2gid - #assert 0 return def getstr(self, sid): @@ -392,8 +378,6 @@ class CFFFont(object): return self.string_index[sid-len(self.STANDARD_STRINGS)] -## TrueTypeFont -## class TrueTypeFont(object): class CMapNotFound(Exception): @@ -479,8 +463,6 @@ class TrueTypeFont(object): return unicode_map -## Fonts -## class PDFFontError(PDFException): pass @@ -492,7 +474,6 @@ LITERAL_STANDARD_ENCODING = LIT('StandardEncoding') LITERAL_TYPE1C = LIT('Type1C') -# PDFFont class PDFFont(object): def __init__(self, descriptor, widths, default_width=None): @@ -557,7 +538,6 @@ class PDFFont(object): return sum(self.char_width(cid) for cid in self.decode(s)) -# PDFSimpleFont class PDFSimpleFont(PDFFont): def __init__(self, descriptor, widths, spec): @@ -594,7 +574,6 @@ class PDFSimpleFont(PDFFont): raise PDFUnicodeNotDefined(None, cid) -# PDFType1Font class PDFType1Font(PDFSimpleFont): def __init__(self, rsrcmgr, spec): @@ -626,14 +605,12 @@ class PDFType1Font(PDFSimpleFont): return '' % self.basefont -# PDFTrueTypeFont class PDFTrueTypeFont(PDFType1Font): def __repr__(self): return '' % self.basefont -# PDFType3Font class PDFType3Font(PDFSimpleFont): def __init__(self, rsrcmgr, spec): @@ -656,7 +633,6 @@ class PDFType3Font(PDFSimpleFont): return '' -# PDFCIDFont class PDFCIDFont(PDFFont): def __init__(self, rsrcmgr, spec, strict=settings.STRICT): @@ -721,9 +697,9 @@ class PDFCIDFont(PDFFont): """ For certain PDFs, Encoding Type isn't mentioned as an attribute of Encoding but as an attribute of CMapName, where CMapName is an - attribure of spec['Encoding']. - The horizaontal/vertical modes are mentioned with diffrent name - such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V' + attribute of spec['Encoding']. + The horizontal/vertical modes are mentioned with different name + such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'. """ try: spec_encoding = spec['Encoding'] @@ -771,16 +747,14 @@ class PDFCIDFont(PDFFont): except KeyError: raise PDFUnicodeNotDefined(self.cidcoding, cid) - -# main def main(argv): for fname in argv[1:]: fp = open(fname, 'rb') - #font = TrueTypeFont(fname, fp) font = CFFFont(fname, fp) print (font) fp.close() return + if __name__ == '__main__': sys.exit(main(sys.argv)) From abd685fdc6853eac2df7b01a65f98f5264db6f08 Mon Sep 17 00:00:00 2001 From: Fakabbir Amin Date: Tue, 20 Aug 2019 17:13:27 +0530 Subject: [PATCH 23/34] Corrects Code Comment --- pdfminer/pdffont.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index e94b383..5217071 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -130,9 +130,9 @@ class Type1FontHeaderParser(PSStackParser): NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-') -""" -Note: DLIdent-* isn't found in PDF Reference but is been kept as -it is harmless and have possibility of been a type. (induced from bug report/PR) + +#Note: DLIdent-* isn't found in PDF Reference but is been kept as +#it is harmless and have possibility of been a type. (induced from bug report/PR) """ IDENTITY_ENCODER = {'Identity-H':'Identity-H', 'Identity-V':'Identity-V', From 7c03d96d25c2a06a5cec4f2506d8bf36f3441158 Mon Sep 17 00:00:00 2001 From: Fakabbir Amin Date: Tue, 20 Aug 2019 17:16:10 +0530 Subject: [PATCH 24/34] Corrects Comment --- pdfminer/pdffont.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 5217071..be9ef8b 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -133,7 +133,6 @@ NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', Non #Note: DLIdent-* isn't found in PDF Reference but is been kept as #it is harmless and have possibility of been a type. (induced from bug report/PR) -""" IDENTITY_ENCODER = {'Identity-H':'Identity-H', 'Identity-V':'Identity-V', 'DLIdent-H':'Identity-H', From 106a09c5bbfe9ac3ad5ce3b87d4ce403758087cd Mon Sep 17 00:00:00 2001 From: Tony Tong Date: Sat, 12 Oct 2019 17:35:46 -0400 Subject: [PATCH 25/34] fix stoke color and non-stroke color in PDFGraphicState --- pdfminer/pdfinterp.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index a14f64a..de54835 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -598,25 +598,25 @@ class PDFPageInterpreter(object): # setrgb-stroking def do_RG(self, r, g, b): - self.graphicstate.color = (r, g, b) + self.graphicstate.scolor = (r, g, b) #self.do_CS(LITERAL_DEVICE_RGB) return # setrgb-non-stroking def do_rg(self, r, g, b): - self.graphicstate.color = (r, g, b) + self.graphicstate.ncolor = (r, g, b) #self.do_cs(LITERAL_DEVICE_RGB) return # setcmyk-stroking def do_K(self, c, m, y, k): - self.graphicstate.color = (c, m, y, k) + self.graphicstate.scolor = (c, m, y, k) #self.do_CS(LITERAL_DEVICE_CMYK) return # setcmyk-non-stroking def do_k(self, c, m, y, k): - self.graphicstate.color = (c, m, y, k) + self.graphicstate.ncolor = (c, m, y, k) #self.do_cs(LITERAL_DEVICE_CMYK) return From 4df6d4e5caab3ccb98f288a763e3bee2868a148f Mon Sep 17 00:00:00 2001 From: "D.A.Bashkirtsev" Date: Tue, 15 Oct 2019 19:11:54 +0500 Subject: [PATCH 26/34] Changed: comparations for image colorspace literals (#132) Fixes #131 Changed: comparations for image colorspace literals Added: test for extracting images from pdfs --- pdfminer/image.py | 6 +-- tests/test_tools_pdf2txt.py | 75 ++++++++++++++++++++++++++----------- 2 files changed, 56 insertions(+), 25 deletions(-) diff --git a/pdfminer/image.py b/pdfminer/image.py index e85815c..39265fb 100644 --- a/pdfminer/image.py +++ b/pdfminer/image.py @@ -74,7 +74,7 @@ class ImageWriter(object): if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE: ext = '.jpg' elif (image.bits == 1 or - image.bits == 8 and image.colorspace in (LITERAL_DEVICE_RGB, LITERAL_DEVICE_GRAY)): + image.bits == 8 and (LITERAL_DEVICE_RGB in image.colorspace or LITERAL_DEVICE_GRAY in image.colorspace)): ext = '.%dx%d.bmp' % (width, height) else: ext = '.%d.%dx%d.img' % (image.bits, width, height) @@ -101,7 +101,7 @@ class ImageWriter(object): for y in range(height): bmp.write_line(y, data[i:i+width]) i += width - elif image.bits == 8 and image.colorspace is LITERAL_DEVICE_RGB: + elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace: bmp = BMPWriter(fp, 24, width, height) data = stream.get_data() i = 0 @@ -109,7 +109,7 @@ class ImageWriter(object): for y in range(height): bmp.write_line(y, data[i:i+width]) i += width - elif image.bits == 8 and image.colorspace is LITERAL_DEVICE_GRAY: + elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace: bmp = BMPWriter(fp, 8, width, height) data = stream.get_data() i = 0 diff --git a/tests/test_tools_pdf2txt.py b/tests/test_tools_pdf2txt.py index 6126d92..188f652 100644 --- a/tests/test_tools_pdf2txt.py +++ b/tests/test_tools_pdf2txt.py @@ -1,45 +1,51 @@ -#!/usr/bin/env python +import os +from shutil import rmtree +from tempfile import NamedTemporaryFile, mkdtemp -# -*- coding: utf-8 -*- - -import nose, logging, os +import nose import tools.pdf2txt as pdf2txt -path=os.path.dirname(os.path.abspath(__file__))+'/' -def run(datapath,filename,options=None): - i=path+datapath+filename+'.pdf' - o=path+filename+'.txt' +def full_path(relative_path_to_this_file): + this_file_dir = os.path.dirname(os.path.abspath(__file__)) + abspath = os.path.abspath(os.path.join(this_file_dir, relative_path_to_this_file)) + return abspath + + +def run(datapath, filename, options=None): + i = full_path(datapath + filename + '.pdf') + o = full_path(filename + '.txt') if options: - s='pdf2txt -o%s %s %s'%(o,options,i) + s = 'pdf2txt -o%s %s %s' % (o, options, i) else: - s='pdf2txt -o%s %s'%(o,i) + s = 'pdf2txt -o%s %s' % (o, i) pdf2txt.main(s.split(' ')[1:]) + class TestDumpPDF(): def test_1(self): - run('../samples/','jo') - run('../samples/','simple1') - run('../samples/','simple2') - run('../samples/','simple3') + run('../samples/', 'jo') + run('../samples/', 'simple1') + run('../samples/', 'simple2') + run('../samples/', 'simple3') run('../samples/','sampleOneByteIdentityEncode') def test_2(self): - run('../samples/nonfree/','dmca') + run('../samples/nonfree/', 'dmca') def test_3(self): - run('../samples/nonfree/','f1040nr') + run('../samples/nonfree/', 'f1040nr') def test_4(self): - run('../samples/nonfree/','i1040nr') + run('../samples/nonfree/', 'i1040nr') def test_5(self): - run('../samples/nonfree/','kampo') + run('../samples/nonfree/', 'kampo') def test_6(self): - run('../samples/nonfree/','naacl06-shinyama') + run('../samples/nonfree/', 'naacl06-shinyama') # this test works on Windows but on Linux & Travis-CI it says # PDFSyntaxError: No /Root object! - Is this really a PDF? @@ -50,13 +56,38 @@ class TestDumpPDF(): """ def test_8(self): - run('../samples/contrib/','2b','-A -t xml') + run('../samples/contrib/', '2b', '-A -t xml') def test_9(self): - run('../samples/nonfree/','175') # https://github.com/pdfminer/pdfminer.six/issues/65 + run('../samples/nonfree/', '175') # https://github.com/pdfminer/pdfminer.six/issues/65 def test_10(self): - run('../samples/scancode/','patchelf') # https://github.com/euske/pdfminer/issues/96 + run('../samples/scancode/', 'patchelf') # https://github.com/euske/pdfminer/issues/96 + + +class TestDumpImages(object): + + def extract_images(self, input_file): + output_dir = mkdtemp() + with NamedTemporaryFile() as output_file: + commands = ['-o', output_file.name, '--output-dir', output_dir, input_file] + pdf2txt.main(commands) + image_files = os.listdir(output_dir) + rmtree(output_dir) + return image_files + + def test_nonfree_dmca(self): + """Extract images of pdf containing bmp images + + Regression test for: https://github.com/pdfminer/pdfminer.six/issues/131 + """ + image_files = self.extract_images(full_path('../samples/nonfree/dmca.pdf')) + assert image_files[0].endswith('bmp') + + def test_nonfree_175(self): + """Extract images of pdf containing jpg images""" + self.extract_images(full_path('../samples/nonfree/175.pdf')) + if __name__ == '__main__': nose.runmodule() From 7e40fde3207b9f07a95d832d8da2d80852898b40 Mon Sep 17 00:00:00 2001 From: jet457 Date: Thu, 17 Oct 2019 03:04:25 -0700 Subject: [PATCH 27/34] Removing assertion in drange to allow equal inputs (#246) and mimic behaviour of built-in method range Fixes #66, since it now allows the bbox to have 0 width or 0 height Added tests for Plane since it is the API that uses drange --- pdfminer/utils.py | 1 - tests/test_utils.py | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 tests/test_utils.py diff --git a/pdfminer/utils.py b/pdfminer/utils.py index 31c608a..1eeefc3 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -161,7 +161,6 @@ def fsplit(pred, objs): # drange def drange(v0, v1, d): """Returns a discrete range.""" - assert v0 < v1, str((v0, v1, d)) return range(int(v0)//d, int(v1+d)//d) diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..e7d9d71 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,40 @@ +from nose.tools import assert_equal + +from pdfminer.layout import LTComponent +from pdfminer.utils import make_compat_str, Plane + + +class TestPlane(object): + def test_find_nothing_in_empty_bbox(self): + plane, _ = self.given_plane_with_one_object() + result = list(plane.find((50, 50, 100, 100))) + assert_equal(result, []) + + def test_find_nothing_after_removing(self): + plane, obj = self.given_plane_with_one_object() + plane.remove(obj) + result = list(plane.find((0, 0, 100, 100))) + assert_equal(result, []) + + def test_find_object_in_whole_plane(self): + plane, obj = self.given_plane_with_one_object() + result = list(plane.find((0, 0, 100, 100))) + assert_equal(result, [obj]) + + def test_find_if_object_is_smaller_than_gridsize(self): + plane, obj = self.given_plane_with_one_object(object_size=1, gridsize=100) + result = list(plane.find((0, 0, 100, 100))) + assert_equal(result, [obj]) + + def test_find_object_if_much_larger_than_gridsize(self): + plane, obj = self.given_plane_with_one_object(object_size=100, gridsize=10) + result = list(plane.find((0, 0, 100, 100))) + assert_equal(result, [obj]) + + @staticmethod + def given_plane_with_one_object(object_size=50, gridsize=50): + bounding_box = (0, 0, 100, 100) + plane = Plane(bounding_box, gridsize) + obj = LTComponent((0, 0, object_size, object_size)) + plane.add(obj) + return plane, obj \ No newline at end of file From 9fd7172f7b6a9820006411a60d314789d29e7a2a Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Thu, 17 Oct 2019 12:14:02 +0200 Subject: [PATCH 28/34] Cleanup utils.py --- pdfminer/utils.py | 134 ++++++++++++++++++++-------------------------- 1 file changed, 59 insertions(+), 75 deletions(-) diff --git a/pdfminer/utils.py b/pdfminer/utils.py index 1eeefc3..4fb5825 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -1,61 +1,63 @@ - """ Miscellaneous Routines. """ import struct -# from sys import maxint as INF #doesn't work anymore under Python3, -# but PDF still uses 32 bits ints -INF = (1<<31) - 1 -import six #Python 2+3 compatibility +import six + +# from sys import maxint as INF doesn't work anymore under Python3, but PDF still uses 32 bits ints +INF = (1 << 31) - 1 if six.PY3: import chardet # For str encoding detection in Py3 + unicode = str + def make_compat_bytes(in_str): - "In Py2, does nothing. In Py3, converts to bytes, encoding to unicode." + """In Py2, does nothing. In Py3, converts to bytes, encoding to unicode.""" assert isinstance(in_str, str), str(type(in_str)) if six.PY2: return in_str else: return in_str.encode() + def make_compat_str(in_str): - "In Py2, does nothing. In Py3, converts to string, guessing encoding." + """In Py2, does nothing. In Py3, converts to string, guessing encoding.""" assert isinstance(in_str, (bytes, str, unicode)), str(type(in_str)) if six.PY3 and isinstance(in_str, bytes): enc = chardet.detect(in_str) in_str = in_str.decode(enc['encoding']) return in_str + def compatible_encode_method(bytesorstring, encoding='utf-8', erraction='ignore'): - "When Py2 str.encode is called, it often means bytes.encode in Py3. This does either." + """When Py2 str.encode is called, it often means bytes.encode in Py3. This does either.""" if six.PY2: assert isinstance(bytesorstring, (str, unicode)), str(type(bytesorstring)) return bytesorstring.encode(encoding, erraction) if six.PY3: - if isinstance(bytesorstring, str): return bytesorstring + if isinstance(bytesorstring, str): + return bytesorstring assert isinstance(bytesorstring, bytes), str(type(bytesorstring)) return bytesorstring.decode(encoding, erraction) -## PNG Predictor -## + def apply_png_predictor(pred, colors, columns, bitspercomponent, data): if bitspercomponent != 8: # unsupported raise ValueError("Unsupported `bitspercomponent': %d" % bitspercomponent) nbytes = colors * columns * bitspercomponent // 8 - i = 0 buf = b'' line0 = b'\x00' * columns - for i in range(0, len(data), nbytes+1): + for i in range(0, len(data), nbytes + 1): ft = data[i] if six.PY2: ft = six.byte2int(ft) i += 1 - line1 = data[i:i+nbytes] + line1 = data[i:i + nbytes] line2 = b'' if ft == 0: # PNG none @@ -66,14 +68,14 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data): for b in line1: if six.PY2: b = six.byte2int(b) - c = (c+b) & 255 + c = (c + b) & 255 line2 += six.int2byte(c) elif ft == 2: # PNG up for (a, b) in zip(line0, line1): if six.PY2: a, b = six.byte2int(a), six.byte2int(b) - c = (a+b) & 255 + c = (a + b) & 255 line2 += six.int2byte(c) elif ft == 3: # PNG average (UNTESTED) @@ -81,7 +83,7 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data): for (a, b) in zip(line0, line1): if six.PY2: a, b = six.byte2int(a), six.byte2int(b) - c = ((c+a+b)//2) & 255 + c = ((c + a + b) // 2) & 255 line2 += six.int2byte(c) else: # unsupported @@ -91,8 +93,7 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data): return buf -## Matrix operations -## +# Matrix operations MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0) @@ -100,40 +101,38 @@ def mult_matrix(m1, m0): (a1, b1, c1, d1, e1, f1) = m1 (a0, b0, c0, d0, e0, f0) = m0 """Returns the multiplication of two matrices.""" - return (a0*a1+c0*b1, b0*a1+d0*b1, - a0*c1+c0*d1, b0*c1+d0*d1, - a0*e1+c0*f1+e0, b0*e1+d0*f1+f0) + return (a0 * a1 + c0 * b1, b0 * a1 + d0 * b1, + a0 * c1 + c0 * d1, b0 * c1 + d0 * d1, + a0 * e1 + c0 * f1 + e0, b0 * e1 + d0 * f1 + f0) def translate_matrix(m, v): """Translates a matrix by (x, y).""" (a, b, c, d, e, f) = m (x, y) = v - return (a, b, c, d, x*a+y*c+e, x*b+y*d+f) + return a, b, c, d, x * a + y * c + e, x * b + y * d + f def apply_matrix_pt(m, v): (a, b, c, d, e, f) = m (x, y) = v """Applies a matrix to a point.""" - return (a*x+c*y+e, b*x+d*y+f) + return a * x + c * y + e, b * x + d * y + f def apply_matrix_norm(m, v): """Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))""" (a, b, c, d, e, f) = m (p, q) = v - return (a*p+c*q, b*p+d*q) + return a * p + c * q, b * p + d * q -## Utility functions -## +# Utility functions -# isnumber def isnumber(x): return isinstance(x, (six.integer_types, float)) -# uniq + def uniq(objs): """Eliminates duplicated elements.""" done = set() @@ -145,7 +144,6 @@ def uniq(objs): return -# fsplit def fsplit(pred, objs): """Split a list into two classes according to the predicate.""" t = [] @@ -155,16 +153,14 @@ def fsplit(pred, objs): t.append(obj) else: f.append(obj) - return (t, f) + return t, f -# drange def drange(v0, v1, d): """Returns a discrete range.""" - return range(int(v0)//d, int(v1+d)//d) + return range(int(v0) // d, int(v1 + d) // d) -# get_bound def get_bound(pts): """Compute a minimal rectangle that covers all the points.""" (x0, y0, x1, y1) = (INF, INF, -INF, -INF) @@ -173,10 +169,9 @@ def get_bound(pts): y0 = min(y0, y) x1 = max(x1, x) y1 = max(y1, y) - return (x0, y0, x1, y1) + return x0, y0, x1, y1 -# pick def pick(seq, func, maxobj=None): """Picks the object obj where func(obj) has the highest value.""" maxscore = None @@ -187,7 +182,6 @@ def pick(seq, func, maxobj=None): return maxobj -# choplist def choplist(n, seq): """Groups every n elements of the list.""" r = [] @@ -199,7 +193,6 @@ def choplist(n, seq): return -# nunpack def nunpack(s, default=0): """Unpacks 1 to 4 or 8 byte integers (big endian).""" l = len(s) @@ -210,7 +203,7 @@ def nunpack(s, default=0): elif l == 2: return struct.unpack('>H', s)[0] elif l == 3: - return struct.unpack('>L', b'\x00'+s)[0] + return struct.unpack('>L', b'\x00' + s)[0] elif l == 4: return struct.unpack('>L', s)[0] elif l == 8: @@ -219,7 +212,6 @@ def nunpack(s, default=0): raise TypeError('invalid length: %d' % l) -# decode_text PDFDocEncoding = ''.join(six.unichr(x) for x in ( 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, @@ -264,7 +256,6 @@ def decode_text(s): return ''.join(PDFDocEncoding[c] for c in s) -# enc def enc(x, codec='ascii'): """Encodes a string for SGML/XML/HTML""" if six.PY3 and isinstance(x, bytes): @@ -284,6 +275,7 @@ def matrix2str(m): (a, b, c, d, e, f) = m return '[%.2f,%.2f,%.2f,%.2f, (%.2f,%.2f)]' % (a, b, c, d, e, f) + def vecBetweenBoxes(obj1, obj2): """A distance function between two TextBoxes. @@ -297,38 +289,37 @@ def vecBetweenBoxes(obj1, obj2): """ (x0, y0) = (min(obj1.x0, obj2.x0), min(obj1.y0, obj2.y0)) (x1, y1) = (max(obj1.x1, obj2.x1), max(obj1.y1, obj2.y1)) - (ow, oh) = (x1-x0, y1-y0) - (iw, ih) = (ow-obj1.width-obj2.width, oh-obj1.height-obj2.height) - if iw<0 and ih<0: + (ow, oh) = (x1 - x0, y1 - y0) + (iw, ih) = (ow - obj1.width - obj2.width, oh - obj1.height - obj2.height) + if iw < 0 and ih < 0: # if one is inside another we compute euclidean distance - (xc1, yc1) = ( (obj1.x0+obj1.x1)/2, (obj1.y0+obj1.y1)/2 ) - (xc2, yc2) = ( (obj2.x0+obj2.x1)/2, (obj2.y0+obj2.y1)/2 ) - return (xc1-xc2, yc1-yc2) + (xc1, yc1) = ((obj1.x0 + obj1.x1) / 2, (obj1.y0 + obj1.y1) / 2) + (xc2, yc2) = ((obj2.x0 + obj2.x1) / 2, (obj2.y0 + obj2.y1) / 2) + return xc1 - xc2, yc1 - yc2 else: - return (max(0, iw), max(0, ih)) + return max(0, iw), max(0, ih) + -## Plane -## -## A set-like data structure for objects placed on a plane. -## Can efficiently find objects in a certain rectangular area. -## It maintains two parallel lists of objects, each of -## which is sorted by its x or y coordinate. -## class Plane(object): + """A set-like data structure for objects placed on a plane. + + Can efficiently find objects in a certain rectangular area. + It maintains two parallel lists of objects, each of + which is sorted by its x or y coordinate. + """ def __init__(self, bbox, gridsize=50): - self._seq = [] # preserve the object order. + self._seq = [] # preserve the object order. self._objs = set() self._grid = {} self.gridsize = gridsize (self.x0, self.y0, self.x1, self.y1) = bbox - return def __repr__(self): - return ('' % list(self)) + return '' % list(self) def __iter__(self): - return ( obj for obj in self._seq if obj in self._objs ) + return (obj for obj in self._seq if obj in self._objs) def __len__(self): return len(self._objs) @@ -338,25 +329,22 @@ class Plane(object): def _getrange(self, bbox): (x0, y0, x1, y1) = bbox - if (x1 <= self.x0 or self.x1 <= x0 or - y1 <= self.y0 or self.y1 <= y0): return + if x1 <= self.x0 or self.x1 <= x0 or y1 <= self.y0 or self.y1 <= y0: + return x0 = max(self.x0, x0) y0 = max(self.y0, y0) x1 = min(self.x1, x1) y1 = min(self.y1, y1) - for y in drange(y0, y1, self.gridsize): - for x in drange(x0, x1, self.gridsize): - yield (x, y) - return + for grid_y in drange(y0, y1, self.gridsize): + for grid_x in drange(x0, x1, self.gridsize): + yield (grid_x, grid_y) - # extend(objs) def extend(self, objs): for obj in objs: self.add(obj) - return - # add(obj): place an object. def add(self, obj): + """place an object.""" for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)): if k not in self._grid: r = [] @@ -366,20 +354,18 @@ class Plane(object): r.append(obj) self._seq.append(obj) self._objs.add(obj) - return - # remove(obj): displace an object. def remove(self, obj): + """displace an object.""" for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)): try: self._grid[k].remove(obj) except (KeyError, ValueError): pass self._objs.remove(obj) - return - # find(): finds objects that are in a certain area. def find(self, bbox): + """finds objects that are in a certain area.""" (x0, y0, x1, y1) = bbox done = set() for k in self._getrange(bbox): @@ -389,8 +375,6 @@ class Plane(object): if obj in done: continue done.add(obj) - if (obj.x1 <= x0 or x1 <= obj.x0 or - obj.y1 <= y0 or y1 <= obj.y0): + if obj.x1 <= x0 or x1 <= obj.x0 or obj.y1 <= y0 or y1 <= obj.y0: continue yield obj - return From 12bba5b5f74a024103c4d0733b8afda2785fe8e0 Mon Sep 17 00:00:00 2001 From: Hugo van Kemenade Date: Sun, 20 Oct 2019 12:41:31 +0300 Subject: [PATCH 29/34] Only define dependencies in setup.py (#306) Fixes #299. Closes #300. Changed: define dependencies in setup.py using install_requires and extra_requires. Added: section to CONTRIBUTE.md for initial dev setup. --- .travis.yml | 7 ++----- CONTRIBUTING.md | 19 ++++++++++++++++++- setup.py | 11 +++++++---- tox.ini | 9 ++------- 4 files changed, 29 insertions(+), 17 deletions(-) diff --git a/.travis.yml b/.travis.yml index 0ec2b91..7a04577 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,9 +5,6 @@ python: - "3.5" - "3.6" install: - - pip install six - - pip install pycryptodome - - pip install chardet - - pip install sortedcontainers + - pip install tox-travis script: - nosetests --nologcapture + - tox diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b2680c2..5b6676c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -29,4 +29,21 @@ Any contribution is appreciated! You might want to: * Code should work for Python 2.7 and Python 3.x (for now). * Code changes should conform to PEP8 coding style (with a line-width of 120). Existing code may stay as it is. * New features should be well documented using docstrings. -* Check spelling and grammar. \ No newline at end of file +* Check spelling and grammar. + +## Dev setup + +```sh +# Clone the repo +git clone https://github.com/pdfminer/pdfminer.six +cd pdfminer.six + +# Install dev dependencies +pip install -e .[dev] + +# Run tests on all Python versions +tox + +# Run tests on a single version +tox -e py36 +``` diff --git a/setup.py b/setup.py index 404c308..a13e4da 100644 --- a/setup.py +++ b/setup.py @@ -1,16 +1,19 @@ from setuptools import setup -import sys import pdfminer as package -requires = ['six', 'pycryptodome', 'sortedcontainers', 'chardet ; python_version > "3.0"'] - setup( name='pdfminer.six', version=package.__version__, packages=['pdfminer'], package_data={'pdfminer': ['cmap/*.pickle.gz']}, - install_requires=requires, + install_requires=[ + 'chardet ; python_version > "3.0"', + 'pycryptodome', + 'six', + 'sortedcontainers', + ], + extras_require={"dev": ["nose", "tox"]}, description='PDF parser and analyzer', long_description=package.__doc__, license='MIT/X', diff --git a/tox.ini b/tox.ini index d0e167b..09c7f80 100644 --- a/tox.ini +++ b/tox.ini @@ -1,11 +1,6 @@ [tox] -envlist = py26,py27,py34,py35,py36 +envlist = py{26, 27, 34, 35, 36} [testenv] +extras = dev commands = nosetests --nologcapture -deps = - six - pycryptodome - chardet - nose - sortedcontainers From dd7dc7b684aa2f18581b7dbeb8cd7adc5d9e952e Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sun, 20 Oct 2019 12:32:11 +0200 Subject: [PATCH 30/34] Use keep-a-changlog format for CHANGELOG.md and add unreleased additions, fixes and changes. --- CHANGELOG.md | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 53c94b5..0c52639 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,30 @@ -# List of changes +# Changelog +All notable changes in pdfminer.six will be documented in this file. -## Version 20181108 - - PR #141 to speedup layout analysis - - PR #173 for using argparse and replace deprecated getopt - - PR #142 to compile pdfminer.six with cython, successfully \ No newline at end of file +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). + +## [Unreleased] + +### Added +- Contribution guidelines in [CONTRIBUTING.md](CONTRIBUTING.md) ([#259](https://github.com/pdfminer/pdfminer.six/pull/259)) +- Support new encodings OneByteEncoding and DLIdent for CMaps ([#283](https://github.com/pdfminer/pdfminer.six/pull/283)) + +### Fixed +- Use `six.iteritems()` instead of `dict().iteritems()` to ensure Python2 and Python3 compatibility ([#274](https://github.com/pdfminer/pdfminer.six/pull/274)) +- Properly convert Adobe Glyph names to unicode characters ([#263](https://github.com/pdfminer/pdfminer.six/pull/263)) +- Allow CMap to be a content stream ([#283](https://github.com/pdfminer/pdfminer.six/pull/283)) +- Resolve indirect objects for width and bounding boxes for fonts ([#273](https://github.com/pdfminer/pdfminer.six/pull/273)) +- Actually updating stroke color in graphic state ([#298](https://github.com/pdfminer/pdfminer.six/pull/298)) +- Interpret (invalid) negative font descent as a positive descent ([#203](https://github.com/pdfminer/pdfminer.six/pull/203)) +- Correct colorspace comparision for images ([#132](https://github.com/pdfminer/pdfminer.six/pull/132)) +- Allow for bounding boxes with zero height or width by removing assertion ([#246](https://github.com/pdfminer/pdfminer.six/pull/246)) + +### Changed +- All dependencies are managed in `setup.py` ([#306](https://github.com/pdfminer/pdfminer.six/pull/306), [#219](https://github.com/pdfminer/pdfminer.six/pull/219)) + +## [20181108] - 2018-11-08 + +### Changed +- Speedup layout analysis ([#141](https://github.com/pdfminer/pdfminer.six/pull/141)) +- Use argparse instead of replace deprecated getopt ([#173](https://github.com/pdfminer/pdfminer.six/pull/173)) +- Allow pdfminer.six to be compiled with cython ([#142](https://github.com/pdfminer/pdfminer.six/pull/142)) \ No newline at end of file From adc4726e064618ba692a9e3ed478bd51b65bedbe Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sun, 20 Oct 2019 13:59:29 +0200 Subject: [PATCH 31/34] Add warning about dropping python2 support (#307) Fix #303 --- CHANGELOG.md | 3 +++ pdfminer/__init__.py | 8 ++++++++ 2 files changed, 11 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0c52639..2bb6e58 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [Unreleased] +### Deprecated +- Support for Python 2 is dropped at January 1st, 2020 ([#307](https://github.com/pdfminer/pdfminer.six/pull/307)) + ### Added - Contribution guidelines in [CONTRIBUTING.md](CONTRIBUTING.md) ([#259](https://github.com/pdfminer/pdfminer.six/pull/259)) - Support new encodings OneByteEncoding and DLIdent for CMaps ([#283](https://github.com/pdfminer/pdfminer.six/pull/283)) diff --git a/pdfminer/__init__.py b/pdfminer/__init__.py index 01ddba7..6914c22 100644 --- a/pdfminer/__init__.py +++ b/pdfminer/__init__.py @@ -10,7 +10,15 @@ It includes a PDF converter that can transform PDF files into other text formats (such as HTML). It has an extensible PDF parser that can be used for other purposes instead of text analysis. """ +import sys +import warnings + __version__ = '20181108' + +if sys.version_info < (3, 0): + warnings.warn('On January 1st, 2020, pdfminer.six will stop supporting Python 2. Please upgrade to Python 3. For ' + 'more information see https://github.com/pdfminer/pdfminer.six/issues/194') + if __name__ == '__main__': print(__version__) From 19c078f0d0d174b2980e806541c4665be7b000e2 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sun, 20 Oct 2019 14:18:26 +0200 Subject: [PATCH 32/34] Update CONTRIBUTING.md --- CONTRIBUTING.md | 55 +++++++++++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5b6676c..8accc02 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -13,37 +13,52 @@ Any contribution is appreciated! You might want to: * Use [issues](https://github.com/pdfminer/pdfminer.six/issues) to report bugs and features - If you report a bug in the results for a particular pdf, include that pdf. This allows others to replicate the issue. -* Fix issues by [creating pull requests](https://help.github.com/en/articles/creating-a-pull-request) -* Help others giving your thoughts on open issues and pull requests. +* Fix issues by [creating pull requests](https://help.github.com/en/articles/creating-a-pull-request). +* Help others by giving your thoughts on open issues and pull requests. -## General guidelines for creating issues and pull requests +## Guidelines for creating issues * Search previous issues, as yours might be a duplicate. * When creating a new issue for a bug, include a minimal reproducible example. * When creating a new issue for a feature, be sure to describe the context of the problem you are trying to solve. This will help others to see the importance of your feature request. -* Link pull request to a single issue. -* Pull requests should be merged to develop, not master. This ensures that master always equals the released verion. + +## Guideline for creating pull request + +* A pull request should close an existing issue. +* Pull requests should be merged to develop, not master. This ensures that master always equals the released version. * Include unit tests when possible. In case of bugs, this will help to prevent the same mistake in the future. In case of features, this will show that your code works correctly. -* Code should work for Python 2.7 and Python 3.x (for now). -* Code changes should conform to PEP8 coding style (with a line-width of 120). Existing code may stay as it is. -* New features should be well documented using docstrings. +* Code should work for Python 2.7 and Python 3.x (for now), conform to PEP8 code style (with a line-width of 120) + and properly documented with docstrings. * Check spelling and grammar. +* Don't forget to update the [CHANGELOG.md](CHANGELOG.md#[Unreleased]) -## Dev setup +## Getting started -```sh -# Clone the repo -git clone https://github.com/pdfminer/pdfminer.six -cd pdfminer.six +1. Clone the repository -# Install dev dependencies -pip install -e .[dev] + ```sh + git clone https://github.com/pdfminer/pdfminer.six + cd pdfminer.six + ``` -# Run tests on all Python versions -tox +2. Install dev dependencies -# Run tests on a single version -tox -e py36 -``` + ```sh + pip install -e .[dev] + ``` + +3. Run the tests + + On all Python versions: + + ```sh + tox + ``` + + Or on a single Python version: + + ```sh + tox -e py36 + ``` From 1d3fa415b51b5c379689d2b62d203fe73b64268d Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sun, 20 Oct 2019 14:19:18 +0200 Subject: [PATCH 33/34] Remove licence from README.md because it is already in the LICENCE file --- README.md | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/README.md b/README.md index 488027c..e2e4cc8 100644 --- a/README.md +++ b/README.md @@ -85,32 +85,3 @@ Contributing ------------ Be sure to read the [contribution guidelines](https://github.com/pdfminer/pdfminer.six/blob/master/CONTRIBUTING.md). - - -Terms and Conditions --------------------- - -(This is so-called MIT/X License) - -Copyright (c) 2004-2014 Yusuke Shinyama - -Permission is hereby granted, free of charge, to any person -obtaining a copy of this software and associated documentation -files (the "Software"), to deal in the Software without -restriction, including without limitation the rights to use, -copy, modify, merge, publish, distribute, sublicense, and/or -sell copies of the Software, and to permit persons to whom the -Software is furnished to do so, subject to the following -conditions: - -The above copyright notice and this permission notice shall be -included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY -KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR -PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR -OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. From 694aa508c3541d2e9e4c290ba582e805bb9eac64 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sun, 20 Oct 2019 14:21:48 +0200 Subject: [PATCH 34/34] Release 20191020 --- CHANGELOG.md | 4 ++++ pdfminer/__init__.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2bb6e58..dd4b3d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [Unreleased] +Nothing yet + +## [20191020] - 2019-10-20 + ### Deprecated - Support for Python 2 is dropped at January 1st, 2020 ([#307](https://github.com/pdfminer/pdfminer.six/pull/307)) diff --git a/pdfminer/__init__.py b/pdfminer/__init__.py index 6914c22..181bf7a 100644 --- a/pdfminer/__init__.py +++ b/pdfminer/__init__.py @@ -13,7 +13,7 @@ other purposes instead of text analysis. import sys import warnings -__version__ = '20181108' +__version__ = '20191020' if sys.version_info < (3, 0):