From 19155d35c63bb48e1f9a0c5410eb72b06e1e1852 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Fri, 23 Sep 2016 14:11:53 +0200 Subject: [PATCH 1/6] remove lf rule --- .gitattributes | 1 - 1 file changed, 1 deletion(-) delete mode 100644 .gitattributes diff --git a/.gitattributes b/.gitattributes deleted file mode 100644 index d9bd16b..0000000 --- a/.gitattributes +++ /dev/null @@ -1 +0,0 @@ -*.py text eol=lf From 1820f964818f4e195bff660d953c9dd93d40a735 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Fri, 23 Sep 2016 14:31:31 +0200 Subject: [PATCH 2/6] backport changes for upstream: #145, #95, #111, #117, #129, #132. --- pdfminer/glyphlist.py | 2 +- pdfminer/latin_enc.py | 1 + pdfminer/pdftypes.py | 2 +- pdfminer/utils.py | 7 ++++--- setup.py | 10 +++++++--- 5 files changed, 14 insertions(+), 8 deletions(-) diff --git a/pdfminer/glyphlist.py b/pdfminer/glyphlist.py index b4b449c..10e1008 100644 --- a/pdfminer/glyphlist.py +++ b/pdfminer/glyphlist.py @@ -7,7 +7,7 @@ Unicode characters instead of using decimal/hex character code. The following data was taken by - $ wget http://www.adobe.com/devnet/opentype/archives/glyphlist.txt + $ wget https://partners.adobe.com/public/developer/en/opentype/glyphlist.txt $ python tools/conv_glyphlist.py glyphlist.txt > glyphlist.py """ diff --git a/pdfminer/latin_enc.py b/pdfminer/latin_enc.py index 41d219c..bb0c1eb 100644 --- a/pdfminer/latin_enc.py +++ b/pdfminer/latin_enc.py @@ -162,6 +162,7 @@ ENCODING = [ ('mu', None, 181, 181, 181), ('multiply', None, None, 215, 215), ('n', 110, 110, 110, 110), + ('nbspace', None, 202, 160, None), ('nine', 57, 57, 57, 57), ('ntilde', None, 150, 241, 241), ('numbersign', 35, 35, 35, 35), diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index caad157..54acfaa 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -229,7 +229,7 @@ class PDFStream(PDFObject): if not isinstance(filters, list): filters = [filters] if not isinstance(params, list): - params = [params] + params = [params] * len(filters) return zip(filters, params) def decode(self): diff --git a/pdfminer/utils.py b/pdfminer/utils.py index e5bd6bf..e2638d4 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -46,8 +46,9 @@ def compatible_encode_method(bytesorstring, encoding='utf-8', erraction='ignore' def apply_png_predictor(pred, colors, columns, bitspercomponent, data): if bitspercomponent != 8: # unsupported - raise ValueError(bitspercomponent) - nbytes = colors*columns*bitspercomponent//8 + raise ValueError("Unsupported `bitspercomponent': %d" % + bitspercomponent) + nbytes = colors * columns * bitspercomponent // 8 i = 0 buf = b'' line0 = b'\x00' * columns @@ -86,7 +87,7 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data): line2 += six.int2byte(c) else: # unsupported - raise ValueError(ft) + raise ValueError("Unsupported predictor value: %d" % ft) buf += line2 line0 = line2 return buf diff --git a/setup.py b/setup.py index 7f465a5..2859607 100644 --- a/setup.py +++ b/setup.py @@ -4,15 +4,19 @@ from setuptools import setup from pdfminer import __version__ import sys +requires = ['six', 'pycrypto'] +if sys.version_info >= (3, 0): + requires.append('chardet') + setup( name='pdfminer.six', version=__version__, - packages=['pdfminer',], + packages=['pdfminer'], package_data={'pdfminer': ['cmap/*.pickle.gz']}, - install_requires=['six', 'chardet'] if sys.version_info >= (3, 0) else ['six'], + install_requires=requires, description='PDF parser and analyzer', long_description='''fork of PDFMiner using six for Python 2+3 compatibility - + PDFMiner is a tool for extracting information from PDF documents. Unlike other PDF-related tools, it focuses entirely on getting and analyzing text data. PDFMiner allows to obtain From 0cb13983f7975e1bbca42778882f3eaaca3420e1 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Fri, 23 Sep 2016 14:57:28 +0200 Subject: [PATCH 3/6] Backport LICENSE. --- LICENSE | 22 ++++++++++++++++++++++ MANIFEST.in | 1 + pdfminer/pdftypes.py | 4 ++++ 3 files changed, 27 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..3940067 --- /dev/null +++ b/LICENSE @@ -0,0 +1,22 @@ +Copyright (c) 2004-2016 Yusuke Shinyama + +Permission is hereby granted, free of charge, to any person +obtaining a copy of this software and associated documentation +files (the "Software"), to deal in the Software without +restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY +KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/MANIFEST.in b/MANIFEST.in index 42ea9e7..d996938 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,5 @@ include Makefile +include LICENSE include *.md include *.py graft docs diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index 54acfaa..d95efef 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -9,6 +9,7 @@ from .ccitt import ccittfaxdecode from .psparser import PSException from .psparser import PSObject from .psparser import LIT +from .psparser import STRICT from . import settings from .utils import apply_png_predictor from .utils import isnumber @@ -229,7 +230,10 @@ class PDFStream(PDFObject): if not isinstance(filters, list): filters = [filters] if not isinstance(params, list): + # Make sure the parameters list is the same as filters. params = [params] * len(filters) + if STRICT and len(params) != len(filters): + raise PDFException("Parameters len filter mismatch") return zip(filters, params) def decode(self): From 865246bd0ce8319ca39fa9237931fae116718407 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Fri, 23 Sep 2016 15:04:07 +0200 Subject: [PATCH 4/6] fix print, upstream: 01121124587d99601cf3368e9f82f096a9e5a98f --- pdfminer/psparser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index af9c189..be435bd 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -138,7 +138,7 @@ KEYWORD_DICT_END = KWD(b'>>') def literal_name(x): if not isinstance(x, PSLiteral): if settings.STRICT: - raise PSTypeError('Literal required: %r' % x) + raise PSTypeError('Literal required: %r' % (x,)) else: name=x else: From 70918095cc8ddf4593963c62ec1e6e875ba1637a Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Sat, 24 Sep 2016 11:57:11 +0200 Subject: [PATCH 5/6] Return an empty list when no `Differences` are found. --- pdfminer/pdffont.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index c6f18a5..d83888c 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -543,7 +543,7 @@ class PDFSimpleFont(PDFFont): encoding = LITERAL_STANDARD_ENCODING if isinstance(encoding, dict): name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING)) - diff = list_value(encoding.get('Differences', None)) + diff = list_value(encoding.get('Differences', [])) self.cid2unicode = EncodingDB.get_encoding(name, diff) else: self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding)) From 447adcf02f238ff09ebfdbf7c55530895ce77ed7 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Sat, 24 Sep 2016 12:03:22 +0200 Subject: [PATCH 6/6] fix STRICT reference --- pdfminer/pdftypes.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index d95efef..0d74325 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -9,7 +9,6 @@ from .ccitt import ccittfaxdecode from .psparser import PSException from .psparser import PSObject from .psparser import LIT -from .psparser import STRICT from . import settings from .utils import apply_png_predictor from .utils import isnumber @@ -232,7 +231,7 @@ class PDFStream(PDFObject): if not isinstance(params, list): # Make sure the parameters list is the same as filters. params = [params] * len(filters) - if STRICT and len(params) != len(filters): + if settings.STRICT and len(params) != len(filters): raise PDFException("Parameters len filter mismatch") return zip(filters, params)