diff --git a/.gitattributes b/.gitattributes deleted file mode 100644 index d9bd16b..0000000 --- a/.gitattributes +++ /dev/null @@ -1 +0,0 @@ -*.py text eol=lf diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..3940067 --- /dev/null +++ b/LICENSE @@ -0,0 +1,22 @@ +Copyright (c) 2004-2016 Yusuke Shinyama + +Permission is hereby granted, free of charge, to any person +obtaining a copy of this software and associated documentation +files (the "Software"), to deal in the Software without +restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY +KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/MANIFEST.in b/MANIFEST.in index 42ea9e7..d996938 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,5 @@ include Makefile +include LICENSE include *.md include *.py graft docs diff --git a/pdfminer/glyphlist.py b/pdfminer/glyphlist.py index b4b449c..10e1008 100644 --- a/pdfminer/glyphlist.py +++ b/pdfminer/glyphlist.py @@ -7,7 +7,7 @@ Unicode characters instead of using decimal/hex character code. The following data was taken by - $ wget http://www.adobe.com/devnet/opentype/archives/glyphlist.txt + $ wget https://partners.adobe.com/public/developer/en/opentype/glyphlist.txt $ python tools/conv_glyphlist.py glyphlist.txt > glyphlist.py """ diff --git a/pdfminer/latin_enc.py b/pdfminer/latin_enc.py index 41d219c..bb0c1eb 100644 --- a/pdfminer/latin_enc.py +++ b/pdfminer/latin_enc.py @@ -162,6 +162,7 @@ ENCODING = [ ('mu', None, 181, 181, 181), ('multiply', None, None, 215, 215), ('n', 110, 110, 110, 110), + ('nbspace', None, 202, 160, None), ('nine', 57, 57, 57, 57), ('ntilde', None, 150, 241, 241), ('numbersign', 35, 35, 35, 35), diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index c6f18a5..d83888c 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -543,7 +543,7 @@ class PDFSimpleFont(PDFFont): encoding = LITERAL_STANDARD_ENCODING if isinstance(encoding, dict): name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING)) - diff = list_value(encoding.get('Differences', None)) + diff = list_value(encoding.get('Differences', [])) self.cid2unicode = EncodingDB.get_encoding(name, diff) else: self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding)) diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index caad157..0d74325 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -229,7 +229,10 @@ class PDFStream(PDFObject): if not isinstance(filters, list): filters = [filters] if not isinstance(params, list): - params = [params] + # Make sure the parameters list is the same as filters. + params = [params] * len(filters) + if settings.STRICT and len(params) != len(filters): + raise PDFException("Parameters len filter mismatch") return zip(filters, params) def decode(self): diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index af9c189..be435bd 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -138,7 +138,7 @@ KEYWORD_DICT_END = KWD(b'>>') def literal_name(x): if not isinstance(x, PSLiteral): if settings.STRICT: - raise PSTypeError('Literal required: %r' % x) + raise PSTypeError('Literal required: %r' % (x,)) else: name=x else: diff --git a/pdfminer/utils.py b/pdfminer/utils.py index e5bd6bf..e2638d4 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -46,8 +46,9 @@ def compatible_encode_method(bytesorstring, encoding='utf-8', erraction='ignore' def apply_png_predictor(pred, colors, columns, bitspercomponent, data): if bitspercomponent != 8: # unsupported - raise ValueError(bitspercomponent) - nbytes = colors*columns*bitspercomponent//8 + raise ValueError("Unsupported `bitspercomponent': %d" % + bitspercomponent) + nbytes = colors * columns * bitspercomponent // 8 i = 0 buf = b'' line0 = b'\x00' * columns @@ -86,7 +87,7 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data): line2 += six.int2byte(c) else: # unsupported - raise ValueError(ft) + raise ValueError("Unsupported predictor value: %d" % ft) buf += line2 line0 = line2 return buf diff --git a/setup.py b/setup.py index 7f465a5..2859607 100644 --- a/setup.py +++ b/setup.py @@ -4,15 +4,19 @@ from setuptools import setup from pdfminer import __version__ import sys +requires = ['six', 'pycrypto'] +if sys.version_info >= (3, 0): + requires.append('chardet') + setup( name='pdfminer.six', version=__version__, - packages=['pdfminer',], + packages=['pdfminer'], package_data={'pdfminer': ['cmap/*.pickle.gz']}, - install_requires=['six', 'chardet'] if sys.version_info >= (3, 0) else ['six'], + install_requires=requires, description='PDF parser and analyzer', long_description='''fork of PDFMiner using six for Python 2+3 compatibility - + PDFMiner is a tool for extracting information from PDF documents. Unlike other PDF-related tools, it focuses entirely on getting and analyzing text data. PDFMiner allows to obtain