From b84cfc98e0e48f8dbc5099a6aaa71a6fc7e52895 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Wed, 2 Feb 2022 22:24:32 +0100 Subject: [PATCH] Update development tools: travis ci to github actions, tox to nox, nose to pytest (#704) * Replace tox with nox * Replace travis with github actions * Fix pytest, mypy and flake8 errors * Add pytest. * Run on all commits * Remove nose * Speedup slow tests to save GitHub actions minutes * Added line to CHANGELOG.md * Fix line too long in pdfdocument.py * Update .github/workflows/actions.yml Co-authored-by: Jake Stockwin * Improve actions.yml * Fix error with nox name for mypy * Add names for jobs * Replace nose.raises with pytest.raises Co-authored-by: Jake Stockwin --- .github/workflows/actions.yml | 94 +++++++++++++++ .gitignore | 1 + .travis.yml | 11 -- CHANGELOG.md | 3 + CONTRIBUTING.md | 4 +- Makefile | 3 - mypy.ini | 5 +- noxfile.py | 58 ++++++++++ pdfminer/data_structures.py | 14 +-- pdfminer/image.py | 5 +- pdfminer/pdfdocument.py | 10 +- pdfminer/pdftypes.py | 2 +- setup.py | 6 +- tests/test_converter.py | 196 +++++++++++++++++--------------- tests/test_encodingdb.py | 17 ++- tests/test_pdfdocument.py | 19 ++-- tests/test_pdfencoding.py | 8 +- tests/test_pdffont.py | 6 +- tests/test_pdfminer_ccitt.py | 86 +++++++------- tests/test_pdfminer_crypto.py | 36 +++--- tests/test_pdfminer_psparser.py | 6 +- tests/test_pdfpage.py | 6 +- tests/test_tools_dumppdf.py | 13 ++- tests/test_tools_pdf2txt.py | 14 +-- tests/test_utils.py | 86 +++++++------- tools/pdf2txt.py | 5 +- tox.ini | 33 ------ 27 files changed, 435 insertions(+), 312 deletions(-) create mode 100644 .github/workflows/actions.yml delete mode 100644 .travis.yml create mode 100644 noxfile.py delete mode 100644 tox.ini diff --git a/.github/workflows/actions.yml b/.github/workflows/actions.yml new file mode 100644 index 0000000..78a011e --- /dev/null +++ b/.github/workflows/actions.yml @@ -0,0 +1,94 @@ +name: Continuous integration + +on: + push: + +env: + default-python: "3.10" + +jobs: + + check-coding-style: + name: Check coding style + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v2 + - name: Set up Python ${{ env.default-python }} + uses: actions/setup-python@v2 + with: + python-version: ${{ env.default-python }} + - name: Upgrade pip, Install nox + run: | + python -m pip install --upgrade pip + python -m pip install nox + - name: Check coding style + run: | + nox --error-on-missing-interpreters --non-interactive --session lint + + check-static-types: + name: Check static types + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v2 + - name: Set up Python ${{ env.default-python }} + uses: actions/setup-python@v2 + with: + python-version: ${{ env.default-python }} + - name: Upgrade pip, Install nox + run: | + python -m pip install --upgrade pip + python -m pip install nox + - name: Check static types + run: | + nox --error-on-missing-interpreters --non-interactive --session types + + tests: + name: Run tests + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ ubuntu-latest ] + python-version: [ "3.6", "3.7", "3.8", "3.9", "3.10" ] + steps: + - name: Checkout code + uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Determine pip cache directory + id: pip-cache + run: | + echo "::set-output name=dir::$(pip cache dir)" + - name: Cache pip cache + uses: actions/cache@v2 + with: + path: ${{ steps.pip-cache.outputs.dir }} + key: ${{ runner.os }}-pip${{ matrix.python-version }} + - name: Upgrade pip and install nox + run: | + python -m pip install --upgrade pip + python -m pip install nox + - name: Run tests + run: | + nox --non-interactive --session tests-${{ matrix.python-version }} + + build-docs: + name: Test building docs + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v2 + - name: Set up Python ${{ env.default-python }} + uses: actions/setup-python@v2 + with: + python-version: ${{ env.default-python }} + - name: Upgrade pip and install nox + run: | + python -m pip install --upgrade pip + python -m pip install nox + - name: Build docs + run: | + nox --error-on-missing-interpreters --non-interactive --session docs \ No newline at end of file diff --git a/.gitignore b/.gitignore index 56f96b3..b155fbb 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,7 @@ tests/*.xml tests/*.txt .idea/ .tox/ +.nox/ # python venv management tools Pipfile diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 34cf5b2..0000000 --- a/.travis.yml +++ /dev/null @@ -1,11 +0,0 @@ -dist: focal -language: python -python: - - "3.6" - - "3.7" - - "3.8" - - "3.9" -install: - - pip install tox tox-travis -script: - - tox -r diff --git a/CHANGELOG.md b/CHANGELOG.md index 649b4ba..62b6684 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Removed - Unnecessary return statements without argument at the end of functions ([#707](https://github.com/pdfminer/pdfminer.six/pull/707)) +### Changed +- Switched from nose to pytest, from tox to nox and from Travis CI to GitHub Actions ([#704](https://github.com/pdfminer/pdfminer.six/pull/704)) + ## [20211012] ### Added diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f43c14b..dfaa8dd 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -60,11 +60,11 @@ Any contribution is appreciated! You might want to: On all Python versions: ```sh - tox + nox ``` Or on a single Python version: ```sh - tox -e py36 + nox -e py36 ``` diff --git a/Makefile b/Makefile index 05f6c41..a4da1af 100644 --- a/Makefile +++ b/Makefile @@ -53,6 +53,3 @@ $(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz: $(CMAPDST) $(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz: $(CMAPDST) $(CONV_CMAP) -c KSC-EUC=euc-kr -c KSC-Johab=johab -c KSCms-UHC=cp949 -c UniKS-UTF8=utf-8 \ $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt - -test: cmap - nosetests diff --git a/mypy.ini b/mypy.ini index eaddd86..ee71111 100644 --- a/mypy.ini +++ b/mypy.ini @@ -20,8 +20,11 @@ disallow_untyped_defs = True [mypy-cryptography.hazmat.*] ignore_missing_imports = True -[mypy-nose.*] +[mypy-pytest.*] ignore_missing_imports = True [mypy-setuptools] ignore_missing_imports = True + +[mypy-nox] +ignore_missing_imports = True \ No newline at end of file diff --git a/noxfile.py b/noxfile.py new file mode 100644 index 0000000..03a38f4 --- /dev/null +++ b/noxfile.py @@ -0,0 +1,58 @@ +import nox + + +PYTHON_ALL_VERSIONS = ["3.6", "3.7", "3.8", "3.9", "3.10"] + + +@nox.session +def lint(session): + session.install('flake8') + session.run( + 'flake8', + 'pdfminer/', + 'tools/', + 'tests/', + '--count', + '--statistics' + ) + + +@nox.session +def types(session): + session.install('mypy') + session.run( + 'mypy', + '--install-types', + '--non-interactive', + '--show-error-codes', + '.' + ) + + +@nox.session(python=PYTHON_ALL_VERSIONS) +def tests(session): + session.install("-e", ".[dev]") + session.run('pytest') + + +@nox.session +def docs(session): + session.install("-e", ".[docs]") + session.run( + 'python', + '-m', + 'sphinx', + '-b', + 'html', + 'docs/source', + 'docs/build/html' + ) + session.run( + 'python', + '-m', + 'sphinx', + '-b', + 'doctest', + 'docs/source', + 'docs/build/doctest' + ) diff --git a/pdfminer/data_structures.py b/pdfminer/data_structures.py index 5239a38..a372dfb 100644 --- a/pdfminer/data_structures.py +++ b/pdfminer/data_structures.py @@ -1,5 +1,4 @@ -import functools -from typing import Any, Dict, Iterable, List, Optional, Tuple +from typing import Any, Iterable, List, Optional, Tuple from pdfminer import settings from pdfminer.pdfparser import PDFSyntaxError @@ -26,21 +25,20 @@ class NumberTree: self.limits = list_value(self._obj['Limits']) def _parse(self) -> List[Tuple[int, Any]]: - l = [] + items = [] if self.nums: # Leaf node for k, v in choplist(2, self.nums): - l.append((int_value(k), v)) + items.append((int_value(k), v)) if self.kids: # Root or intermediate node for child_ref in self.kids: - l += NumberTree(child_ref)._parse() + items += NumberTree(child_ref)._parse() - return l + return items values: List[Tuple[int, Any]] # workaround decorators unsupported by mypy - @property # type: ignore [no-redef,misc] - @functools.lru_cache + @property # type: ignore[no-redef,misc] def values(self) -> List[Tuple[int, Any]]: values = self._parse() diff --git a/pdfminer/image.py b/pdfminer/image.py index 5c942fb..d537d7c 100644 --- a/pdfminer/image.py +++ b/pdfminer/image.py @@ -2,7 +2,7 @@ import os import os.path import struct from io import BytesIO -from typing import BinaryIO, Tuple +from typing import BinaryIO, Tuple, List, Any from .jbig2 import JBIG2StreamReader, JBIG2StreamWriter from .layout import LTImage @@ -104,6 +104,7 @@ class ImageWriter: # seems to be easily opened by other programs from PIL import Image raw_data = image.stream.get_rawdata() + assert raw_data is not None ifp = BytesIO(raw_data) i = Image.open(ifp) i.save(fp, 'JPEG2000') @@ -162,7 +163,7 @@ class ImageWriter: return is_jbig2 @staticmethod - def jbig2_global(image): + def jbig2_global(image: LTImage) -> List[Any]: global_streams = [] filters = image.stream.get_filters() for filter_name, params in filters: diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py index 1968569..4da9927 100644 --- a/pdfminer/pdfdocument.py +++ b/pdfminer/pdfdocument.py @@ -13,9 +13,9 @@ from . import settings from .arcfour import Arcfour from .data_structures import NumberTree from .pdfparser import PDFSyntaxError, PDFParser, PDFStreamParser -from .pdftypes import DecipherCallable, PDFException, PDFTypeError, PDFStream, \ - PDFObjectNotFound, decipher_all, int_value, str_value, list_value, \ - uint_value, dict_value, stream_value +from .pdftypes import DecipherCallable, PDFException, PDFTypeError, \ + PDFStream, PDFObjectNotFound, decipher_all, int_value, str_value, \ + list_value, uint_value, dict_value, stream_value from .psparser import PSEOF, literal_name, LIT, KWD from .utils import choplist, decode_text, nunpack, format_int_roman, \ format_int_alpha @@ -51,6 +51,10 @@ class PDFEncryptionError(PDFException): pass +class PDFPasswordIncorrect(PDFEncryptionError): + pass + + class PDFEncryptionWarning(UserWarning): """Legacy warning for failed decryption. diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index 037f2ce..e10af5b 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -217,7 +217,7 @@ def stream_value(x: object) -> "PDFStream": return x -def decompress_corrupted(data): +def decompress_corrupted(data: bytes) -> bytes: """Called on some data that can't be properly decoded because of CRC checksum error. Attempt to decode it skipping the CRC. """ diff --git a/setup.py b/setup.py index bc7f074..66ca485 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,10 @@ +import sys +from pathlib import Path + from setuptools import setup from os import path +sys.path.append(str(Path(__file__).parent)) import pdfminer as package @@ -17,7 +21,7 @@ setup( 'cryptography', ], extras_require={ - "dev": ["nose", "tox", "mypy == 0.910"], + "dev": ["pytest", "nox", "mypy == 0.931"], "docs": ["sphinx", "sphinx-argparse"], }, description='PDF parser and analyzer', diff --git a/tests/test_converter.py b/tests/test_converter.py index 8781fa2..da2496f 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -1,28 +1,26 @@ import io from tempfile import TemporaryFile -from nose.tools import assert_equal, assert_false, assert_true - from pdfminer.converter import PDFLayoutAnalyzer, PDFConverter from pdfminer.high_level import extract_pages from pdfminer.layout import LTContainer, LTRect, LTLine, LTCurve from pdfminer.pdfinterp import PDFGraphicState -class TestPaintPath(): +class TestPaintPath: def test_paint_path(self): path = [('m', 6, 7), ('l', 7, 7)] analyzer = self._get_analyzer() analyzer.cur_item = LTContainer([0, 100, 0, 100]) analyzer.paint_path(PDFGraphicState(), False, False, False, path) - assert_equal(len(analyzer.cur_item._objs), 1) + assert len(analyzer.cur_item._objs) == 1 def test_paint_path_mlllh(self): - path = [('m', 6, 7), ('l', 7, 7), ('l', 7, 91), ('l', 6, 91), ('h',)] + path = [('m', 6, 7), ('l', 7, 7), ('l', 7, 91), ('l', 6, 91), ('h',)] analyzer = self._get_analyzer() analyzer.cur_item = LTContainer([0, 100, 0, 100]) analyzer.paint_path(PDFGraphicState(), False, False, False, path) - assert_equal(len(analyzer.cur_item), 1) + assert len(analyzer.cur_item) == 1 def test_paint_path_multiple_mlllh(self): """Path from samples/contrib/issue-00369-excel.pdf""" @@ -34,7 +32,7 @@ class TestPaintPath(): analyzer = self._get_analyzer() analyzer.cur_item = LTContainer([0, 100, 0, 100]) analyzer.paint_path(PDFGraphicState(), False, False, False, path) - assert_equal(len(analyzer.cur_item._objs), 3) + assert len(analyzer.cur_item._objs) == 3 def test_paint_path_quadrilaterals(self): """via https://github.com/pdfminer/pdfminer.six/issues/473""" @@ -49,98 +47,114 @@ class TestPaintPath(): return list(map(type, parse(path))) # Standard rect - assert_equal(get_types([ - ("m", 10, 90), - ("l", 90, 90), - ("l", 90, 10), - ("l", 10, 10), - ("h",), - ]), [LTRect]) + assert get_types( + [ + ("m", 10, 90), + ("l", 90, 90), + ("l", 90, 10), + ("l", 10, 10), + ("h",), + ] + ) == [LTRect] # Same but mllll variation - assert_equal(get_types([ - ("m", 10, 90), - ("l", 90, 90), - ("l", 90, 10), - ("l", 10, 10), - ("l", 10, 90), - ]), [LTRect]) + assert get_types( + [ + ("m", 10, 90), + ("l", 90, 90), + ("l", 90, 10), + ("l", 10, 10), + ("l", 10, 90), + ] + ) == [LTRect] # Bowtie shape - assert_equal(get_types([ - ("m", 110, 90), - ("l", 190, 10), - ("l", 190, 90), - ("l", 110, 10), - ("h",), - ]), [LTCurve]) + assert get_types( + [ + ("m", 110, 90), + ("l", 190, 10), + ("l", 190, 90), + ("l", 110, 10), + ("h",), + ] + ) == [LTCurve] # Quadrilateral with one slanted side - assert_equal(get_types([ - ("m", 210, 90), - ("l", 290, 60), - ("l", 290, 10), - ("l", 210, 10), - ("h",), - ]), [LTCurve]) + assert get_types( + [ + ("m", 210, 90), + ("l", 290, 60), + ("l", 290, 10), + ("l", 210, 10), + ("h",), + ] + ) == [LTCurve] # Path with two rect subpaths - assert_equal(get_types([ - ("m", 310, 90), - ("l", 350, 90), - ("l", 350, 10), - ("l", 310, 10), - ("h",), - ("m", 350, 90), - ("l", 390, 90), - ("l", 390, 10), - ("l", 350, 10), - ("h",), - ]), [LTRect, LTRect]) + assert get_types( + [ + ("m", 310, 90), + ("l", 350, 90), + ("l", 350, 10), + ("l", 310, 10), + ("h",), + ("m", 350, 90), + ("l", 390, 90), + ("l", 390, 10), + ("l", 350, 10), + ("h",), + ] + ) == [LTRect, LTRect] # Path with one rect subpath and one pentagon - assert_equal(get_types([ - ("m", 410, 90), - ("l", 445, 90), - ("l", 445, 10), - ("l", 410, 10), - ("h",), - ("m", 455, 70), - ("l", 475, 90), - ("l", 490, 70), - ("l", 490, 10), - ("l", 455, 10), - ("h",), - ]), [LTRect, LTCurve]) + assert get_types( + [ + ("m", 410, 90), + ("l", 445, 90), + ("l", 445, 10), + ("l", 410, 10), + ("h",), + ("m", 455, 70), + ("l", 475, 90), + ("l", 490, 70), + ("l", 490, 10), + ("l", 455, 10), + ("h",), + ] + ) == [LTRect, LTCurve] # Three types of simple lines - assert_equal(get_types([ - # Vertical line - ("m", 10, 30), - ("l", 10, 40), - ("h",), - # Horizontal line - ("m", 10, 50), - ("l", 70, 50), - ("h",), - # Diagonal line - ("m", 10, 10), - ("l", 30, 30), - ("h",), - ]), [LTLine, LTLine, LTLine]) + assert get_types( + [ + # Vertical line + ("m", 10, 30), + ("l", 10, 40), + ("h",), + # Horizontal line + ("m", 10, 50), + ("l", 70, 50), + ("h",), + # Diagonal line + ("m", 10, 10), + ("l", 30, 30), + ("h",), + ] + ) == [LTLine, LTLine, LTLine] # Same as above, but 'ml' variation - assert_equal(get_types([ - # Vertical line - ("m", 10, 30), - ("l", 10, 40), - # Horizontal line - ("m", 10, 50), - ("l", 70, 50), - # Diagonal line - ("m", 10, 10), - ("l", 30, 30), - ]), [LTLine, LTLine, LTLine]) + assert get_types( + [ + # Vertical line + ("m", 10, 30), + ("l", 10, 40), + # Horizontal line + ("m", 10, 50), + ("l", 70, 50), + # Diagonal line + ("m", 10, 10), + ("l", 30, 30), + ] + ) == [LTLine, LTLine, LTLine] # There are six lines in this one-page PDF; # they all have shape 'ml' not 'mlh' @@ -192,21 +206,21 @@ class TestPaintPath(): class TestBinaryDetector(): def test_stringio(self): - assert_false(PDFConverter._is_binary_stream(io.StringIO())) + assert not PDFConverter._is_binary_stream(io.StringIO()) def test_bytesio(self): - assert_true(PDFConverter._is_binary_stream(io.BytesIO())) + assert PDFConverter._is_binary_stream(io.BytesIO()) def test_tmpfile(self): with TemporaryFile(mode='w') as f: - assert_false(PDFConverter._is_binary_stream(f)) + assert not PDFConverter._is_binary_stream(f) def test_binary_tmpfile(self): with TemporaryFile(mode='wb') as f: - assert_true(PDFConverter._is_binary_stream(f)) + assert PDFConverter._is_binary_stream(f) def test_non_file_like_object_defaults_to_binary(self): - assert_true(PDFConverter._is_binary_stream(object())) + assert PDFConverter._is_binary_stream(object()) def test_textiowrapper(self): - assert_false(PDFConverter._is_binary_stream(io.TextIOBase())) + assert not PDFConverter._is_binary_stream(io.TextIOBase()) diff --git a/tests/test_encodingdb.py b/tests/test_encodingdb.py index bd71d78..455d437 100644 --- a/tests/test_encodingdb.py +++ b/tests/test_encodingdb.py @@ -4,7 +4,7 @@ See: https://github.com/adobe-type-tools/agl-specification#2-the-mapping While not in the specification, lowercase unicode often occurs in pdf's. Therefore lowercase unittest variants are added. """ -from nose.tools import assert_raises +import pytest from pdfminer.encodingdb import name2unicode, EncodingDB from pdfminer.psparser import PSLiteral @@ -59,7 +59,8 @@ def test_name2unicode_uni_empty_string_long(): This character can be correctly mapped by using the glyph name "u1040C. """ - assert_raises(KeyError, name2unicode, 'uniD801DC0C') + with pytest.raises(KeyError): + name2unicode('uniD801DC0C') def test_name2unicode_uni_empty_string_long_lowercase(): @@ -71,7 +72,8 @@ def test_name2unicode_uni_empty_string_long_lowercase(): expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the glyph name "u1040C.""" - assert_raises(KeyError, name2unicode, 'uniD801DC0C') + with pytest.raises(KeyError): + name2unicode('uniD801DC0C') def test_name2unicode_uni_pua(): @@ -128,13 +130,15 @@ def test_name2unicode_foo(): """The name 'foo' maps to an empty string, because 'foo' is not in AGL, and because it does not start with a 'u.'""" - assert_raises(KeyError, name2unicode, 'foo') + with pytest.raises(KeyError): + name2unicode('foo') def test_name2unicode_notdef(): """The name ".notdef" is reduced to an empty string (step 1) and mapped to an empty string (step 3)""" - assert_raises(KeyError, name2unicode, '.notdef') + with pytest.raises(KeyError): + name2unicode('.notdef') def test_name2unicode_pua_ogoneksmall(): @@ -145,7 +149,8 @@ def test_name2unicode_pua_ogoneksmall(): def test_name2unicode_overflow_error(): - assert_raises(KeyError, name2unicode, '226215240241240240240240') + with pytest.raises(KeyError): + name2unicode('226215240241240240240240') def test_get_encoding_with_invalid_differences(): diff --git a/tests/test_pdfdocument.py b/tests/test_pdfdocument.py index d90abc0..8530b0b 100644 --- a/tests/test_pdfdocument.py +++ b/tests/test_pdfdocument.py @@ -1,6 +1,6 @@ import itertools -from nose.tools import assert_equal, raises +import pytest from helpers import absolute_sample_path from pdfminer.pdfdocument import PDFDocument, PDFNoPageLabels @@ -10,12 +10,12 @@ from pdfminer.pdftypes import PDFObjectNotFound, dict_value, int_value class TestPdfDocument(object): - @raises(PDFObjectNotFound) def test_get_zero_objid_raises_pdfobjectnotfound(self): with open(absolute_sample_path('simple1.pdf'), 'rb') as in_file: parser = PDFParser(in_file) doc = PDFDocument(parser) - doc.getobj(0) + with pytest.raises(PDFObjectNotFound): + doc.getobj(0) def test_encrypted_no_id(self): # Some documents may be encrypted but not have an /ID key in @@ -25,8 +25,7 @@ class TestPdfDocument(object): with open(path, 'rb') as fp: parser = PDFParser(fp) doc = PDFDocument(parser) - assert_equal(doc.info, - [{'Producer': b'European Patent Office'}]) + assert doc.info == [{'Producer': b'European Patent Office'}] def test_page_labels(self): path = absolute_sample_path('contrib/pagelabels.pdf') @@ -34,14 +33,14 @@ class TestPdfDocument(object): parser = PDFParser(fp) doc = PDFDocument(parser) total_pages = int_value(dict_value(doc.catalog['Pages'])['Count']) - assert_equal( - list(itertools.islice(doc.get_page_labels(), total_pages)), - ['iii', 'iv', '1', '2', '1']) + assert list(itertools.islice(doc.get_page_labels(), total_pages)) \ + == ['iii', 'iv', '1', '2', '1'] - @raises(PDFNoPageLabels) def test_no_page_labels(self): path = absolute_sample_path('simple1.pdf') with open(path, 'rb') as fp: parser = PDFParser(fp) doc = PDFDocument(parser) - doc.get_page_labels() + + with pytest.raises(PDFNoPageLabels): + doc.get_page_labels() diff --git a/tests/test_pdfencoding.py b/tests/test_pdfencoding.py index 62b5790..bbbe887 100644 --- a/tests/test_pdfencoding.py +++ b/tests/test_pdfencoding.py @@ -2,15 +2,13 @@ # -*- coding: utf-8 -*- -import nose - from pdfminer.cmapdb import IdentityCMap, CMap, IdentityCMapByte from pdfminer.pdffont import PDFCIDFont from pdfminer.pdftypes import PDFStream from pdfminer.psparser import PSLiteral -class TestPDFEncoding(): +class TestPDFEncoding: def test_cmapname_onebyteidentityV(self): stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityV')}, '') @@ -107,7 +105,3 @@ class TestPDFEncoding(): def test_font_without_spec(self): font = PDFCIDFont(None, {}) assert isinstance(font.cmap, CMap) - - -if __name__ == '__main__': - nose.runmodule() diff --git a/tests/test_pdffont.py b/tests/test_pdffont.py index 4044afd..e880b49 100644 --- a/tests/test_pdffont.py +++ b/tests/test_pdffont.py @@ -1,5 +1,3 @@ -from nose.tools import assert_equal, assert_greater - from pdfminer.pdffont import PDFCIDFont from pdfminer.pdfinterp import PDFResourceManager from pdfminer.psparser import PSLiteral @@ -17,5 +15,5 @@ def test_get_cmap_from_pickle(): cmap = font.get_cmap_from_spec(spec, False) - assert_equal(cmap.attrs.get('CMapName'), cmap_name) - assert_greater(len(cmap.code2cid), 0) + assert cmap.attrs.get('CMapName') == cmap_name + assert len(cmap.code2cid) > 0 diff --git a/tests/test_pdfminer_ccitt.py b/tests/test_pdfminer_ccitt.py index 412bbae..785ad88 100644 --- a/tests/test_pdfminer_ccitt.py +++ b/tests/test_pdfminer_ccitt.py @@ -1,5 +1,3 @@ -from nose.tools import assert_equal - from pdfminer.ccitt import CCITTG4Parser, CCITTFaxDecoder @@ -13,98 +11,98 @@ class TestCCITTG4Parser(): def test_b1(self): parser = self.get_parser('00000') parser._do_vertical(0) - assert_equal(parser._curpos, 0) + assert parser._curpos == 0 return def test_b2(self): parser = self.get_parser('10000') parser._do_vertical(-1) - assert_equal(parser._curpos, 0) + assert parser._curpos == 0 return def test_b3(self): parser = self.get_parser('000111') parser._do_pass() - assert_equal(parser._curpos, 3) - assert_equal(parser._get_bits(), '111') + assert parser._curpos == 3 + assert parser._get_bits() == '111' return def test_b4(self): parser = self.get_parser('00000') parser._do_vertical(+2) - assert_equal(parser._curpos, 2) - assert_equal(parser._get_bits(), '11') + assert parser._curpos == 2 + assert parser._get_bits() == '11' return def test_b5(self): parser = self.get_parser('11111111100') parser._do_horizontal(0, 3) - assert_equal(parser._curpos, 3) + assert parser._curpos == 3 parser._do_vertical(1) - assert_equal(parser._curpos, 10) - assert_equal(parser._get_bits(), '0001111111') + assert parser._curpos == 10 + assert parser._get_bits() == '0001111111' return def test_e1(self): parser = self.get_parser('10000') parser._do_vertical(0) - assert_equal(parser._curpos, 1) + assert parser._curpos == 1 parser._do_vertical(0) - assert_equal(parser._curpos, 5) - assert_equal(parser._get_bits(), '10000') + assert parser._curpos == 5 + assert parser._get_bits() == '10000' return def test_e2(self): parser = self.get_parser('10011') parser._do_vertical(0) - assert_equal(parser._curpos, 1) + assert parser._curpos == 1 parser._do_vertical(2) - assert_equal(parser._curpos, 5) - assert_equal(parser._get_bits(), '10000') + assert parser._curpos == 5 + assert parser._get_bits() == '10000' return def test_e3(self): parser = self.get_parser('011111') parser._color = 0 parser._do_vertical(0) - assert_equal(parser._color, 1) - assert_equal(parser._curpos, 1) + assert parser._color == 1 + assert parser._curpos == 1 parser._do_vertical(-2) - assert_equal(parser._color, 0) - assert_equal(parser._curpos, 4) + assert parser._color == 0 + assert parser._curpos == 4 parser._do_vertical(0) - assert_equal(parser._curpos, 6) - assert_equal(parser._get_bits(), '011100') + assert parser._curpos == 6 + assert parser._get_bits() == '011100' return def test_e4(self): parser = self.get_parser('10000') parser._do_vertical(0) - assert_equal(parser._curpos, 1) + assert parser._curpos == 1 parser._do_vertical(-2) - assert_equal(parser._curpos, 3) + assert parser._curpos == 3 parser._do_vertical(0) - assert_equal(parser._curpos, 5) - assert_equal(parser._get_bits(), '10011') + assert parser._curpos == 5 + assert parser._get_bits() == '10011' return def test_e5(self): parser = self.get_parser('011000') parser._color = 0 parser._do_vertical(0) - assert_equal(parser._curpos, 1) + assert parser._curpos == 1 parser._do_vertical(3) - assert_equal(parser._curpos, 6) - assert_equal(parser._get_bits(), '011111') + assert parser._curpos == 6 + assert parser._get_bits() == '011111' return def test_e6(self): parser = self.get_parser('11001') parser._do_pass() - assert_equal(parser._curpos, 4) + assert parser._curpos == 4 parser._do_vertical(0) - assert_equal(parser._curpos, 5) - assert_equal(parser._get_bits(), '11111') + assert parser._curpos == 5 + assert parser._get_bits() == '11111' return def test_e7(self): @@ -112,8 +110,8 @@ class TestCCITTG4Parser(): parser._curpos = 2 parser._color = 1 parser._do_horizontal(2, 6) - assert_equal(parser._curpos, 10) - assert_equal(parser._get_bits(), '1111000000') + assert parser._curpos == 10 + assert parser._get_bits() == '1111000000' return def test_e8(self): @@ -121,19 +119,19 @@ class TestCCITTG4Parser(): parser._curpos = 1 parser._color = 0 parser._do_vertical(0) - assert_equal(parser._curpos, 2) + assert parser._curpos == 2 parser._do_horizontal(7, 0) - assert_equal(parser._curpos, 9) - assert_equal(parser._get_bits(), '101111111') + assert parser._curpos == 9 + assert parser._get_bits() == '101111111' return def test_m1(self): parser = self.get_parser('10101') parser._do_pass() - assert_equal(parser._curpos, 2) + assert parser._curpos == 2 parser._do_pass() - assert_equal(parser._curpos, 4) - assert_equal(parser._get_bits(), '1111') + assert parser._curpos == 4 + assert parser._get_bits() == '1111' return def test_m2(self): @@ -142,7 +140,7 @@ class TestCCITTG4Parser(): parser._do_vertical(-1) parser._do_vertical(1) parser._do_horizontal(1, 1) - assert_equal(parser._get_bits(), '011101') + assert parser._get_bits() == '011101' return def test_m3(self): @@ -151,7 +149,7 @@ class TestCCITTG4Parser(): parser._do_pass() parser._do_vertical(1) parser._do_vertical(1) - assert_equal(parser._get_bits(), '00000001') + assert parser._get_bits() == '00000001' return @@ -159,5 +157,5 @@ class TestCCITTFaxDecoder: def test_b1(self): decoder = CCITTFaxDecoder(5) decoder.output_line(0, b'0') - assert_equal(decoder.close(), b'\x80') + assert decoder.close() == b'\x80' return diff --git a/tests/test_pdfminer_crypto.py b/tests/test_pdfminer_crypto.py index 0126393..cb7f733 100644 --- a/tests/test_pdfminer_crypto.py +++ b/tests/test_pdfminer_crypto.py @@ -1,7 +1,6 @@ """Test of various compression/encoding modules (previously in doctests) """ import binascii -from nose.tools import assert_equal from pdfminer.arcfour import Arcfour from pdfminer.ascii85 import asciihexdecode, ascii85decode @@ -23,37 +22,32 @@ class TestAscii85(): def test_ascii85decode(self): """The sample string is taken from: http://en.wikipedia.org/w/index.php?title=Ascii85""" - assert_equal(ascii85decode(b'9jqo^BlbD-BleB1DJ+*+F(f,q'), - b'Man is distinguished') - assert_equal(ascii85decode(b'E,9)oF*2M7/c~>'), - b'pleasure.') + assert ascii85decode(b'9jqo^BlbD-BleB1DJ+*+F(f,q') \ + == b'Man is distinguished' + assert ascii85decode(b'E,9)oF*2M7/c~>') == b'pleasure.' def test_asciihexdecode(self): - assert_equal(asciihexdecode(b'61 62 2e6364 65'), - b'ab.cde') - assert_equal(asciihexdecode(b'61 62 2e6364 657>'), - b'ab.cdep') - assert_equal(asciihexdecode(b'7>'), - b'p') + assert asciihexdecode(b'61 62 2e6364 65') == b'ab.cde' + assert asciihexdecode(b'61 62 2e6364 657>') == b'ab.cdep' + assert asciihexdecode(b'7>') == b'p' class TestArcfour(): def test(self): - assert_equal(hex(Arcfour(b'Key').process(b'Plaintext')), - b'bbf316e8d940af0ad3') - assert_equal(hex(Arcfour(b'Wiki').process(b'pedia')), - b'1021bf0420') - assert_equal(hex(Arcfour(b'Secret').process(b'Attack at dawn')), - b'45a01f645fc35b383552544b9bf5') + assert hex(Arcfour(b'Key').process(b'Plaintext')) \ + == b'bbf316e8d940af0ad3' + assert hex(Arcfour(b'Wiki').process(b'pedia')) == b'1021bf0420' + assert hex(Arcfour(b'Secret').process(b'Attack at dawn')) \ + == b'45a01f645fc35b383552544b9bf5' class TestLzw(): def test_lzwdecode(self): - assert_equal(lzwdecode(b'\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01'), - b'\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42') + assert lzwdecode(b'\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01') \ + == b'\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42' class TestRunlength(): def test_rldecode(self): - assert_equal(rldecode(b'\x05123456\xfa7\x04abcde\x80junk'), - b'1234567777777abcde') + assert rldecode(b'\x05123456\xfa7\x04abcde\x80junk') \ + == b'1234567777777abcde' diff --git a/tests/test_pdfminer_psparser.py b/tests/test_pdfminer_psparser.py index 5d3d339..12f1d70 100644 --- a/tests/test_pdfminer_psparser.py +++ b/tests/test_pdfminer_psparser.py @@ -1,7 +1,5 @@ import logging -from nose.tools import assert_equal - from pdfminer.psparser import KWD, LIT, PSBaseParser, PSStackParser, PSEOF logger = logging.getLogger(__name__) @@ -92,11 +90,11 @@ func/a/b{(c)do*}def def test_1(self): tokens = self.get_tokens(self.TESTDATA) logger.info(tokens) - assert_equal(tokens, self.TOKENS) + assert tokens == self.TOKENS return def test_2(self): objs = self.get_objects(self.TESTDATA) logger.info(objs) - assert_equal(objs, self.OBJS) + assert objs == self.OBJS return diff --git a/tests/test_pdfpage.py b/tests/test_pdfpage.py index 06574c3..0c0c6a6 100644 --- a/tests/test_pdfpage.py +++ b/tests/test_pdfpage.py @@ -1,9 +1,7 @@ -from nose.tools import assert_equal - from helpers import absolute_sample_path from pdfminer.pdfdocument import PDFDocument -from pdfminer.pdfparser import PDFParser from pdfminer.pdfpage import PDFPage +from pdfminer.pdfparser import PDFParser class TestPdfPage(object): @@ -15,4 +13,4 @@ class TestPdfPage(object): parser = PDFParser(fp) doc = PDFDocument(parser) for (i, page) in enumerate(PDFPage.create_pages(doc)): - assert_equal(page.label, expected_labels[i]) + assert page.label == expected_labels[i] diff --git a/tests/test_tools_dumppdf.py b/tests/test_tools_dumppdf.py index abe6718..9c71782 100644 --- a/tests/test_tools_dumppdf.py +++ b/tests/test_tools_dumppdf.py @@ -1,6 +1,7 @@ import unittest -import logging -from nose.tools import raises + +import pytest + from helpers import absolute_sample_path from tempfilepath import TemporaryFilePath from tools import dumppdf @@ -46,12 +47,12 @@ class TestDumpPDF(unittest.TestCase): def test_6(self): run('nonfree/naacl06-shinyama.pdf', '-t -a') - @raises(TypeError) def test_simple1_raw(self): """Known issue: crash in dumpxml writing binary to text stream.""" - run('simple1.pdf', '-r -a') + with pytest.raises(TypeError): + run('simple1.pdf', '-r -a') - @raises(TypeError) def test_simple1_binary(self): """Known issue: crash in dumpxml writing binary to text stream.""" - run('simple1.pdf', '-b -a') + with pytest.raises(TypeError): + run('simple1.pdf', '-b -a') diff --git a/tests/test_tools_pdf2txt.py b/tests/test_tools_pdf2txt.py index f73bc0e..f951a32 100644 --- a/tests/test_tools_pdf2txt.py +++ b/tests/test_tools_pdf2txt.py @@ -46,10 +46,10 @@ class TestPdf2Txt(): run('nonfree/dmca.pdf') def test_nonfree_f1040nr(self): - run('nonfree/f1040nr.pdf') + run('nonfree/f1040nr.pdf', '-p 1') def test_nonfree_i1040nr(self): - run('nonfree/i1040nr.pdf') + run('nonfree/i1040nr.pdf', '-p 1') def test_nonfree_kampo(self): run('nonfree/kampo.pdf') @@ -58,7 +58,7 @@ class TestPdf2Txt(): run('nonfree/naacl06-shinyama.pdf') def test_nlp2004slides(self): - run('nonfree/nlp2004slides.pdf') + run('nonfree/nlp2004slides.pdf', '-p 1') def test_contrib_2b(self): run('contrib/2b.pdf', '-A -t xml') @@ -116,11 +116,11 @@ class TestPdf2Txt(): class TestDumpImages: @staticmethod - def extract_images(input_file): + def extract_images(input_file, *args): output_dir = mkdtemp() with TemporaryFilePath() as output_file_name: commands = ['-o', output_file_name, '--output-dir', - output_dir, input_file] + output_dir, input_file, *args] pdf2txt.main(commands) image_files = os.listdir(output_dir) rmtree(output_dir) @@ -132,8 +132,8 @@ class TestDumpImages: Regression test for: https://github.com/pdfminer/pdfminer.six/issues/131 """ - image_files = self.extract_images( - absolute_sample_path('../samples/nonfree/dmca.pdf')) + filepath = absolute_sample_path('../samples/nonfree/dmca.pdf') + image_files = self.extract_images(filepath, '-p', '1') assert image_files[0].endswith('bmp') def test_nonfree_175(self): diff --git a/tests/test_utils.py b/tests/test_utils.py index 6c32181..d745256 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,61 +1,63 @@ -from nose.tools import assert_equal, assert_raises import pathlib +import pytest + from helpers import absolute_sample_path from pdfminer.layout import LTComponent -from pdfminer.utils import (format_int_alpha, format_int_roman, open_filename, - Plane, shorten_str) +from pdfminer.utils import open_filename, Plane, shorten_str, \ + format_int_roman, format_int_alpha class TestOpenFilename: def test_string_input(self): filename = absolute_sample_path("simple1.pdf") opened = open_filename(filename) - assert_equal(opened.closing, True) + assert opened.closing def test_pathlib_input(self): filename = pathlib.Path(absolute_sample_path("simple1.pdf")) opened = open_filename(filename) - assert_equal(opened.closing, True) + assert opened.closing def test_file_input(self): filename = absolute_sample_path("simple1.pdf") with open(filename, "rb") as in_file: opened = open_filename(in_file) - assert_equal(opened.file_handler, in_file) + assert opened.file_handler == in_file def test_unsupported_input(self): - assert_raises(TypeError, open_filename, 0) + with pytest.raises(TypeError): + open_filename(0) class TestPlane: def test_find_nothing_in_empty_bbox(self): plane, _ = self.given_plane_with_one_object() result = list(plane.find((50, 50, 100, 100))) - assert_equal(result, []) + assert result == [] def test_find_nothing_after_removing(self): plane, obj = self.given_plane_with_one_object() plane.remove(obj) result = list(plane.find((0, 0, 100, 100))) - assert_equal(result, []) + assert result == [] def test_find_object_in_whole_plane(self): plane, obj = self.given_plane_with_one_object() result = list(plane.find((0, 0, 100, 100))) - assert_equal(result, [obj]) + assert result == [obj] def test_find_if_object_is_smaller_than_gridsize(self): plane, obj = self.given_plane_with_one_object(object_size=1, gridsize=100) result = list(plane.find((0, 0, 100, 100))) - assert_equal(result, [obj]) + assert result == [obj] def test_find_object_if_much_larger_than_gridsize(self): plane, obj = self.given_plane_with_one_object(object_size=100, gridsize=10) result = list(plane.find((0, 0, 100, 100))) - assert_equal(result, [obj]) + assert result == [obj] @staticmethod def given_plane_with_one_object(object_size=50, gridsize=50): @@ -69,42 +71,42 @@ class TestPlane: class TestFunctions(object): def test_shorten_str(self): s = shorten_str('Hello there World', 15) - assert_equal(s, 'Hello ... World') + assert s == 'Hello ... World' def test_shorten_short_str_is_same(self): s = 'Hello World' - assert_equal(s, shorten_str(s, 50)) + assert shorten_str(s, 50) == s def test_shorten_to_really_short(self): - assert_equal('Hello', shorten_str('Hello World', 5)) + assert shorten_str('Hello World', 5) == 'Hello' def test_format_int_alpha(self): - assert_equal('a', format_int_alpha(1)) - assert_equal('b', format_int_alpha(2)) - assert_equal('z', format_int_alpha(26)) - assert_equal('aa', format_int_alpha(27)) - assert_equal('ab', format_int_alpha(28)) - assert_equal('az', format_int_alpha(26*2)) - assert_equal('ba', format_int_alpha(26*2 + 1)) - assert_equal('zz', format_int_alpha(26*27)) - assert_equal('aaa', format_int_alpha(26*27 + 1)) + assert format_int_alpha(1) == 'a' + assert format_int_alpha(2) == 'b' + assert format_int_alpha(26) == 'z' + assert format_int_alpha(27) == 'aa' + assert format_int_alpha(28) == 'ab' + assert format_int_alpha(26 * 2) == 'az' + assert format_int_alpha(26 * 2 + 1) == 'ba' + assert format_int_alpha(26 * 27) == 'zz' + assert format_int_alpha(26 * 27 + 1) == 'aaa' def test_format_int_roman(self): - assert_equal('i', format_int_roman(1)) - assert_equal('ii', format_int_roman(2)) - assert_equal('iii', format_int_roman(3)) - assert_equal('iv', format_int_roman(4)) - assert_equal('v', format_int_roman(5)) - assert_equal('vi', format_int_roman(6)) - assert_equal('vii', format_int_roman(7)) - assert_equal('viii', format_int_roman(8)) - assert_equal('ix', format_int_roman(9)) - assert_equal('x', format_int_roman(10)) - assert_equal('xi', format_int_roman(11)) - assert_equal('xx', format_int_roman(20)) - assert_equal('xl', format_int_roman(40)) - assert_equal('xlv', format_int_roman(45)) - assert_equal('l', format_int_roman(50)) - assert_equal('xc', format_int_roman(90)) - assert_equal('xci', format_int_roman(91)) - assert_equal('c', format_int_roman(100)) + assert format_int_roman(1) == 'i' + assert format_int_roman(2) == 'ii' + assert format_int_roman(3) == 'iii' + assert format_int_roman(4) == 'iv' + assert format_int_roman(5) == 'v' + assert format_int_roman(6) == 'vi' + assert format_int_roman(7) == 'vii' + assert format_int_roman(8) == 'viii' + assert format_int_roman(9) == 'ix' + assert format_int_roman(10) == 'x' + assert format_int_roman(11) == 'xi' + assert format_int_roman(20) == 'xx' + assert format_int_roman(40) == 'xl' + assert format_int_roman(45) == 'xlv' + assert format_int_roman(50) == 'l' + assert format_int_roman(90) == 'xc' + assert format_int_roman(91) == 'xci' + assert format_int_roman(100) == 'c' diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index f42ea50..b5760c8 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -202,7 +202,10 @@ def parse_args(args: Optional[List[str]]) -> argparse.Namespace: parsed_args.page_numbers = {x-1 for x in parsed_args.page_numbers} if parsed_args.pagenos: - parsed_args.page_numbers = {int(x)-1 for x in parsed_args.pagenos.split(",")} + parsed_args.page_numbers = { + int(x) - 1 + for x in parsed_args.pagenos.split(",") + } if parsed_args.output_type == "text" and parsed_args.outfile != "-": for override, alttype in OUTPUT_TYPES: diff --git a/tox.ini b/tox.ini deleted file mode 100644 index af5d36f..0000000 --- a/tox.ini +++ /dev/null @@ -1,33 +0,0 @@ -[tox] -envlist = py{36,37,38,39}-{nose,flake8,mypy,docs} - -[testenv:py{36,37,38,39}-nose] -deps = - nose -allowlist_externals = - nosetests -commands = - nosetests --nologcapture - -[testenv:py{36,37,38,39}-flake8] -deps = - flake8 -allowlist_externals = - flake8 -commands = - flake8 pdfminer/ tools/ tests/ --count --statistics - -[testenv:py{36,37,38,39}-mypy] -deps = - mypy -allowlist_externals = - mypy -commands = - mypy --install-types --non-interactive --show-error-codes . - -[testenv:py{36,37,38,39}-docs] -extras = - docs -commands = - python -m sphinx -b html docs/source docs/build/html - python -m sphinx -b doctest docs/source docs/build/doctest