pdfminer.six/tests/test_tools_pdf2txt.py

import os
from shutil import rmtree
from tempfile import NamedTemporaryFile, mkdtemp

import nose

import tools.pdf2txt as pdf2txt


def full_path(relative_path_to_this_file):
    this_file_dir = os.path.dirname(os.path.abspath(__file__))
    abspath = os.path.abspath(os.path.join(this_file_dir, relative_path_to_this_file))
    return abspath


def run(datapath, filename, options=None):
    i = full_path(datapath + filename + '.pdf')
    o = full_path(filename + '.txt')
    if options:
        s = 'pdf2txt -o%s %s %s' % (o, options, i)
    else:
        s = 'pdf2txt -o%s %s' % (o, i)
    pdf2txt.main(s.split(' ')[1:])


class TestDumpPDF():

    def test_1(self):
        run('../samples/', 'jo')
        run('../samples/', 'simple1')
        run('../samples/', 'simple2')
        run('../samples/', 'simple3')
        run('../samples/','sampleOneByteIdentityEncode')

    def test_2(self):
        run('../samples/nonfree/', 'dmca')

    def test_3(self):
        run('../samples/nonfree/', 'f1040nr')

    def test_4(self):
        run('../samples/nonfree/', 'i1040nr')

    def test_5(self):
        run('../samples/nonfree/', 'kampo')

    def test_6(self):
        run('../samples/nonfree/', 'naacl06-shinyama')

    # this test works on Windows but on Linux & Travis-CI it says
    # PDFSyntaxError: No /Root object! - Is this really a PDF?
    # TODO: Find why
    """
    def test_7(self):
        run('../samples/contrib/','stamp-no')
    """

    def test_8(self):
        run('../samples/contrib/', '2b', '-A -t xml')

    def test_9(self):
        run('../samples/nonfree/', '175')  # https://github.com/pdfminer/pdfminer.six/issues/65

    def test_10(self):
        run('../samples/scancode/', 'patchelf')  # https://github.com/euske/pdfminer/issues/96


class TestDumpImages(object):

    def extract_images(self, input_file):
        output_dir = mkdtemp()
        with NamedTemporaryFile() as output_file:
            commands = ['-o', output_file.name, '--output-dir', output_dir, input_file]
            pdf2txt.main(commands)
        image_files = os.listdir(output_dir)
        rmtree(output_dir)
        return image_files

    def test_nonfree_dmca(self):
        """Extract images of pdf containing bmp images

        Regression test for: https://github.com/pdfminer/pdfminer.six/issues/131
        """
        image_files = self.extract_images(full_path('../samples/nonfree/dmca.pdf'))
        assert image_files[0].endswith('bmp')

    def test_nonfree_175(self):
        """Extract images of pdf containing jpg images"""
        self.extract_images(full_path('../samples/nonfree/175.pdf'))


if __name__ == '__main__':
    nose.runmodule()
Changed: comparations for image colorspace literals (#132) Fixes #131 Changed: comparations for image colorspace literals Added: test for extracting images from pdfs 2019-10-15 14:11:54 +00:00			`import os`
			`from shutil import rmtree`
			`from tempfile import NamedTemporaryFile, mkdtemp`
Removing all the "#!/usr/bin/env python" lines, they do not need for … (#34) * Removing all the "#!/usr/bin/env python" lines, they do not need for python3, solving issue number: #19. * Restored all the shebangs in the tools and tests folders (because they are real executables) but used "#!/usr/bin/env python" instead of "#!/usr/bin/python" as this blog points out: https://www.peterbe.com/plog/importance-of-env Removed also the shebang from pdfminer/psparser.py file. 2016-11-08 19:01:11 +00:00
Changed: comparations for image colorspace literals (#132) Fixes #131 Changed: comparations for image colorspace literals Added: test for extracting images from pdfs 2019-10-15 14:11:54 +00:00			`import nose`
Python 3.4 support and tests 2014-09-03 13:26:08 +00:00
			`import tools.pdf2txt as pdf2txt`


Changed: comparations for image colorspace literals (#132) Fixes #131 Changed: comparations for image colorspace literals Added: test for extracting images from pdfs 2019-10-15 14:11:54 +00:00			`def full_path(relative_path_to_this_file):`
			`this_file_dir = os.path.dirname(os.path.abspath(__file__))`
			`abspath = os.path.abspath(os.path.join(this_file_dir, relative_path_to_this_file))`
			`return abspath`


			`def run(datapath, filename, options=None):`
			`i = full_path(datapath + filename + '.pdf')`
			`o = full_path(filename + '.txt')`
Python 3.4 support and tests 2014-09-03 13:26:08 +00:00			`if options:`
Changed: comparations for image colorspace literals (#132) Fixes #131 Changed: comparations for image colorspace literals Added: test for extracting images from pdfs 2019-10-15 14:11:54 +00:00			`s = 'pdf2txt -o%s %s %s' % (o, options, i)`
Python 3.4 support and tests 2014-09-03 13:26:08 +00:00			`else:`
Changed: comparations for image colorspace literals (#132) Fixes #131 Changed: comparations for image colorspace literals Added: test for extracting images from pdfs 2019-10-15 14:11:54 +00:00			`s = 'pdf2txt -o%s %s' % (o, i)`
Progress, progress.. not nearly atomic enough, sorry. 2015-05-30 15:14:24 +00:00			`pdf2txt.main(s.split(' ')[1:])`
Python 3.4 support and tests 2014-09-03 13:26:08 +00:00
Changed: comparations for image colorspace literals (#132) Fixes #131 Changed: comparations for image colorspace literals Added: test for extracting images from pdfs 2019-10-15 14:11:54 +00:00
Python 3.4 compatibility + tests 2014-09-04 07:36:19 +00:00			`class TestDumpPDF():`
Python 3.4 support and tests 2014-09-03 13:26:08 +00:00
			`def test_1(self):`
Changed: comparations for image colorspace literals (#132) Fixes #131 Changed: comparations for image colorspace literals Added: test for extracting images from pdfs 2019-10-15 14:11:54 +00:00			`run('../samples/', 'jo')`
			`run('../samples/', 'simple1')`
			`run('../samples/', 'simple2')`
			`run('../samples/', 'simple3')`
Adds Test Case 2019-08-10 04:49:20 +00:00			`run('../samples/','sampleOneByteIdentityEncode')`
new test fails on Linux & TRavis-CI. TODO: find why 2017-04-18 16:28:48 +00:00
Python 3.4 support and tests 2014-09-03 13:26:08 +00:00			`def test_2(self):`
Changed: comparations for image colorspace literals (#132) Fixes #131 Changed: comparations for image colorspace literals Added: test for extracting images from pdfs 2019-10-15 14:11:54 +00:00			`run('../samples/nonfree/', 'dmca')`
new test fails on Linux & TRavis-CI. TODO: find why 2017-04-18 16:28:48 +00:00
Python 3.4 support and tests 2014-09-03 13:26:08 +00:00			`def test_3(self):`
Changed: comparations for image colorspace literals (#132) Fixes #131 Changed: comparations for image colorspace literals Added: test for extracting images from pdfs 2019-10-15 14:11:54 +00:00			`run('../samples/nonfree/', 'f1040nr')`
Python 3.4 support and tests 2014-09-03 13:26:08 +00:00
			`def test_4(self):`
Changed: comparations for image colorspace literals (#132) Fixes #131 Changed: comparations for image colorspace literals Added: test for extracting images from pdfs 2019-10-15 14:11:54 +00:00			`run('../samples/nonfree/', 'i1040nr')`
new test fails on Linux & TRavis-CI. TODO: find why 2017-04-18 16:28:48 +00:00
Python 3.4 support and tests 2014-09-03 13:26:08 +00:00			`def test_5(self):`
Changed: comparations for image colorspace literals (#132) Fixes #131 Changed: comparations for image colorspace literals Added: test for extracting images from pdfs 2019-10-15 14:11:54 +00:00			`run('../samples/nonfree/', 'kampo')`
new test fails on Linux & TRavis-CI. TODO: find why 2017-04-18 16:28:48 +00:00
Python 3.4 support and tests 2014-09-03 13:26:08 +00:00			`def test_6(self):`
Changed: comparations for image colorspace literals (#132) Fixes #131 Changed: comparations for image colorspace literals Added: test for extracting images from pdfs 2019-10-15 14:11:54 +00:00			`run('../samples/nonfree/', 'naacl06-shinyama')`
Python 3.4 support and tests 2014-09-03 13:26:08 +00:00
new test fails on Linux & TRavis-CI. TODO: find why 2017-04-18 16:28:48 +00:00			`# this test works on Windows but on Linux & Travis-CI it says`
			`# PDFSyntaxError: No /Root object! - Is this really a PDF?`
			`# TODO: Find why`
			`"""`
solves https://github.com/pdfminer/pdfminer.six/issues/50 2017-04-18 16:20:31 +00:00			`def test_7(self):`
			`run('../samples/contrib/','stamp-no')`
new test fails on Linux & TRavis-CI. TODO: find why 2017-04-18 16:28:48 +00:00			`"""`
Add a test for the previous fix 2017-10-16 10:05:39 +00:00
issue #56 reproduced, solution attempt unsucessful 2017-04-19 12:19:14 +00:00			`def test_8(self):`
Changed: comparations for image colorspace literals (#132) Fixes #131 Changed: comparations for image colorspace literals Added: test for extracting images from pdfs 2019-10-15 14:11:54 +00:00			`run('../samples/contrib/', '2b', '-A -t xml')`
solves https://github.com/pdfminer/pdfminer.six/issues/65 2017-07-20 19:17:06 +00:00
			`def test_9(self):`
Changed: comparations for image colorspace literals (#132) Fixes #131 Changed: comparations for image colorspace literals Added: test for extracting images from pdfs 2019-10-15 14:11:54 +00:00			`run('../samples/nonfree/', '175') # https://github.com/pdfminer/pdfminer.six/issues/65`
Add a test for the previous fix 2017-10-16 10:05:39 +00:00
			`def test_10(self):`
Changed: comparations for image colorspace literals (#132) Fixes #131 Changed: comparations for image colorspace literals Added: test for extracting images from pdfs 2019-10-15 14:11:54 +00:00			`run('../samples/scancode/', 'patchelf') # https://github.com/euske/pdfminer/issues/96`


			`class TestDumpImages(object):`

			`def extract_images(self, input_file):`
			`output_dir = mkdtemp()`
			`with NamedTemporaryFile() as output_file:`
			`commands = ['-o', output_file.name, '--output-dir', output_dir, input_file]`
			`pdf2txt.main(commands)`
			`image_files = os.listdir(output_dir)`
			`rmtree(output_dir)`
			`return image_files`

			`def test_nonfree_dmca(self):`
			`"""Extract images of pdf containing bmp images`

			`Regression test for: https://github.com/pdfminer/pdfminer.six/issues/131`
			`"""`
			`image_files = self.extract_images(full_path('../samples/nonfree/dmca.pdf'))`
			`assert image_files[0].endswith('bmp')`

			`def test_nonfree_175(self):`
			`"""Extract images of pdf containing jpg images"""`
			`self.extract_images(full_path('../samples/nonfree/175.pdf'))`

Add a test for the previous fix 2017-10-16 10:05:39 +00:00
Python 3.4 support and tests 2014-09-03 13:26:08 +00:00			`if __name__ == '__main__':`
Python 3.4 compatibility + tests 2014-09-04 07:36:19 +00:00			`nose.runmodule()`