From 4df6d4e5caab3ccb98f288a763e3bee2868a148f Mon Sep 17 00:00:00 2001 From: "D.A.Bashkirtsev" Date: Tue, 15 Oct 2019 19:11:54 +0500 Subject: [PATCH] Changed: comparations for image colorspace literals (#132) Fixes #131 Changed: comparations for image colorspace literals Added: test for extracting images from pdfs --- pdfminer/image.py | 6 +-- tests/test_tools_pdf2txt.py | 75 ++++++++++++++++++++++++++----------- 2 files changed, 56 insertions(+), 25 deletions(-) diff --git a/pdfminer/image.py b/pdfminer/image.py index e85815c..39265fb 100644 --- a/pdfminer/image.py +++ b/pdfminer/image.py @@ -74,7 +74,7 @@ class ImageWriter(object): if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE: ext = '.jpg' elif (image.bits == 1 or - image.bits == 8 and image.colorspace in (LITERAL_DEVICE_RGB, LITERAL_DEVICE_GRAY)): + image.bits == 8 and (LITERAL_DEVICE_RGB in image.colorspace or LITERAL_DEVICE_GRAY in image.colorspace)): ext = '.%dx%d.bmp' % (width, height) else: ext = '.%d.%dx%d.img' % (image.bits, width, height) @@ -101,7 +101,7 @@ class ImageWriter(object): for y in range(height): bmp.write_line(y, data[i:i+width]) i += width - elif image.bits == 8 and image.colorspace is LITERAL_DEVICE_RGB: + elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace: bmp = BMPWriter(fp, 24, width, height) data = stream.get_data() i = 0 @@ -109,7 +109,7 @@ class ImageWriter(object): for y in range(height): bmp.write_line(y, data[i:i+width]) i += width - elif image.bits == 8 and image.colorspace is LITERAL_DEVICE_GRAY: + elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace: bmp = BMPWriter(fp, 8, width, height) data = stream.get_data() i = 0 diff --git a/tests/test_tools_pdf2txt.py b/tests/test_tools_pdf2txt.py index 6126d92..188f652 100644 --- a/tests/test_tools_pdf2txt.py +++ b/tests/test_tools_pdf2txt.py @@ -1,45 +1,51 @@ -#!/usr/bin/env python +import os +from shutil import rmtree +from tempfile import NamedTemporaryFile, mkdtemp -# -*- coding: utf-8 -*- - -import nose, logging, os +import nose import tools.pdf2txt as pdf2txt -path=os.path.dirname(os.path.abspath(__file__))+'/' -def run(datapath,filename,options=None): - i=path+datapath+filename+'.pdf' - o=path+filename+'.txt' +def full_path(relative_path_to_this_file): + this_file_dir = os.path.dirname(os.path.abspath(__file__)) + abspath = os.path.abspath(os.path.join(this_file_dir, relative_path_to_this_file)) + return abspath + + +def run(datapath, filename, options=None): + i = full_path(datapath + filename + '.pdf') + o = full_path(filename + '.txt') if options: - s='pdf2txt -o%s %s %s'%(o,options,i) + s = 'pdf2txt -o%s %s %s' % (o, options, i) else: - s='pdf2txt -o%s %s'%(o,i) + s = 'pdf2txt -o%s %s' % (o, i) pdf2txt.main(s.split(' ')[1:]) + class TestDumpPDF(): def test_1(self): - run('../samples/','jo') - run('../samples/','simple1') - run('../samples/','simple2') - run('../samples/','simple3') + run('../samples/', 'jo') + run('../samples/', 'simple1') + run('../samples/', 'simple2') + run('../samples/', 'simple3') run('../samples/','sampleOneByteIdentityEncode') def test_2(self): - run('../samples/nonfree/','dmca') + run('../samples/nonfree/', 'dmca') def test_3(self): - run('../samples/nonfree/','f1040nr') + run('../samples/nonfree/', 'f1040nr') def test_4(self): - run('../samples/nonfree/','i1040nr') + run('../samples/nonfree/', 'i1040nr') def test_5(self): - run('../samples/nonfree/','kampo') + run('../samples/nonfree/', 'kampo') def test_6(self): - run('../samples/nonfree/','naacl06-shinyama') + run('../samples/nonfree/', 'naacl06-shinyama') # this test works on Windows but on Linux & Travis-CI it says # PDFSyntaxError: No /Root object! - Is this really a PDF? @@ -50,13 +56,38 @@ class TestDumpPDF(): """ def test_8(self): - run('../samples/contrib/','2b','-A -t xml') + run('../samples/contrib/', '2b', '-A -t xml') def test_9(self): - run('../samples/nonfree/','175') # https://github.com/pdfminer/pdfminer.six/issues/65 + run('../samples/nonfree/', '175') # https://github.com/pdfminer/pdfminer.six/issues/65 def test_10(self): - run('../samples/scancode/','patchelf') # https://github.com/euske/pdfminer/issues/96 + run('../samples/scancode/', 'patchelf') # https://github.com/euske/pdfminer/issues/96 + + +class TestDumpImages(object): + + def extract_images(self, input_file): + output_dir = mkdtemp() + with NamedTemporaryFile() as output_file: + commands = ['-o', output_file.name, '--output-dir', output_dir, input_file] + pdf2txt.main(commands) + image_files = os.listdir(output_dir) + rmtree(output_dir) + return image_files + + def test_nonfree_dmca(self): + """Extract images of pdf containing bmp images + + Regression test for: https://github.com/pdfminer/pdfminer.six/issues/131 + """ + image_files = self.extract_images(full_path('../samples/nonfree/dmca.pdf')) + assert image_files[0].endswith('bmp') + + def test_nonfree_175(self): + """Extract images of pdf containing jpg images""" + self.extract_images(full_path('../samples/nonfree/175.pdf')) + if __name__ == '__main__': nose.runmodule()