Changed: comparations for image colorspace literals (#132)

Fixes #131 

Changed: comparations for image colorspace literals
Added: test for extracting images from pdfs
pull/306/head
D.A.Bashkirtsev 2019-10-15 19:11:54 +05:00 committed by Pieter Marsman
parent 63b2e09ac3
commit 4df6d4e5ca
2 changed files with 56 additions and 25 deletions

View File

@ -74,7 +74,7 @@ class ImageWriter(object):
if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
ext = '.jpg'
elif (image.bits == 1 or
image.bits == 8 and image.colorspace in (LITERAL_DEVICE_RGB, LITERAL_DEVICE_GRAY)):
image.bits == 8 and (LITERAL_DEVICE_RGB in image.colorspace or LITERAL_DEVICE_GRAY in image.colorspace)):
ext = '.%dx%d.bmp' % (width, height)
else:
ext = '.%d.%dx%d.img' % (image.bits, width, height)
@ -101,7 +101,7 @@ class ImageWriter(object):
for y in range(height):
bmp.write_line(y, data[i:i+width])
i += width
elif image.bits == 8 and image.colorspace is LITERAL_DEVICE_RGB:
elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace:
bmp = BMPWriter(fp, 24, width, height)
data = stream.get_data()
i = 0
@ -109,7 +109,7 @@ class ImageWriter(object):
for y in range(height):
bmp.write_line(y, data[i:i+width])
i += width
elif image.bits == 8 and image.colorspace is LITERAL_DEVICE_GRAY:
elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace:
bmp = BMPWriter(fp, 8, width, height)
data = stream.get_data()
i = 0

View File

@ -1,22 +1,28 @@
#!/usr/bin/env python
import os
from shutil import rmtree
from tempfile import NamedTemporaryFile, mkdtemp
# -*- coding: utf-8 -*-
import nose, logging, os
import nose
import tools.pdf2txt as pdf2txt
path=os.path.dirname(os.path.abspath(__file__))+'/'
def full_path(relative_path_to_this_file):
this_file_dir = os.path.dirname(os.path.abspath(__file__))
abspath = os.path.abspath(os.path.join(this_file_dir, relative_path_to_this_file))
return abspath
def run(datapath, filename, options=None):
i=path+datapath+filename+'.pdf'
o=path+filename+'.txt'
i = full_path(datapath + filename + '.pdf')
o = full_path(filename + '.txt')
if options:
s = 'pdf2txt -o%s %s %s' % (o, options, i)
else:
s = 'pdf2txt -o%s %s' % (o, i)
pdf2txt.main(s.split(' ')[1:])
class TestDumpPDF():
def test_1(self):
@ -58,5 +64,30 @@ class TestDumpPDF():
def test_10(self):
run('../samples/scancode/', 'patchelf') # https://github.com/euske/pdfminer/issues/96
class TestDumpImages(object):
def extract_images(self, input_file):
output_dir = mkdtemp()
with NamedTemporaryFile() as output_file:
commands = ['-o', output_file.name, '--output-dir', output_dir, input_file]
pdf2txt.main(commands)
image_files = os.listdir(output_dir)
rmtree(output_dir)
return image_files
def test_nonfree_dmca(self):
"""Extract images of pdf containing bmp images
Regression test for: https://github.com/pdfminer/pdfminer.six/issues/131
"""
image_files = self.extract_images(full_path('../samples/nonfree/dmca.pdf'))
assert image_files[0].endswith('bmp')
def test_nonfree_175(self):
"""Extract images of pdf containing jpg images"""
self.extract_images(full_path('../samples/nonfree/175.pdf'))
if __name__ == '__main__':
nose.runmodule()