Changed: comparations for image colorspace literals (#132)
Fixes #131 Changed: comparations for image colorspace literals Added: test for extracting images from pdfspull/306/head
parent
63b2e09ac3
commit
4df6d4e5ca
|
@ -74,7 +74,7 @@ class ImageWriter(object):
|
||||||
if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
|
if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
|
||||||
ext = '.jpg'
|
ext = '.jpg'
|
||||||
elif (image.bits == 1 or
|
elif (image.bits == 1 or
|
||||||
image.bits == 8 and image.colorspace in (LITERAL_DEVICE_RGB, LITERAL_DEVICE_GRAY)):
|
image.bits == 8 and (LITERAL_DEVICE_RGB in image.colorspace or LITERAL_DEVICE_GRAY in image.colorspace)):
|
||||||
ext = '.%dx%d.bmp' % (width, height)
|
ext = '.%dx%d.bmp' % (width, height)
|
||||||
else:
|
else:
|
||||||
ext = '.%d.%dx%d.img' % (image.bits, width, height)
|
ext = '.%d.%dx%d.img' % (image.bits, width, height)
|
||||||
|
@ -101,7 +101,7 @@ class ImageWriter(object):
|
||||||
for y in range(height):
|
for y in range(height):
|
||||||
bmp.write_line(y, data[i:i+width])
|
bmp.write_line(y, data[i:i+width])
|
||||||
i += width
|
i += width
|
||||||
elif image.bits == 8 and image.colorspace is LITERAL_DEVICE_RGB:
|
elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace:
|
||||||
bmp = BMPWriter(fp, 24, width, height)
|
bmp = BMPWriter(fp, 24, width, height)
|
||||||
data = stream.get_data()
|
data = stream.get_data()
|
||||||
i = 0
|
i = 0
|
||||||
|
@ -109,7 +109,7 @@ class ImageWriter(object):
|
||||||
for y in range(height):
|
for y in range(height):
|
||||||
bmp.write_line(y, data[i:i+width])
|
bmp.write_line(y, data[i:i+width])
|
||||||
i += width
|
i += width
|
||||||
elif image.bits == 8 and image.colorspace is LITERAL_DEVICE_GRAY:
|
elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace:
|
||||||
bmp = BMPWriter(fp, 8, width, height)
|
bmp = BMPWriter(fp, 8, width, height)
|
||||||
data = stream.get_data()
|
data = stream.get_data()
|
||||||
i = 0
|
i = 0
|
||||||
|
|
|
@ -1,45 +1,51 @@
|
||||||
#!/usr/bin/env python
|
import os
|
||||||
|
from shutil import rmtree
|
||||||
|
from tempfile import NamedTemporaryFile, mkdtemp
|
||||||
|
|
||||||
# -*- coding: utf-8 -*-
|
import nose
|
||||||
|
|
||||||
import nose, logging, os
|
|
||||||
|
|
||||||
import tools.pdf2txt as pdf2txt
|
import tools.pdf2txt as pdf2txt
|
||||||
|
|
||||||
path=os.path.dirname(os.path.abspath(__file__))+'/'
|
|
||||||
|
|
||||||
def run(datapath,filename,options=None):
|
def full_path(relative_path_to_this_file):
|
||||||
i=path+datapath+filename+'.pdf'
|
this_file_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
o=path+filename+'.txt'
|
abspath = os.path.abspath(os.path.join(this_file_dir, relative_path_to_this_file))
|
||||||
|
return abspath
|
||||||
|
|
||||||
|
|
||||||
|
def run(datapath, filename, options=None):
|
||||||
|
i = full_path(datapath + filename + '.pdf')
|
||||||
|
o = full_path(filename + '.txt')
|
||||||
if options:
|
if options:
|
||||||
s='pdf2txt -o%s %s %s'%(o,options,i)
|
s = 'pdf2txt -o%s %s %s' % (o, options, i)
|
||||||
else:
|
else:
|
||||||
s='pdf2txt -o%s %s'%(o,i)
|
s = 'pdf2txt -o%s %s' % (o, i)
|
||||||
pdf2txt.main(s.split(' ')[1:])
|
pdf2txt.main(s.split(' ')[1:])
|
||||||
|
|
||||||
|
|
||||||
class TestDumpPDF():
|
class TestDumpPDF():
|
||||||
|
|
||||||
def test_1(self):
|
def test_1(self):
|
||||||
run('../samples/','jo')
|
run('../samples/', 'jo')
|
||||||
run('../samples/','simple1')
|
run('../samples/', 'simple1')
|
||||||
run('../samples/','simple2')
|
run('../samples/', 'simple2')
|
||||||
run('../samples/','simple3')
|
run('../samples/', 'simple3')
|
||||||
run('../samples/','sampleOneByteIdentityEncode')
|
run('../samples/','sampleOneByteIdentityEncode')
|
||||||
|
|
||||||
def test_2(self):
|
def test_2(self):
|
||||||
run('../samples/nonfree/','dmca')
|
run('../samples/nonfree/', 'dmca')
|
||||||
|
|
||||||
def test_3(self):
|
def test_3(self):
|
||||||
run('../samples/nonfree/','f1040nr')
|
run('../samples/nonfree/', 'f1040nr')
|
||||||
|
|
||||||
def test_4(self):
|
def test_4(self):
|
||||||
run('../samples/nonfree/','i1040nr')
|
run('../samples/nonfree/', 'i1040nr')
|
||||||
|
|
||||||
def test_5(self):
|
def test_5(self):
|
||||||
run('../samples/nonfree/','kampo')
|
run('../samples/nonfree/', 'kampo')
|
||||||
|
|
||||||
def test_6(self):
|
def test_6(self):
|
||||||
run('../samples/nonfree/','naacl06-shinyama')
|
run('../samples/nonfree/', 'naacl06-shinyama')
|
||||||
|
|
||||||
# this test works on Windows but on Linux & Travis-CI it says
|
# this test works on Windows but on Linux & Travis-CI it says
|
||||||
# PDFSyntaxError: No /Root object! - Is this really a PDF?
|
# PDFSyntaxError: No /Root object! - Is this really a PDF?
|
||||||
|
@ -50,13 +56,38 @@ class TestDumpPDF():
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def test_8(self):
|
def test_8(self):
|
||||||
run('../samples/contrib/','2b','-A -t xml')
|
run('../samples/contrib/', '2b', '-A -t xml')
|
||||||
|
|
||||||
def test_9(self):
|
def test_9(self):
|
||||||
run('../samples/nonfree/','175') # https://github.com/pdfminer/pdfminer.six/issues/65
|
run('../samples/nonfree/', '175') # https://github.com/pdfminer/pdfminer.six/issues/65
|
||||||
|
|
||||||
def test_10(self):
|
def test_10(self):
|
||||||
run('../samples/scancode/','patchelf') # https://github.com/euske/pdfminer/issues/96
|
run('../samples/scancode/', 'patchelf') # https://github.com/euske/pdfminer/issues/96
|
||||||
|
|
||||||
|
|
||||||
|
class TestDumpImages(object):
|
||||||
|
|
||||||
|
def extract_images(self, input_file):
|
||||||
|
output_dir = mkdtemp()
|
||||||
|
with NamedTemporaryFile() as output_file:
|
||||||
|
commands = ['-o', output_file.name, '--output-dir', output_dir, input_file]
|
||||||
|
pdf2txt.main(commands)
|
||||||
|
image_files = os.listdir(output_dir)
|
||||||
|
rmtree(output_dir)
|
||||||
|
return image_files
|
||||||
|
|
||||||
|
def test_nonfree_dmca(self):
|
||||||
|
"""Extract images of pdf containing bmp images
|
||||||
|
|
||||||
|
Regression test for: https://github.com/pdfminer/pdfminer.six/issues/131
|
||||||
|
"""
|
||||||
|
image_files = self.extract_images(full_path('../samples/nonfree/dmca.pdf'))
|
||||||
|
assert image_files[0].endswith('bmp')
|
||||||
|
|
||||||
|
def test_nonfree_175(self):
|
||||||
|
"""Extract images of pdf containing jpg images"""
|
||||||
|
self.extract_images(full_path('../samples/nonfree/175.pdf'))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
nose.runmodule()
|
nose.runmodule()
|
||||||
|
|
Loading…
Reference in New Issue