2019-10-15 14:11:54 +00:00
|
|
|
import os
|
|
|
|
from shutil import rmtree
|
2020-10-26 09:10:11 +00:00
|
|
|
from tempfile import mkdtemp
|
2022-01-23 20:41:08 +00:00
|
|
|
import filecmp
|
2016-11-08 19:01:11 +00:00
|
|
|
|
2014-09-03 13:26:08 +00:00
|
|
|
import tools.pdf2txt as pdf2txt
|
2019-10-26 16:42:33 +00:00
|
|
|
from helpers import absolute_sample_path
|
2020-10-26 09:10:11 +00:00
|
|
|
from tempfilepath import TemporaryFilePath
|
2014-09-03 13:26:08 +00:00
|
|
|
|
|
|
|
|
2019-10-26 16:42:33 +00:00
|
|
|
def run(sample_path, options=None):
|
|
|
|
absolute_path = absolute_sample_path(sample_path)
|
2020-10-26 09:10:11 +00:00
|
|
|
with TemporaryFilePath() as output_file_name:
|
2019-10-26 16:42:33 +00:00
|
|
|
if options:
|
2022-02-11 21:46:51 +00:00
|
|
|
s = "pdf2txt -o{} {} {}".format(output_file_name, options, absolute_path)
|
2019-10-26 16:42:33 +00:00
|
|
|
else:
|
2022-02-11 21:46:51 +00:00
|
|
|
s = "pdf2txt -o{} {}".format(output_file_name, absolute_path)
|
2020-10-26 09:10:11 +00:00
|
|
|
|
2022-02-11 21:46:51 +00:00
|
|
|
pdf2txt.main(s.split(" ")[1:])
|
2019-10-15 14:11:54 +00:00
|
|
|
|
|
|
|
|
2022-02-11 21:46:51 +00:00
|
|
|
class TestPdf2Txt:
|
2019-10-26 16:42:33 +00:00
|
|
|
def test_jo(self):
|
2022-02-11 21:46:51 +00:00
|
|
|
run("jo.pdf")
|
2014-09-03 13:26:08 +00:00
|
|
|
|
2019-10-26 16:42:33 +00:00
|
|
|
def test_simple1(self):
|
2022-02-11 21:46:51 +00:00
|
|
|
run("simple1.pdf")
|
2019-10-15 14:11:54 +00:00
|
|
|
|
2019-10-26 16:42:33 +00:00
|
|
|
def test_simple2(self):
|
2022-02-11 21:46:51 +00:00
|
|
|
run("simple2.pdf")
|
2014-09-03 13:26:08 +00:00
|
|
|
|
2019-10-26 16:42:33 +00:00
|
|
|
def test_simple3(self):
|
2022-02-11 21:46:51 +00:00
|
|
|
run("simple3.pdf")
|
2017-04-18 16:28:48 +00:00
|
|
|
|
2019-10-26 16:42:33 +00:00
|
|
|
def test_sample_one_byte_identity_encode(self):
|
2022-02-11 21:46:51 +00:00
|
|
|
run("sampleOneByteIdentityEncode.pdf")
|
2017-04-18 16:28:48 +00:00
|
|
|
|
2019-10-26 16:42:33 +00:00
|
|
|
def test_nonfree_175(self):
|
2019-12-29 20:20:20 +00:00
|
|
|
"""Regression test for:
|
2020-01-07 20:59:13 +00:00
|
|
|
https://github.com/pdfminer/pdfminer.six/issues/65
|
|
|
|
"""
|
2022-02-11 21:46:51 +00:00
|
|
|
run("nonfree/175.pdf")
|
2014-09-03 13:26:08 +00:00
|
|
|
|
2019-10-26 16:42:33 +00:00
|
|
|
def test_nonfree_dmca(self):
|
2022-02-11 21:46:51 +00:00
|
|
|
run("nonfree/dmca.pdf")
|
2019-10-26 16:42:33 +00:00
|
|
|
|
|
|
|
def test_nonfree_f1040nr(self):
|
2022-02-11 21:46:51 +00:00
|
|
|
run("nonfree/f1040nr.pdf", "-p 1")
|
2017-04-18 16:28:48 +00:00
|
|
|
|
2019-10-26 16:42:33 +00:00
|
|
|
def test_nonfree_i1040nr(self):
|
2022-02-11 21:46:51 +00:00
|
|
|
run("nonfree/i1040nr.pdf", "-p 1")
|
2017-04-18 16:28:48 +00:00
|
|
|
|
2019-10-26 16:42:33 +00:00
|
|
|
def test_nonfree_kampo(self):
|
2022-02-11 21:46:51 +00:00
|
|
|
run("nonfree/kampo.pdf")
|
2014-09-03 13:26:08 +00:00
|
|
|
|
2019-10-26 16:42:33 +00:00
|
|
|
def test_nonfree_naacl06_shinyama(self):
|
2022-02-11 21:46:51 +00:00
|
|
|
run("nonfree/naacl06-shinyama.pdf")
|
2017-10-16 10:05:39 +00:00
|
|
|
|
2019-10-26 16:42:33 +00:00
|
|
|
def test_nlp2004slides(self):
|
2022-02-11 21:46:51 +00:00
|
|
|
run("nonfree/nlp2004slides.pdf", "-p 1")
|
2017-07-20 19:17:06 +00:00
|
|
|
|
2019-10-26 16:42:33 +00:00
|
|
|
def test_contrib_2b(self):
|
2022-02-11 21:46:51 +00:00
|
|
|
run("contrib/2b.pdf", "-A -t xml")
|
2017-10-16 10:05:39 +00:00
|
|
|
|
2020-07-11 14:04:11 +00:00
|
|
|
def test_contrib_issue_350(self):
|
|
|
|
"""Regression test for
|
|
|
|
https://github.com/pdfminer/pdfminer.six/issues/350"""
|
2022-02-11 21:46:51 +00:00
|
|
|
run("contrib/issue-00352-asw-oct96-p41.pdf")
|
2020-07-11 14:04:11 +00:00
|
|
|
|
2019-10-26 16:42:33 +00:00
|
|
|
def test_scancode_patchelf(self):
|
2020-07-11 15:34:38 +00:00
|
|
|
"""Regression test for https://github.com/euske/pdfminer/issues/96"""
|
2022-02-11 21:46:51 +00:00
|
|
|
run("scancode/patchelf.pdf")
|
2019-10-15 14:11:54 +00:00
|
|
|
|
2020-01-07 20:59:13 +00:00
|
|
|
def test_contrib_hash_two_complement(self):
|
2020-07-11 14:04:11 +00:00
|
|
|
"""Check that unsigned integer is added correctly to encryption hash.et
|
2020-01-07 20:59:13 +00:00
|
|
|
|
|
|
|
See https://github.com/pdfminer/pdfminer.six/issues/186
|
|
|
|
"""
|
2022-02-11 21:46:51 +00:00
|
|
|
run("contrib/issue-00352-hash-twos-complement.pdf")
|
2020-01-07 20:59:13 +00:00
|
|
|
|
2020-07-11 15:34:38 +00:00
|
|
|
def test_contrib_excel(self):
|
|
|
|
"""Regression test for
|
2022-02-11 21:46:51 +00:00
|
|
|
https://github.com/pdfminer/pdfminer.six/issues/369
|
|
|
|
"""
|
|
|
|
run("contrib/issue-00369-excel.pdf", "-t html")
|
2020-07-11 15:34:38 +00:00
|
|
|
|
2020-07-20 20:00:54 +00:00
|
|
|
def test_encryption_aes128(self):
|
2022-02-11 21:46:51 +00:00
|
|
|
run("encryption/aes-128.pdf", "-P foo")
|
2020-07-20 20:00:54 +00:00
|
|
|
|
|
|
|
def test_encryption_aes128m(self):
|
2022-02-11 21:46:51 +00:00
|
|
|
run("encryption/aes-128-m.pdf", "-P foo")
|
2020-07-20 20:00:54 +00:00
|
|
|
|
|
|
|
def test_encryption_aes256(self):
|
2022-02-11 21:46:51 +00:00
|
|
|
run("encryption/aes-256.pdf", "-P foo")
|
2020-07-20 20:00:54 +00:00
|
|
|
|
|
|
|
def test_encryption_aes256m(self):
|
2022-02-11 21:46:51 +00:00
|
|
|
run("encryption/aes-256-m.pdf", "-P foo")
|
2020-07-20 20:00:54 +00:00
|
|
|
|
2021-09-06 20:00:23 +00:00
|
|
|
def test_encryption_aes256_r6_user(self):
|
2022-02-11 21:46:51 +00:00
|
|
|
run("encryption/aes-256-r6.pdf", "-P usersecret")
|
2021-09-06 20:00:23 +00:00
|
|
|
|
|
|
|
def test_encryption_aes256_r6_owner(self):
|
2022-02-11 21:46:51 +00:00
|
|
|
run("encryption/aes-256-r6.pdf", "-P ownersecret")
|
2021-09-06 20:00:23 +00:00
|
|
|
|
2020-07-20 20:00:54 +00:00
|
|
|
def test_encryption_base(self):
|
2022-02-11 21:46:51 +00:00
|
|
|
run("encryption/base.pdf", "-P foo")
|
2020-07-20 20:00:54 +00:00
|
|
|
|
|
|
|
def test_encryption_rc4_40(self):
|
2022-02-11 21:46:51 +00:00
|
|
|
run("encryption/rc4-40.pdf", "-P foo")
|
2020-07-20 20:00:54 +00:00
|
|
|
|
|
|
|
def test_encryption_rc4_128(self):
|
2022-02-11 21:46:51 +00:00
|
|
|
run("encryption/rc4-128.pdf", "-P foo")
|
2020-07-20 20:00:54 +00:00
|
|
|
|
2019-10-15 14:11:54 +00:00
|
|
|
|
2020-01-04 15:47:07 +00:00
|
|
|
class TestDumpImages:
|
2019-10-26 16:42:33 +00:00
|
|
|
@staticmethod
|
2022-02-02 21:24:32 +00:00
|
|
|
def extract_images(input_file, *args):
|
2019-10-15 14:11:54 +00:00
|
|
|
output_dir = mkdtemp()
|
2020-10-26 09:10:11 +00:00
|
|
|
with TemporaryFilePath() as output_file_name:
|
2022-02-11 21:46:51 +00:00
|
|
|
commands = [
|
|
|
|
"-o",
|
|
|
|
output_file_name,
|
|
|
|
"--output-dir",
|
|
|
|
output_dir,
|
|
|
|
input_file,
|
|
|
|
*args,
|
|
|
|
]
|
2019-10-15 14:11:54 +00:00
|
|
|
pdf2txt.main(commands)
|
|
|
|
image_files = os.listdir(output_dir)
|
|
|
|
rmtree(output_dir)
|
|
|
|
return image_files
|
|
|
|
|
|
|
|
def test_nonfree_dmca(self):
|
|
|
|
"""Extract images of pdf containing bmp images
|
|
|
|
|
2019-12-29 20:20:20 +00:00
|
|
|
Regression test for:
|
|
|
|
https://github.com/pdfminer/pdfminer.six/issues/131
|
2019-10-15 14:11:54 +00:00
|
|
|
"""
|
2022-02-11 21:46:51 +00:00
|
|
|
filepath = absolute_sample_path("../samples/nonfree/dmca.pdf")
|
|
|
|
image_files = self.extract_images(filepath, "-p", "1")
|
|
|
|
assert image_files[0].endswith("bmp")
|
2019-10-15 14:11:54 +00:00
|
|
|
|
|
|
|
def test_nonfree_175(self):
|
|
|
|
"""Extract images of pdf containing jpg images"""
|
2022-02-11 21:46:51 +00:00
|
|
|
self.extract_images(absolute_sample_path("../samples/nonfree/175.pdf"))
|
2019-10-22 15:37:06 +00:00
|
|
|
|
|
|
|
def test_jbig2_image_export(self):
|
|
|
|
"""Extract images of pdf containing jbig2 images
|
|
|
|
|
|
|
|
Feature test for: https://github.com/pdfminer/pdfminer.six/pull/46
|
|
|
|
"""
|
2022-02-11 21:46:51 +00:00
|
|
|
input_file = absolute_sample_path("../samples/contrib/pdf-with-jbig2.pdf")
|
2022-01-23 20:41:08 +00:00
|
|
|
output_dir = mkdtemp()
|
|
|
|
with TemporaryFilePath() as output_file_name:
|
2022-02-11 21:46:51 +00:00
|
|
|
commands = ["-o", output_file_name, "--output-dir", output_dir, input_file]
|
2022-01-23 20:41:08 +00:00
|
|
|
pdf2txt.main(commands)
|
|
|
|
image_files = os.listdir(output_dir)
|
|
|
|
try:
|
2022-02-11 21:46:51 +00:00
|
|
|
assert image_files[0].endswith(".jb2")
|
|
|
|
assert filecmp.cmp(
|
|
|
|
output_dir + "/" + image_files[0],
|
|
|
|
absolute_sample_path("../samples/contrib/XIPLAYER0.jb2"),
|
|
|
|
)
|
2022-01-23 20:41:08 +00:00
|
|
|
finally:
|
|
|
|
rmtree(output_dir)
|
2019-10-22 15:37:06 +00:00
|
|
|
|
2019-10-22 16:15:59 +00:00
|
|
|
def test_contrib_matplotlib(self):
|
|
|
|
"""Test a pdf with Type3 font"""
|
2022-02-11 21:46:51 +00:00
|
|
|
run("contrib/matplotlib.pdf")
|
2019-10-22 16:15:59 +00:00
|
|
|
|
|
|
|
def test_nonfree_cmp_itext_logo(self):
|
|
|
|
"""Test a pdf with Type3 font"""
|
2022-02-11 21:46:51 +00:00
|
|
|
run("nonfree/cmp_itext_logo.pdf")
|