diff --git a/CHANGELOG.md b/CHANGELOG.md index dcd2350..dffe4af 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Fixed - Interpret two's complement integer as unsigned integer ([#352](https://github.com/pdfminer/pdfminer.six/pull/352)) +- KeyError when extracting embedded files and a Unicode file specification is missing ([#338](https://github.com/pdfminer/pdfminer.six/pull/338)) ## [20200104] - 2019-01-04 diff --git a/samples/contrib/issue-00152-embedded-pdf.pdf b/samples/contrib/issue-00152-embedded-pdf.pdf new file mode 100644 index 0000000..20813b9 Binary files /dev/null and b/samples/contrib/issue-00152-embedded-pdf.pdf differ diff --git a/tests/test_tools_dumppdf.py b/tests/test_tools_dumppdf.py index bee7e3b..49298a8 100644 --- a/tests/test_tools_dumppdf.py +++ b/tests/test_tools_dumppdf.py @@ -1,4 +1,5 @@ -from tempfile import NamedTemporaryFile +from shutil import rmtree +from tempfile import NamedTemporaryFile, mkdtemp from helpers import absolute_sample_path from tools import dumppdf @@ -36,3 +37,15 @@ class TestDumpPDF(): def test_6(self): run('nonfree/naacl06-shinyama.pdf', '-t -a') + + def test_embedded_font_filename(self): + """If UF font file name does not exist, then F should be used + + Related issue: https://github.com/pdfminer/pdfminer.six/issues/152 + """ + output_dir = mkdtemp() + try: + run('contrib/issue-00152-embedded-pdf.pdf', + '--extract-embedded %s' % output_dir) + finally: + rmtree(output_dir) diff --git a/tests/test_tools_pdf2txt.py b/tests/test_tools_pdf2txt.py index 30bc3c8..459e19b 100644 --- a/tests/test_tools_pdf2txt.py +++ b/tests/test_tools_pdf2txt.py @@ -17,7 +17,7 @@ def run(sample_path, options=None): pdf2txt.main(s.split(' ')[1:]) -class TestDumpPDF(): +class TestPdf2Txt(): def test_jo(self): run('jo.pdf') diff --git a/tools/dumppdf.py b/tools/dumppdf.py index 0a1d94d..377e9b4 100755 --- a/tools/dumppdf.py +++ b/tools/dumppdf.py @@ -173,36 +173,39 @@ LITERAL_EMBEDDEDFILE = LIT('EmbeddedFile') def extractembedded(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): - def extract1(obj): - filename = os.path.basename(obj['UF'] or obj['F']) - fileref = obj['EF']['F'] + def extract1(objid, obj): + filename = os.path.basename(obj.get('UF') or obj.get('F').decode()) + fileref = obj['EF'].get('UF') or obj['EF'].get('F') fileobj = doc.getobj(fileref.objid) if not isinstance(fileobj, PDFStream): - raise PDFValueError( - 'unable to process PDF: reference for %r is not a PDFStream' % - (filename)) + error_msg = 'unable to process PDF: reference for %r is not a ' \ + 'PDFStream' % filename + raise PDFValueError(error_msg) if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE: raise PDFValueError( 'unable to process PDF: reference for %r ' 'is not an EmbeddedFile' % (filename)) - path = os.path.join(extractdir, filename) + path = os.path.join(extractdir, '%.6d-%s' % (objid, filename)) if os.path.exists(path): raise IOError('file exists: %r' % path) print('extracting: %r' % path) + os.makedirs(os.path.dirname(path), exist_ok=True) out = open(path, 'wb') out.write(fileobj.get_data()) out.close() return - fp = open(fname, 'rb') - parser = PDFParser(fp) - doc = PDFDocument(parser, password) - for xref in doc.xrefs: - for objid in xref.get_objids(): - obj = doc.getobj(objid) - if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC: - extract1(obj) - fp.close() + with open(fname, 'rb') as fp: + parser = PDFParser(fp) + doc = PDFDocument(parser, password) + extracted_objids = set() + for xref in doc.xrefs: + for objid in xref.get_objids(): + obj = doc.getobj(objid) + if objid not in extracted_objids and isinstance(obj, dict) \ + and obj.get('Type') is LITERAL_FILESPEC: + extracted_objids.add(objid) + extract1(objid, obj) return