Fallback on backwards-compatible key (F) for embedded files URL's when the unicode URL (UF) does not exist (#338)

* Fix getting filename when extracting embedded files * Add test for pdf that contains embedded pdf, and fix additional errors in looping over multiple xrefs * Add line to CHANGELOG
2020-01-16 22:11:42 +01:00 · 2020-01-16 22:11:42 +01:00 · 2f7f5d2667
parent 0b1741b9bf
commit 2f7f5d2667
5 changed files with 35 additions and 18 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

 ### Fixed
 - Interpret two's complement integer as unsigned integer ([#352](https://github.com/pdfminer/pdfminer.six/pull/352))
+- KeyError when extracting embedded files and a Unicode file specification is missing ([#338](https://github.com/pdfminer/pdfminer.six/pull/338))

 ## [20200104] - 2019-01-04

--- a/samples/contrib/issue-00152-embedded-pdf.pdf
+++ b/samples/contrib/issue-00152-embedded-pdf.pdf
--- a/tests/test_tools_dumppdf.py
+++ b/tests/test_tools_dumppdf.py
@ -1,4 +1,5 @@
-from tempfile import NamedTemporaryFile
+from shutil import rmtree
+from tempfile import NamedTemporaryFile, mkdtemp

 from helpers import absolute_sample_path
 from tools import dumppdf
@ -36,3 +37,15 @@ class TestDumpPDF():

    def test_6(self):
        run('nonfree/naacl06-shinyama.pdf', '-t -a')
+
+    def test_embedded_font_filename(self):
+        """If UF font file name does not exist, then F should be used
+
+        Related issue: https://github.com/pdfminer/pdfminer.six/issues/152
+        """
+        output_dir = mkdtemp()
+        try:
+            run('contrib/issue-00152-embedded-pdf.pdf',
+                '--extract-embedded %s' % output_dir)
+        finally:
+            rmtree(output_dir)
--- a/tests/test_tools_pdf2txt.py
+++ b/tests/test_tools_pdf2txt.py
@ -17,7 +17,7 @@ def run(sample_path, options=None):
        pdf2txt.main(s.split(' ')[1:])


-class TestDumpPDF():
+class TestPdf2Txt():
    def test_jo(self):
        run('jo.pdf')

--- a/tools/dumppdf.py
+++ b/tools/dumppdf.py
@ -173,36 +173,39 @@ LITERAL_EMBEDDEDFILE = LIT('EmbeddedFile')

 def extractembedded(outfp, fname, objids, pagenos, password='',
                    dumpall=False, codec=None, extractdir=None):
-    def extract1(obj):
-        filename = os.path.basename(obj['UF'] or obj['F'])
-        fileref = obj['EF']['F']
+    def extract1(objid, obj):
+        filename = os.path.basename(obj.get('UF') or obj.get('F').decode())
+        fileref = obj['EF'].get('UF') or obj['EF'].get('F')
        fileobj = doc.getobj(fileref.objid)
        if not isinstance(fileobj, PDFStream):
-            raise PDFValueError(
-                'unable to process PDF: reference for %r is not a PDFStream' %
-                (filename))
+            error_msg = 'unable to process PDF: reference for %r is not a ' \
+                        'PDFStream' % filename
+            raise PDFValueError(error_msg)
        if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
            raise PDFValueError(
                'unable to process PDF: reference for %r '
                'is not an EmbeddedFile' % (filename))
-        path = os.path.join(extractdir, filename)
+        path = os.path.join(extractdir, '%.6d-%s' % (objid, filename))
        if os.path.exists(path):
            raise IOError('file exists: %r' % path)
        print('extracting: %r' % path)
+        os.makedirs(os.path.dirname(path), exist_ok=True)
        out = open(path, 'wb')
        out.write(fileobj.get_data())
        out.close()
        return

-    fp = open(fname, 'rb')
+    with open(fname, 'rb') as fp:
        parser = PDFParser(fp)
        doc = PDFDocument(parser, password)
+        extracted_objids = set()
        for xref in doc.xrefs:
            for objid in xref.get_objids():
                obj = doc.getobj(objid)
-            if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC:
-                extract1(obj)
-    fp.close()
+                if objid not in extracted_objids and isinstance(obj, dict) \
+                        and obj.get('Type') is LITERAL_FILESPEC:
+                    extracted_objids.add(objid)
+                    extract1(objid, obj)
    return