Fallback on backwards-compatible key (F) for embedded files URL's when the unicode URL (UF) does not exist (#338)

* Fix getting filename when extracting embedded files

* Add test for pdf that contains embedded pdf, and fix additional errors in looping over multiple xrefs

* Add line to CHANGELOG
pull/348/head^2
Pieter Marsman 2020-01-16 22:11:42 +01:00 committed by GitHub
parent 0b1741b9bf
commit 2f7f5d2667
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 35 additions and 18 deletions

View File

@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
### Fixed ### Fixed
- Interpret two's complement integer as unsigned integer ([#352](https://github.com/pdfminer/pdfminer.six/pull/352)) - Interpret two's complement integer as unsigned integer ([#352](https://github.com/pdfminer/pdfminer.six/pull/352))
- KeyError when extracting embedded files and a Unicode file specification is missing ([#338](https://github.com/pdfminer/pdfminer.six/pull/338))
## [20200104] - 2019-01-04 ## [20200104] - 2019-01-04

Binary file not shown.

View File

@ -1,4 +1,5 @@
from tempfile import NamedTemporaryFile from shutil import rmtree
from tempfile import NamedTemporaryFile, mkdtemp
from helpers import absolute_sample_path from helpers import absolute_sample_path
from tools import dumppdf from tools import dumppdf
@ -36,3 +37,15 @@ class TestDumpPDF():
def test_6(self): def test_6(self):
run('nonfree/naacl06-shinyama.pdf', '-t -a') run('nonfree/naacl06-shinyama.pdf', '-t -a')
def test_embedded_font_filename(self):
"""If UF font file name does not exist, then F should be used
Related issue: https://github.com/pdfminer/pdfminer.six/issues/152
"""
output_dir = mkdtemp()
try:
run('contrib/issue-00152-embedded-pdf.pdf',
'--extract-embedded %s' % output_dir)
finally:
rmtree(output_dir)

View File

@ -17,7 +17,7 @@ def run(sample_path, options=None):
pdf2txt.main(s.split(' ')[1:]) pdf2txt.main(s.split(' ')[1:])
class TestDumpPDF(): class TestPdf2Txt():
def test_jo(self): def test_jo(self):
run('jo.pdf') run('jo.pdf')

View File

@ -173,36 +173,39 @@ LITERAL_EMBEDDEDFILE = LIT('EmbeddedFile')
def extractembedded(outfp, fname, objids, pagenos, password='', def extractembedded(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None, extractdir=None): dumpall=False, codec=None, extractdir=None):
def extract1(obj): def extract1(objid, obj):
filename = os.path.basename(obj['UF'] or obj['F']) filename = os.path.basename(obj.get('UF') or obj.get('F').decode())
fileref = obj['EF']['F'] fileref = obj['EF'].get('UF') or obj['EF'].get('F')
fileobj = doc.getobj(fileref.objid) fileobj = doc.getobj(fileref.objid)
if not isinstance(fileobj, PDFStream): if not isinstance(fileobj, PDFStream):
raise PDFValueError( error_msg = 'unable to process PDF: reference for %r is not a ' \
'unable to process PDF: reference for %r is not a PDFStream' % 'PDFStream' % filename
(filename)) raise PDFValueError(error_msg)
if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE: if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
raise PDFValueError( raise PDFValueError(
'unable to process PDF: reference for %r ' 'unable to process PDF: reference for %r '
'is not an EmbeddedFile' % (filename)) 'is not an EmbeddedFile' % (filename))
path = os.path.join(extractdir, filename) path = os.path.join(extractdir, '%.6d-%s' % (objid, filename))
if os.path.exists(path): if os.path.exists(path):
raise IOError('file exists: %r' % path) raise IOError('file exists: %r' % path)
print('extracting: %r' % path) print('extracting: %r' % path)
os.makedirs(os.path.dirname(path), exist_ok=True)
out = open(path, 'wb') out = open(path, 'wb')
out.write(fileobj.get_data()) out.write(fileobj.get_data())
out.close() out.close()
return return
fp = open(fname, 'rb') with open(fname, 'rb') as fp:
parser = PDFParser(fp) parser = PDFParser(fp)
doc = PDFDocument(parser, password) doc = PDFDocument(parser, password)
extracted_objids = set()
for xref in doc.xrefs: for xref in doc.xrefs:
for objid in xref.get_objids(): for objid in xref.get_objids():
obj = doc.getobj(objid) obj = doc.getobj(objid)
if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC: if objid not in extracted_objids and isinstance(obj, dict) \
extract1(obj) and obj.get('Type') is LITERAL_FILESPEC:
fp.close() extracted_objids.add(objid)
extract1(objid, obj)
return return