Fallback on backwards-compatible key (F) for embedded files URL's when the unicode URL (UF) does not exist (#338)
* Fix getting filename when extracting embedded files * Add test for pdf that contains embedded pdf, and fix additional errors in looping over multiple xrefs * Add line to CHANGELOGpull/348/head^2
parent
0b1741b9bf
commit
2f7f5d2667
|
@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
- Interpret two's complement integer as unsigned integer ([#352](https://github.com/pdfminer/pdfminer.six/pull/352))
|
- Interpret two's complement integer as unsigned integer ([#352](https://github.com/pdfminer/pdfminer.six/pull/352))
|
||||||
|
- KeyError when extracting embedded files and a Unicode file specification is missing ([#338](https://github.com/pdfminer/pdfminer.six/pull/338))
|
||||||
|
|
||||||
## [20200104] - 2019-01-04
|
## [20200104] - 2019-01-04
|
||||||
|
|
||||||
|
|
Binary file not shown.
|
@ -1,4 +1,5 @@
|
||||||
from tempfile import NamedTemporaryFile
|
from shutil import rmtree
|
||||||
|
from tempfile import NamedTemporaryFile, mkdtemp
|
||||||
|
|
||||||
from helpers import absolute_sample_path
|
from helpers import absolute_sample_path
|
||||||
from tools import dumppdf
|
from tools import dumppdf
|
||||||
|
@ -36,3 +37,15 @@ class TestDumpPDF():
|
||||||
|
|
||||||
def test_6(self):
|
def test_6(self):
|
||||||
run('nonfree/naacl06-shinyama.pdf', '-t -a')
|
run('nonfree/naacl06-shinyama.pdf', '-t -a')
|
||||||
|
|
||||||
|
def test_embedded_font_filename(self):
|
||||||
|
"""If UF font file name does not exist, then F should be used
|
||||||
|
|
||||||
|
Related issue: https://github.com/pdfminer/pdfminer.six/issues/152
|
||||||
|
"""
|
||||||
|
output_dir = mkdtemp()
|
||||||
|
try:
|
||||||
|
run('contrib/issue-00152-embedded-pdf.pdf',
|
||||||
|
'--extract-embedded %s' % output_dir)
|
||||||
|
finally:
|
||||||
|
rmtree(output_dir)
|
||||||
|
|
|
@ -17,7 +17,7 @@ def run(sample_path, options=None):
|
||||||
pdf2txt.main(s.split(' ')[1:])
|
pdf2txt.main(s.split(' ')[1:])
|
||||||
|
|
||||||
|
|
||||||
class TestDumpPDF():
|
class TestPdf2Txt():
|
||||||
def test_jo(self):
|
def test_jo(self):
|
||||||
run('jo.pdf')
|
run('jo.pdf')
|
||||||
|
|
||||||
|
|
|
@ -173,36 +173,39 @@ LITERAL_EMBEDDEDFILE = LIT('EmbeddedFile')
|
||||||
|
|
||||||
def extractembedded(outfp, fname, objids, pagenos, password='',
|
def extractembedded(outfp, fname, objids, pagenos, password='',
|
||||||
dumpall=False, codec=None, extractdir=None):
|
dumpall=False, codec=None, extractdir=None):
|
||||||
def extract1(obj):
|
def extract1(objid, obj):
|
||||||
filename = os.path.basename(obj['UF'] or obj['F'])
|
filename = os.path.basename(obj.get('UF') or obj.get('F').decode())
|
||||||
fileref = obj['EF']['F']
|
fileref = obj['EF'].get('UF') or obj['EF'].get('F')
|
||||||
fileobj = doc.getobj(fileref.objid)
|
fileobj = doc.getobj(fileref.objid)
|
||||||
if not isinstance(fileobj, PDFStream):
|
if not isinstance(fileobj, PDFStream):
|
||||||
raise PDFValueError(
|
error_msg = 'unable to process PDF: reference for %r is not a ' \
|
||||||
'unable to process PDF: reference for %r is not a PDFStream' %
|
'PDFStream' % filename
|
||||||
(filename))
|
raise PDFValueError(error_msg)
|
||||||
if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
|
if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
|
||||||
raise PDFValueError(
|
raise PDFValueError(
|
||||||
'unable to process PDF: reference for %r '
|
'unable to process PDF: reference for %r '
|
||||||
'is not an EmbeddedFile' % (filename))
|
'is not an EmbeddedFile' % (filename))
|
||||||
path = os.path.join(extractdir, filename)
|
path = os.path.join(extractdir, '%.6d-%s' % (objid, filename))
|
||||||
if os.path.exists(path):
|
if os.path.exists(path):
|
||||||
raise IOError('file exists: %r' % path)
|
raise IOError('file exists: %r' % path)
|
||||||
print('extracting: %r' % path)
|
print('extracting: %r' % path)
|
||||||
|
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||||
out = open(path, 'wb')
|
out = open(path, 'wb')
|
||||||
out.write(fileobj.get_data())
|
out.write(fileobj.get_data())
|
||||||
out.close()
|
out.close()
|
||||||
return
|
return
|
||||||
|
|
||||||
fp = open(fname, 'rb')
|
with open(fname, 'rb') as fp:
|
||||||
parser = PDFParser(fp)
|
parser = PDFParser(fp)
|
||||||
doc = PDFDocument(parser, password)
|
doc = PDFDocument(parser, password)
|
||||||
for xref in doc.xrefs:
|
extracted_objids = set()
|
||||||
for objid in xref.get_objids():
|
for xref in doc.xrefs:
|
||||||
obj = doc.getobj(objid)
|
for objid in xref.get_objids():
|
||||||
if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC:
|
obj = doc.getobj(objid)
|
||||||
extract1(obj)
|
if objid not in extracted_objids and isinstance(obj, dict) \
|
||||||
fp.close()
|
and obj.get('Type') is LITERAL_FILESPEC:
|
||||||
|
extracted_objids.add(objid)
|
||||||
|
extract1(objid, obj)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue