Fallback on backwards-compatible key (F) for embedded files URL's when the unicode URL (UF) does not exist (#338)
* Fix getting filename when extracting embedded files * Add test for pdf that contains embedded pdf, and fix additional errors in looping over multiple xrefs * Add line to CHANGELOGpull/348/head^2
parent
0b1741b9bf
commit
2f7f5d2667
|
@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
|||
|
||||
### Fixed
|
||||
- Interpret two's complement integer as unsigned integer ([#352](https://github.com/pdfminer/pdfminer.six/pull/352))
|
||||
- KeyError when extracting embedded files and a Unicode file specification is missing ([#338](https://github.com/pdfminer/pdfminer.six/pull/338))
|
||||
|
||||
## [20200104] - 2019-01-04
|
||||
|
||||
|
|
Binary file not shown.
|
@ -1,4 +1,5 @@
|
|||
from tempfile import NamedTemporaryFile
|
||||
from shutil import rmtree
|
||||
from tempfile import NamedTemporaryFile, mkdtemp
|
||||
|
||||
from helpers import absolute_sample_path
|
||||
from tools import dumppdf
|
||||
|
@ -36,3 +37,15 @@ class TestDumpPDF():
|
|||
|
||||
def test_6(self):
|
||||
run('nonfree/naacl06-shinyama.pdf', '-t -a')
|
||||
|
||||
def test_embedded_font_filename(self):
|
||||
"""If UF font file name does not exist, then F should be used
|
||||
|
||||
Related issue: https://github.com/pdfminer/pdfminer.six/issues/152
|
||||
"""
|
||||
output_dir = mkdtemp()
|
||||
try:
|
||||
run('contrib/issue-00152-embedded-pdf.pdf',
|
||||
'--extract-embedded %s' % output_dir)
|
||||
finally:
|
||||
rmtree(output_dir)
|
||||
|
|
|
@ -17,7 +17,7 @@ def run(sample_path, options=None):
|
|||
pdf2txt.main(s.split(' ')[1:])
|
||||
|
||||
|
||||
class TestDumpPDF():
|
||||
class TestPdf2Txt():
|
||||
def test_jo(self):
|
||||
run('jo.pdf')
|
||||
|
||||
|
|
|
@ -173,36 +173,39 @@ LITERAL_EMBEDDEDFILE = LIT('EmbeddedFile')
|
|||
|
||||
def extractembedded(outfp, fname, objids, pagenos, password='',
|
||||
dumpall=False, codec=None, extractdir=None):
|
||||
def extract1(obj):
|
||||
filename = os.path.basename(obj['UF'] or obj['F'])
|
||||
fileref = obj['EF']['F']
|
||||
def extract1(objid, obj):
|
||||
filename = os.path.basename(obj.get('UF') or obj.get('F').decode())
|
||||
fileref = obj['EF'].get('UF') or obj['EF'].get('F')
|
||||
fileobj = doc.getobj(fileref.objid)
|
||||
if not isinstance(fileobj, PDFStream):
|
||||
raise PDFValueError(
|
||||
'unable to process PDF: reference for %r is not a PDFStream' %
|
||||
(filename))
|
||||
error_msg = 'unable to process PDF: reference for %r is not a ' \
|
||||
'PDFStream' % filename
|
||||
raise PDFValueError(error_msg)
|
||||
if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
|
||||
raise PDFValueError(
|
||||
'unable to process PDF: reference for %r '
|
||||
'is not an EmbeddedFile' % (filename))
|
||||
path = os.path.join(extractdir, filename)
|
||||
path = os.path.join(extractdir, '%.6d-%s' % (objid, filename))
|
||||
if os.path.exists(path):
|
||||
raise IOError('file exists: %r' % path)
|
||||
print('extracting: %r' % path)
|
||||
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||
out = open(path, 'wb')
|
||||
out.write(fileobj.get_data())
|
||||
out.close()
|
||||
return
|
||||
|
||||
fp = open(fname, 'rb')
|
||||
with open(fname, 'rb') as fp:
|
||||
parser = PDFParser(fp)
|
||||
doc = PDFDocument(parser, password)
|
||||
extracted_objids = set()
|
||||
for xref in doc.xrefs:
|
||||
for objid in xref.get_objids():
|
||||
obj = doc.getobj(objid)
|
||||
if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC:
|
||||
extract1(obj)
|
||||
fp.close()
|
||||
if objid not in extracted_objids and isinstance(obj, dict) \
|
||||
and obj.get('Type') is LITERAL_FILESPEC:
|
||||
extracted_objids.add(objid)
|
||||
extract1(objid, obj)
|
||||
return
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue