dumppdf: support for extracting embedded files using the -E option
parent
c7709045e9
commit
925845b172
|
@ -352,6 +352,9 @@ no stream header is displayed for the ease of saving it to a file.
|
||||||
<dt> <code>-T</code>
|
<dt> <code>-T</code>
|
||||||
<dd> Shows the table of contents.
|
<dd> Shows the table of contents.
|
||||||
<p>
|
<p>
|
||||||
|
<dt> <code>-E <em>directory</em></code>
|
||||||
|
<dd> Extracts embedded files from the pdf into the given directory.
|
||||||
|
<p>
|
||||||
<dt> <code>-P <em>password</em></code>
|
<dt> <code>-P <em>password</em></code>
|
||||||
<dd> Provides the user password to access PDF contents.
|
<dd> Provides the user password to access PDF contents.
|
||||||
<p>
|
<p>
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
# options:
|
# options:
|
||||||
# -i objid : object id
|
# -i objid : object id
|
||||||
#
|
#
|
||||||
import sys, re
|
import sys, os, re
|
||||||
from pdfminer.psparser import PSKeyword, PSLiteral
|
from pdfminer.psparser import PSKeyword, PSLiteral
|
||||||
from pdfminer.pdfparser import PDFDocument, PDFParser, PDFNoOutlines
|
from pdfminer.pdfparser import PDFDocument, PDFParser, PDFNoOutlines
|
||||||
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
|
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
|
||||||
|
@ -152,6 +152,46 @@ def dumpoutline(outfp, fname, objids, pagenos, password='',
|
||||||
fp.close()
|
fp.close()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# extractembedded
|
||||||
|
def extractembedded(outfp, fname, objids, pagenos, password='',
|
||||||
|
dumpall=False, codec=None):
|
||||||
|
doc = PDFDocument()
|
||||||
|
fp = file(fname, 'rb')
|
||||||
|
parser = PDFParser(fp)
|
||||||
|
parser.set_document(doc)
|
||||||
|
doc.set_parser(parser)
|
||||||
|
doc.initialize(password)
|
||||||
|
|
||||||
|
cwd = os.path.normpath(os.getcwd()) + '/'
|
||||||
|
for xref in doc.xrefs:
|
||||||
|
for objid in xref.get_objids():
|
||||||
|
obj = doc.getobj(objid)
|
||||||
|
if isinstance(obj, dict):
|
||||||
|
objtype = obj.get('Type', '')
|
||||||
|
if isinstance(objtype, PSLiteral) and objtype.name == 'Filespec':
|
||||||
|
filename = obj['UF'] or obj['F']
|
||||||
|
fileref = obj['EF']['F']
|
||||||
|
fileobj = doc.getobj(fileref.objid)
|
||||||
|
if not isinstance(fileobj, PDFStream):
|
||||||
|
raise Exception("unable to process PDF: reference for %s is not a PDFStream" % (filename))
|
||||||
|
if not isinstance(fileobj['Type'], PSLiteral) or not fileobj['Type'].name == 'EmbeddedFile':
|
||||||
|
raise Exception("unable to process PDF: reference for %s is not an EmbeddedFile" % (filename))
|
||||||
|
|
||||||
|
print "extracting", filename
|
||||||
|
absfilename = os.path.normpath(os.path.abspath(filename))
|
||||||
|
if not absfilename.startswith(cwd):
|
||||||
|
raise Exception("filename %s is trying to escape to parent directories.." % (filename))
|
||||||
|
|
||||||
|
dirname = os.path.dirname(absfilename)
|
||||||
|
if not os.path.isdir(dirname):
|
||||||
|
os.makedirs(dirname)
|
||||||
|
|
||||||
|
# don't overwrite anything
|
||||||
|
fd = os.open(absfilename, os.O_WRONLY | os.O_CREAT | os.O_EXCL)
|
||||||
|
f = os.fdopen(fd, 'wb')
|
||||||
|
f.write(fileobj.get_data())
|
||||||
|
f.close()
|
||||||
|
|
||||||
# dumppdf
|
# dumppdf
|
||||||
def dumppdf(outfp, fname, objids, pagenos, password='',
|
def dumppdf(outfp, fname, objids, pagenos, password='',
|
||||||
dumpall=False, codec=None):
|
dumpall=False, codec=None):
|
||||||
|
@ -188,10 +228,10 @@ def dumppdf(outfp, fname, objids, pagenos, password='',
|
||||||
def main(argv):
|
def main(argv):
|
||||||
import getopt
|
import getopt
|
||||||
def usage():
|
def usage():
|
||||||
print 'usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-i objid] file ...' % argv[0]
|
print 'usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-E directory] [-i objid] file ...' % argv[0]
|
||||||
return 100
|
return 100
|
||||||
try:
|
try:
|
||||||
(opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTi:')
|
(opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTE:i:')
|
||||||
except getopt.GetoptError:
|
except getopt.GetoptError:
|
||||||
return usage()
|
return usage()
|
||||||
if not args: return usage()
|
if not args: return usage()
|
||||||
|
@ -203,6 +243,7 @@ def main(argv):
|
||||||
dumpall = False
|
dumpall = False
|
||||||
proc = dumppdf
|
proc = dumppdf
|
||||||
outfp = sys.stdout
|
outfp = sys.stdout
|
||||||
|
extractdir = None
|
||||||
for (k, v) in opts:
|
for (k, v) in opts:
|
||||||
if k == '-d': debug += 1
|
if k == '-d': debug += 1
|
||||||
elif k == '-i': objids.extend( int(x) for x in v.split(',') )
|
elif k == '-i': objids.extend( int(x) for x in v.split(',') )
|
||||||
|
@ -213,11 +254,16 @@ def main(argv):
|
||||||
elif k == '-b': codec = 'binary'
|
elif k == '-b': codec = 'binary'
|
||||||
elif k == '-t': codec = 'text'
|
elif k == '-t': codec = 'text'
|
||||||
elif k == '-T': proc = dumpoutline
|
elif k == '-T': proc = dumpoutline
|
||||||
|
elif k == '-E': extractdir = v
|
||||||
elif k == '-o': outfp = file(v, 'wb')
|
elif k == '-o': outfp = file(v, 'wb')
|
||||||
#
|
#
|
||||||
PDFDocument.debug = debug
|
PDFDocument.debug = debug
|
||||||
PDFParser.debug = debug
|
PDFParser.debug = debug
|
||||||
#
|
#
|
||||||
|
if extractdir:
|
||||||
|
proc = extractembedded
|
||||||
|
os.chdir(extractdir)
|
||||||
|
#
|
||||||
for fname in args:
|
for fname in args:
|
||||||
proc(outfp, fname, objids, pagenos, password=password,
|
proc(outfp, fname, objids, pagenos, password=password,
|
||||||
dumpall=dumpall, codec=codec)
|
dumpall=dumpall, codec=codec)
|
||||||
|
|
Loading…
Reference in New Issue