dumppdf: support for extracting embedded files using the -E option

pull/1/head
eug 2013-01-20 13:23:58 +10:00
parent c7709045e9
commit 925845b172
2 changed files with 52 additions and 3 deletions

View File

@ -352,6 +352,9 @@ no stream header is displayed for the ease of saving it to a file.
<dt> <code>-T</code> <dt> <code>-T</code>
<dd> Shows the table of contents. <dd> Shows the table of contents.
<p> <p>
<dt> <code>-E <em>directory</em></code>
<dd> Extracts embedded files from the pdf into the given directory.
<p>
<dt> <code>-P <em>password</em></code> <dt> <code>-P <em>password</em></code>
<dd> Provides the user password to access PDF contents. <dd> Provides the user password to access PDF contents.
<p> <p>

View File

@ -6,7 +6,7 @@
# options: # options:
# -i objid : object id # -i objid : object id
# #
import sys, re import sys, os, re
from pdfminer.psparser import PSKeyword, PSLiteral from pdfminer.psparser import PSKeyword, PSLiteral
from pdfminer.pdfparser import PDFDocument, PDFParser, PDFNoOutlines from pdfminer.pdfparser import PDFDocument, PDFParser, PDFNoOutlines
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
@ -152,6 +152,46 @@ def dumpoutline(outfp, fname, objids, pagenos, password='',
fp.close() fp.close()
return return
# extractembedded
def extractembedded(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None):
doc = PDFDocument()
fp = file(fname, 'rb')
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize(password)
cwd = os.path.normpath(os.getcwd()) + '/'
for xref in doc.xrefs:
for objid in xref.get_objids():
obj = doc.getobj(objid)
if isinstance(obj, dict):
objtype = obj.get('Type', '')
if isinstance(objtype, PSLiteral) and objtype.name == 'Filespec':
filename = obj['UF'] or obj['F']
fileref = obj['EF']['F']
fileobj = doc.getobj(fileref.objid)
if not isinstance(fileobj, PDFStream):
raise Exception("unable to process PDF: reference for %s is not a PDFStream" % (filename))
if not isinstance(fileobj['Type'], PSLiteral) or not fileobj['Type'].name == 'EmbeddedFile':
raise Exception("unable to process PDF: reference for %s is not an EmbeddedFile" % (filename))
print "extracting", filename
absfilename = os.path.normpath(os.path.abspath(filename))
if not absfilename.startswith(cwd):
raise Exception("filename %s is trying to escape to parent directories.." % (filename))
dirname = os.path.dirname(absfilename)
if not os.path.isdir(dirname):
os.makedirs(dirname)
# don't overwrite anything
fd = os.open(absfilename, os.O_WRONLY | os.O_CREAT | os.O_EXCL)
f = os.fdopen(fd, 'wb')
f.write(fileobj.get_data())
f.close()
# dumppdf # dumppdf
def dumppdf(outfp, fname, objids, pagenos, password='', def dumppdf(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None): dumpall=False, codec=None):
@ -188,10 +228,10 @@ def dumppdf(outfp, fname, objids, pagenos, password='',
def main(argv): def main(argv):
import getopt import getopt
def usage(): def usage():
print 'usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-i objid] file ...' % argv[0] print 'usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-E directory] [-i objid] file ...' % argv[0]
return 100 return 100
try: try:
(opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTi:') (opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTE:i:')
except getopt.GetoptError: except getopt.GetoptError:
return usage() return usage()
if not args: return usage() if not args: return usage()
@ -203,6 +243,7 @@ def main(argv):
dumpall = False dumpall = False
proc = dumppdf proc = dumppdf
outfp = sys.stdout outfp = sys.stdout
extractdir = None
for (k, v) in opts: for (k, v) in opts:
if k == '-d': debug += 1 if k == '-d': debug += 1
elif k == '-i': objids.extend( int(x) for x in v.split(',') ) elif k == '-i': objids.extend( int(x) for x in v.split(',') )
@ -213,11 +254,16 @@ def main(argv):
elif k == '-b': codec = 'binary' elif k == '-b': codec = 'binary'
elif k == '-t': codec = 'text' elif k == '-t': codec = 'text'
elif k == '-T': proc = dumpoutline elif k == '-T': proc = dumpoutline
elif k == '-E': extractdir = v
elif k == '-o': outfp = file(v, 'wb') elif k == '-o': outfp = file(v, 'wb')
# #
PDFDocument.debug = debug PDFDocument.debug = debug
PDFParser.debug = debug PDFParser.debug = debug
# #
if extractdir:
proc = extractembedded
os.chdir(extractdir)
#
for fname in args: for fname in args:
proc(outfp, fname, objids, pagenos, password=password, proc(outfp, fname, objids, pagenos, password=password,
dumpall=dumpall, codec=codec) dumpall=dumpall, codec=codec)