Fixed some style issues.

pull/1/head
Yusuke Shinyama 2013-10-19 08:41:01 +09:00
parent 28cb424f8f
commit 32844507ea
1 changed files with 38 additions and 42 deletions

View File

@ -6,11 +6,11 @@
# options: # options:
# -i objid : object id # -i objid : object id
# #
import sys, os, re import sys, os.path, re
from pdfminer.psparser import PSKeyword, PSLiteral from pdfminer.psparser import PSKeyword, PSLiteral, LIT
from pdfminer.pdfparser import PDFParser from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
from pdfminer.pdftypes import PDFObjectNotFound from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFPage
@ -108,7 +108,7 @@ def dumpallobjs(out, doc, codec=None):
# dumpoutline # dumpoutline
def dumpoutline(outfp, fname, objids, pagenos, password='', def dumpoutline(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None): dumpall=False, codec=None, extractdir=None):
fp = file(fname, 'rb') fp = file(fname, 'rb')
parser = PDFParser(fp) parser = PDFParser(fp)
doc = PDFDocument(parser) doc = PDFDocument(parser)
@ -155,48 +155,46 @@ def dumpoutline(outfp, fname, objids, pagenos, password='',
return return
# extractembedded # extractembedded
LITERAL_FILESPEC = LIT('Filespec')
LITERAL_EMBEDDEDFILE = LIT('EmbeddedFile')
def extractembedded(outfp, fname, objids, pagenos, password='', def extractembedded(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None): dumpall=False, codec=None, extractdir=None):
doc = PDFDocument() def extract1(obj):
filename = os.path.basename(obj['UF'] or obj['F'])
fileref = obj['EF']['F']
fileobj = doc.getobj(fileref.objid)
if not isinstance(fileobj, PDFStream):
raise PDFValueError(
'unable to process PDF: reference for %r is not a PDFStream' %
(filename))
if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
raise PDFValueError(
'unable to process PDF: reference for %r is not an EmbeddedFile' %
(filename))
path = os.path.join(extractdir, filename)
if os.path.exists(path):
raise IOError('file exists: %r' % path)
print >>sys.stderr, 'extracting: %r' % path
out = file(path, 'wb')
out.write(fileobj.get_data())
out.close()
return
fp = file(fname, 'rb') fp = file(fname, 'rb')
parser = PDFParser(fp) parser = PDFParser(fp)
parser.set_document(doc) doc = PDFDocument(parser)
doc.set_parser(parser)
doc.initialize(password) doc.initialize(password)
cwd = os.path.normpath(os.getcwd()) + '/'
for xref in doc.xrefs: for xref in doc.xrefs:
for objid in xref.get_objids(): for objid in xref.get_objids():
obj = doc.getobj(objid) obj = doc.getobj(objid)
if isinstance(obj, dict): if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC:
objtype = obj.get('Type', '') extract1(obj)
if isinstance(objtype, PSLiteral) and objtype.name == 'Filespec': return
filename = obj['UF'] or obj['F']
fileref = obj['EF']['F']
fileobj = doc.getobj(fileref.objid)
if not isinstance(fileobj, PDFStream):
raise Exception("unable to process PDF: reference for %s is not a PDFStream" % (filename))
if not isinstance(fileobj['Type'], PSLiteral) or not fileobj['Type'].name == 'EmbeddedFile':
raise Exception("unable to process PDF: reference for %s is not an EmbeddedFile" % (filename))
print "extracting", filename
absfilename = os.path.normpath(os.path.abspath(filename))
if not absfilename.startswith(cwd):
raise Exception("filename %s is trying to escape to parent directories.." % (filename))
dirname = os.path.dirname(absfilename)
if not os.path.isdir(dirname):
os.makedirs(dirname)
# don't overwrite anything
fd = os.open(absfilename, os.O_WRONLY | os.O_CREAT | os.O_EXCL)
f = os.fdopen(fd, 'wb')
f.write(fileobj.get_data())
f.close()
# dumppdf # dumppdf
def dumppdf(outfp, fname, objids, pagenos, password='', def dumppdf(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None): dumpall=False, codec=None, extractdir=None):
fp = file(fname, 'rb') fp = file(fname, 'rb')
parser = PDFParser(fp) parser = PDFParser(fp)
doc = PDFDocument(parser) doc = PDFDocument(parser)
@ -246,6 +244,7 @@ def main(argv):
extractdir = None extractdir = None
for (k, v) in opts: for (k, v) in opts:
if k == '-d': debug += 1 if k == '-d': debug += 1
elif k == '-o': outfp = file(v, 'wb')
elif k == '-i': objids.extend( int(x) for x in v.split(',') ) elif k == '-i': objids.extend( int(x) for x in v.split(',') )
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-P': password = v elif k == '-P': password = v
@ -254,19 +253,16 @@ def main(argv):
elif k == '-b': codec = 'binary' elif k == '-b': codec = 'binary'
elif k == '-t': codec = 'text' elif k == '-t': codec = 'text'
elif k == '-T': proc = dumpoutline elif k == '-T': proc = dumpoutline
elif k == '-E': extractdir = v elif k == '-E':
elif k == '-o': outfp = file(v, 'wb') extractdir = v
proc = extractembedded
# #
PDFDocument.debug = debug PDFDocument.debug = debug
PDFParser.debug = debug PDFParser.debug = debug
# #
if extractdir:
proc = extractembedded
os.chdir(extractdir)
#
for fname in args: for fname in args:
proc(outfp, fname, objids, pagenos, password=password, proc(outfp, fname, objids, pagenos, password=password,
dumpall=dumpall, codec=codec) dumpall=dumpall, codec=codec, extractdir=extractdir)
return return
if __name__ == '__main__': sys.exit(main(sys.argv)) if __name__ == '__main__': sys.exit(main(sys.argv))