Fixed some style issues.

pull/1/head
Yusuke Shinyama 2013-10-19 08:41:01 +09:00
parent 28cb424f8f
commit 32844507ea
1 changed files with 38 additions and 42 deletions

View File

@ -6,11 +6,11 @@
# options: # options:
# -i objid : object id # -i objid : object id
# #
import sys, os, re import sys, os.path, re
from pdfminer.psparser import PSKeyword, PSLiteral from pdfminer.psparser import PSKeyword, PSLiteral, LIT
from pdfminer.pdfparser import PDFParser from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
from pdfminer.pdftypes import PDFObjectNotFound from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFPage
@ -108,7 +108,7 @@ def dumpallobjs(out, doc, codec=None):
# dumpoutline # dumpoutline
def dumpoutline(outfp, fname, objids, pagenos, password='', def dumpoutline(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None): dumpall=False, codec=None, extractdir=None):
fp = file(fname, 'rb') fp = file(fname, 'rb')
parser = PDFParser(fp) parser = PDFParser(fp)
doc = PDFDocument(parser) doc = PDFDocument(parser)
@ -155,48 +155,46 @@ def dumpoutline(outfp, fname, objids, pagenos, password='',
return return
# extractembedded # extractembedded
LITERAL_FILESPEC = LIT('Filespec')
LITERAL_EMBEDDEDFILE = LIT('EmbeddedFile')
def extractembedded(outfp, fname, objids, pagenos, password='', def extractembedded(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None): dumpall=False, codec=None, extractdir=None):
doc = PDFDocument() def extract1(obj):
fp = file(fname, 'rb') filename = os.path.basename(obj['UF'] or obj['F'])
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize(password)
cwd = os.path.normpath(os.getcwd()) + '/'
for xref in doc.xrefs:
for objid in xref.get_objids():
obj = doc.getobj(objid)
if isinstance(obj, dict):
objtype = obj.get('Type', '')
if isinstance(objtype, PSLiteral) and objtype.name == 'Filespec':
filename = obj['UF'] or obj['F']
fileref = obj['EF']['F'] fileref = obj['EF']['F']
fileobj = doc.getobj(fileref.objid) fileobj = doc.getobj(fileref.objid)
if not isinstance(fileobj, PDFStream): if not isinstance(fileobj, PDFStream):
raise Exception("unable to process PDF: reference for %s is not a PDFStream" % (filename)) raise PDFValueError(
if not isinstance(fileobj['Type'], PSLiteral) or not fileobj['Type'].name == 'EmbeddedFile': 'unable to process PDF: reference for %r is not a PDFStream' %
raise Exception("unable to process PDF: reference for %s is not an EmbeddedFile" % (filename)) (filename))
if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
raise PDFValueError(
'unable to process PDF: reference for %r is not an EmbeddedFile' %
(filename))
path = os.path.join(extractdir, filename)
if os.path.exists(path):
raise IOError('file exists: %r' % path)
print >>sys.stderr, 'extracting: %r' % path
out = file(path, 'wb')
out.write(fileobj.get_data())
out.close()
return
print "extracting", filename fp = file(fname, 'rb')
absfilename = os.path.normpath(os.path.abspath(filename)) parser = PDFParser(fp)
if not absfilename.startswith(cwd): doc = PDFDocument(parser)
raise Exception("filename %s is trying to escape to parent directories.." % (filename)) doc.initialize(password)
dirname = os.path.dirname(absfilename) for xref in doc.xrefs:
if not os.path.isdir(dirname): for objid in xref.get_objids():
os.makedirs(dirname) obj = doc.getobj(objid)
if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC:
# don't overwrite anything extract1(obj)
fd = os.open(absfilename, os.O_WRONLY | os.O_CREAT | os.O_EXCL) return
f = os.fdopen(fd, 'wb')
f.write(fileobj.get_data())
f.close()
# dumppdf # dumppdf
def dumppdf(outfp, fname, objids, pagenos, password='', def dumppdf(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None): dumpall=False, codec=None, extractdir=None):
fp = file(fname, 'rb') fp = file(fname, 'rb')
parser = PDFParser(fp) parser = PDFParser(fp)
doc = PDFDocument(parser) doc = PDFDocument(parser)
@ -246,6 +244,7 @@ def main(argv):
extractdir = None extractdir = None
for (k, v) in opts: for (k, v) in opts:
if k == '-d': debug += 1 if k == '-d': debug += 1
elif k == '-o': outfp = file(v, 'wb')
elif k == '-i': objids.extend( int(x) for x in v.split(',') ) elif k == '-i': objids.extend( int(x) for x in v.split(',') )
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-P': password = v elif k == '-P': password = v
@ -254,19 +253,16 @@ def main(argv):
elif k == '-b': codec = 'binary' elif k == '-b': codec = 'binary'
elif k == '-t': codec = 'text' elif k == '-t': codec = 'text'
elif k == '-T': proc = dumpoutline elif k == '-T': proc = dumpoutline
elif k == '-E': extractdir = v elif k == '-E':
elif k == '-o': outfp = file(v, 'wb') extractdir = v
proc = extractembedded
# #
PDFDocument.debug = debug PDFDocument.debug = debug
PDFParser.debug = debug PDFParser.debug = debug
# #
if extractdir:
proc = extractembedded
os.chdir(extractdir)
#
for fname in args: for fname in args:
proc(outfp, fname, objids, pagenos, password=password, proc(outfp, fname, objids, pagenos, password=password,
dumpall=dumpall, codec=codec) dumpall=dumpall, codec=codec, extractdir=extractdir)
return return
if __name__ == '__main__': sys.exit(main(sys.argv)) if __name__ == '__main__': sys.exit(main(sys.argv))