Fixed some style issues.

pull/1/head
Yusuke Shinyama 2013-10-19 08:41:01 +09:00
parent 28cb424f8f
commit 32844507ea
1 changed files with 38 additions and 42 deletions

View File

@ -6,11 +6,11 @@
# options:
# -i objid : object id
#
import sys, os, re
from pdfminer.psparser import PSKeyword, PSLiteral
import sys, os.path, re
from pdfminer.psparser import PSKeyword, PSLiteral, LIT
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
from pdfminer.pdftypes import PDFObjectNotFound
from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
from pdfminer.pdfpage import PDFPage
@ -108,7 +108,7 @@ def dumpallobjs(out, doc, codec=None):
# dumpoutline
def dumpoutline(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None):
dumpall=False, codec=None, extractdir=None):
fp = file(fname, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
@ -155,48 +155,46 @@ def dumpoutline(outfp, fname, objids, pagenos, password='',
return
# extractembedded
LITERAL_FILESPEC = LIT('Filespec')
LITERAL_EMBEDDEDFILE = LIT('EmbeddedFile')
def extractembedded(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None):
doc = PDFDocument()
dumpall=False, codec=None, extractdir=None):
def extract1(obj):
filename = os.path.basename(obj['UF'] or obj['F'])
fileref = obj['EF']['F']
fileobj = doc.getobj(fileref.objid)
if not isinstance(fileobj, PDFStream):
raise PDFValueError(
'unable to process PDF: reference for %r is not a PDFStream' %
(filename))
if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
raise PDFValueError(
'unable to process PDF: reference for %r is not an EmbeddedFile' %
(filename))
path = os.path.join(extractdir, filename)
if os.path.exists(path):
raise IOError('file exists: %r' % path)
print >>sys.stderr, 'extracting: %r' % path
out = file(path, 'wb')
out.write(fileobj.get_data())
out.close()
return
fp = file(fname, 'rb')
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc = PDFDocument(parser)
doc.initialize(password)
cwd = os.path.normpath(os.getcwd()) + '/'
for xref in doc.xrefs:
for objid in xref.get_objids():
obj = doc.getobj(objid)
if isinstance(obj, dict):
objtype = obj.get('Type', '')
if isinstance(objtype, PSLiteral) and objtype.name == 'Filespec':
filename = obj['UF'] or obj['F']
fileref = obj['EF']['F']
fileobj = doc.getobj(fileref.objid)
if not isinstance(fileobj, PDFStream):
raise Exception("unable to process PDF: reference for %s is not a PDFStream" % (filename))
if not isinstance(fileobj['Type'], PSLiteral) or not fileobj['Type'].name == 'EmbeddedFile':
raise Exception("unable to process PDF: reference for %s is not an EmbeddedFile" % (filename))
print "extracting", filename
absfilename = os.path.normpath(os.path.abspath(filename))
if not absfilename.startswith(cwd):
raise Exception("filename %s is trying to escape to parent directories.." % (filename))
dirname = os.path.dirname(absfilename)
if not os.path.isdir(dirname):
os.makedirs(dirname)
# don't overwrite anything
fd = os.open(absfilename, os.O_WRONLY | os.O_CREAT | os.O_EXCL)
f = os.fdopen(fd, 'wb')
f.write(fileobj.get_data())
f.close()
if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC:
extract1(obj)
return
# dumppdf
def dumppdf(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None):
dumpall=False, codec=None, extractdir=None):
fp = file(fname, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
@ -246,6 +244,7 @@ def main(argv):
extractdir = None
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-o': outfp = file(v, 'wb')
elif k == '-i': objids.extend( int(x) for x in v.split(',') )
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-P': password = v
@ -254,19 +253,16 @@ def main(argv):
elif k == '-b': codec = 'binary'
elif k == '-t': codec = 'text'
elif k == '-T': proc = dumpoutline
elif k == '-E': extractdir = v
elif k == '-o': outfp = file(v, 'wb')
elif k == '-E':
extractdir = v
proc = extractembedded
#
PDFDocument.debug = debug
PDFParser.debug = debug
#
if extractdir:
proc = extractembedded
os.chdir(extractdir)
#
for fname in args:
proc(outfp, fname, objids, pagenos, password=password,
dumpall=dumpall, codec=codec)
dumpall=dumpall, codec=codec, extractdir=extractdir)
return
if __name__ == '__main__': sys.exit(main(sys.argv))