webapp fixed
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@283 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
5d98a27d9c
commit
866f2bbb75
|
@ -5,4 +5,4 @@ RM=rm -f
|
||||||
all:
|
all:
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
-$(RM) *.pyc *.pyo
|
-$(RM) *.pyc *.pyo *.cgic *.cgio
|
||||||
|
|
|
@ -77,21 +77,23 @@ class WebApp(object):
|
||||||
|
|
||||||
TITLE = 'pdf2html demo'
|
TITLE = 'pdf2html demo'
|
||||||
MAXFILESIZE = 10000000 # set to zero if unlimited.
|
MAXFILESIZE = 10000000 # set to zero if unlimited.
|
||||||
MAXPAGES = 10 # set to zero if unlimited.
|
MAXPAGES = 100 # set to zero if unlimited.
|
||||||
|
|
||||||
def __init__(self, infp=sys.stdin, outfp=sys.stdout, environ=os.environ,
|
def __init__(self, infp=sys.stdin, outfp=sys.stdout, environ=os.environ,
|
||||||
codec='utf-8', apppath='/'):
|
codec='utf-8', apppath='/'):
|
||||||
self.infp = infp
|
self.infp = infp
|
||||||
self.outfp = outfp
|
self.outfp = outfp
|
||||||
|
self.environ = environ
|
||||||
self.codec = codec
|
self.codec = codec
|
||||||
self.apppath = apppath
|
self.apppath = apppath
|
||||||
self.remote_addr = environ.get('REMOTE_ADDR')
|
self.remote_addr = self.environ.get('REMOTE_ADDR')
|
||||||
self.path_info = environ.get('PATH_INFO')
|
self.path_info = self.environ.get('PATH_INFO')
|
||||||
self.method = environ.get('REQUEST_METHOD', 'GET').upper()
|
self.method = self.environ.get('REQUEST_METHOD', 'GET').upper()
|
||||||
self.server = environ.get('SERVER_SOFTWARE', '')
|
self.server = self.environ.get('SERVER_SOFTWARE', '')
|
||||||
self.tmpdir = environ.get('TEMP', './var/')
|
self.tmpdir = self.environ.get('TEMP', './var/')
|
||||||
self.content_type = 'text/html; charset=%s' % codec
|
self.content_type = 'text/html; charset=%s' % codec
|
||||||
self.logger = logging.getLogger()
|
self.logger = logging.getLogger()
|
||||||
|
logging.basicConfig(level=10,stream=sys.stderr)
|
||||||
return
|
return
|
||||||
|
|
||||||
def put(self, *args):
|
def put(self, *args):
|
||||||
|
@ -102,7 +104,7 @@ class WebApp(object):
|
||||||
self.outfp.write(x.encode(self.codec, 'xmlcharrefreplace'))
|
self.outfp.write(x.encode(self.codec, 'xmlcharrefreplace'))
|
||||||
return
|
return
|
||||||
|
|
||||||
def http_200(self):
|
def response_200(self):
|
||||||
if self.server.startswith('cgi-httpd'):
|
if self.server.startswith('cgi-httpd'):
|
||||||
# required for cgi-httpd
|
# required for cgi-httpd
|
||||||
self.outfp.write('HTTP/1.0 200 OK\r\n')
|
self.outfp.write('HTTP/1.0 200 OK\r\n')
|
||||||
|
@ -110,7 +112,7 @@ class WebApp(object):
|
||||||
self.outfp.write('Connection: close\r\n\r\n')
|
self.outfp.write('Connection: close\r\n\r\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
def http_404(self):
|
def response_404(self):
|
||||||
if self.server.startswith('cgi-httpd'):
|
if self.server.startswith('cgi-httpd'):
|
||||||
# required for cgi-httpd
|
# required for cgi-httpd
|
||||||
self.outfp.write('HTTP/1.0 404 Not Found\r\n')
|
self.outfp.write('HTTP/1.0 404 Not Found\r\n')
|
||||||
|
@ -119,7 +121,7 @@ class WebApp(object):
|
||||||
self.outfp.write('<html><body>page does not exist</body></body>\n')
|
self.outfp.write('<html><body>page does not exist</body></body>\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
def http_301(self, url):
|
def response_301(self, url):
|
||||||
if self.server.startswith('cgi-httpd'):
|
if self.server.startswith('cgi-httpd'):
|
||||||
# required for cgi-httpd
|
# required for cgi-httpd
|
||||||
self.outfp.write('HTTP/1.0 301 Moved\r\n')
|
self.outfp.write('HTTP/1.0 301 Moved\r\n')
|
||||||
|
@ -146,53 +148,52 @@ class WebApp(object):
|
||||||
return
|
return
|
||||||
|
|
||||||
def setup(self):
|
def setup(self):
|
||||||
|
self.run = self.response_404
|
||||||
|
status = 404
|
||||||
if not os.path.isdir(self.tmpdir):
|
if not os.path.isdir(self.tmpdir):
|
||||||
self.logger.error('no tmpdir')
|
self.logger.error('no tmpdir')
|
||||||
status = 304
|
status = 304
|
||||||
elif self.path_info != self.apppath:
|
elif self.path_info == self.apppath:
|
||||||
status = 404
|
self.run = self.convert
|
||||||
else:
|
|
||||||
status = 200
|
status = 200
|
||||||
self._status = status
|
|
||||||
return status
|
return status
|
||||||
|
|
||||||
def run(self):
|
def convert(self):
|
||||||
form = cgi.FieldStorage(self.infp)
|
self.form = cgi.FieldStorage(fp=self.infp, environ=self.environ)
|
||||||
if self._status != 200:
|
|
||||||
self.http_404()
|
|
||||||
return
|
|
||||||
if (self.method != 'POST' or
|
if (self.method != 'POST' or
|
||||||
'c' not in form or
|
'c' not in self.form or
|
||||||
'f' not in form):
|
'f' not in self.form):
|
||||||
|
self.response_200()
|
||||||
self.coverpage()
|
self.coverpage()
|
||||||
return
|
return
|
||||||
item = form['f']
|
item = self.form['f']
|
||||||
if not (item.file and item.filename):
|
if not (item.file and item.filename):
|
||||||
|
self.response_200()
|
||||||
self.coverpage()
|
self.coverpage()
|
||||||
return
|
return
|
||||||
cmd = form.getvalue('c')
|
cmd = self.form.getvalue('c')
|
||||||
html = (cmd == 'Convert to HTML')
|
html = (cmd == 'Convert to HTML')
|
||||||
pagenos = []
|
pagenos = []
|
||||||
if 'p' in form:
|
if 'p' in self.form:
|
||||||
for m in re.finditer(r'\d+', form.getvalue('p')):
|
for m in re.finditer(r'\d+', self.form.getvalue('p')):
|
||||||
try:
|
try:
|
||||||
pagenos.append(int(m.group(0)))
|
pagenos.append(int(m.group(0)))
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
self.logger.info('received: host=%s, name=%r, pagenos=%r' %
|
|
||||||
(self.remote_addr, item.filename, pagenos))
|
|
||||||
h = abs(hash((random.random(), self.remote_addr, item.filename)))
|
h = abs(hash((random.random(), self.remote_addr, item.filename)))
|
||||||
tmppath = os.path.join(self.tmpdir, '%08x%08x.pdf' % (time.time(), h))
|
tmppath = os.path.join(self.tmpdir, '%08x%08x.pdf' % (time.time(), h))
|
||||||
|
self.logger.info('received: host=%s, name=%r, pagenos=%r, tmppath=%r' %
|
||||||
|
(self.remote_addr, item.filename, pagenos, tmppath))
|
||||||
try:
|
try:
|
||||||
if not html:
|
if not html:
|
||||||
self.content_type = 'text/plain; charset=%s' % self.codec
|
self.content_type = 'text/plain; charset=%s' % self.codec
|
||||||
self.http_200()
|
self.response_200()
|
||||||
try:
|
try:
|
||||||
convert(item.file, sys.stdout, tmppath, pagenos=pagenos, codec=self.codec,
|
convert(item.file, self.outfp, tmppath, pagenos=pagenos, codec=self.codec,
|
||||||
maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE, html=html)
|
maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE, html=html)
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
self.put('<p>Sorry, an error has occured: %s' % q(repr(e)))
|
self.put('<p>Sorry, an error has occured: %s' % q(repr(e)))
|
||||||
self.logger.error('convert: %r: path=%r: %s' % (e, tmppath, traceback.format_exc()))
|
self.logger.error('convert: %r: path=%r: %s' % (e, traceback.format_exc()))
|
||||||
finally:
|
finally:
|
||||||
try:
|
try:
|
||||||
os.remove(tmppath)
|
os.remove(tmppath)
|
||||||
|
|
|
@ -0,0 +1,113 @@
|
||||||
|
#!/usr/bin/env python2
|
||||||
|
##
|
||||||
|
## WebApp class runner
|
||||||
|
##
|
||||||
|
## usage:
|
||||||
|
## $ runapp.py pdf2html.cgi
|
||||||
|
##
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import urllib
|
||||||
|
from httplib import responses
|
||||||
|
from BaseHTTPServer import HTTPServer
|
||||||
|
from SimpleHTTPServer import SimpleHTTPRequestHandler
|
||||||
|
|
||||||
|
## WebAppHandler
|
||||||
|
##
|
||||||
|
class WebAppHandler(SimpleHTTPRequestHandler):
|
||||||
|
|
||||||
|
APP_CLASS = None
|
||||||
|
|
||||||
|
def do_POST(self):
|
||||||
|
return self.run_cgi()
|
||||||
|
|
||||||
|
def send_head(self):
|
||||||
|
return self.run_cgi()
|
||||||
|
|
||||||
|
def run_cgi(self):
|
||||||
|
rest = self.path
|
||||||
|
i = rest.rfind('?')
|
||||||
|
if i >= 0:
|
||||||
|
rest, query = rest[:i], rest[i+1:]
|
||||||
|
else:
|
||||||
|
query = ''
|
||||||
|
i = rest.find('/')
|
||||||
|
if i >= 0:
|
||||||
|
script, rest = rest[:i], rest[i:]
|
||||||
|
else:
|
||||||
|
script, rest = rest, ''
|
||||||
|
scriptname = '/' + script
|
||||||
|
scriptfile = self.translate_path(scriptname)
|
||||||
|
env = {}
|
||||||
|
env['SERVER_SOFTWARE'] = self.version_string()
|
||||||
|
env['SERVER_NAME'] = self.server.server_name
|
||||||
|
env['GATEWAY_INTERFACE'] = 'CGI/1.1'
|
||||||
|
env['SERVER_PROTOCOL'] = self.protocol_version
|
||||||
|
env['SERVER_PORT'] = str(self.server.server_port)
|
||||||
|
env['REQUEST_METHOD'] = self.command
|
||||||
|
uqrest = urllib.unquote(rest)
|
||||||
|
env['PATH_INFO'] = uqrest
|
||||||
|
env['PATH_TRANSLATED'] = self.translate_path(uqrest)
|
||||||
|
env['SCRIPT_NAME'] = scriptname
|
||||||
|
if query:
|
||||||
|
env['QUERY_STRING'] = query
|
||||||
|
host = self.address_string()
|
||||||
|
if host != self.client_address[0]:
|
||||||
|
env['REMOTE_HOST'] = host
|
||||||
|
env['REMOTE_ADDR'] = self.client_address[0]
|
||||||
|
if self.headers.typeheader is None:
|
||||||
|
env['CONTENT_TYPE'] = self.headers.type
|
||||||
|
else:
|
||||||
|
env['CONTENT_TYPE'] = self.headers.typeheader
|
||||||
|
length = self.headers.getheader('content-length')
|
||||||
|
if length:
|
||||||
|
env['CONTENT_LENGTH'] = length
|
||||||
|
accept = []
|
||||||
|
for line in self.headers.getallmatchingheaders('accept'):
|
||||||
|
if line[:1] in "\t\n\r ":
|
||||||
|
accept.append(line.strip())
|
||||||
|
else:
|
||||||
|
accept = accept + line[7:].split(',')
|
||||||
|
env['HTTP_ACCEPT'] = ','.join(accept)
|
||||||
|
ua = self.headers.getheader('user-agent')
|
||||||
|
if ua:
|
||||||
|
env['HTTP_USER_AGENT'] = ua
|
||||||
|
co = filter(None, self.headers.getheaders('cookie'))
|
||||||
|
if co:
|
||||||
|
env['HTTP_COOKIE'] = ', '.join(co)
|
||||||
|
for k in ('QUERY_STRING', 'REMOTE_HOST', 'CONTENT_LENGTH',
|
||||||
|
'HTTP_USER_AGENT', 'HTTP_COOKIE'):
|
||||||
|
env.setdefault(k, "")
|
||||||
|
app = self.APP_CLASS(infp=self.rfile, outfp=self.wfile, environ=env)
|
||||||
|
status = app.setup()
|
||||||
|
self.send_response(status, responses[status])
|
||||||
|
app.run()
|
||||||
|
return
|
||||||
|
|
||||||
|
# main
|
||||||
|
def main(argv):
|
||||||
|
import getopt, imp
|
||||||
|
def usage():
|
||||||
|
print 'usage: %s [-h host] [-p port] [-n name] module.class' % argv[0]
|
||||||
|
return 100
|
||||||
|
try:
|
||||||
|
(opts, args) = getopt.getopt(argv[1:], 'h:p:n:')
|
||||||
|
except getopt.GetoptError:
|
||||||
|
return usage()
|
||||||
|
host = ''
|
||||||
|
port = 8080
|
||||||
|
name = 'WebApp'
|
||||||
|
for (k, v) in opts:
|
||||||
|
if k == '-h': host = v
|
||||||
|
elif k == '-p': port = int(v)
|
||||||
|
elif k == '-n': name = v
|
||||||
|
if not args: return usage()
|
||||||
|
path = args.pop(0)
|
||||||
|
module = imp.load_source('app', path)
|
||||||
|
WebAppHandler.APP_CLASS = getattr(module, name)
|
||||||
|
print 'Listening %s:%d...' % (host,port)
|
||||||
|
httpd = HTTPServer((host,port), WebAppHandler)
|
||||||
|
httpd.serve_forever()
|
||||||
|
return
|
||||||
|
|
||||||
|
if __name__ == '__main__': sys.exit(main(sys.argv))
|
Loading…
Reference in New Issue