From 866f2bbb75df159c0f142e9f9082a86f7848cf0b Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Sat, 25 Dec 2010 08:41:35 +0000 Subject: [PATCH] webapp fixed git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@283 1aa58f4a-7d42-0410-adbc-911cccaed67c --- tools/Makefile | 2 +- tools/pdf2html.cgi | 59 +++++++++++------------ tools/runapp.py | 113 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 144 insertions(+), 30 deletions(-) create mode 100755 tools/runapp.py diff --git a/tools/Makefile b/tools/Makefile index 1f232e0..ed51702 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -5,4 +5,4 @@ RM=rm -f all: clean: - -$(RM) *.pyc *.pyo + -$(RM) *.pyc *.pyo *.cgic *.cgio diff --git a/tools/pdf2html.cgi b/tools/pdf2html.cgi index bb21221..54f8feb 100755 --- a/tools/pdf2html.cgi +++ b/tools/pdf2html.cgi @@ -77,21 +77,23 @@ class WebApp(object): TITLE = 'pdf2html demo' MAXFILESIZE = 10000000 # set to zero if unlimited. - MAXPAGES = 10 # set to zero if unlimited. + MAXPAGES = 100 # set to zero if unlimited. def __init__(self, infp=sys.stdin, outfp=sys.stdout, environ=os.environ, codec='utf-8', apppath='/'): self.infp = infp self.outfp = outfp + self.environ = environ self.codec = codec self.apppath = apppath - self.remote_addr = environ.get('REMOTE_ADDR') - self.path_info = environ.get('PATH_INFO') - self.method = environ.get('REQUEST_METHOD', 'GET').upper() - self.server = environ.get('SERVER_SOFTWARE', '') - self.tmpdir = environ.get('TEMP', './var/') + self.remote_addr = self.environ.get('REMOTE_ADDR') + self.path_info = self.environ.get('PATH_INFO') + self.method = self.environ.get('REQUEST_METHOD', 'GET').upper() + self.server = self.environ.get('SERVER_SOFTWARE', '') + self.tmpdir = self.environ.get('TEMP', './var/') self.content_type = 'text/html; charset=%s' % codec self.logger = logging.getLogger() + logging.basicConfig(level=10,stream=sys.stderr) return def put(self, *args): @@ -102,7 +104,7 @@ class WebApp(object): self.outfp.write(x.encode(self.codec, 'xmlcharrefreplace')) return - def http_200(self): + def response_200(self): if self.server.startswith('cgi-httpd'): # required for cgi-httpd self.outfp.write('HTTP/1.0 200 OK\r\n') @@ -110,7 +112,7 @@ class WebApp(object): self.outfp.write('Connection: close\r\n\r\n') return - def http_404(self): + def response_404(self): if self.server.startswith('cgi-httpd'): # required for cgi-httpd self.outfp.write('HTTP/1.0 404 Not Found\r\n') @@ -119,7 +121,7 @@ class WebApp(object): self.outfp.write('page does not exist\n') return - def http_301(self, url): + def response_301(self, url): if self.server.startswith('cgi-httpd'): # required for cgi-httpd self.outfp.write('HTTP/1.0 301 Moved\r\n') @@ -146,53 +148,52 @@ class WebApp(object): return def setup(self): + self.run = self.response_404 + status = 404 if not os.path.isdir(self.tmpdir): self.logger.error('no tmpdir') status = 304 - elif self.path_info != self.apppath: - status = 404 - else: + elif self.path_info == self.apppath: + self.run = self.convert status = 200 - self._status = status return status - def run(self): - form = cgi.FieldStorage(self.infp) - if self._status != 200: - self.http_404() - return + def convert(self): + self.form = cgi.FieldStorage(fp=self.infp, environ=self.environ) if (self.method != 'POST' or - 'c' not in form or - 'f' not in form): + 'c' not in self.form or + 'f' not in self.form): + self.response_200() self.coverpage() return - item = form['f'] + item = self.form['f'] if not (item.file and item.filename): + self.response_200() self.coverpage() return - cmd = form.getvalue('c') + cmd = self.form.getvalue('c') html = (cmd == 'Convert to HTML') pagenos = [] - if 'p' in form: - for m in re.finditer(r'\d+', form.getvalue('p')): + if 'p' in self.form: + for m in re.finditer(r'\d+', self.form.getvalue('p')): try: pagenos.append(int(m.group(0))) except ValueError: pass - self.logger.info('received: host=%s, name=%r, pagenos=%r' % - (self.remote_addr, item.filename, pagenos)) h = abs(hash((random.random(), self.remote_addr, item.filename))) tmppath = os.path.join(self.tmpdir, '%08x%08x.pdf' % (time.time(), h)) + self.logger.info('received: host=%s, name=%r, pagenos=%r, tmppath=%r' % + (self.remote_addr, item.filename, pagenos, tmppath)) try: if not html: self.content_type = 'text/plain; charset=%s' % self.codec - self.http_200() + self.response_200() try: - convert(item.file, sys.stdout, tmppath, pagenos=pagenos, codec=self.codec, + convert(item.file, self.outfp, tmppath, pagenos=pagenos, codec=self.codec, maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE, html=html) except Exception, e: self.put('

Sorry, an error has occured: %s' % q(repr(e))) - self.logger.error('convert: %r: path=%r: %s' % (e, tmppath, traceback.format_exc())) + self.logger.error('convert: %r: path=%r: %s' % (e, traceback.format_exc())) finally: try: os.remove(tmppath) diff --git a/tools/runapp.py b/tools/runapp.py new file mode 100755 index 0000000..3e187d0 --- /dev/null +++ b/tools/runapp.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python2 +## +## WebApp class runner +## +## usage: +## $ runapp.py pdf2html.cgi +## + +import sys +import urllib +from httplib import responses +from BaseHTTPServer import HTTPServer +from SimpleHTTPServer import SimpleHTTPRequestHandler + +## WebAppHandler +## +class WebAppHandler(SimpleHTTPRequestHandler): + + APP_CLASS = None + + def do_POST(self): + return self.run_cgi() + + def send_head(self): + return self.run_cgi() + + def run_cgi(self): + rest = self.path + i = rest.rfind('?') + if i >= 0: + rest, query = rest[:i], rest[i+1:] + else: + query = '' + i = rest.find('/') + if i >= 0: + script, rest = rest[:i], rest[i:] + else: + script, rest = rest, '' + scriptname = '/' + script + scriptfile = self.translate_path(scriptname) + env = {} + env['SERVER_SOFTWARE'] = self.version_string() + env['SERVER_NAME'] = self.server.server_name + env['GATEWAY_INTERFACE'] = 'CGI/1.1' + env['SERVER_PROTOCOL'] = self.protocol_version + env['SERVER_PORT'] = str(self.server.server_port) + env['REQUEST_METHOD'] = self.command + uqrest = urllib.unquote(rest) + env['PATH_INFO'] = uqrest + env['PATH_TRANSLATED'] = self.translate_path(uqrest) + env['SCRIPT_NAME'] = scriptname + if query: + env['QUERY_STRING'] = query + host = self.address_string() + if host != self.client_address[0]: + env['REMOTE_HOST'] = host + env['REMOTE_ADDR'] = self.client_address[0] + if self.headers.typeheader is None: + env['CONTENT_TYPE'] = self.headers.type + else: + env['CONTENT_TYPE'] = self.headers.typeheader + length = self.headers.getheader('content-length') + if length: + env['CONTENT_LENGTH'] = length + accept = [] + for line in self.headers.getallmatchingheaders('accept'): + if line[:1] in "\t\n\r ": + accept.append(line.strip()) + else: + accept = accept + line[7:].split(',') + env['HTTP_ACCEPT'] = ','.join(accept) + ua = self.headers.getheader('user-agent') + if ua: + env['HTTP_USER_AGENT'] = ua + co = filter(None, self.headers.getheaders('cookie')) + if co: + env['HTTP_COOKIE'] = ', '.join(co) + for k in ('QUERY_STRING', 'REMOTE_HOST', 'CONTENT_LENGTH', + 'HTTP_USER_AGENT', 'HTTP_COOKIE'): + env.setdefault(k, "") + app = self.APP_CLASS(infp=self.rfile, outfp=self.wfile, environ=env) + status = app.setup() + self.send_response(status, responses[status]) + app.run() + return + +# main +def main(argv): + import getopt, imp + def usage(): + print 'usage: %s [-h host] [-p port] [-n name] module.class' % argv[0] + return 100 + try: + (opts, args) = getopt.getopt(argv[1:], 'h:p:n:') + except getopt.GetoptError: + return usage() + host = '' + port = 8080 + name = 'WebApp' + for (k, v) in opts: + if k == '-h': host = v + elif k == '-p': port = int(v) + elif k == '-n': name = v + if not args: return usage() + path = args.pop(0) + module = imp.load_source('app', path) + WebAppHandler.APP_CLASS = getattr(module, name) + print 'Listening %s:%d...' % (host,port) + httpd = HTTPServer((host,port), WebAppHandler) + httpd.serve_forever() + return + +if __name__ == '__main__': sys.exit(main(sys.argv))