pdf2html.cgi code cleanup

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@218 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2010-05-29 11:51:15 +00:00
parent fe3bdbfce0
commit f9c9357547
1 changed files with 31 additions and 41 deletions

View File

@ -15,10 +15,10 @@
# $ cp pdfminer/tools/pdf2html.cgi $CGIDIR # $ cp pdfminer/tools/pdf2html.cgi $CGIDIR
# #
import sys import sys, os, os.path, re, time
import cgi, logging, traceback, random
# comment out at this at runtime. # comment out at this at runtime.
#import cgitb; cgitb.enable() #import cgitb; cgitb.enable()
import os, os.path, re, cgi, time, random, codecs, logging, traceback
import pdfminer import pdfminer
from pdfminer.pdfinterp import PDFResourceManager, process_pdf from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import HTMLConverter, TextConverter from pdfminer.converter import HTMLConverter, TextConverter
@ -71,26 +71,27 @@ def convert(outfp, infp, path, codec='utf-8',
return return
## PDF2HTMLApp ## WebApp
## ##
class PDF2HTMLApp(object): class WebApp(object):
APPURL = '/convert' TITLE = 'pdf2html demo'
MAXFILESIZE = 5000000 APPPATH = '/' # absolute URL path to this application.
MAXPAGES = 10 MAXFILESIZE = 5000000 # set to zero if unlimited.
MAXPAGES = 10 # set to zero if unlimited.
def __init__(self, outfp=sys.stdout, codec='utf-8'): def __init__(self, infp=sys.stdin, outfp=sys.stdout, codec='utf-8'):
self.outfp = outfp self.outfp = outfp
self.codec = codec self.codec = codec
self.remote_addr = os.environ.get('REMOTE_ADDR') self.remote_addr = os.environ.get('REMOTE_ADDR')
self.path_info = os.environ.get('PATH_INFO') self.path_info = os.environ.get('PATH_INFO')
self.method = os.environ.get('REQUEST_METHOD', 'GET') self.method = os.environ.get('REQUEST_METHOD', 'GET').upper()
self.server = os.environ.get('SERVER_SOFTWARE', '') self.server = os.environ.get('SERVER_SOFTWARE', '')
self.logpath = os.environ.get('LOG_PATH', './var/log') self.logpath = os.environ.get('LOG_PATH', './var/log')
self.tmpdir = os.environ.get('TEMP', './var/') self.tmpdir = os.environ.get('TEMP', './var/')
self.debug = os.environ.get('DEBUG')
self.content_type = 'text/html; charset=%s' % codec self.content_type = 'text/html; charset=%s' % codec
self.cur_time = time.time() self.cur_time = time.time()
self.form = cgi.FieldStorage(infp)
return return
def put(self, *args): def put(self, *args):
@ -127,9 +128,9 @@ class PDF2HTMLApp(object):
def coverpage(self): def coverpage(self):
self.put( self.put(
'<html><head><title>pdf2html demo</title></head><body>\n', '<html><head><title>%s</title></head><body>\n' % q(self.TITLE),
'<h1>pdf2html demo</h1><hr>\n', '<h1>%s</h1><hr>\n' % q(self.TITLE),
'<form method="POST" action="%s" enctype="multipart/form-data">\n' % q(self.APPURL), '<form method="POST" action="%s" enctype="multipart/form-data">\n' % q(self.APPPATH),
'<p>Upload PDF File: <input name="f" type="file" value="">\n', '<p>Upload PDF File: <input name="f" type="file" value="">\n',
'&nbsp; Page numbers (comma-separated):\n', '&nbsp; Page numbers (comma-separated):\n',
'<input name="p" type="text" size="10" value="">\n', '<input name="p" type="text" size="10" value="">\n',
@ -145,41 +146,30 @@ class PDF2HTMLApp(object):
return return
def run(self, argv): def run(self, argv):
if self.debug: logging.basicConfig(level=logging.INFO,
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(levelname)s %(message)s')
else:
logging.basicConfig(level=logging.ERROR,
format='%(asctime)s %(levelname)s %(message)s', format='%(asctime)s %(levelname)s %(message)s',
filename=self.logpath, filemode='a') filename=self.logpath, filemode='a')
if self.path_info == '/': if self.path_info != self.APPPATH:
self.http_200()
self.coverpage()
return
if self.path_info != self.APPURL:
logging.error('invalid path: %r' % self.path_info)
self.http_404() self.http_404()
return return
if not os.path.isdir(self.tmpdir): if not os.path.isdir(self.tmpdir):
logging.error('no tmpdir') logging.error('no tmpdir')
self.bummer('error') self.bummer('error')
return return
form = cgi.FieldStorage() if (self.method != 'POST' or
if 'f' not in form: 'c' not in self.form or
self.http_301('/') 'f' not in self.form):
self.coverpage()
return return
if 'c' not in form: item = self.form['f']
self.http_301('/')
return
item = form['f']
if not (item.file and item.filename): if not (item.file and item.filename):
self.http_301('/') self.coverpage()
return return
cmd = form.getvalue('c') cmd = self.form.getvalue('c')
html = (cmd == 'Convert to HTML') html = (cmd == 'Convert to HTML')
pagenos = [] pagenos = []
if 'p' in form: if 'p' in self.form:
for m in re.finditer(r'\d+', form.getvalue('p')): for m in re.finditer(r'\d+', self.form.getvalue('p')):
try: try:
pagenos.append(int(m.group(0))) pagenos.append(int(m.group(0)))
except ValueError: except ValueError:
@ -188,11 +178,11 @@ class PDF2HTMLApp(object):
(self.remote_addr, item.filename, pagenos)) (self.remote_addr, item.filename, pagenos))
h = abs(hash((random.random(), self.remote_addr, item.filename))) h = abs(hash((random.random(), self.remote_addr, item.filename)))
tmppath = os.path.join(self.tmpdir, '%08x%08x.pdf' % (self.cur_time, h)) tmppath = os.path.join(self.tmpdir, '%08x%08x.pdf' % (self.cur_time, h))
try:
try: try:
if not html: if not html:
self.content_type = 'text/plain; charset=%s' % self.codec self.content_type = 'text/plain; charset=%s' % self.codec
self.http_200() self.http_200()
try:
convert(sys.stdout, item.file, tmppath, pagenos=pagenos, codec=self.codec, convert(sys.stdout, item.file, tmppath, pagenos=pagenos, codec=self.codec,
maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE, html=html) maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE, html=html)
except Exception, e: except Exception, e:
@ -207,4 +197,4 @@ class PDF2HTMLApp(object):
# main # main
if __name__ == '__main__': sys.exit(PDF2HTMLApp().run(sys.argv)) if __name__ == '__main__': sys.exit(WebApp().run(sys.argv))