webapp fixed

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@83 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-04-02 14:24:57 +00:00
parent 548cdf9443
commit 162c5f0bfa
1 changed files with 24 additions and 12 deletions

View File

@ -12,8 +12,6 @@
# $ mkdir CGIDIR # $ mkdir CGIDIR
# $ mkdir CGIDIR/var # $ mkdir CGIDIR/var
# $ cp -a pdfminer/pdflib CGIDIR # $ cp -a pdfminer/pdflib CGIDIR
# $ cp -a pdfminer/tools CGIDIR
# $ cp -a pdfminer/CDBCMap CGIDIR
# $ PYTHONPATH=CGIDIR pdfminer/tools/pdf2html.cgi # $ PYTHONPATH=CGIDIR pdfminer/tools/pdf2html.cgi
# #
@ -21,6 +19,7 @@ import sys
# comment out at runtime. # comment out at runtime.
import cgitb; cgitb.enable() import cgitb; cgitb.enable()
import os, os.path, re, cgi, time, random, codecs, logging, traceback import os, os.path, re, cgi, time, random, codecs, logging, traceback
import pdflib.pdf2txt
# quote HTML metacharacters # quote HTML metacharacters
@ -36,11 +35,11 @@ def url(base, **kw):
r.append('%s=%s' % (k, v)) r.append('%s=%s' % (k, v))
return base+'&'.join(r) return base+'&'.join(r)
## convert(outfp, infp, path, codec='utf-8', maxpages=10, pagenos=None) ## convert
## ##
class FileSizeExceeded(ValueError): pass class FileSizeExceeded(ValueError): pass
def convert(outfp, infp, path, codec='utf-8', maxpages=10, maxfilesize=5000000, pagenos=None): def convert(outfp, infp, path, codec='utf-8', maxpages=10,
from tools.pdf2txt import CMapDB, PDFResourceManager, HTMLConverter, convert maxfilesize=5000000, pagenos=None, html=True):
# save the input file. # save the input file.
src = file(path, 'wb') src = file(path, 'wb')
nbytes = 0 nbytes = 0
@ -55,10 +54,13 @@ def convert(outfp, infp, path, codec='utf-8', maxpages=10, maxfilesize=5000000,
infp.close() infp.close()
# perform conversion and # perform conversion and
# send the results over the network. # send the results over the network.
CMapDB.initialize('.', './CDBCMap') pdflib.pdf2txt.CMapDB.initialize('.', './CDBCMap')
rsrc = PDFResourceManager() rsrc = pdflib.pdf2txt.PDFResourceManager()
device = HTMLConverter(rsrc, outfp, codec=codec) if html:
convert(rsrc, device, path, pagenos, maxpages=maxpages) device = pdflib.pdf2txt.HTMLConverter(rsrc, outfp, codec=codec)
else:
device = pdflib.pdf2txt.TextConverter(rsrc, outfp, codec=codec)
pdflib.pdf2txt.convert(rsrc, device, path, pagenos, maxpages=maxpages)
return return
@ -127,7 +129,9 @@ class PDF2HTMLApp(object):
'&nbsp; Page numbers (comma-separated): <input name="p" type="text" size="10" value="">\n', '&nbsp; Page numbers (comma-separated): <input name="p" type="text" size="10" value="">\n',
'<p>(Text extraction is limited to maximum %d pages.\n' % self.MAXPAGES, '<p>(Text extraction is limited to maximum %d pages.\n' % self.MAXPAGES,
'Maximum file size for input is %d bytes.)\n' % self.MAXFILESIZE, 'Maximum file size for input is %d bytes.)\n' % self.MAXFILESIZE,
'<p><input type="submit" value="Convert to HTML"> <input type="reset" value="Reset">\n', '<p><input type="submit" name="c" value="Convert to HTML">\n',
'<input type="submit" name="c" value="Convert to TEXT">\n',
'<input type="reset" value="Reset">\n',
'</form><hr>\n', '</form><hr>\n',
'<p>Powered by <a href="http://www.unixuser.org/~euske/python/pdfminer/">PDFMiner</a>\n', '<p>Powered by <a href="http://www.unixuser.org/~euske/python/pdfminer/">PDFMiner</a>\n',
'</body></html>\n', '</body></html>\n',
@ -148,10 +152,15 @@ class PDF2HTMLApp(object):
if 'f' not in self.form: if 'f' not in self.form:
self.http_301('/') self.http_301('/')
return return
if 'c' not in self.form:
self.http_301('/')
return
item = self.form['f'] item = self.form['f']
if not (item.file and item.filename): if not (item.file and item.filename):
self.http_301('/') self.http_301('/')
return return
cmd = self.form.getvalue('c')
html = (cmd == 'Convert to HTML')
pagenos = [] pagenos = []
if 'p' in self.form: if 'p' in self.form:
for m in re.finditer(r'\d+', self.form.getvalue('p')): for m in re.finditer(r'\d+', self.form.getvalue('p')):
@ -164,8 +173,11 @@ class PDF2HTMLApp(object):
tmppath = os.path.join(self.TMPDIR, '%08x%08x.pdf' % (self.cur_time, h)) tmppath = os.path.join(self.TMPDIR, '%08x%08x.pdf' % (self.cur_time, h))
try: try:
try: try:
convert(sys.stdout, item.file, tmppath, pagenos=pagenos, if not html:
codec=self.codec, maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE) self.content_type = 'text/plain; charset=%s' % self.codec
self.http_200()
convert(sys.stdout, item.file, tmppath, pagenos=pagenos, codec=self.codec,
maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE, html=html)
except Exception, e: except Exception, e:
self.put('<p>Sorry, an error has occured: %s' % q(repr(e))) self.put('<p>Sorry, an error has occured: %s' % q(repr(e)))
logging.error('error: %r: path=%r: %s' % (e, tmppath, traceback.format_exc())) logging.error('error: %r: path=%r: %s' % (e, tmppath, traceback.format_exc()))