182 lines
5.7 KiB
Python
Executable File
182 lines
5.7 KiB
Python
Executable File
#!/usr/bin/python
|
|
#
|
|
# pdf2html.cgi - Gateway for converting PDF into HTML.
|
|
#
|
|
# Security consideration for public access:
|
|
#
|
|
# Limit the process size and/or running time.
|
|
# The process should be chrooted.
|
|
# The user should be imposed quota.
|
|
#
|
|
# Setup:
|
|
# $ mkdir CGIDIR
|
|
# $ mkdir CGIDIR/var
|
|
# $ cp -a pdfminer/pdflib CGIDIR
|
|
# $ cp -a pdfminer/tools CGIDIR
|
|
# $ cp -a pdfminer/CDBCMap CGIDIR
|
|
# $ PYTHONPATH=CGIDIR pdfminer/tools/pdf2html.cgi
|
|
#
|
|
|
|
import sys
|
|
# comment out at runtime.
|
|
import cgitb; cgitb.enable()
|
|
import os, os.path, re, cgi, time, random, codecs, logging, traceback
|
|
|
|
|
|
# quote HTML metacharacters
|
|
def q(x):
|
|
return x.replace('&','&').replace('>','>').replace('<','<').replace('"','"')
|
|
|
|
# encode parameters as a URL
|
|
Q = re.compile(r'[^a-zA-Z0-9_.-=]')
|
|
def url(base, **kw):
|
|
r = []
|
|
for (k,v) in kw.iteritems():
|
|
v = Q.sub(lambda m: '%%%02X' % ord(m.group(0)), encoder(q(v), 'replace')[0])
|
|
r.append('%s=%s' % (k, v))
|
|
return base+'&'.join(r)
|
|
|
|
## convert(outfp, infp, path, codec='utf-8', maxpages=10, pagenos=None)
|
|
##
|
|
class FileSizeExceeded(ValueError): pass
|
|
def convert(outfp, infp, path, codec='utf-8', maxpages=10, maxfilesize=5000000, pagenos=None):
|
|
from tools.pdf2txt import CMapDB, PDFResourceManager, HTMLConverter, convert
|
|
# save the input file.
|
|
src = file(path, 'wb')
|
|
nbytes = 0
|
|
while 1:
|
|
data = infp.read(4096)
|
|
nbytes += len(data)
|
|
if maxfilesize and maxfilesize < nbytes:
|
|
raise FileSizeExceeded(maxfilesize)
|
|
if not data: break
|
|
src.write(data)
|
|
src.close()
|
|
infp.close()
|
|
# perform conversion and
|
|
# send the results over the network.
|
|
CMapDB.initialize('.', './CDBCMap')
|
|
rsrc = PDFResourceManager()
|
|
device = HTMLConverter(rsrc, outfp, codec=codec)
|
|
convert(rsrc, device, path, pagenos, maxpages=maxpages)
|
|
return
|
|
|
|
|
|
## PDF2HTMLApp
|
|
##
|
|
class PDF2HTMLApp(object):
|
|
|
|
APPURL = '/convert'
|
|
TMPDIR = './var/'
|
|
LOGPATH = './var/log'
|
|
MAXFILESIZE = 5000000
|
|
MAXPAGES = 10
|
|
|
|
def __init__(self, outfp, logpath=LOGPATH, loglevel=logging.DEBUG, codec='utf-8'):
|
|
self.outfp = outfp
|
|
self.codec = codec
|
|
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
|
|
level=loglevel, filename=logpath, filemode='a')
|
|
self.remote_addr = os.environ.get('REMOTE_ADDR')
|
|
self.path_info = os.environ.get('PATH_INFO')
|
|
self.method = os.environ.get('REQUEST_METHOD', 'GET')
|
|
self.server = os.environ.get('SERVER_SOFTWARE', '')
|
|
self.content_type = 'text/html; charset=%s' % codec
|
|
self.cur_time = time.time()
|
|
self.form = cgi.FieldStorage()
|
|
return
|
|
|
|
def put(self, *args):
|
|
for x in args:
|
|
if isinstance(x, str):
|
|
self.outfp.write(x)
|
|
elif isinstance(x, unicode):
|
|
self.outfp.write(x.encode(self.codec, 'xmlcharrefreplace'))
|
|
return
|
|
|
|
def http_200(self):
|
|
if self.server.startswith('cgi-httpd'):
|
|
# required for cgi-httpd
|
|
self.outfp.write('HTTP/1.0 200 OK\r\n')
|
|
self.outfp.write('Content-type: %s\r\n' % self.content_type)
|
|
self.outfp.write('Connection: close\r\n\r\n')
|
|
return
|
|
|
|
def http_404(self):
|
|
if self.server.startswith('cgi-httpd'):
|
|
# required for cgi-httpd
|
|
self.outfp.write('HTTP/1.0 404 Not Found\r\n')
|
|
self.outfp.write('Content-type: text/html\r\n')
|
|
self.outfp.write('Connection: close\r\n\r\n')
|
|
self.outfp.write('<html><body>page does not exist</body></body>\n')
|
|
return
|
|
|
|
def http_301(self, url):
|
|
if self.server.startswith('cgi-httpd'):
|
|
# required for cgi-httpd
|
|
self.outfp.write('HTTP/1.0 301 Moved\r\n')
|
|
self.outfp.write('Location: %s\r\n\r\n' % url)
|
|
return
|
|
|
|
def coverpage(self):
|
|
self.put(
|
|
'<html><head><title>pdf2html demo</title></head><body>\n',
|
|
'<h1>pdf2html demo</h1><hr>\n',
|
|
'<form method="POST" action="%s" enctype="multipart/form-data">\n' % q(self.APPURL),
|
|
'<p>Upload PDF File: <input name="f" type="file" value="">\n',
|
|
' Page numbers (comma-separated): <input name="p" type="text" size="10" value="">\n',
|
|
'<p>(Text extraction is limited to maximum %d pages.\n' % self.MAXPAGES,
|
|
'Maximum file size for input is %d bytes.)\n' % self.MAXFILESIZE,
|
|
'<p><input type="submit" value="Convert to HTML"> <input type="reset" value="Reset">\n',
|
|
'</form><hr>\n',
|
|
'<p>Powered by <a href="http://www.unixuser.org/~euske/python/pdfminer/">PDFMiner</a>\n',
|
|
'</body></html>\n',
|
|
)
|
|
return
|
|
|
|
def run(self, argv):
|
|
if self.path_info == '/':
|
|
self.http_200()
|
|
self.coverpage()
|
|
return
|
|
if self.path_info != self.APPURL:
|
|
self.http_404()
|
|
return
|
|
if not os.path.isdir(self.TMPDIR):
|
|
self.bummer('error')
|
|
return
|
|
if 'f' not in self.form:
|
|
self.http_301('/')
|
|
return
|
|
item = self.form['f']
|
|
if not (item.file and item.filename):
|
|
self.http_301('/')
|
|
return
|
|
pagenos = []
|
|
if 'p' in self.form:
|
|
for m in re.finditer(r'\d+', self.form.getvalue('p')):
|
|
try:
|
|
pagenos.append(int(m.group(0)))
|
|
except ValueError:
|
|
pass
|
|
logging.info('process: host=%s, name=%r, pagenos=%r' % (self.remote_addr, item.filename, pagenos))
|
|
h = abs(hash((random.random(), self.remote_addr, item.filename)))
|
|
tmppath = os.path.join(self.TMPDIR, '%08x%08x.pdf' % (self.cur_time, h))
|
|
try:
|
|
try:
|
|
convert(sys.stdout, item.file, tmppath, pagenos=pagenos,
|
|
codec=self.codec, maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE)
|
|
except Exception, e:
|
|
self.put('<p>Sorry, an error has occured: %s' % q(repr(e)))
|
|
logging.error('error: %r: path=%r: %s' % (e, tmppath, traceback.format_exc()))
|
|
finally:
|
|
try:
|
|
os.remove(tmppath)
|
|
except:
|
|
pass
|
|
return
|
|
|
|
|
|
# main
|
|
if __name__ == '__main__': sys.exit(PDF2HTMLApp(sys.stdout).run(sys.argv))
|