#!/usr/bin/python # # pdf2html.cgi - Gateway for converting PDF into HTML. # # Security consideration for public access: # # Limit the process size and/or running time. # The process should be chrooted. # The user should be imposed quota. # # Setup: # $ mkdir CGIDIR # $ mkdir CGIDIR/var # $ cp -a pdfminer/pdflib CGIDIR # $ cp -a pdfminer/tools CGIDIR # $ cp -a pdfminer/CDBCMap CGIDIR # $ PYTHONPATH=CGIDIR pdfminer/tools/pdf2html.cgi # import sys # comment out at runtime. import cgitb; cgitb.enable() import os, os.path, re, cgi, time, random, codecs, logging, traceback # quote HTML metacharacters def q(x): return x.replace('&','&').replace('>','>').replace('<','<').replace('"','"') # encode parameters as a URL Q = re.compile(r'[^a-zA-Z0-9_.-=]') def url(base, **kw): r = [] for (k,v) in kw.iteritems(): v = Q.sub(lambda m: '%%%02X' % ord(m.group(0)), encoder(q(v), 'replace')[0]) r.append('%s=%s' % (k, v)) return base+'&'.join(r) ## convert(outfp, infp, path, codec='utf-8', maxpages=10, pagenos=None) ## class FileSizeExceeded(ValueError): pass def convert(outfp, infp, path, codec='utf-8', maxpages=10, maxfilesize=5000000, pagenos=None): from tools.pdf2txt import CMapDB, PDFResourceManager, HTMLConverter, convert # save the input file. src = file(path, 'wb') nbytes = 0 while 1: data = infp.read(4096) nbytes += len(data) if maxfilesize and maxfilesize < nbytes: raise FileSizeExceeded(maxfilesize) if not data: break src.write(data) src.close() infp.close() # perform conversion and # send the results over the network. CMapDB.initialize('.', './CDBCMap') rsrc = PDFResourceManager() device = HTMLConverter(rsrc, outfp, codec=codec) convert(rsrc, device, path, pagenos, maxpages=maxpages) return ## PDF2HTMLApp ## class PDF2HTMLApp(object): APPURL = '/convert' TMPDIR = './var/' LOGPATH = './var/log' MAXFILESIZE = 5000000 MAXPAGES = 10 def __init__(self, outfp, logpath=LOGPATH, loglevel=logging.DEBUG, codec='utf-8'): self.outfp = outfp self.codec = codec logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=loglevel, filename=logpath, filemode='a') self.remote_addr = os.environ.get('REMOTE_ADDR') self.path_info = os.environ.get('PATH_INFO') self.method = os.environ.get('REQUEST_METHOD', 'GET') self.server = os.environ.get('SERVER_SOFTWARE', '') self.content_type = 'text/html; charset=%s' % codec self.cur_time = time.time() self.form = cgi.FieldStorage() return def put(self, *args): for x in args: if isinstance(x, str): self.outfp.write(x) elif isinstance(x, unicode): self.outfp.write(x.encode(self.codec, 'xmlcharrefreplace')) return def http_200(self): if self.server.startswith('cgi-httpd'): # required for cgi-httpd self.outfp.write('HTTP/1.0 200 OK\r\n') self.outfp.write('Content-type: %s\r\n' % self.content_type) self.outfp.write('Connection: close\r\n\r\n') return def http_404(self): if self.server.startswith('cgi-httpd'): # required for cgi-httpd self.outfp.write('HTTP/1.0 404 Not Found\r\n') self.outfp.write('Content-type: text/html\r\n') self.outfp.write('Connection: close\r\n\r\n') self.outfp.write('page does not exist\n') return def http_301(self, url): if self.server.startswith('cgi-httpd'): # required for cgi-httpd self.outfp.write('HTTP/1.0 301 Moved\r\n') self.outfp.write('Location: %s\r\n\r\n' % url) return def coverpage(self): self.put( 'pdf2html demo\n', '

pdf2html demo


\n', '
\n' % q(self.APPURL), '

Upload PDF File: \n', '  Page numbers (comma-separated): \n', '

(Text extraction is limited to maximum %d pages.\n' % self.MAXPAGES, 'Maximum file size for input is %d bytes.)\n' % self.MAXFILESIZE, '

\n', '


\n', '

Powered by PDFMiner\n', '\n', ) return def run(self, argv): if self.path_info == '/': self.http_200() self.coverpage() return if self.path_info != self.APPURL: self.http_404() return if not os.path.isdir(self.TMPDIR): self.bummer('error') return if 'f' not in self.form: self.http_301('/') return item = self.form['f'] if not (item.file and item.filename): self.http_301('/') return pagenos = [] if 'p' in self.form: for m in re.finditer(r'\d+', self.form.getvalue('p')): try: pagenos.append(int(m.group(0))) except ValueError: pass logging.info('process: host=%s, name=%r, pagenos=%r' % (self.remote_addr, item.filename, pagenos)) h = abs(hash((random.random(), self.remote_addr, item.filename))) tmppath = os.path.join(self.TMPDIR, '%08x%08x.pdf' % (self.cur_time, h)) try: try: convert(sys.stdout, item.file, tmppath, pagenos=pagenos, codec=self.codec, maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE) except Exception, e: self.put('

Sorry, an error has occured: %s' % q(repr(e))) logging.error('error: %r: path=%r: %s' % (e, tmppath, traceback.format_exc())) finally: try: os.remove(tmppath) except: pass return # main if __name__ == '__main__': sys.exit(PDF2HTMLApp(sys.stdout).run(sys.argv))