From 3e5ab3e01b1a4a65e002f9c0fc96c0a2a3b1460d Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Sat, 6 Sep 2008 04:51:01 +0000 Subject: [PATCH] pdf2html webapp added. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@52 1aa58f4a-7d42-0410-adbc-911cccaed67c --- Makefile | 2 +- TODO | 3 +- tools/pdf2html.cgi | 181 +++++++++++++++++++++++++++++++++++++++++++++ tools/pdf2txt.py | 4 +- 4 files changed, 185 insertions(+), 5 deletions(-) create mode 100755 tools/pdf2html.cgi diff --git a/Makefile b/Makefile index 7afc258..5a0c224 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # Makefile for pdfminer PACKAGE=pdfminer -VERSION=20080830 +VERSION=20080906 GNUTAR=tar SVN=svn PYTHON=python diff --git a/TODO b/TODO index f32ff7a..35585e9 100644 --- a/TODO +++ b/TODO @@ -1,6 +1,5 @@ TODOs: - - API Documentation. - - Sample webapp for pdf->html. + - Better API Documentation. - Error handling for invalid type. - Infer text stream by clustering. diff --git a/tools/pdf2html.cgi b/tools/pdf2html.cgi new file mode 100755 index 0000000..828ef36 --- /dev/null +++ b/tools/pdf2html.cgi @@ -0,0 +1,181 @@ +#!/usr/bin/python +# +# pdf2html.cgi - Gateway for converting PDF into HTML. +# +# Security consideration for public access: +# +# Limit the process size and/or running time. +# The process should be chrooted. +# The user should be imposed quota. +# +# Setup: +# $ mkdir CGIDIR +# $ mkdir CGIDIR/var +# $ cp -a pdfminer/pdflib CGIDIR +# $ cp -a pdfminer/tools CGIDIR +# $ cp -a pdfminer/CDBCMap CGIDIR +# $ PYTHONPATH=CGIDIR pdfminer/tools/pdf2html.cgi +# + +import sys +# comment out at runtime. +import cgitb; cgitb.enable() +import os, os.path, re, cgi, time, random, codecs, logging, traceback + + +# quote HTML metacharacters +def q(x): + return x.replace('&','&').replace('>','>').replace('<','<').replace('"','"') + +# encode parameters as a URL +Q = re.compile(r'[^a-zA-Z0-9_.-=]') +def url(base, **kw): + r = [] + for (k,v) in kw.iteritems(): + v = Q.sub(lambda m: '%%%02X' % ord(m.group(0)), encoder(q(v), 'replace')[0]) + r.append('%s=%s' % (k, v)) + return base+'&'.join(r) + +## convert(outfp, infp, path, codec='utf-8', maxpages=10, pagenos=None) +## +class FileSizeExceeded(ValueError): pass +def convert(outfp, infp, path, codec='utf-8', maxpages=10, maxfilesize=5000000, pagenos=None): + from tools.pdf2txt import CMapDB, PDFResourceManager, HTMLConverter, convert + # save the input file. + src = file(path, 'wb') + nbytes = 0 + while 1: + data = infp.read(4096) + nbytes += len(data) + if maxfilesize and maxfilesize < nbytes: + raise FileSizeExceeded(maxfilesize) + if not data: break + src.write(data) + src.close() + infp.close() + # perform conversion and + # send the results over the network. + CMapDB.initialize('.', './CDBCMap') + rsrc = PDFResourceManager() + device = HTMLConverter(rsrc, outfp, codec=codec) + convert(rsrc, device, path, pagenos, maxpages=maxpages) + return + + +## PDF2HTMLApp +## +class PDF2HTMLApp(object): + + APPURL = '/convert' + TMPDIR = './var/' + LOGPATH = './var/log' + MAXFILESIZE = 5000000 + MAXPAGES = 10 + + def __init__(self, outfp, logpath=LOGPATH, loglevel=logging.DEBUG, codec='utf-8'): + self.outfp = outfp + self.codec = codec + logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', + level=loglevel, filename=logpath, filemode='a') + self.remote_addr = os.environ.get('REMOTE_ADDR') + self.path_info = os.environ.get('PATH_INFO') + self.method = os.environ.get('REQUEST_METHOD', 'GET') + self.server = os.environ.get('SERVER_SOFTWARE', '') + self.content_type = 'text/html; charset=%s' % codec + self.cur_time = time.time() + self.form = cgi.FieldStorage() + return + + def put(self, *args): + for x in args: + if isinstance(x, str): + self.outfp.write(x) + elif isinstance(x, unicode): + self.outfp.write(x.encode(self.codec, 'xmlcharrefreplace')) + return + + def http_200(self): + if self.server.startswith('cgi-httpd'): + # required for cgi-httpd + self.outfp.write('HTTP/1.0 200 OK\r\n') + self.outfp.write('Content-type: %s\r\n' % self.content_type) + self.outfp.write('Connection: close\r\n\r\n') + return + + def http_404(self): + if self.server.startswith('cgi-httpd'): + # required for cgi-httpd + self.outfp.write('HTTP/1.0 404 Not Found\r\n') + self.outfp.write('Content-type: text/html\r\n') + self.outfp.write('Connection: close\r\n\r\n') + self.outfp.write('page does not exist\n') + return + + def http_301(self, url): + if self.server.startswith('cgi-httpd'): + # required for cgi-httpd + self.outfp.write('HTTP/1.0 301 Moved\r\n') + self.outfp.write('Location: %s\r\n\r\n' % url) + return + + def coverpage(self): + self.put( + 'pdf2html demo\n', + '

pdf2html demo


\n', + '
\n' % q(self.APPURL), + '

Upload PDF File: \n', + '  Page numbers (comma-separated): \n', + '

(Text extraction is limited to maximum %d pages.\n' % self.MAXPAGES, + 'Maximum file size for input is %d bytes.)\n' % self.MAXFILESIZE, + '

\n', + '


\n', + '

Powered by PDFMiner\n', + '\n', + ) + return + + def run(self, argv): + if self.path_info == '/': + self.http_200() + self.coverpage() + return + if self.path_info != self.APPURL: + self.http_404() + return + if not os.path.isdir(self.TMPDIR): + self.bummer('error') + return + if 'f' not in self.form: + self.http_301('/') + return + item = self.form['f'] + if not (item.file and item.filename): + self.http_301('/') + return + pagenos = [] + if 'p' in self.form: + for m in re.finditer(r'\d+', self.form.getvalue('p')): + try: + pagenos.append(int(m.group(0))) + except ValueError: + pass + logging.info('process: host=%s, name=%r, pagenos=%r' % (self.remote_addr, item.filename, pagenos)) + h = abs(hash((random.random(), self.remote_addr, item.filename))) + tmppath = os.path.join(self.TMPDIR, '%08x%08x.pdf' % (self.cur_time, h)) + try: + try: + convert(sys.stdout, item.file, tmppath, pagenos=pagenos, + codec=self.codec, maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE) + except Exception, e: + self.put('

Sorry, an error has occured: %s' % q(repr(e))) + logging.error('error: %r: path=%r: %s' % (e, tmppath, traceback.format_exc())) + finally: + try: + os.remove(tmppath) + except: + pass + return + + +# main +if __name__ == '__main__': sys.exit(PDF2HTMLApp(sys.stdout).run(sys.argv)) diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index 12975b1..b6170c2 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -161,9 +161,9 @@ def convert(rsrc, device, fname, pagenos, maxpages=0, password='', debug=0): try: doc.initialize(password) except PDFPasswordIncorrect: - raise TextExtractionNotAllowed('incorrect password') + raise TextExtractionNotAllowed('Incorrect password') if not doc.is_extractable: - raise TextExtractionNotAllowed('text extraction is not allowed: %r' % fname) + raise TextExtractionNotAllowed('Text extraction is not allowed: %r' % fname) interpreter = PDFPageInterpreter(rsrc, device, debug=debug) for (pageno,page) in enumerate(doc.get_pages(debug=debug)): if pagenos and (pageno not in pagenos): continue