pdfminer.six/tools/pdf2html.cgi

#!/usr/bin/python2 -O
#
# pdf2html.cgi - Gateway script for converting PDF into HTML.
#
# Security consideration for public access:
#
#   Limit the process size and/or maximum cpu time.
#   The process should be chrooted.
#   The user should be imposed quota.
#
# How to Setup:
#   $ mkdir $CGIDIR
#   $ mkdir $CGIDIR/var
#   $ python setup.py install_lib --install-dir=$CGIDIR
#   $ cp pdfminer/tools/pdf2html.cgi $CGIDIR
#

import sys, os, os.path, re, time
import cgi, logging, traceback, random
# comment out at this at runtime.
#import cgitb; cgitb.enable()
import pdfminer
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import HTMLConverter, TextConverter
from pdfminer.layout import LAParams


# quote HTML metacharacters
def q(x):
    return x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')

# encode parameters as a URL
Q = re.compile(r'[^a-zA-Z0-9_.-=]')
def url(base, **kw):
    r = []
    for (k,v) in kw.iteritems():
        v = Q.sub(lambda m: '%%%02X' % ord(m.group(0)), encoder(q(v), 'replace')[0])
        r.append('%s=%s' % (k, v))
    return base+'&'.join(r)


##  convert
##
class FileSizeExceeded(ValueError): pass
def convert(outfp, infp, path, codec='utf-8',
            maxpages=0, maxfilesize=0, pagenos=None,
            html=True):
    # save the input file.
    src = file(path, 'wb')
    nbytes = 0
    while 1:
        data = infp.read(4096)
        nbytes += len(data)
        if maxfilesize and maxfilesize < nbytes:
            raise FileSizeExceeded(maxfilesize)
        if not data: break
        src.write(data)
    src.close()
    infp.close()
    # perform conversion and
    # send the results over the network.
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    if html:
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    else:
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages)
    fp.close()
    return


##  WebApp
##
class WebApp(object):

    TITLE = 'pdf2html demo'
    APPPATH = '/'        # absolute URL path to this application. 
    MAXFILESIZE = 5000000              # set to zero if unlimited.
    MAXPAGES = 10                      # set to zero if unlimited.

    def __init__(self, infp=sys.stdin, outfp=sys.stdout, codec='utf-8'):
        self.outfp = outfp
        self.codec = codec
        self.remote_addr = os.environ.get('REMOTE_ADDR')
        self.path_info = os.environ.get('PATH_INFO')
        self.method = os.environ.get('REQUEST_METHOD', 'GET').upper()
        self.server = os.environ.get('SERVER_SOFTWARE', '')
        self.logpath = os.environ.get('LOG_PATH', './var/log')
        self.tmpdir = os.environ.get('TEMP', './var/')
        self.content_type = 'text/html; charset=%s' % codec
        self.cur_time = time.time()
        self.form = cgi.FieldStorage(infp)
        return

    def put(self, *args):
        for x in args:
            if isinstance(x, str):
                self.outfp.write(x)
            elif isinstance(x, unicode):
                self.outfp.write(x.encode(self.codec, 'xmlcharrefreplace'))
        return

    def http_200(self):
        if self.server.startswith('cgi-httpd'):
            # required for cgi-httpd
            self.outfp.write('HTTP/1.0 200 OK\r\n')
        self.outfp.write('Content-type: %s\r\n' % self.content_type)
        self.outfp.write('Connection: close\r\n\r\n')
        return

    def http_404(self):
        if self.server.startswith('cgi-httpd'):
            # required for cgi-httpd
            self.outfp.write('HTTP/1.0 404 Not Found\r\n')
        self.outfp.write('Content-type: text/html\r\n')
        self.outfp.write('Connection: close\r\n\r\n')
        self.outfp.write('<html><body>page does not exist</body></body>\n')
        return

    def http_301(self, url):
        if self.server.startswith('cgi-httpd'):
            # required for cgi-httpd
            self.outfp.write('HTTP/1.0 301 Moved\r\n')
        self.outfp.write('Location: %s\r\n\r\n' % url)
        return

    def coverpage(self):
        self.put(
          '<html><head><title>%s</title></head><body>\n' % q(self.TITLE),
          '<h1>%s</h1><hr>\n' % q(self.TITLE),
          '<form method="POST" action="%s" enctype="multipart/form-data">\n' % q(self.APPPATH),
          '<p>Upload PDF File: <input name="f" type="file" value="">\n',
          '&nbsp; Page numbers (comma-separated):\n',
          '<input name="p" type="text" size="10" value="">\n',
          '<p>(Text extraction is limited to maximum %d pages.\n' % self.MAXPAGES,
          'Maximum file size for input is %d bytes.)\n' % self.MAXFILESIZE,
          '<p><input type="submit" name="c" value="Convert to HTML">\n',
          '<input type="submit" name="c" value="Convert to TEXT">\n',
          '<input type="reset" value="Reset">\n',
          '</form><hr>\n',
          '<p>Powered by <a href="http://www.unixuser.org/~euske/python/pdfminer/">PDFMiner</a>-%s\n' % pdfminer.__version__,
          '</body></html>\n',
          )
        return

    def run(self, argv):
        logging.basicConfig(level=logging.INFO,
                            format='%(asctime)s %(levelname)s %(message)s',
                            filename=self.logpath, filemode='a')
        if self.path_info != self.APPPATH:
            self.http_404()
            return
        if not os.path.isdir(self.tmpdir):
            logging.error('no tmpdir')
            self.bummer('error')
            return
        if (self.method != 'POST' or 
            'c' not in self.form or
            'f' not in self.form):
            self.coverpage()
            return
        item = self.form['f']
        if not (item.file and item.filename):
            self.coverpage()
            return
        cmd = self.form.getvalue('c')
        html = (cmd == 'Convert to HTML')
        pagenos = []
        if 'p' in self.form:
            for m in re.finditer(r'\d+', self.form.getvalue('p')):
                try:
                    pagenos.append(int(m.group(0)))
                except ValueError:
                    pass
        logging.info('received: host=%s, name=%r, pagenos=%r' %
                     (self.remote_addr, item.filename, pagenos))
        h = abs(hash((random.random(), self.remote_addr, item.filename)))
        tmppath = os.path.join(self.tmpdir, '%08x%08x.pdf' % (self.cur_time, h))
        try:
            if not html:
                self.content_type = 'text/plain; charset=%s' % self.codec
            self.http_200()
            try:
                convert(sys.stdout, item.file, tmppath, pagenos=pagenos, codec=self.codec,
                        maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE, html=html)
            except Exception, e:
                self.put('<p>Sorry, an error has occured: %s' % q(repr(e)))
                logging.error('convert: %r: path=%r: %s' % (e, tmppath, traceback.format_exc()))
        finally:
            try:
                os.remove(tmppath)
            except:
                pass
        return


# main
if __name__ == '__main__': sys.exit(WebApp().run(sys.argv))
stay with python2 git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@264 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-10-19 09:57:01 +00:00			`#!/usr/bin/python2 -O`
pdf2html webapp added. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@52 1aa58f4a-7d42-0410-adbc-911cccaed67c 2008-09-06 04:51:01 +00:00			`#`
trivial grammar errors git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@173 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-01-10 07:18:05 +00:00			`# pdf2html.cgi - Gateway script for converting PDF into HTML.`
pdf2html webapp added. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@52 1aa58f4a-7d42-0410-adbc-911cccaed67c 2008-09-06 04:51:01 +00:00			`#`
			`# Security consideration for public access:`
			`#`
trivial grammar errors git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@173 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-01-10 07:18:05 +00:00			`# Limit the process size and/or maximum cpu time.`
pdf2html webapp added. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@52 1aa58f4a-7d42-0410-adbc-911cccaed67c 2008-09-06 04:51:01 +00:00			`# The process should be chrooted.`
			`# The user should be imposed quota.`
			`#`
trivial grammar errors git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@173 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-01-10 07:18:05 +00:00			`# How to Setup:`
pdf2html.cgi git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@169 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-01-01 14:15:25 +00:00			`# $ mkdir $CGIDIR`
			`# $ mkdir $CGIDIR/var`
			`# $ python setup.py install_lib --install-dir=$CGIDIR`
			`# $ cp pdfminer/tools/pdf2html.cgi $CGIDIR`
pdf2html webapp added. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@52 1aa58f4a-7d42-0410-adbc-911cccaed67c 2008-09-06 04:51:01 +00:00			`#`

pdf2html.cgi code cleanup git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@218 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-05-29 11:51:15 +00:00			`import sys, os, os.path, re, time`
			`import cgi, logging, traceback, random`
trivial grammar errors git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@173 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-01-10 07:18:05 +00:00			`# comment out at this at runtime.`
			`#import cgitb; cgitb.enable()`
			`import pdfminer`
layout analysis improved. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@120 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-07-21 07:55:19 +00:00			`from pdfminer.pdfinterp import PDFResourceManager, process_pdf`
			`from pdfminer.converter import HTMLConverter, TextConverter`
			`from pdfminer.layout import LAParams`
pdf2html webapp added. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@52 1aa58f4a-7d42-0410-adbc-911cccaed67c 2008-09-06 04:51:01 +00:00

			`# quote HTML metacharacters`
			`def q(x):`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`return x.replace('&','&').replace('>','>').replace('<','<').replace('"','"')`
pdf2html webapp added. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@52 1aa58f4a-7d42-0410-adbc-911cccaed67c 2008-09-06 04:51:01 +00:00
			`# encode parameters as a URL`
			`Q = re.compile(r'[^a-zA-Z0-9_.-=]')`
			`def url(base, **kw):`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`r = []`
			`for (k,v) in kw.iteritems():`
			`v = Q.sub(lambda m: '%%%02X' % ord(m.group(0)), encoder(q(v), 'replace')[0])`
			`r.append('%s=%s' % (k, v))`
			`return base+'&'.join(r)`
pdf2html webapp added. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@52 1aa58f4a-7d42-0410-adbc-911cccaed67c 2008-09-06 04:51:01 +00:00
layout analysis improved. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@120 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-07-21 07:55:19 +00:00
webapp fixed git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@83 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-04-02 14:24:57 +00:00			`## convert`
pdf2html webapp added. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@52 1aa58f4a-7d42-0410-adbc-911cccaed67c 2008-09-06 04:51:01 +00:00			`##`
			`class FileSizeExceeded(ValueError): pass`
pdf2html.cgi git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@169 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-01-01 14:15:25 +00:00			`def convert(outfp, infp, path, codec='utf-8',`
			`maxpages=0, maxfilesize=0, pagenos=None,`
			`html=True):`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`# save the input file.`
			`src = file(path, 'wb')`
			`nbytes = 0`
			`while 1:`
			`data = infp.read(4096)`
			`nbytes += len(data)`
			`if maxfilesize and maxfilesize < nbytes:`
			`raise FileSizeExceeded(maxfilesize)`
			`if not data: break`
			`src.write(data)`
			`src.close()`
			`infp.close()`
			`# perform conversion and`
			`# send the results over the network.`
trivial change git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@211 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-04-24 13:31:03 +00:00			`rsrcmgr = PDFResourceManager()`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`laparams = LAParams()`
			`if html:`
trivial change git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@211 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-04-24 13:31:03 +00:00			`device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`else:`
trivial change git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@211 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-04-24 13:31:03 +00:00			`device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`fp = file(path, 'rb')`
trivial change git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@211 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-04-24 13:31:03 +00:00			`process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages)`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`fp.close()`
			`return`
pdf2html webapp added. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@52 1aa58f4a-7d42-0410-adbc-911cccaed67c 2008-09-06 04:51:01 +00:00

pdf2html.cgi code cleanup git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@218 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-05-29 11:51:15 +00:00			`## WebApp`
pdf2html webapp added. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@52 1aa58f4a-7d42-0410-adbc-911cccaed67c 2008-09-06 04:51:01 +00:00			`##`
pdf2html.cgi code cleanup git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@218 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-05-29 11:51:15 +00:00			`class WebApp(object):`
pdf2html webapp added. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@52 1aa58f4a-7d42-0410-adbc-911cccaed67c 2008-09-06 04:51:01 +00:00
pdf2html.cgi code cleanup git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@218 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-05-29 11:51:15 +00:00			`TITLE = 'pdf2html demo'`
			`APPPATH = '/' # absolute URL path to this application.`
			`MAXFILESIZE = 5000000 # set to zero if unlimited.`
			`MAXPAGES = 10 # set to zero if unlimited.`
pdf2html webapp added. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@52 1aa58f4a-7d42-0410-adbc-911cccaed67c 2008-09-06 04:51:01 +00:00
pdf2html.cgi code cleanup git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@218 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-05-29 11:51:15 +00:00			`def __init__(self, infp=sys.stdin, outfp=sys.stdout, codec='utf-8'):`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`self.outfp = outfp`
			`self.codec = codec`
			`self.remote_addr = os.environ.get('REMOTE_ADDR')`
			`self.path_info = os.environ.get('PATH_INFO')`
pdf2html.cgi code cleanup git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@218 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-05-29 11:51:15 +00:00			`self.method = os.environ.get('REQUEST_METHOD', 'GET').upper()`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`self.server = os.environ.get('SERVER_SOFTWARE', '')`
pdf2html.cgi git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@169 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-01-01 14:15:25 +00:00			`self.logpath = os.environ.get('LOG_PATH', './var/log')`
			`self.tmpdir = os.environ.get('TEMP', './var/')`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`self.content_type = 'text/html; charset=%s' % codec`
			`self.cur_time = time.time()`
pdf2html.cgi code cleanup git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@218 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-05-29 11:51:15 +00:00			`self.form = cgi.FieldStorage(infp)`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`return`
pdf2html webapp added. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@52 1aa58f4a-7d42-0410-adbc-911cccaed67c 2008-09-06 04:51:01 +00:00
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`def put(self, *args):`
			`for x in args:`
			`if isinstance(x, str):`
			`self.outfp.write(x)`
			`elif isinstance(x, unicode):`
			`self.outfp.write(x.encode(self.codec, 'xmlcharrefreplace'))`
			`return`
pdf2html webapp added. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@52 1aa58f4a-7d42-0410-adbc-911cccaed67c 2008-09-06 04:51:01 +00:00
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`def http_200(self):`
			`if self.server.startswith('cgi-httpd'):`
			`# required for cgi-httpd`
			`self.outfp.write('HTTP/1.0 200 OK\r\n')`
			`self.outfp.write('Content-type: %s\r\n' % self.content_type)`
			`self.outfp.write('Connection: close\r\n\r\n')`
			`return`

			`def http_404(self):`
			`if self.server.startswith('cgi-httpd'):`
			`# required for cgi-httpd`
			`self.outfp.write('HTTP/1.0 404 Not Found\r\n')`
			`self.outfp.write('Content-type: text/html\r\n')`
			`self.outfp.write('Connection: close\r\n\r\n')`
			`self.outfp.write('<html><body>page does not exist</body></body>\n')`
			`return`

			`def http_301(self, url):`
			`if self.server.startswith('cgi-httpd'):`
			`# required for cgi-httpd`
			`self.outfp.write('HTTP/1.0 301 Moved\r\n')`
			`self.outfp.write('Location: %s\r\n\r\n' % url)`
			`return`
pdf2html webapp added. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@52 1aa58f4a-7d42-0410-adbc-911cccaed67c 2008-09-06 04:51:01 +00:00
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`def coverpage(self):`
			`self.put(`
pdf2html.cgi code cleanup git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@218 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-05-29 11:51:15 +00:00			`'<html><head><title>%s</title></head><body>\n' % q(self.TITLE),`
			`'<h1>%s</h1><hr>\n' % q(self.TITLE),`
			`'<form method="POST" action="%s" enctype="multipart/form-data">\n' % q(self.APPPATH),`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`'<p>Upload PDF File: <input name="f" type="file" value="">\n',`
pdf2html.cgi git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@169 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-01-01 14:15:25 +00:00			`'  Page numbers (comma-separated):\n',`
			`'<input name="p" type="text" size="10" value="">\n',`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`'<p>(Text extraction is limited to maximum %d pages.\n' % self.MAXPAGES,`
			`'Maximum file size for input is %d bytes.)\n' % self.MAXFILESIZE,`
			`'<p><input type="submit" name="c" value="Convert to HTML">\n',`
			`'<input type="submit" name="c" value="Convert to TEXT">\n',`
			`'<input type="reset" value="Reset">\n',`
			`'</form><hr>\n',`
trivial grammar errors git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@173 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-01-10 07:18:05 +00:00			`'<p>Powered by <a href="http://www.unixuser.org/~euske/python/pdfminer/">PDFMiner</a>-%s\n' % pdfminer.__version__,`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`'</body></html>\n',`
			`)`
			`return`

			`def run(self, argv):`
pdf2html.cgi code cleanup git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@218 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-05-29 11:51:15 +00:00			`logging.basicConfig(level=logging.INFO,`
			`format='%(asctime)s %(levelname)s %(message)s',`
			`filename=self.logpath, filemode='a')`
			`if self.path_info != self.APPPATH:`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`self.http_404()`
			`return`
pdf2html.cgi git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@169 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-01-01 14:15:25 +00:00			`if not os.path.isdir(self.tmpdir):`
trivial grammar errors git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@173 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-01-10 07:18:05 +00:00			`logging.error('no tmpdir')`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`self.bummer('error')`
			`return`
pdf2html.cgi code cleanup git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@218 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-05-29 11:51:15 +00:00			`if (self.method != 'POST' or`
			`'c' not in self.form or`
			`'f' not in self.form):`
			`self.coverpage()`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`return`
pdf2html.cgi code cleanup git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@218 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-05-29 11:51:15 +00:00			`item = self.form['f']`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`if not (item.file and item.filename):`
pdf2html.cgi code cleanup git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@218 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-05-29 11:51:15 +00:00			`self.coverpage()`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`return`
pdf2html.cgi code cleanup git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@218 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-05-29 11:51:15 +00:00			`cmd = self.form.getvalue('c')`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`html = (cmd == 'Convert to HTML')`
			`pagenos = []`
pdf2html.cgi code cleanup git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@218 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-05-29 11:51:15 +00:00			`if 'p' in self.form:`
			`for m in re.finditer(r'\d+', self.form.getvalue('p')):`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`try:`
			`pagenos.append(int(m.group(0)))`
			`except ValueError:`
			`pass`
trivial grammar errors git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@173 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-01-10 07:18:05 +00:00			`logging.info('received: host=%s, name=%r, pagenos=%r' %`
pdf2html.cgi git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@169 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-01-01 14:15:25 +00:00			`(self.remote_addr, item.filename, pagenos))`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`h = abs(hash((random.random(), self.remote_addr, item.filename)))`
pdf2html.cgi git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@169 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-01-01 14:15:25 +00:00			`tmppath = os.path.join(self.tmpdir, '%08x%08x.pdf' % (self.cur_time, h))`
pdf2html webapp added. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@52 1aa58f4a-7d42-0410-adbc-911cccaed67c 2008-09-06 04:51:01 +00:00			`try:`
pdf2html.cgi code cleanup git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@218 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-05-29 11:51:15 +00:00			`if not html:`
			`self.content_type = 'text/plain; charset=%s' % self.codec`
			`self.http_200()`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`try:`
			`convert(sys.stdout, item.file, tmppath, pagenos=pagenos, codec=self.codec,`
			`maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE, html=html)`
			`except Exception, e:`
			`self.put('<p>Sorry, an error has occured: %s' % q(repr(e)))`
trivial grammar errors git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@173 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-01-10 07:18:05 +00:00			`logging.error('convert: %r: path=%r: %s' % (e, tmppath, traceback.format_exc()))`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`finally:`
			`try:`
			`os.remove(tmppath)`
			`except:`
			`pass`
			`return`
pdf2html webapp added. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@52 1aa58f4a-7d42-0410-adbc-911cccaed67c 2008-09-06 04:51:01 +00:00

			`# main`
pdf2html.cgi code cleanup git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@218 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-05-29 11:51:15 +00:00			`if __name__ == '__main__': sys.exit(WebApp().run(sys.argv))`