From 3e5ab3e01b1a4a65e002f9c0fc96c0a2a3b1460d Mon Sep 17 00:00:00 2001
From: "yusuke.shinyama.dummy"
 <yusuke.shinyama.dummy@1aa58f4a-7d42-0410-adbc-911cccaed67c>
Date: Sat, 6 Sep 2008 04:51:01 +0000
Subject: [PATCH] pdf2html webapp added.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@52 1aa58f4a-7d42-0410-adbc-911cccaed67c
---
 Makefile           |   2 +-
 TODO               |   3 +-
 tools/pdf2html.cgi | 181 +++++++++++++++++++++++++++++++++++++++++++++
 tools/pdf2txt.py   |   4 +-
 4 files changed, 185 insertions(+), 5 deletions(-)
 create mode 100755 tools/pdf2html.cgi

diff --git a/Makefile b/Makefile
index 7afc258..5a0c224 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 # Makefile for pdfminer
 
 PACKAGE=pdfminer
-VERSION=20080830
+VERSION=20080906
 GNUTAR=tar
 SVN=svn
 PYTHON=python
diff --git a/TODO b/TODO
index f32ff7a..35585e9 100644
--- a/TODO
+++ b/TODO
@@ -1,6 +1,5 @@
 TODOs:
-  - API Documentation.
-  - Sample webapp for pdf->html.
+  - Better API Documentation.
   - Error handling for invalid type.
   - Infer text stream by clustering.
 
diff --git a/tools/pdf2html.cgi b/tools/pdf2html.cgi
new file mode 100755
index 0000000..828ef36
--- /dev/null
+++ b/tools/pdf2html.cgi
@@ -0,0 +1,181 @@
+#!/usr/bin/python
+#
+# pdf2html.cgi - Gateway for converting PDF into HTML.
+#
+# Security consideration for public access:
+#
+#   Limit the process size and/or running time.
+#   The process should be chrooted.
+#   The user should be imposed quota.
+#
+# Setup:
+#   $ mkdir CGIDIR
+#   $ mkdir CGIDIR/var
+#   $ cp -a pdfminer/pdflib CGIDIR
+#   $ cp -a pdfminer/tools CGIDIR
+#   $ cp -a pdfminer/CDBCMap CGIDIR
+#   $ PYTHONPATH=CGIDIR pdfminer/tools/pdf2html.cgi 
+#
+
+import sys
+# comment out at runtime.
+import cgitb; cgitb.enable()
+import os, os.path, re, cgi, time, random, codecs, logging, traceback
+
+
+# quote HTML metacharacters
+def q(x):
+  return x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
+
+# encode parameters as a URL
+Q = re.compile(r'[^a-zA-Z0-9_.-=]')
+def url(base, **kw):
+  r = []
+  for (k,v) in kw.iteritems():
+    v = Q.sub(lambda m: '%%%02X' % ord(m.group(0)), encoder(q(v), 'replace')[0])
+    r.append('%s=%s' % (k, v))
+  return base+'&'.join(r)
+
+##  convert(outfp, infp, path, codec='utf-8', maxpages=10, pagenos=None)
+##
+class FileSizeExceeded(ValueError): pass
+def convert(outfp, infp, path, codec='utf-8', maxpages=10, maxfilesize=5000000, pagenos=None):
+  from tools.pdf2txt import CMapDB, PDFResourceManager, HTMLConverter, convert
+  # save the input file.
+  src = file(path, 'wb')
+  nbytes = 0
+  while 1:
+    data = infp.read(4096)
+    nbytes += len(data)
+    if maxfilesize and maxfilesize < nbytes:
+      raise FileSizeExceeded(maxfilesize)
+    if not data: break
+    src.write(data)
+  src.close()
+  infp.close()
+  # perform conversion and
+  # send the results over the network.
+  CMapDB.initialize('.', './CDBCMap')
+  rsrc = PDFResourceManager()
+  device = HTMLConverter(rsrc, outfp, codec=codec)
+  convert(rsrc, device, path, pagenos, maxpages=maxpages)
+  return
+
+
+##  PDF2HTMLApp
+##
+class PDF2HTMLApp(object):
+
+  APPURL = '/convert'
+  TMPDIR = './var/'
+  LOGPATH = './var/log'
+  MAXFILESIZE = 5000000
+  MAXPAGES = 10
+  
+  def __init__(self, outfp, logpath=LOGPATH, loglevel=logging.DEBUG, codec='utf-8'):
+    self.outfp = outfp
+    self.codec = codec
+    logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', 
+                        level=loglevel, filename=logpath, filemode='a')
+    self.remote_addr = os.environ.get('REMOTE_ADDR')
+    self.path_info = os.environ.get('PATH_INFO')
+    self.method = os.environ.get('REQUEST_METHOD', 'GET')
+    self.server = os.environ.get('SERVER_SOFTWARE', '')
+    self.content_type = 'text/html; charset=%s' % codec
+    self.cur_time = time.time()
+    self.form = cgi.FieldStorage()
+    return
+
+  def put(self, *args):
+    for x in args:
+      if isinstance(x, str):
+        self.outfp.write(x)
+      elif isinstance(x, unicode):
+        self.outfp.write(x.encode(self.codec, 'xmlcharrefreplace'))
+    return
+
+  def http_200(self):
+    if self.server.startswith('cgi-httpd'):
+      # required for cgi-httpd
+      self.outfp.write('HTTP/1.0 200 OK\r\n')
+    self.outfp.write('Content-type: %s\r\n' % self.content_type)
+    self.outfp.write('Connection: close\r\n\r\n')
+    return
+  
+  def http_404(self):
+    if self.server.startswith('cgi-httpd'):
+      # required for cgi-httpd
+      self.outfp.write('HTTP/1.0 404 Not Found\r\n')
+    self.outfp.write('Content-type: text/html\r\n')
+    self.outfp.write('Connection: close\r\n\r\n')
+    self.outfp.write('<html><body>page does not exist</body></body>\n')
+    return
+  
+  def http_301(self, url):
+    if self.server.startswith('cgi-httpd'):
+      # required for cgi-httpd
+      self.outfp.write('HTTP/1.0 301 Moved\r\n')
+    self.outfp.write('Location: %s\r\n\r\n' % url)
+    return
+
+  def coverpage(self):
+    self.put(
+      '<html><head><title>pdf2html demo</title></head><body>\n',
+      '<h1>pdf2html demo</h1><hr>\n',
+      '<form method="POST" action="%s" enctype="multipart/form-data">\n' % q(self.APPURL),
+      '<p>Upload PDF File: <input name="f" type="file" value="">\n',
+      '&nbsp; Page numbers (comma-separated): <input name="p" type="text" size="10" value="">\n',
+      '<p>(Text extraction is limited to maximum %d pages.\n' % self.MAXPAGES,
+      'Maximum file size for input is %d bytes.)\n' % self.MAXFILESIZE,
+      '<p><input type="submit" value="Convert to HTML"> <input type="reset" value="Reset">\n',
+      '</form><hr>\n',
+      '<p>Powered by <a href="http://www.unixuser.org/~euske/python/pdfminer/">PDFMiner</a>\n',
+      '</body></html>\n',
+      )
+    return
+
+  def run(self, argv):
+    if self.path_info == '/':
+      self.http_200()
+      self.coverpage()
+      return
+    if self.path_info != self.APPURL:
+      self.http_404()
+      return
+    if not os.path.isdir(self.TMPDIR):
+      self.bummer('error')
+      return
+    if 'f' not in self.form:
+      self.http_301('/')
+      return
+    item = self.form['f']
+    if not (item.file and item.filename):
+      self.http_301('/')
+      return
+    pagenos = []
+    if 'p' in self.form:
+      for m in re.finditer(r'\d+', self.form.getvalue('p')):
+        try:
+          pagenos.append(int(m.group(0)))
+        except ValueError:
+          pass
+    logging.info('process: host=%s, name=%r, pagenos=%r' % (self.remote_addr, item.filename, pagenos))
+    h = abs(hash((random.random(), self.remote_addr, item.filename)))
+    tmppath = os.path.join(self.TMPDIR, '%08x%08x.pdf' % (self.cur_time, h))
+    try:
+      try:
+        convert(sys.stdout, item.file, tmppath, pagenos=pagenos,
+                codec=self.codec, maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE)
+      except Exception, e:
+        self.put('<p>Sorry, an error has occured: %s' % q(repr(e)))
+        logging.error('error: %r: path=%r: %s' % (e, tmppath, traceback.format_exc()))
+    finally:
+      try:
+        os.remove(tmppath)
+      except:
+        pass
+    return
+
+
+# main
+if __name__ == '__main__': sys.exit(PDF2HTMLApp(sys.stdout).run(sys.argv))
diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py
index 12975b1..b6170c2 100755
--- a/tools/pdf2txt.py
+++ b/tools/pdf2txt.py
@@ -161,9 +161,9 @@ def convert(rsrc, device, fname, pagenos, maxpages=0, password='', debug=0):
   try:
     doc.initialize(password)
   except PDFPasswordIncorrect:
-    raise TextExtractionNotAllowed('incorrect password')
+    raise TextExtractionNotAllowed('Incorrect password')
   if not doc.is_extractable:
-    raise TextExtractionNotAllowed('text extraction is not allowed: %r' % fname)
+    raise TextExtractionNotAllowed('Text extraction is not allowed: %r' % fname)
   interpreter = PDFPageInterpreter(rsrc, device, debug=debug)
   for (pageno,page) in enumerate(doc.get_pages(debug=debug)):
     if pagenos and (pageno not in pagenos): continue