diff --git a/docs/index.html b/docs/index.html index 88c9995..9057053 100644 --- a/docs/index.html +++ b/docs/index.html @@ -19,7 +19,7 @@ Python PDF parser and analyzer
text
: TEXT format. (Default)
-html
: HTML format. Not recommended for extraction purpose because the markup is very messy.
+html
: HTML format. Not recommended for extraction purposes because the markup is messy.
xml
: XML format. Provides the most information available.
tag
: "Tagged PDF" format. A tagged PDF has its own contents annotated with
HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations.
@@ -269,7 +269,7 @@ By default, it extracts all the pages in a document.
dumppdf.py
dumps the internal contents of a PDF file
-in pseudo-XML format. This program is primarily for debugging purpose,
+in pseudo-XML format. This program is primarily for debugging purposes,
but it's also possible to extract some meaningful contents
(such as images).
diff --git a/tools/pdf2html.cgi b/tools/pdf2html.cgi
index a890e16..e0d61b8 100755
--- a/tools/pdf2html.cgi
+++ b/tools/pdf2html.cgi
@@ -1,14 +1,14 @@
-#!/usr/bin/python
+#!/usr/bin/python -O
#
-# pdf2html.cgi - Gateway for converting PDF into HTML.
+# pdf2html.cgi - Gateway script for converting PDF into HTML.
#
# Security consideration for public access:
#
-# Limit the process size and/or running time.
+# Limit the process size and/or maximum cpu time.
# The process should be chrooted.
# The user should be imposed quota.
#
-# Setup:
+# How to Setup:
# $ mkdir $CGIDIR
# $ mkdir $CGIDIR/var
# $ python setup.py install_lib --install-dir=$CGIDIR
@@ -16,9 +16,10 @@
#
import sys
-# comment out at runtime.
-import cgitb; cgitb.enable()
+# comment out at this at runtime.
+#import cgitb; cgitb.enable()
import os, os.path, re, cgi, time, random, codecs, logging, traceback
+import pdfminer
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import HTMLConverter, TextConverter
from pdfminer.layout import LAParams
@@ -138,26 +139,29 @@ class PDF2HTMLApp(object):
'\n',
'\n',
'
Powered by PDFMiner\n', + '
Powered by PDFMiner-%s\n' % pdfminer.__version__, '