trivial grammar errors

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@173 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2010-01-10 07:18:05 +00:00
parent 665f2bd710
commit a9d7a00ccd
2 changed files with 18 additions and 14 deletions

View File

@ -19,7 +19,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod> <div align=right class=lastmod>
<!-- hhmts start --> <!-- hhmts start -->
Last Modified: Mon Jan 4 21:44:43 JST 2010 Last Modified: Mon Jan 4 23:23:00 JST 2010
<!-- hhmts end --> <!-- hhmts end -->
</div> </div>
@ -197,7 +197,7 @@ By default, it extracts texts from all the pages.
<dd> Specifies the output format. The following formats are currently supported. <dd> Specifies the output format. The following formats are currently supported.
<ul> <ul>
<li> <code>text</code> : TEXT format. (Default) <li> <code>text</code> : TEXT format. (Default)
<li> <code>html</code> : HTML format. Not recommended for extraction purpose because the markup is very messy. <li> <code>html</code> : HTML format. Not recommended for extraction purposes because the markup is messy.
<li> <code>xml</code> : XML format. Provides the most information available. <li> <code>xml</code> : XML format. Provides the most information available.
<li> <code>tag</code> : "Tagged PDF" format. A tagged PDF has its own contents annotated with <li> <code>tag</code> : "Tagged PDF" format. A tagged PDF has its own contents annotated with
HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations. HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations.
@ -269,7 +269,7 @@ By default, it extracts all the pages in a document.
<h3>dumppdf.py</h3> <h3>dumppdf.py</h3>
<p> <p>
<code>dumppdf.py</code> dumps the internal contents of a PDF file <code>dumppdf.py</code> dumps the internal contents of a PDF file
in pseudo-XML format. This program is primarily for debugging purpose, in pseudo-XML format. This program is primarily for debugging purposes,
but it's also possible to extract some meaningful contents but it's also possible to extract some meaningful contents
(such as images). (such as images).

View File

@ -1,14 +1,14 @@
#!/usr/bin/python #!/usr/bin/python -O
# #
# pdf2html.cgi - Gateway for converting PDF into HTML. # pdf2html.cgi - Gateway script for converting PDF into HTML.
# #
# Security consideration for public access: # Security consideration for public access:
# #
# Limit the process size and/or running time. # Limit the process size and/or maximum cpu time.
# The process should be chrooted. # The process should be chrooted.
# The user should be imposed quota. # The user should be imposed quota.
# #
# Setup: # How to Setup:
# $ mkdir $CGIDIR # $ mkdir $CGIDIR
# $ mkdir $CGIDIR/var # $ mkdir $CGIDIR/var
# $ python setup.py install_lib --install-dir=$CGIDIR # $ python setup.py install_lib --install-dir=$CGIDIR
@ -16,9 +16,10 @@
# #
import sys import sys
# comment out at runtime. # comment out at this at runtime.
import cgitb; cgitb.enable() #import cgitb; cgitb.enable()
import os, os.path, re, cgi, time, random, codecs, logging, traceback import os, os.path, re, cgi, time, random, codecs, logging, traceback
import pdfminer
from pdfminer.pdfinterp import PDFResourceManager, process_pdf from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import HTMLConverter, TextConverter from pdfminer.converter import HTMLConverter, TextConverter
from pdfminer.layout import LAParams from pdfminer.layout import LAParams
@ -138,26 +139,29 @@ class PDF2HTMLApp(object):
'<input type="submit" name="c" value="Convert to TEXT">\n', '<input type="submit" name="c" value="Convert to TEXT">\n',
'<input type="reset" value="Reset">\n', '<input type="reset" value="Reset">\n',
'</form><hr>\n', '</form><hr>\n',
'<p>Powered by <a href="http://www.unixuser.org/~euske/python/pdfminer/">PDFMiner</a>\n', '<p>Powered by <a href="http://www.unixuser.org/~euske/python/pdfminer/">PDFMiner</a>-%s\n' % pdfminer.__version__,
'</body></html>\n', '</body></html>\n',
) )
return return
def run(self, argv): def run(self, argv):
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s')
if self.debug: if self.debug:
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(levelname)s %(message)s')
else: else:
logging.basicConfig(level=logging.ERROR, logging.basicConfig(level=logging.ERROR,
format='%(asctime)s %(levelname)s %(message)s',
filename=self.logpath, filemode='a') filename=self.logpath, filemode='a')
if self.path_info == '/': if self.path_info == '/':
self.http_200() self.http_200()
self.coverpage() self.coverpage()
return return
if self.path_info != self.APPURL: if self.path_info != self.APPURL:
logging.error('invalid path: %r' % self.path_info)
self.http_404() self.http_404()
return return
if not os.path.isdir(self.tmpdir): if not os.path.isdir(self.tmpdir):
logging.error('no tmpdir')
self.bummer('error') self.bummer('error')
return return
form = cgi.FieldStorage() form = cgi.FieldStorage()
@ -180,7 +184,7 @@ class PDF2HTMLApp(object):
pagenos.append(int(m.group(0))) pagenos.append(int(m.group(0)))
except ValueError: except ValueError:
pass pass
logging.info('process: host=%s, name=%r, pagenos=%r' % logging.info('received: host=%s, name=%r, pagenos=%r' %
(self.remote_addr, item.filename, pagenos)) (self.remote_addr, item.filename, pagenos))
h = abs(hash((random.random(), self.remote_addr, item.filename))) h = abs(hash((random.random(), self.remote_addr, item.filename)))
tmppath = os.path.join(self.tmpdir, '%08x%08x.pdf' % (self.cur_time, h)) tmppath = os.path.join(self.tmpdir, '%08x%08x.pdf' % (self.cur_time, h))
@ -193,7 +197,7 @@ class PDF2HTMLApp(object):
maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE, html=html) maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE, html=html)
except Exception, e: except Exception, e:
self.put('<p>Sorry, an error has occured: %s' % q(repr(e))) self.put('<p>Sorry, an error has occured: %s' % q(repr(e)))
logging.error('error: %r: path=%r: %s' % (e, tmppath, traceback.format_exc())) logging.error('convert: %r: path=%r: %s' % (e, tmppath, traceback.format_exc()))
finally: finally:
try: try:
os.remove(tmppath) os.remove(tmppath)