trivial grammar errors
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@173 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
665f2bd710
commit
a9d7a00ccd
|
@ -19,7 +19,7 @@ Python PDF parser and analyzer
|
||||||
|
|
||||||
<div align=right class=lastmod>
|
<div align=right class=lastmod>
|
||||||
<!-- hhmts start -->
|
<!-- hhmts start -->
|
||||||
Last Modified: Mon Jan 4 21:44:43 JST 2010
|
Last Modified: Mon Jan 4 23:23:00 JST 2010
|
||||||
<!-- hhmts end -->
|
<!-- hhmts end -->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -197,7 +197,7 @@ By default, it extracts texts from all the pages.
|
||||||
<dd> Specifies the output format. The following formats are currently supported.
|
<dd> Specifies the output format. The following formats are currently supported.
|
||||||
<ul>
|
<ul>
|
||||||
<li> <code>text</code> : TEXT format. (Default)
|
<li> <code>text</code> : TEXT format. (Default)
|
||||||
<li> <code>html</code> : HTML format. Not recommended for extraction purpose because the markup is very messy.
|
<li> <code>html</code> : HTML format. Not recommended for extraction purposes because the markup is messy.
|
||||||
<li> <code>xml</code> : XML format. Provides the most information available.
|
<li> <code>xml</code> : XML format. Provides the most information available.
|
||||||
<li> <code>tag</code> : "Tagged PDF" format. A tagged PDF has its own contents annotated with
|
<li> <code>tag</code> : "Tagged PDF" format. A tagged PDF has its own contents annotated with
|
||||||
HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations.
|
HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations.
|
||||||
|
@ -269,7 +269,7 @@ By default, it extracts all the pages in a document.
|
||||||
<h3>dumppdf.py</h3>
|
<h3>dumppdf.py</h3>
|
||||||
<p>
|
<p>
|
||||||
<code>dumppdf.py</code> dumps the internal contents of a PDF file
|
<code>dumppdf.py</code> dumps the internal contents of a PDF file
|
||||||
in pseudo-XML format. This program is primarily for debugging purpose,
|
in pseudo-XML format. This program is primarily for debugging purposes,
|
||||||
but it's also possible to extract some meaningful contents
|
but it's also possible to extract some meaningful contents
|
||||||
(such as images).
|
(such as images).
|
||||||
|
|
||||||
|
|
|
@ -1,14 +1,14 @@
|
||||||
#!/usr/bin/python
|
#!/usr/bin/python -O
|
||||||
#
|
#
|
||||||
# pdf2html.cgi - Gateway for converting PDF into HTML.
|
# pdf2html.cgi - Gateway script for converting PDF into HTML.
|
||||||
#
|
#
|
||||||
# Security consideration for public access:
|
# Security consideration for public access:
|
||||||
#
|
#
|
||||||
# Limit the process size and/or running time.
|
# Limit the process size and/or maximum cpu time.
|
||||||
# The process should be chrooted.
|
# The process should be chrooted.
|
||||||
# The user should be imposed quota.
|
# The user should be imposed quota.
|
||||||
#
|
#
|
||||||
# Setup:
|
# How to Setup:
|
||||||
# $ mkdir $CGIDIR
|
# $ mkdir $CGIDIR
|
||||||
# $ mkdir $CGIDIR/var
|
# $ mkdir $CGIDIR/var
|
||||||
# $ python setup.py install_lib --install-dir=$CGIDIR
|
# $ python setup.py install_lib --install-dir=$CGIDIR
|
||||||
|
@ -16,9 +16,10 @@
|
||||||
#
|
#
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
# comment out at runtime.
|
# comment out at this at runtime.
|
||||||
import cgitb; cgitb.enable()
|
#import cgitb; cgitb.enable()
|
||||||
import os, os.path, re, cgi, time, random, codecs, logging, traceback
|
import os, os.path, re, cgi, time, random, codecs, logging, traceback
|
||||||
|
import pdfminer
|
||||||
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
|
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
|
||||||
from pdfminer.converter import HTMLConverter, TextConverter
|
from pdfminer.converter import HTMLConverter, TextConverter
|
||||||
from pdfminer.layout import LAParams
|
from pdfminer.layout import LAParams
|
||||||
|
@ -138,26 +139,29 @@ class PDF2HTMLApp(object):
|
||||||
'<input type="submit" name="c" value="Convert to TEXT">\n',
|
'<input type="submit" name="c" value="Convert to TEXT">\n',
|
||||||
'<input type="reset" value="Reset">\n',
|
'<input type="reset" value="Reset">\n',
|
||||||
'</form><hr>\n',
|
'</form><hr>\n',
|
||||||
'<p>Powered by <a href="http://www.unixuser.org/~euske/python/pdfminer/">PDFMiner</a>\n',
|
'<p>Powered by <a href="http://www.unixuser.org/~euske/python/pdfminer/">PDFMiner</a>-%s\n' % pdfminer.__version__,
|
||||||
'</body></html>\n',
|
'</body></html>\n',
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
def run(self, argv):
|
def run(self, argv):
|
||||||
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s')
|
|
||||||
if self.debug:
|
if self.debug:
|
||||||
logging.basicConfig(level=logging.DEBUG)
|
logging.basicConfig(level=logging.DEBUG,
|
||||||
|
format='%(asctime)s %(levelname)s %(message)s')
|
||||||
else:
|
else:
|
||||||
logging.basicConfig(level=logging.ERROR,
|
logging.basicConfig(level=logging.ERROR,
|
||||||
|
format='%(asctime)s %(levelname)s %(message)s',
|
||||||
filename=self.logpath, filemode='a')
|
filename=self.logpath, filemode='a')
|
||||||
if self.path_info == '/':
|
if self.path_info == '/':
|
||||||
self.http_200()
|
self.http_200()
|
||||||
self.coverpage()
|
self.coverpage()
|
||||||
return
|
return
|
||||||
if self.path_info != self.APPURL:
|
if self.path_info != self.APPURL:
|
||||||
|
logging.error('invalid path: %r' % self.path_info)
|
||||||
self.http_404()
|
self.http_404()
|
||||||
return
|
return
|
||||||
if not os.path.isdir(self.tmpdir):
|
if not os.path.isdir(self.tmpdir):
|
||||||
|
logging.error('no tmpdir')
|
||||||
self.bummer('error')
|
self.bummer('error')
|
||||||
return
|
return
|
||||||
form = cgi.FieldStorage()
|
form = cgi.FieldStorage()
|
||||||
|
@ -180,7 +184,7 @@ class PDF2HTMLApp(object):
|
||||||
pagenos.append(int(m.group(0)))
|
pagenos.append(int(m.group(0)))
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
logging.info('process: host=%s, name=%r, pagenos=%r' %
|
logging.info('received: host=%s, name=%r, pagenos=%r' %
|
||||||
(self.remote_addr, item.filename, pagenos))
|
(self.remote_addr, item.filename, pagenos))
|
||||||
h = abs(hash((random.random(), self.remote_addr, item.filename)))
|
h = abs(hash((random.random(), self.remote_addr, item.filename)))
|
||||||
tmppath = os.path.join(self.tmpdir, '%08x%08x.pdf' % (self.cur_time, h))
|
tmppath = os.path.join(self.tmpdir, '%08x%08x.pdf' % (self.cur_time, h))
|
||||||
|
@ -193,7 +197,7 @@ class PDF2HTMLApp(object):
|
||||||
maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE, html=html)
|
maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE, html=html)
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
self.put('<p>Sorry, an error has occured: %s' % q(repr(e)))
|
self.put('<p>Sorry, an error has occured: %s' % q(repr(e)))
|
||||||
logging.error('error: %r: path=%r: %s' % (e, tmppath, traceback.format_exc()))
|
logging.error('convert: %r: path=%r: %s' % (e, tmppath, traceback.format_exc()))
|
||||||
finally:
|
finally:
|
||||||
try:
|
try:
|
||||||
os.remove(tmppath)
|
os.remove(tmppath)
|
||||||
|
|
Loading…
Reference in New Issue