Remove webapp and other (un)helpful application references: django, cgi, and pyinstaller. (#320)
Fixes #314 Fixes #105pull/315/head
parent
1c4a4167ed
commit
d88d6020a2
|
@ -11,6 +11,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
|||
### Fixed
|
||||
- Unhandled AssertionError when dumping pdf containing reference to object id 0 ([#318](https://github.com/pdfminer/pdfminer.six/pull/318))
|
||||
|
||||
### Removed
|
||||
- Files for external applications such as django, cgi and pyinstaller ([#314](https://github.com/pdfminer/pdfminer.six/issues/314))
|
||||
|
||||
## [20191020] - 2019-10-20
|
||||
|
||||
### Deprecated
|
||||
|
|
|
@ -64,7 +64,7 @@ PDF parser that can be used for other purposes than text analysis.
|
|||
<li> CJK languages and vertical writing scripts support.
|
||||
<li> Various font types (Type1, TrueType, Type3, and CID) support.
|
||||
<li> Basic encryption (RC4) support.
|
||||
<li> PDF to HTML conversion (with a sample converter web app).
|
||||
<li> PDF to HTML conversion.
|
||||
<li> Outline (TOC) extraction.
|
||||
<li> Tagged contents extraction.
|
||||
<li> Reconstruct the original layout by grouping text chunks.
|
||||
|
|
|
@ -1,8 +1 @@
|
|||
STRICT = False
|
||||
|
||||
try:
|
||||
from django.conf import settings
|
||||
STRICT = getattr(settings, 'PDF_MINER_IS_STRICT', STRICT)
|
||||
except Exception:
|
||||
# in case it's not a django project
|
||||
pass
|
||||
|
|
|
@ -1,215 +0,0 @@
|
|||
#!/usr/bin/env python -O
|
||||
#
|
||||
# pdf2html.cgi - Gateway script for converting PDF into HTML.
|
||||
#
|
||||
# Security consideration for public access:
|
||||
#
|
||||
# Limit the process size and/or maximum cpu time.
|
||||
# The process should be chrooted.
|
||||
# The user should be imposed quota.
|
||||
#
|
||||
# How to Setup:
|
||||
# $ mkdir $CGIDIR
|
||||
# $ mkdir $CGIDIR/var
|
||||
# $ python setup.py install_lib --install-dir=$CGIDIR
|
||||
# $ cp pdfminer/tools/pdf2html.cgi $CGIDIR
|
||||
#
|
||||
|
||||
import sys, os, os.path, re, time
|
||||
import cgi, logging, traceback, random
|
||||
# comment out at this at runtime.
|
||||
#import cgitb; cgitb.enable()
|
||||
import pdfminer
|
||||
from pdfminer.pdfdocument import PDFDocument
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
from pdfminer.converter import HTMLConverter, TextConverter
|
||||
from pdfminer.layout import LAParams
|
||||
|
||||
import six #Python 2+3 compatibility
|
||||
|
||||
# quote HTML metacharacters
|
||||
def q(x):
|
||||
return x.replace('&','&').replace('>','>').replace('<','<').replace('"','"')
|
||||
|
||||
# encode parameters as a URL
|
||||
Q = re.compile(r'[^a-zA-Z0-9_.-=]')
|
||||
def url(base, **kw):
|
||||
r = []
|
||||
for (k,v) in six.iteritems(kw):
|
||||
v = Q.sub(lambda m: '%%%02X' % ord(m.group(0)), encoder(q(v), 'replace')[0])
|
||||
r.append('%s=%s' % (k, v))
|
||||
return base+'&'.join(r)
|
||||
|
||||
|
||||
## convert
|
||||
##
|
||||
class FileSizeExceeded(ValueError): pass
|
||||
def convert(infp, outfp, path, codec='utf-8',
|
||||
maxpages=0, maxfilesize=0, pagenos=None,
|
||||
html=True):
|
||||
# save the input file.
|
||||
src = open(path, 'wb')
|
||||
nbytes = 0
|
||||
while 1:
|
||||
data = infp.read(4096)
|
||||
nbytes += len(data)
|
||||
if maxfilesize and maxfilesize < nbytes:
|
||||
raise FileSizeExceeded(maxfilesize)
|
||||
if not data: break
|
||||
src.write(data)
|
||||
src.close()
|
||||
infp.close()
|
||||
# perform conversion and
|
||||
# send the results over the network.
|
||||
rsrcmgr = PDFResourceManager()
|
||||
laparams = LAParams()
|
||||
if html:
|
||||
device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
|
||||
layoutmode='exact')
|
||||
else:
|
||||
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
|
||||
fp = open(path, 'rb')
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages):
|
||||
interpreter.process_page(page)
|
||||
fp.close()
|
||||
device.close()
|
||||
return
|
||||
|
||||
|
||||
## WebApp
|
||||
##
|
||||
class WebApp(object):
|
||||
|
||||
TITLE = 'pdf2html demo'
|
||||
MAXFILESIZE = 10000000 # set to zero if unlimited.
|
||||
MAXPAGES = 100 # set to zero if unlimited.
|
||||
|
||||
def __init__(self, infp=sys.stdin, outfp=sys.stdout, environ=os.environ,
|
||||
codec='utf-8', apppath='/'):
|
||||
self.infp = infp
|
||||
self.outfp = outfp
|
||||
self.environ = environ
|
||||
self.codec = codec
|
||||
self.apppath = apppath
|
||||
self.remote_addr = self.environ.get('REMOTE_ADDR')
|
||||
self.path_info = self.environ.get('PATH_INFO')
|
||||
self.method = self.environ.get('REQUEST_METHOD', 'GET').upper()
|
||||
self.server = self.environ.get('SERVER_SOFTWARE', '')
|
||||
self.tmpdir = self.environ.get('TEMP', './var/')
|
||||
self.content_type = 'text/html; charset=%s' % codec
|
||||
self.logger = logging.getLogger()
|
||||
return
|
||||
|
||||
def put(self, *args):
|
||||
for x in args:
|
||||
if isinstance(x, str):
|
||||
self.outfp.write(x)
|
||||
elif isinstance(x, unicode):
|
||||
self.outfp.write(x.encode(self.codec, 'xmlcharrefreplace'))
|
||||
return
|
||||
|
||||
def response_200(self):
|
||||
if self.server.startswith('cgi-httpd'):
|
||||
# required for cgi-httpd
|
||||
self.outfp.write('HTTP/1.0 200 OK\r\n')
|
||||
self.outfp.write('Content-type: %s\r\n' % self.content_type)
|
||||
self.outfp.write('Connection: close\r\n\r\n')
|
||||
return
|
||||
|
||||
def response_404(self):
|
||||
if self.server.startswith('cgi-httpd'):
|
||||
# required for cgi-httpd
|
||||
self.outfp.write('HTTP/1.0 404 Not Found\r\n')
|
||||
self.outfp.write('Content-type: text/html\r\n')
|
||||
self.outfp.write('Connection: close\r\n\r\n')
|
||||
self.outfp.write('<html><body>page does not exist</body></body>\n')
|
||||
return
|
||||
|
||||
def response_301(self, url):
|
||||
if self.server.startswith('cgi-httpd'):
|
||||
# required for cgi-httpd
|
||||
self.outfp.write('HTTP/1.0 301 Moved\r\n')
|
||||
self.outfp.write('Location: %s\r\n\r\n' % url)
|
||||
return
|
||||
|
||||
def coverpage(self):
|
||||
self.put(
|
||||
'<html><head><title>%s</title></head><body>\n' % q(self.TITLE),
|
||||
'<h1>%s</h1><hr>\n' % q(self.TITLE),
|
||||
'<form method="POST" action="%s" enctype="multipart/form-data">\n' % q(self.apppath),
|
||||
'<p>Upload PDF File: <input name="f" type="file" value="">\n',
|
||||
' Page numbers (comma-separated):\n',
|
||||
'<input name="p" type="text" size="10" value="">\n',
|
||||
'<p>(Text extraction is limited to maximum %d pages.\n' % self.MAXPAGES,
|
||||
'Maximum file size for input is %d bytes.)\n' % self.MAXFILESIZE,
|
||||
'<p><input type="submit" name="c" value="Convert to HTML">\n',
|
||||
'<input type="submit" name="c" value="Convert to TEXT">\n',
|
||||
'<input type="reset" value="Reset">\n',
|
||||
'</form><hr>\n',
|
||||
'<p>Powered by <a href="http://www.unixuser.org/~euske/python/pdfminer/">PDFMiner</a>-%s\n' % pdfminer.__version__,
|
||||
'</body></html>\n',
|
||||
)
|
||||
return
|
||||
|
||||
def setup(self):
|
||||
self.run = self.response_404
|
||||
status = 404
|
||||
if not os.path.isdir(self.tmpdir):
|
||||
self.logger.error('no tmpdir')
|
||||
status = 304
|
||||
elif self.path_info == self.apppath:
|
||||
self.run = self.convert
|
||||
status = 200
|
||||
return status
|
||||
|
||||
def convert(self):
|
||||
form = cgi.FieldStorage(fp=self.infp, environ=self.environ)
|
||||
if (self.method != 'POST' or
|
||||
'c' not in form or
|
||||
'f' not in form):
|
||||
self.response_200()
|
||||
self.coverpage()
|
||||
return
|
||||
item = form['f']
|
||||
if not (item.file and item.filename):
|
||||
self.response_200()
|
||||
self.coverpage()
|
||||
return
|
||||
cmd = form.getvalue('c')
|
||||
html = (cmd == 'Convert to HTML')
|
||||
pagenos = []
|
||||
if 'p' in form:
|
||||
for m in re.finditer(r'\d+', form.getvalue('p')):
|
||||
try:
|
||||
pagenos.append(int(m.group(0)))
|
||||
except ValueError:
|
||||
pass
|
||||
h = abs(hash((random.random(), self.remote_addr, item.filename)))
|
||||
tmppath = os.path.join(self.tmpdir, '%08x%08x.pdf' % (time.time(), h))
|
||||
self.logger.info('received: host=%s, name=%r, pagenos=%r, tmppath=%r' %
|
||||
(self.remote_addr, item.filename, pagenos, tmppath))
|
||||
try:
|
||||
if not html:
|
||||
self.content_type = 'text/plain; charset=%s' % self.codec
|
||||
self.response_200()
|
||||
try:
|
||||
convert(item.file, self.outfp, tmppath, pagenos=pagenos, codec=self.codec,
|
||||
maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE, html=html)
|
||||
except Exception as e:
|
||||
self.put('<p>Sorry, an error has occurred: %s' % q(repr(e)))
|
||||
self.logger.error('convert: %r: path=%r: %s' % (e, traceback.format_exc()))
|
||||
finally:
|
||||
try:
|
||||
os.remove(tmppath)
|
||||
except:
|
||||
pass
|
||||
return
|
||||
|
||||
|
||||
# main
|
||||
if __name__ == '__main__':
|
||||
app = WebApp()
|
||||
app.setup()
|
||||
sys.exit(app.run())
|
|
@ -1,30 +0,0 @@
|
|||
# -*- mode: python -*-
|
||||
|
||||
block_cipher = None
|
||||
|
||||
|
||||
a = Analysis(['pdf2txt.py'],
|
||||
pathex=['C:\\Dev\\Python\\pdfminer.six\\tools'],
|
||||
binaries=[],
|
||||
datas=[],
|
||||
hiddenimports=[],
|
||||
hookspath=[],
|
||||
runtime_hooks=[],
|
||||
excludes=['django','matplotlib','PIL','numpy','qt5'],
|
||||
win_no_prefer_redirects=False,
|
||||
win_private_assemblies=False,
|
||||
cipher=block_cipher)
|
||||
|
||||
pyz = PYZ(a.pure, a.zipped_data,
|
||||
cipher=block_cipher)
|
||||
exe = EXE(pyz,
|
||||
a.scripts,
|
||||
a.binaries,
|
||||
a.zipfiles,
|
||||
a.datas,
|
||||
name='pdf2txt',
|
||||
debug=False,
|
||||
strip=False,
|
||||
upx=True,
|
||||
runtime_tmpdir=None,
|
||||
console=True )
|
|
@ -1,29 +0,0 @@
|
|||
# -*- mode: python -*-
|
||||
|
||||
block_cipher = None
|
||||
|
||||
|
||||
a = Analysis(['pdfdiff.py'],
|
||||
pathex=['C:\\Dev\\Python\\pdfminer.six\\tools'],
|
||||
binaries=[],
|
||||
datas=[],
|
||||
hiddenimports=[],
|
||||
hookspath=[],
|
||||
runtime_hooks=[],
|
||||
excludes=['django','matplotlib','PIL','numpy','qt5'],
|
||||
win_no_prefer_redirects=False,
|
||||
win_private_assemblies=False,
|
||||
cipher=block_cipher)
|
||||
pyz = PYZ(a.pure, a.zipped_data,
|
||||
cipher=block_cipher)
|
||||
exe = EXE(pyz,
|
||||
a.scripts,
|
||||
a.binaries,
|
||||
a.zipfiles,
|
||||
a.datas,
|
||||
name='pdfdiff',
|
||||
debug=False,
|
||||
strip=False,
|
||||
upx=True,
|
||||
runtime_tmpdir=None,
|
||||
console=True )
|
114
tools/runapp.py
114
tools/runapp.py
|
@ -1,114 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
##
|
||||
## WebApp class runner
|
||||
##
|
||||
## usage:
|
||||
## $ runapp.py pdf2html.cgi
|
||||
##
|
||||
|
||||
import sys
|
||||
import urllib
|
||||
from six.moves.http_client import responses
|
||||
from six.moves.BaseHTTPServer import HTTPServer
|
||||
from six.moves.SimpleHTTPServer import SimpleHTTPRequestHandler
|
||||
|
||||
## WebAppHandler
|
||||
##
|
||||
class WebAppHandler(SimpleHTTPRequestHandler):
|
||||
|
||||
APP_CLASS = None
|
||||
|
||||
def do_POST(self):
|
||||
return self.run_cgi()
|
||||
|
||||
def send_head(self):
|
||||
return self.run_cgi()
|
||||
|
||||
def run_cgi(self):
|
||||
rest = self.path
|
||||
i = rest.rfind('?')
|
||||
if i >= 0:
|
||||
rest, query = rest[:i], rest[i+1:]
|
||||
else:
|
||||
query = ''
|
||||
i = rest.find('/')
|
||||
if i >= 0:
|
||||
script, rest = rest[:i], rest[i:]
|
||||
else:
|
||||
script, rest = rest, ''
|
||||
scriptname = '/' + script
|
||||
scriptfile = self.translate_path(scriptname)
|
||||
env = {}
|
||||
env['SERVER_SOFTWARE'] = self.version_string()
|
||||
env['SERVER_NAME'] = self.server.server_name
|
||||
env['GATEWAY_INTERFACE'] = 'CGI/1.1'
|
||||
env['SERVER_PROTOCOL'] = self.protocol_version
|
||||
env['SERVER_PORT'] = str(self.server.server_port)
|
||||
env['REQUEST_METHOD'] = self.command
|
||||
uqrest = urllib.unquote(rest)
|
||||
env['PATH_INFO'] = uqrest
|
||||
env['PATH_TRANSLATED'] = self.translate_path(uqrest)
|
||||
env['SCRIPT_NAME'] = scriptname
|
||||
if query:
|
||||
env['QUERY_STRING'] = query
|
||||
host = self.address_string()
|
||||
if host != self.client_address[0]:
|
||||
env['REMOTE_HOST'] = host
|
||||
env['REMOTE_ADDR'] = self.client_address[0]
|
||||
if self.headers.typeheader is None:
|
||||
env['CONTENT_TYPE'] = self.headers.type
|
||||
else:
|
||||
env['CONTENT_TYPE'] = self.headers.typeheader
|
||||
length = self.headers.getheader('content-length')
|
||||
if length:
|
||||
env['CONTENT_LENGTH'] = length
|
||||
accept = []
|
||||
for line in self.headers.getallmatchingheaders('accept'):
|
||||
if line[:1] in "\t\n\r ":
|
||||
accept.append(line.strip())
|
||||
else:
|
||||
accept = accept + line[7:].split(',')
|
||||
env['HTTP_ACCEPT'] = ','.join(accept)
|
||||
ua = self.headers.getheader('user-agent')
|
||||
if ua:
|
||||
env['HTTP_USER_AGENT'] = ua
|
||||
co = filter(None, self.headers.getheaders('cookie'))
|
||||
if co:
|
||||
env['HTTP_COOKIE'] = ', '.join(co)
|
||||
for k in ('QUERY_STRING', 'REMOTE_HOST', 'CONTENT_LENGTH',
|
||||
'HTTP_USER_AGENT', 'HTTP_COOKIE'):
|
||||
env.setdefault(k, "")
|
||||
app = self.APP_CLASS(infp=self.rfile, outfp=self.wfile, environ=env)
|
||||
status = app.setup()
|
||||
self.send_response(status, responses[status])
|
||||
app.run()
|
||||
return
|
||||
|
||||
# main
|
||||
def main(argv):
|
||||
import getopt, imp
|
||||
def usage():
|
||||
print ('usage: %s [-h host] [-p port] [-n name] module.class' % argv[0])
|
||||
return 100
|
||||
try:
|
||||
(opts, args) = getopt.getopt(argv[1:], 'h:p:n:')
|
||||
except getopt.GetoptError:
|
||||
return usage()
|
||||
host = ''
|
||||
port = 8080
|
||||
name = 'WebApp'
|
||||
for (k, v) in opts:
|
||||
if k == '-h': host = v
|
||||
elif k == '-p': port = int(v)
|
||||
elif k == '-n': name = v
|
||||
if not args: return usage()
|
||||
path = args.pop(0)
|
||||
module = imp.load_source('app', path)
|
||||
WebAppHandler.APP_CLASS = getattr(module, name)
|
||||
print ('Listening %s:%d...' % (host,port))
|
||||
httpd = HTTPServer((host,port), WebAppHandler)
|
||||
httpd.serve_forever()
|
||||
return
|
||||
|
||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
Loading…
Reference in New Issue