Remove webapp and other (un)helpful application references: django, cgi, and pyinstaller. (#320)

Fixes #314 
Fixes #105
pull/315/head
Pieter Marsman 2019-10-26 19:16:37 +02:00 committed by GitHub
parent 1c4a4167ed
commit d88d6020a2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 4 additions and 396 deletions

View File

@ -11,6 +11,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
### Fixed ### Fixed
- Unhandled AssertionError when dumping pdf containing reference to object id 0 ([#318](https://github.com/pdfminer/pdfminer.six/pull/318)) - Unhandled AssertionError when dumping pdf containing reference to object id 0 ([#318](https://github.com/pdfminer/pdfminer.six/pull/318))
### Removed
- Files for external applications such as django, cgi and pyinstaller ([#314](https://github.com/pdfminer/pdfminer.six/issues/314))
## [20191020] - 2019-10-20 ## [20191020] - 2019-10-20
### Deprecated ### Deprecated

View File

@ -64,7 +64,7 @@ PDF parser that can be used for other purposes than text analysis.
<li> CJK languages and vertical writing scripts support. <li> CJK languages and vertical writing scripts support.
<li> Various font types (Type1, TrueType, Type3, and CID) support. <li> Various font types (Type1, TrueType, Type3, and CID) support.
<li> Basic encryption (RC4) support. <li> Basic encryption (RC4) support.
<li> PDF to HTML conversion (with a sample converter web app). <li> PDF to HTML conversion.
<li> Outline (TOC) extraction. <li> Outline (TOC) extraction.
<li> Tagged contents extraction. <li> Tagged contents extraction.
<li> Reconstruct the original layout by grouping text chunks. <li> Reconstruct the original layout by grouping text chunks.

View File

@ -1,8 +1 @@
STRICT = False STRICT = False
try:
from django.conf import settings
STRICT = getattr(settings, 'PDF_MINER_IS_STRICT', STRICT)
except Exception:
# in case it's not a django project
pass

View File

@ -1,215 +0,0 @@
#!/usr/bin/env python -O
#
# pdf2html.cgi - Gateway script for converting PDF into HTML.
#
# Security consideration for public access:
#
# Limit the process size and/or maximum cpu time.
# The process should be chrooted.
# The user should be imposed quota.
#
# How to Setup:
# $ mkdir $CGIDIR
# $ mkdir $CGIDIR/var
# $ python setup.py install_lib --install-dir=$CGIDIR
# $ cp pdfminer/tools/pdf2html.cgi $CGIDIR
#
import sys, os, os.path, re, time
import cgi, logging, traceback, random
# comment out at this at runtime.
#import cgitb; cgitb.enable()
import pdfminer
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import HTMLConverter, TextConverter
from pdfminer.layout import LAParams
import six #Python 2+3 compatibility
# quote HTML metacharacters
def q(x):
return x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
# encode parameters as a URL
Q = re.compile(r'[^a-zA-Z0-9_.-=]')
def url(base, **kw):
r = []
for (k,v) in six.iteritems(kw):
v = Q.sub(lambda m: '%%%02X' % ord(m.group(0)), encoder(q(v), 'replace')[0])
r.append('%s=%s' % (k, v))
return base+'&'.join(r)
## convert
##
class FileSizeExceeded(ValueError): pass
def convert(infp, outfp, path, codec='utf-8',
maxpages=0, maxfilesize=0, pagenos=None,
html=True):
# save the input file.
src = open(path, 'wb')
nbytes = 0
while 1:
data = infp.read(4096)
nbytes += len(data)
if maxfilesize and maxfilesize < nbytes:
raise FileSizeExceeded(maxfilesize)
if not data: break
src.write(data)
src.close()
infp.close()
# perform conversion and
# send the results over the network.
rsrcmgr = PDFResourceManager()
laparams = LAParams()
if html:
device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
layoutmode='exact')
else:
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages):
interpreter.process_page(page)
fp.close()
device.close()
return
## WebApp
##
class WebApp(object):
TITLE = 'pdf2html demo'
MAXFILESIZE = 10000000 # set to zero if unlimited.
MAXPAGES = 100 # set to zero if unlimited.
def __init__(self, infp=sys.stdin, outfp=sys.stdout, environ=os.environ,
codec='utf-8', apppath='/'):
self.infp = infp
self.outfp = outfp
self.environ = environ
self.codec = codec
self.apppath = apppath
self.remote_addr = self.environ.get('REMOTE_ADDR')
self.path_info = self.environ.get('PATH_INFO')
self.method = self.environ.get('REQUEST_METHOD', 'GET').upper()
self.server = self.environ.get('SERVER_SOFTWARE', '')
self.tmpdir = self.environ.get('TEMP', './var/')
self.content_type = 'text/html; charset=%s' % codec
self.logger = logging.getLogger()
return
def put(self, *args):
for x in args:
if isinstance(x, str):
self.outfp.write(x)
elif isinstance(x, unicode):
self.outfp.write(x.encode(self.codec, 'xmlcharrefreplace'))
return
def response_200(self):
if self.server.startswith('cgi-httpd'):
# required for cgi-httpd
self.outfp.write('HTTP/1.0 200 OK\r\n')
self.outfp.write('Content-type: %s\r\n' % self.content_type)
self.outfp.write('Connection: close\r\n\r\n')
return
def response_404(self):
if self.server.startswith('cgi-httpd'):
# required for cgi-httpd
self.outfp.write('HTTP/1.0 404 Not Found\r\n')
self.outfp.write('Content-type: text/html\r\n')
self.outfp.write('Connection: close\r\n\r\n')
self.outfp.write('<html><body>page does not exist</body></body>\n')
return
def response_301(self, url):
if self.server.startswith('cgi-httpd'):
# required for cgi-httpd
self.outfp.write('HTTP/1.0 301 Moved\r\n')
self.outfp.write('Location: %s\r\n\r\n' % url)
return
def coverpage(self):
self.put(
'<html><head><title>%s</title></head><body>\n' % q(self.TITLE),
'<h1>%s</h1><hr>\n' % q(self.TITLE),
'<form method="POST" action="%s" enctype="multipart/form-data">\n' % q(self.apppath),
'<p>Upload PDF File: <input name="f" type="file" value="">\n',
'&nbsp; Page numbers (comma-separated):\n',
'<input name="p" type="text" size="10" value="">\n',
'<p>(Text extraction is limited to maximum %d pages.\n' % self.MAXPAGES,
'Maximum file size for input is %d bytes.)\n' % self.MAXFILESIZE,
'<p><input type="submit" name="c" value="Convert to HTML">\n',
'<input type="submit" name="c" value="Convert to TEXT">\n',
'<input type="reset" value="Reset">\n',
'</form><hr>\n',
'<p>Powered by <a href="http://www.unixuser.org/~euske/python/pdfminer/">PDFMiner</a>-%s\n' % pdfminer.__version__,
'</body></html>\n',
)
return
def setup(self):
self.run = self.response_404
status = 404
if not os.path.isdir(self.tmpdir):
self.logger.error('no tmpdir')
status = 304
elif self.path_info == self.apppath:
self.run = self.convert
status = 200
return status
def convert(self):
form = cgi.FieldStorage(fp=self.infp, environ=self.environ)
if (self.method != 'POST' or
'c' not in form or
'f' not in form):
self.response_200()
self.coverpage()
return
item = form['f']
if not (item.file and item.filename):
self.response_200()
self.coverpage()
return
cmd = form.getvalue('c')
html = (cmd == 'Convert to HTML')
pagenos = []
if 'p' in form:
for m in re.finditer(r'\d+', form.getvalue('p')):
try:
pagenos.append(int(m.group(0)))
except ValueError:
pass
h = abs(hash((random.random(), self.remote_addr, item.filename)))
tmppath = os.path.join(self.tmpdir, '%08x%08x.pdf' % (time.time(), h))
self.logger.info('received: host=%s, name=%r, pagenos=%r, tmppath=%r' %
(self.remote_addr, item.filename, pagenos, tmppath))
try:
if not html:
self.content_type = 'text/plain; charset=%s' % self.codec
self.response_200()
try:
convert(item.file, self.outfp, tmppath, pagenos=pagenos, codec=self.codec,
maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE, html=html)
except Exception as e:
self.put('<p>Sorry, an error has occurred: %s' % q(repr(e)))
self.logger.error('convert: %r: path=%r: %s' % (e, traceback.format_exc()))
finally:
try:
os.remove(tmppath)
except:
pass
return
# main
if __name__ == '__main__':
app = WebApp()
app.setup()
sys.exit(app.run())

View File

@ -1,30 +0,0 @@
# -*- mode: python -*-
block_cipher = None
a = Analysis(['pdf2txt.py'],
pathex=['C:\\Dev\\Python\\pdfminer.six\\tools'],
binaries=[],
datas=[],
hiddenimports=[],
hookspath=[],
runtime_hooks=[],
excludes=['django','matplotlib','PIL','numpy','qt5'],
win_no_prefer_redirects=False,
win_private_assemblies=False,
cipher=block_cipher)
pyz = PYZ(a.pure, a.zipped_data,
cipher=block_cipher)
exe = EXE(pyz,
a.scripts,
a.binaries,
a.zipfiles,
a.datas,
name='pdf2txt',
debug=False,
strip=False,
upx=True,
runtime_tmpdir=None,
console=True )

View File

@ -1,29 +0,0 @@
# -*- mode: python -*-
block_cipher = None
a = Analysis(['pdfdiff.py'],
pathex=['C:\\Dev\\Python\\pdfminer.six\\tools'],
binaries=[],
datas=[],
hiddenimports=[],
hookspath=[],
runtime_hooks=[],
excludes=['django','matplotlib','PIL','numpy','qt5'],
win_no_prefer_redirects=False,
win_private_assemblies=False,
cipher=block_cipher)
pyz = PYZ(a.pure, a.zipped_data,
cipher=block_cipher)
exe = EXE(pyz,
a.scripts,
a.binaries,
a.zipfiles,
a.datas,
name='pdfdiff',
debug=False,
strip=False,
upx=True,
runtime_tmpdir=None,
console=True )

View File

@ -1,114 +0,0 @@
#!/usr/bin/env python
##
## WebApp class runner
##
## usage:
## $ runapp.py pdf2html.cgi
##
import sys
import urllib
from six.moves.http_client import responses
from six.moves.BaseHTTPServer import HTTPServer
from six.moves.SimpleHTTPServer import SimpleHTTPRequestHandler
## WebAppHandler
##
class WebAppHandler(SimpleHTTPRequestHandler):
APP_CLASS = None
def do_POST(self):
return self.run_cgi()
def send_head(self):
return self.run_cgi()
def run_cgi(self):
rest = self.path
i = rest.rfind('?')
if i >= 0:
rest, query = rest[:i], rest[i+1:]
else:
query = ''
i = rest.find('/')
if i >= 0:
script, rest = rest[:i], rest[i:]
else:
script, rest = rest, ''
scriptname = '/' + script
scriptfile = self.translate_path(scriptname)
env = {}
env['SERVER_SOFTWARE'] = self.version_string()
env['SERVER_NAME'] = self.server.server_name
env['GATEWAY_INTERFACE'] = 'CGI/1.1'
env['SERVER_PROTOCOL'] = self.protocol_version
env['SERVER_PORT'] = str(self.server.server_port)
env['REQUEST_METHOD'] = self.command
uqrest = urllib.unquote(rest)
env['PATH_INFO'] = uqrest
env['PATH_TRANSLATED'] = self.translate_path(uqrest)
env['SCRIPT_NAME'] = scriptname
if query:
env['QUERY_STRING'] = query
host = self.address_string()
if host != self.client_address[0]:
env['REMOTE_HOST'] = host
env['REMOTE_ADDR'] = self.client_address[0]
if self.headers.typeheader is None:
env['CONTENT_TYPE'] = self.headers.type
else:
env['CONTENT_TYPE'] = self.headers.typeheader
length = self.headers.getheader('content-length')
if length:
env['CONTENT_LENGTH'] = length
accept = []
for line in self.headers.getallmatchingheaders('accept'):
if line[:1] in "\t\n\r ":
accept.append(line.strip())
else:
accept = accept + line[7:].split(',')
env['HTTP_ACCEPT'] = ','.join(accept)
ua = self.headers.getheader('user-agent')
if ua:
env['HTTP_USER_AGENT'] = ua
co = filter(None, self.headers.getheaders('cookie'))
if co:
env['HTTP_COOKIE'] = ', '.join(co)
for k in ('QUERY_STRING', 'REMOTE_HOST', 'CONTENT_LENGTH',
'HTTP_USER_AGENT', 'HTTP_COOKIE'):
env.setdefault(k, "")
app = self.APP_CLASS(infp=self.rfile, outfp=self.wfile, environ=env)
status = app.setup()
self.send_response(status, responses[status])
app.run()
return
# main
def main(argv):
import getopt, imp
def usage():
print ('usage: %s [-h host] [-p port] [-n name] module.class' % argv[0])
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'h:p:n:')
except getopt.GetoptError:
return usage()
host = ''
port = 8080
name = 'WebApp'
for (k, v) in opts:
if k == '-h': host = v
elif k == '-p': port = int(v)
elif k == '-n': name = v
if not args: return usage()
path = args.pop(0)
module = imp.load_source('app', path)
WebAppHandler.APP_CLASS = getattr(module, name)
print ('Listening %s:%d...' % (host,port))
httpd = HTTPServer((host,port), WebAppHandler)
httpd.serve_forever()
return
if __name__ == '__main__': sys.exit(main(sys.argv))