From d88d6020a25c800391789e3a136a802f8fc2d2a0 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sat, 26 Oct 2019 19:16:37 +0200 Subject: [PATCH] Remove webapp and other (un)helpful application references: django, cgi, and pyinstaller. (#320) Fixes #314 Fixes #105 --- CHANGELOG.md | 3 + docs/index.html | 2 +- pdfminer/settings.py | 7 -- tools/pdf2html.cgi | 215 ------------------------------------------- tools/pdf2txt.spec | 30 ------ tools/pdfdiff.spec | 29 ------ tools/runapp.py | 114 ----------------------- 7 files changed, 4 insertions(+), 396 deletions(-) delete mode 100755 tools/pdf2html.cgi delete mode 100644 tools/pdf2txt.spec delete mode 100644 tools/pdfdiff.spec delete mode 100755 tools/runapp.py diff --git a/CHANGELOG.md b/CHANGELOG.md index d91d035..26bdd5e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Fixed - Unhandled AssertionError when dumping pdf containing reference to object id 0 ([#318](https://github.com/pdfminer/pdfminer.six/pull/318)) +### Removed +- Files for external applications such as django, cgi and pyinstaller ([#314](https://github.com/pdfminer/pdfminer.six/issues/314)) + ## [20191020] - 2019-10-20 ### Deprecated diff --git a/docs/index.html b/docs/index.html index 8037bf8..6b6857c 100644 --- a/docs/index.html +++ b/docs/index.html @@ -64,7 +64,7 @@ PDF parser that can be used for other purposes than text analysis.
  • CJK languages and vertical writing scripts support.
  • Various font types (Type1, TrueType, Type3, and CID) support.
  • Basic encryption (RC4) support. -
  • PDF to HTML conversion (with a sample converter web app). +
  • PDF to HTML conversion.
  • Outline (TOC) extraction.
  • Tagged contents extraction.
  • Reconstruct the original layout by grouping text chunks. diff --git a/pdfminer/settings.py b/pdfminer/settings.py index 2dd99c0..810077a 100644 --- a/pdfminer/settings.py +++ b/pdfminer/settings.py @@ -1,8 +1 @@ STRICT = False - -try: - from django.conf import settings - STRICT = getattr(settings, 'PDF_MINER_IS_STRICT', STRICT) -except Exception: - # in case it's not a django project - pass diff --git a/tools/pdf2html.cgi b/tools/pdf2html.cgi deleted file mode 100755 index e2ea964..0000000 --- a/tools/pdf2html.cgi +++ /dev/null @@ -1,215 +0,0 @@ -#!/usr/bin/env python -O -# -# pdf2html.cgi - Gateway script for converting PDF into HTML. -# -# Security consideration for public access: -# -# Limit the process size and/or maximum cpu time. -# The process should be chrooted. -# The user should be imposed quota. -# -# How to Setup: -# $ mkdir $CGIDIR -# $ mkdir $CGIDIR/var -# $ python setup.py install_lib --install-dir=$CGIDIR -# $ cp pdfminer/tools/pdf2html.cgi $CGIDIR -# - -import sys, os, os.path, re, time -import cgi, logging, traceback, random -# comment out at this at runtime. -#import cgitb; cgitb.enable() -import pdfminer -from pdfminer.pdfdocument import PDFDocument -from pdfminer.pdfpage import PDFPage -from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter -from pdfminer.converter import HTMLConverter, TextConverter -from pdfminer.layout import LAParams - -import six #Python 2+3 compatibility - -# quote HTML metacharacters -def q(x): - return x.replace('&','&').replace('>','>').replace('<','<').replace('"','"') - -# encode parameters as a URL -Q = re.compile(r'[^a-zA-Z0-9_.-=]') -def url(base, **kw): - r = [] - for (k,v) in six.iteritems(kw): - v = Q.sub(lambda m: '%%%02X' % ord(m.group(0)), encoder(q(v), 'replace')[0]) - r.append('%s=%s' % (k, v)) - return base+'&'.join(r) - - -## convert -## -class FileSizeExceeded(ValueError): pass -def convert(infp, outfp, path, codec='utf-8', - maxpages=0, maxfilesize=0, pagenos=None, - html=True): - # save the input file. - src = open(path, 'wb') - nbytes = 0 - while 1: - data = infp.read(4096) - nbytes += len(data) - if maxfilesize and maxfilesize < nbytes: - raise FileSizeExceeded(maxfilesize) - if not data: break - src.write(data) - src.close() - infp.close() - # perform conversion and - # send the results over the network. - rsrcmgr = PDFResourceManager() - laparams = LAParams() - if html: - device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, - layoutmode='exact') - else: - device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) - fp = open(path, 'rb') - interpreter = PDFPageInterpreter(rsrcmgr, device) - for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages): - interpreter.process_page(page) - fp.close() - device.close() - return - - -## WebApp -## -class WebApp(object): - - TITLE = 'pdf2html demo' - MAXFILESIZE = 10000000 # set to zero if unlimited. - MAXPAGES = 100 # set to zero if unlimited. - - def __init__(self, infp=sys.stdin, outfp=sys.stdout, environ=os.environ, - codec='utf-8', apppath='/'): - self.infp = infp - self.outfp = outfp - self.environ = environ - self.codec = codec - self.apppath = apppath - self.remote_addr = self.environ.get('REMOTE_ADDR') - self.path_info = self.environ.get('PATH_INFO') - self.method = self.environ.get('REQUEST_METHOD', 'GET').upper() - self.server = self.environ.get('SERVER_SOFTWARE', '') - self.tmpdir = self.environ.get('TEMP', './var/') - self.content_type = 'text/html; charset=%s' % codec - self.logger = logging.getLogger() - return - - def put(self, *args): - for x in args: - if isinstance(x, str): - self.outfp.write(x) - elif isinstance(x, unicode): - self.outfp.write(x.encode(self.codec, 'xmlcharrefreplace')) - return - - def response_200(self): - if self.server.startswith('cgi-httpd'): - # required for cgi-httpd - self.outfp.write('HTTP/1.0 200 OK\r\n') - self.outfp.write('Content-type: %s\r\n' % self.content_type) - self.outfp.write('Connection: close\r\n\r\n') - return - - def response_404(self): - if self.server.startswith('cgi-httpd'): - # required for cgi-httpd - self.outfp.write('HTTP/1.0 404 Not Found\r\n') - self.outfp.write('Content-type: text/html\r\n') - self.outfp.write('Connection: close\r\n\r\n') - self.outfp.write('page does not exist\n') - return - - def response_301(self, url): - if self.server.startswith('cgi-httpd'): - # required for cgi-httpd - self.outfp.write('HTTP/1.0 301 Moved\r\n') - self.outfp.write('Location: %s\r\n\r\n' % url) - return - - def coverpage(self): - self.put( - '%s\n' % q(self.TITLE), - '

    %s


    \n' % q(self.TITLE), - '
    \n' % q(self.apppath), - '

    Upload PDF File: \n', - '  Page numbers (comma-separated):\n', - '\n', - '

    (Text extraction is limited to maximum %d pages.\n' % self.MAXPAGES, - 'Maximum file size for input is %d bytes.)\n' % self.MAXFILESIZE, - '

    \n', - '\n', - '\n', - '


    \n', - '

    Powered by PDFMiner-%s\n' % pdfminer.__version__, - '\n', - ) - return - - def setup(self): - self.run = self.response_404 - status = 404 - if not os.path.isdir(self.tmpdir): - self.logger.error('no tmpdir') - status = 304 - elif self.path_info == self.apppath: - self.run = self.convert - status = 200 - return status - - def convert(self): - form = cgi.FieldStorage(fp=self.infp, environ=self.environ) - if (self.method != 'POST' or - 'c' not in form or - 'f' not in form): - self.response_200() - self.coverpage() - return - item = form['f'] - if not (item.file and item.filename): - self.response_200() - self.coverpage() - return - cmd = form.getvalue('c') - html = (cmd == 'Convert to HTML') - pagenos = [] - if 'p' in form: - for m in re.finditer(r'\d+', form.getvalue('p')): - try: - pagenos.append(int(m.group(0))) - except ValueError: - pass - h = abs(hash((random.random(), self.remote_addr, item.filename))) - tmppath = os.path.join(self.tmpdir, '%08x%08x.pdf' % (time.time(), h)) - self.logger.info('received: host=%s, name=%r, pagenos=%r, tmppath=%r' % - (self.remote_addr, item.filename, pagenos, tmppath)) - try: - if not html: - self.content_type = 'text/plain; charset=%s' % self.codec - self.response_200() - try: - convert(item.file, self.outfp, tmppath, pagenos=pagenos, codec=self.codec, - maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE, html=html) - except Exception as e: - self.put('

    Sorry, an error has occurred: %s' % q(repr(e))) - self.logger.error('convert: %r: path=%r: %s' % (e, traceback.format_exc())) - finally: - try: - os.remove(tmppath) - except: - pass - return - - -# main -if __name__ == '__main__': - app = WebApp() - app.setup() - sys.exit(app.run()) diff --git a/tools/pdf2txt.spec b/tools/pdf2txt.spec deleted file mode 100644 index c0073e6..0000000 --- a/tools/pdf2txt.spec +++ /dev/null @@ -1,30 +0,0 @@ -# -*- mode: python -*- - -block_cipher = None - - -a = Analysis(['pdf2txt.py'], - pathex=['C:\\Dev\\Python\\pdfminer.six\\tools'], - binaries=[], - datas=[], - hiddenimports=[], - hookspath=[], - runtime_hooks=[], - excludes=['django','matplotlib','PIL','numpy','qt5'], - win_no_prefer_redirects=False, - win_private_assemblies=False, - cipher=block_cipher) - -pyz = PYZ(a.pure, a.zipped_data, - cipher=block_cipher) -exe = EXE(pyz, - a.scripts, - a.binaries, - a.zipfiles, - a.datas, - name='pdf2txt', - debug=False, - strip=False, - upx=True, - runtime_tmpdir=None, - console=True ) diff --git a/tools/pdfdiff.spec b/tools/pdfdiff.spec deleted file mode 100644 index 6872b32..0000000 --- a/tools/pdfdiff.spec +++ /dev/null @@ -1,29 +0,0 @@ -# -*- mode: python -*- - -block_cipher = None - - -a = Analysis(['pdfdiff.py'], - pathex=['C:\\Dev\\Python\\pdfminer.six\\tools'], - binaries=[], - datas=[], - hiddenimports=[], - hookspath=[], - runtime_hooks=[], - excludes=['django','matplotlib','PIL','numpy','qt5'], - win_no_prefer_redirects=False, - win_private_assemblies=False, - cipher=block_cipher) -pyz = PYZ(a.pure, a.zipped_data, - cipher=block_cipher) -exe = EXE(pyz, - a.scripts, - a.binaries, - a.zipfiles, - a.datas, - name='pdfdiff', - debug=False, - strip=False, - upx=True, - runtime_tmpdir=None, - console=True ) diff --git a/tools/runapp.py b/tools/runapp.py deleted file mode 100755 index 6b953be..0000000 --- a/tools/runapp.py +++ /dev/null @@ -1,114 +0,0 @@ -#!/usr/bin/env python - -## -## WebApp class runner -## -## usage: -## $ runapp.py pdf2html.cgi -## - -import sys -import urllib -from six.moves.http_client import responses -from six.moves.BaseHTTPServer import HTTPServer -from six.moves.SimpleHTTPServer import SimpleHTTPRequestHandler - -## WebAppHandler -## -class WebAppHandler(SimpleHTTPRequestHandler): - - APP_CLASS = None - - def do_POST(self): - return self.run_cgi() - - def send_head(self): - return self.run_cgi() - - def run_cgi(self): - rest = self.path - i = rest.rfind('?') - if i >= 0: - rest, query = rest[:i], rest[i+1:] - else: - query = '' - i = rest.find('/') - if i >= 0: - script, rest = rest[:i], rest[i:] - else: - script, rest = rest, '' - scriptname = '/' + script - scriptfile = self.translate_path(scriptname) - env = {} - env['SERVER_SOFTWARE'] = self.version_string() - env['SERVER_NAME'] = self.server.server_name - env['GATEWAY_INTERFACE'] = 'CGI/1.1' - env['SERVER_PROTOCOL'] = self.protocol_version - env['SERVER_PORT'] = str(self.server.server_port) - env['REQUEST_METHOD'] = self.command - uqrest = urllib.unquote(rest) - env['PATH_INFO'] = uqrest - env['PATH_TRANSLATED'] = self.translate_path(uqrest) - env['SCRIPT_NAME'] = scriptname - if query: - env['QUERY_STRING'] = query - host = self.address_string() - if host != self.client_address[0]: - env['REMOTE_HOST'] = host - env['REMOTE_ADDR'] = self.client_address[0] - if self.headers.typeheader is None: - env['CONTENT_TYPE'] = self.headers.type - else: - env['CONTENT_TYPE'] = self.headers.typeheader - length = self.headers.getheader('content-length') - if length: - env['CONTENT_LENGTH'] = length - accept = [] - for line in self.headers.getallmatchingheaders('accept'): - if line[:1] in "\t\n\r ": - accept.append(line.strip()) - else: - accept = accept + line[7:].split(',') - env['HTTP_ACCEPT'] = ','.join(accept) - ua = self.headers.getheader('user-agent') - if ua: - env['HTTP_USER_AGENT'] = ua - co = filter(None, self.headers.getheaders('cookie')) - if co: - env['HTTP_COOKIE'] = ', '.join(co) - for k in ('QUERY_STRING', 'REMOTE_HOST', 'CONTENT_LENGTH', - 'HTTP_USER_AGENT', 'HTTP_COOKIE'): - env.setdefault(k, "") - app = self.APP_CLASS(infp=self.rfile, outfp=self.wfile, environ=env) - status = app.setup() - self.send_response(status, responses[status]) - app.run() - return - -# main -def main(argv): - import getopt, imp - def usage(): - print ('usage: %s [-h host] [-p port] [-n name] module.class' % argv[0]) - return 100 - try: - (opts, args) = getopt.getopt(argv[1:], 'h:p:n:') - except getopt.GetoptError: - return usage() - host = '' - port = 8080 - name = 'WebApp' - for (k, v) in opts: - if k == '-h': host = v - elif k == '-p': port = int(v) - elif k == '-n': name = v - if not args: return usage() - path = args.pop(0) - module = imp.load_source('app', path) - WebAppHandler.APP_CLASS = getattr(module, name) - print ('Listening %s:%d...' % (host,port)) - httpd = HTTPServer((host,port), WebAppHandler) - httpd.serve_forever() - return - -if __name__ == '__main__': sys.exit(main(sys.argv))