Merge branch 'develop' of github.com:pdfminer/pdfminer.six into develop

2019-11-06 21:51:41 +01:00 · 2019-11-06 21:51:41 +01:00 · 027bb62943
parent 548b933a84 ed1b09c6f2
commit 027bb62943
5 changed files with 49 additions and 17 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -5,6 +5,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 ## [Unreleased]
 ### Deprecated
 - The argument `_py2_no_more_posargs` because Python2 is removed on January
 , 2020 ([#328](https://github.com/pdfminer/pdfminer.six/pull/328) and 
 [#307](https://github.com/pdfminer/pdfminer.six/pull/307))
 ### Added
 - Support for extracting JBIG2 encoded images ([#311](https://github.com/pdfminer/pdfminer.six/pull/311) and [#46](https://github.com/pdfminer/pdfminer.six/pull/46))
--- a/pdfminer/high_level.py
+++ b/pdfminer/high_level.py
@ -4,6 +4,7 @@ Functions that encapsulate "usual" use-cases for pdfminer, for use making
 bundled scripts and for using pdfminer as a module for routine tasks.
 """
 import logging
 import six
 import sys
@ -18,11 +19,10 @@ from .image import ImageWriter
 def extract_text_to_fp(inf, outfp,
                    _py2_no_more_posargs=None,  # Bloody Python2 needs a shim
                    output_type='text', codec='utf-8', laparams = None,
                    maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
                    layoutmode='normal', output_dir=None, strip_control=False,
-                    debug=False, disable_caching=False, **other):
+                    debug=False, disable_caching=False, **kwargs):
    """
    Parses text from inf-file and writes to outfp file-like object.
    Takes loads of optional arguments but the defaults are somewhat sane.
@ -44,6 +44,16 @@ def extract_text_to_fp(inf, outfp,
    debug: Output more logging data
    disable_caching: Does what it says on the tin
    """
    if '_py2_no_more_posargs' in kwargs is not None:
        raise DeprecationWarning(
            'The `_py2_no_more_posargs will be removed on January, 2020. At '
            'that moment pdfminer.six will stop supporting Python 2. Please '
            'upgrade to Python 3. For more information see '
            'https://github.com/pdfminer/pdfminer .six/issues/194')
    if debug:
        logging.getLogger().setLevel(logging.DEBUG)
    if six.PY2 and sys.stdin.encoding:
        password = password.decode(sys.stdin.encoding)
--- a/tools/dumppdf.py
+++ b/tools/dumppdf.py
@ -15,6 +15,8 @@ from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
 from pdfminer.psparser import PSKeyword, PSLiteral, LIT
 from pdfminer.utils import isnumber
 logging.basicConfig()
 ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
--- a/tools/pdf2txt.py
+++ b/tools/pdf2txt.py
@ -7,23 +7,30 @@ import argparse
 import logging
 import six
 import sys
 import pdfminer.settings
 pdfminer.settings.STRICT = False
 import pdfminer.high_level
 import pdfminer.layout
 from pdfminer.image import ImageWriter
 logging.basicConfig()
 def extract_text(files=[], outfile='-',
            _py2_no_more_posargs=None,  # Bloody Python2 needs a shim
            no_laparams=False, all_texts=None, detect_vertical=None, # LAParams
            word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams
            output_type='text', codec='utf-8', strip_control=False,
            maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
            layoutmode='normal', output_dir=None, debug=False,
-            disable_caching=False, **other):
+            disable_caching=False, **kwargs):
-    if _py2_no_more_posargs is not None:
+    if '_py2_no_more_posargs' in kwargs is not None:
-        raise ValueError("Too many positional arguments passed.")
+        raise DeprecationWarning(
            'The `_py2_no_more_posargs will be removed on January, 2020. At '
            'that moment pdfminer.six will stop supporting Python 2. Please '
            'upgrade to Python 3. For more information see '
            'https://github.com/pdfminer/pdfminer .six/issues/194')
    if not files:
        raise ValueError("Must provide files to work upon!")
--- a/tools/pdfdiff.py
+++ b/tools/pdfdiff.py
@ -11,28 +11,34 @@ pdfminer.settings.STRICT = False
 import pdfminer.high_level
 import pdfminer.layout
-def compare(file1,file2,**args):
+logging.basicConfig()
    if args.get('_py2_no_more_posargs',None) is not None:
        raise ValueError("Too many positional arguments passed.")
 def compare(file1, file2, **kwargs):
    if '_py2_no_more_posargs' in kwargs is not None:
        raise DeprecationWarning(
            'The `_py2_no_more_posargs will be removed on January, 2020. At '
            'that moment pdfminer.six will stop supporting Python 2. Please '
            'upgrade to Python 3. For more information see '
            'https://github.com/pdfminer/pdfminer .six/issues/194')
    # If any LAParams group arguments were passed, create an LAParams object and
    # populate with given args. Otherwise, set it to None.
-    if args.get('laparams',None) is None:
+    if kwargs.get('laparams', None) is None:
        laparams = pdfminer.layout.LAParams()
        for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
-            paramv = args.get(param, None)
+            paramv = kwargs.get(param, None)
            if paramv is not None:
                laparams[param]=paramv
-        args['laparams']=laparams
+        kwargs['laparams']=laparams
    s1=six.StringIO()
    with open(file1, "rb") as fp:
-        pdfminer.high_level.extract_text_to_fp(fp,s1, **args)
+        pdfminer.high_level.extract_text_to_fp(fp, s1, **kwargs)
    s2=six.StringIO()
    with open(file2, "rb") as fp:
-        pdfminer.high_level.extract_text_to_fp(fp,s2, **args)
+        pdfminer.high_level.extract_text_to_fp(fp, s2, **kwargs)
    import difflib
    s1.seek(0)
@ -41,12 +47,12 @@ def compare(file1,file2,**args):
    import os.path
    try:
-        extension = os.path.splitext(args['outfile'])[1][1:4]
+        extension = os.path.splitext(kwargs['outfile'])[1][1:4]
        if extension.lower()=='htm':
            return difflib.HtmlDiff().make_file(s1,s2)
    except KeyError:
        pass
-    return difflib.unified_diff(s1,s2,n=args['context_lines'])
+    return difflib.unified_diff(s1, s2, n=kwargs['context_lines'])
 # main
@ -85,10 +91,12 @@ def main(args=None):
    P.add_argument("-O", "--output-dir", default=None, help="Output directory for images")
    P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching")
    P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode")
    A = P.parse_args(args=args)
    if A.debug:
        logging.getLogger().setLevel(logging.DEBUG)
    if A.page_numbers:
        A.page_numbers = set([x-1 for x in A.page_numbers])
    if A.pagenos: