Use logger.warn instead of warnings.warn if warning cannot be prevented by user (#673)

* Use logging.Logger.warning instead of warning.warn in most cases, following the Python official guidance that warning.warn is directed at _developers_, not users * (pdfdocument.py) remove declarations of PDFTextExtractionNotAllowedWarning, PDFNoValidXRefWarning * (pdfpage.py) Don't import warning, don't use PDFTextExtractionNotAllowedWarning * (tools/dumppdf.py) Don't import warning, don't use PDFNoValidXRefWarning * (tests/test_tools_dumppdf.py) Don't import warning, check for logging.WARN rather than PDFNoValidXRefWarning * get name right * make flake8 happy * Keep warning classes such that this does not crash code when these warnings are explictly ignored * Update changelog to include pr ref * Small textual change * Remove patch * No need for testing if the warning is actually raised. The test_tootls_dumppdf.py are just test cases if these pdfs are supported. * Use logger as name for logger * Add docs to legacy warnings * Use logger.Logger.warn for failed decompression * Add reference to docs describing when to use logger and warnings Co-authored-by: Henry S. Thompson <ht@home.hst.name> Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
2022-01-26 19:41:12 +00:00 · 2022-01-26 19:41:12 +00:00 · dc530f3a6f
parent c4ac514984
commit dc530f3a6f
6 changed files with 27 additions and 25 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -41,6 +41,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 - Support for Python 3.4 and 3.5 ([#522](https://github.com/pdfminer/pdfminer.six/pull/522))
 - Unused dependency on `sortedcontainers` package ([#525](https://github.com/pdfminer/pdfminer.six/pull/525))
 - Support for non-standard output streams that are not binary ([#523](https://github.com/pdfminer/pdfminer.six/pull/523))
 - Replace warnings.warn with logging.Logger.warning in line with [recommended use](https://docs.python.org/3/howto/logging.html#when-to-use-logging) ([#673](https://github.com/pdfminer/pdfminer.six/pull/673))
 - Dependency on typing-extensions introduced by [#661](https://github.com/pdfminer/pdfminer.six/pull/661) ([#677](https://github.com/pdfminer/pdfminer.six/pull/677))
 ## [20201018]
--- a/pdfminer/pdfdocument.py
+++ b/pdfminer/pdfdocument.py
@ -25,6 +25,10 @@ class PDFNoValidXRef(PDFSyntaxError):
 class PDFNoValidXRefWarning(SyntaxWarning):
    """Legacy warning for missing xref.
    Not used anymore because warnings.warn is replaced by logger.Logger.warn.
    """
    pass
@ -41,10 +45,18 @@ class PDFEncryptionError(PDFException):
 class PDFEncryptionWarning(UserWarning):
    """Legacy warning for failed decryption.
    Not used anymore because warnings.warn is replaced by logger.Logger.warn.
    """
    pass
 class PDFTextExtractionNotAllowedWarning(UserWarning):
    """Legacy warning for PDF that does not allow extraction.
    Not used anymore because warnings.warn is replaced by logger.Logger.warn.
    """
    pass
--- a/pdfminer/pdfpage.py
+++ b/pdfminer/pdfpage.py
@ -1,7 +1,6 @@
 import logging
 from pdfminer.utils import Rect
 from typing import BinaryIO, Container, Dict, Iterator, List, Optional, Tuple
 import warnings
 from . import settings
 from .psparser import LIT
 from .pdftypes import PDFObjectNotFound
@ -11,7 +10,6 @@ from .pdftypes import list_value
 from .pdftypes import dict_value
 from .pdfparser import PDFParser
 from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed
 from .pdfdocument import PDFTextExtractionNotAllowedWarning
 log = logging.getLogger(__name__)
@ -155,8 +153,9 @@ class PDFPage:
                warning_msg = 'The PDF %r contains a metadata field '\
                            'indicating that it should not allow '   \
                            'text extraction. Ignoring this field '  \
-                            'and proceeding.' % fp
+                            'and proceeding. Use the check_extractable ' \
-                warnings.warn(warning_msg, PDFTextExtractionNotAllowedWarning)
+                            'if you want to raise an error in this case' % fp
                log.warning(warning_msg)
        # Process each page contained in the document.
        for (pageno, page) in enumerate(cls.create_pages(doc)):
            if pagenos and (pageno not in pagenos):
--- a/pdfminer/pdftypes.py
+++ b/pdfminer/pdftypes.py
@ -1,5 +1,4 @@
 import zlib
 import warnings
 import logging
 import io
 import sys
@ -21,7 +20,7 @@ if TYPE_CHECKING:
    from .pdfdocument import PDFDocument
-log = logging.getLogger(__name__)
+logger = logging.getLogger(__name__)
 LITERAL_CRYPT = LIT('Crypt')
@ -205,7 +204,7 @@ def dict_value(x: object) -> Dict[Any, Any]:
    x = resolve1(x)
    if not isinstance(x, dict):
        if settings.STRICT:
-            log.error('PDFTypeError : Dict required: %r', x)
+            logger.error('PDFTypeError : Dict required: %r', x)
            raise PDFTypeError('Dict required: %r' % x)
        return {}
    return x
@ -237,9 +236,7 @@ def decompress_corrupted(data):
    except zlib.error:
        # Let the error propagates if we're not yet in the CRC checksum
        if i < len(data) - 3:
-            # Import here to prevent circualr import
+            logger.warning("Data-loss while decompressing corrupted data")
            from .pdfdocument import PDFEncryptionWarning
            warnings.warn("Data-loss while decompressing corrupted data", PDFEncryptionWarning)
    return result_str
--- a/tests/test_tools_dumppdf.py
+++ b/tests/test_tools_dumppdf.py
@ -1,8 +1,8 @@
-import warnings
+import unittest
 import logging
 from nose.tools import raises
 from helpers import absolute_sample_path
 from tempfilepath import TemporaryFilePath
 from pdfminer.pdfdocument import PDFNoValidXRefWarning
 from tools import dumppdf
@ -18,12 +18,9 @@ def run(filename, options=None):
        dumppdf.main(s.split(' ')[1:])
-class TestDumpPDF():
+class TestDumpPDF(unittest.TestCase):
    def test_simple1(self):
-        """dumppdf.py simple1.pdf raises a warning because it has no xref"""
+        run('simple1.pdf', '-t -a')
        with warnings.catch_warnings(record=True) as ws:
            run('simple1.pdf', '-t -a')
            assert any(w.category == PDFNoValidXRefWarning for w in ws)
    def test_simple2(self):
        run('simple2.pdf', '-t -a')
@ -32,10 +29,7 @@ class TestDumpPDF():
        run('jo.pdf', '-t -a')
    def test_simple3(self):
-        """dumppdf.py simple3.pdf raises a warning because it has no xref"""
+        run('simple3.pdf', '-t -a')
        with warnings.catch_warnings(record=True) as ws:
            run('simple3.pdf', '-t -a')
            assert any(w.category == PDFNoValidXRefWarning for w in ws)
    def test_2(self):
        run('nonfree/dmca.pdf', '-t -a')
--- a/tools/dumppdf.py
+++ b/tools/dumppdf.py
@ -6,12 +6,10 @@ import re
 import sys
 from typing import Any, Container, Dict, Iterable, List, Optional, TextIO, \
    Union, cast
 import warnings
 from argparse import ArgumentParser
 import pdfminer
-from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines, PDFXRefFallback, \
+from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines, PDFXRefFallback
    PDFNoValidXRefWarning
 from pdfminer.pdfpage import PDFPage
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError
@ -20,6 +18,7 @@ from pdfminer.psparser import PSKeyword, PSLiteral, LIT
 from pdfminer.utils import isnumber
 logging.basicConfig()
 logger = logging.getLogger(__name__)
 ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
@ -115,7 +114,7 @@ def dumptrailers(
        msg = 'This PDF does not have an xref. Use --show-fallback-xref if ' \
              'you want to display the content of a fallback xref that ' \
              'contains all objects.'
-        warnings.warn(msg, PDFNoValidXRefWarning)
+        logger.warning(msg)
    return