Use logger.warn instead of warnings.warn if warning cannot be prevented by user (#673)

* Use logging.Logger.warning instead of warning.warn in most cases, following
 the Python official guidance that warning.warn is directed at _developers_,
 not users

 * (pdfdocument.py) remove declarations of PDFTextExtractionNotAllowedWarning,
			PDFNoValidXRefWarning

 * (pdfpage.py) Don't import warning, don't use PDFTextExtractionNotAllowedWarning

 * (tools/dumppdf.py) Don't import warning, don't use PDFNoValidXRefWarning

 * (tests/test_tools_dumppdf.py) Don't import warning, check for logging.WARN rather
				  than PDFNoValidXRefWarning

* get name right

* make flake8 happy

* Keep warning classes such that this does not crash code when these warnings are explictly ignored

* Update changelog to include pr ref

* Small textual change

* Remove patch

* No need for testing if the warning is actually raised. The test_tootls_dumppdf.py are just test cases if these pdfs are supported.

* Use logger as name for logger

* Add docs to legacy warnings

* Use logger.Logger.warn for failed decompression

* Add reference to docs describing when to use logger and warnings

Co-authored-by: Henry S. Thompson <ht@home.hst.name>
Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
pull/684/head^2
htInEdin 2022-01-26 19:41:12 +00:00 committed by GitHub
parent c4ac514984
commit dc530f3a6f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 27 additions and 25 deletions

View File

@ -41,6 +41,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- Support for Python 3.4 and 3.5 ([#522](https://github.com/pdfminer/pdfminer.six/pull/522)) - Support for Python 3.4 and 3.5 ([#522](https://github.com/pdfminer/pdfminer.six/pull/522))
- Unused dependency on `sortedcontainers` package ([#525](https://github.com/pdfminer/pdfminer.six/pull/525)) - Unused dependency on `sortedcontainers` package ([#525](https://github.com/pdfminer/pdfminer.six/pull/525))
- Support for non-standard output streams that are not binary ([#523](https://github.com/pdfminer/pdfminer.six/pull/523)) - Support for non-standard output streams that are not binary ([#523](https://github.com/pdfminer/pdfminer.six/pull/523))
- Replace warnings.warn with logging.Logger.warning in line with [recommended use](https://docs.python.org/3/howto/logging.html#when-to-use-logging) ([#673](https://github.com/pdfminer/pdfminer.six/pull/673))
- Dependency on typing-extensions introduced by [#661](https://github.com/pdfminer/pdfminer.six/pull/661) ([#677](https://github.com/pdfminer/pdfminer.six/pull/677)) - Dependency on typing-extensions introduced by [#661](https://github.com/pdfminer/pdfminer.six/pull/661) ([#677](https://github.com/pdfminer/pdfminer.six/pull/677))
## [20201018] ## [20201018]

View File

@ -25,6 +25,10 @@ class PDFNoValidXRef(PDFSyntaxError):
class PDFNoValidXRefWarning(SyntaxWarning): class PDFNoValidXRefWarning(SyntaxWarning):
"""Legacy warning for missing xref.
Not used anymore because warnings.warn is replaced by logger.Logger.warn.
"""
pass pass
@ -41,10 +45,18 @@ class PDFEncryptionError(PDFException):
class PDFEncryptionWarning(UserWarning): class PDFEncryptionWarning(UserWarning):
"""Legacy warning for failed decryption.
Not used anymore because warnings.warn is replaced by logger.Logger.warn.
"""
pass pass
class PDFTextExtractionNotAllowedWarning(UserWarning): class PDFTextExtractionNotAllowedWarning(UserWarning):
"""Legacy warning for PDF that does not allow extraction.
Not used anymore because warnings.warn is replaced by logger.Logger.warn.
"""
pass pass

View File

@ -1,7 +1,6 @@
import logging import logging
from pdfminer.utils import Rect from pdfminer.utils import Rect
from typing import BinaryIO, Container, Dict, Iterator, List, Optional, Tuple from typing import BinaryIO, Container, Dict, Iterator, List, Optional, Tuple
import warnings
from . import settings from . import settings
from .psparser import LIT from .psparser import LIT
from .pdftypes import PDFObjectNotFound from .pdftypes import PDFObjectNotFound
@ -11,7 +10,6 @@ from .pdftypes import list_value
from .pdftypes import dict_value from .pdftypes import dict_value
from .pdfparser import PDFParser from .pdfparser import PDFParser
from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed
from .pdfdocument import PDFTextExtractionNotAllowedWarning
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -155,8 +153,9 @@ class PDFPage:
warning_msg = 'The PDF %r contains a metadata field '\ warning_msg = 'The PDF %r contains a metadata field '\
'indicating that it should not allow ' \ 'indicating that it should not allow ' \
'text extraction. Ignoring this field ' \ 'text extraction. Ignoring this field ' \
'and proceeding.' % fp 'and proceeding. Use the check_extractable ' \
warnings.warn(warning_msg, PDFTextExtractionNotAllowedWarning) 'if you want to raise an error in this case' % fp
log.warning(warning_msg)
# Process each page contained in the document. # Process each page contained in the document.
for (pageno, page) in enumerate(cls.create_pages(doc)): for (pageno, page) in enumerate(cls.create_pages(doc)):
if pagenos and (pageno not in pagenos): if pagenos and (pageno not in pagenos):

View File

@ -1,5 +1,4 @@
import zlib import zlib
import warnings
import logging import logging
import io import io
import sys import sys
@ -21,7 +20,7 @@ if TYPE_CHECKING:
from .pdfdocument import PDFDocument from .pdfdocument import PDFDocument
log = logging.getLogger(__name__) logger = logging.getLogger(__name__)
LITERAL_CRYPT = LIT('Crypt') LITERAL_CRYPT = LIT('Crypt')
@ -205,7 +204,7 @@ def dict_value(x: object) -> Dict[Any, Any]:
x = resolve1(x) x = resolve1(x)
if not isinstance(x, dict): if not isinstance(x, dict):
if settings.STRICT: if settings.STRICT:
log.error('PDFTypeError : Dict required: %r', x) logger.error('PDFTypeError : Dict required: %r', x)
raise PDFTypeError('Dict required: %r' % x) raise PDFTypeError('Dict required: %r' % x)
return {} return {}
return x return x
@ -237,9 +236,7 @@ def decompress_corrupted(data):
except zlib.error: except zlib.error:
# Let the error propagates if we're not yet in the CRC checksum # Let the error propagates if we're not yet in the CRC checksum
if i < len(data) - 3: if i < len(data) - 3:
# Import here to prevent circualr import logger.warning("Data-loss while decompressing corrupted data")
from .pdfdocument import PDFEncryptionWarning
warnings.warn("Data-loss while decompressing corrupted data", PDFEncryptionWarning)
return result_str return result_str

View File

@ -1,8 +1,8 @@
import warnings import unittest
import logging
from nose.tools import raises from nose.tools import raises
from helpers import absolute_sample_path from helpers import absolute_sample_path
from tempfilepath import TemporaryFilePath from tempfilepath import TemporaryFilePath
from pdfminer.pdfdocument import PDFNoValidXRefWarning
from tools import dumppdf from tools import dumppdf
@ -18,12 +18,9 @@ def run(filename, options=None):
dumppdf.main(s.split(' ')[1:]) dumppdf.main(s.split(' ')[1:])
class TestDumpPDF(): class TestDumpPDF(unittest.TestCase):
def test_simple1(self): def test_simple1(self):
"""dumppdf.py simple1.pdf raises a warning because it has no xref""" run('simple1.pdf', '-t -a')
with warnings.catch_warnings(record=True) as ws:
run('simple1.pdf', '-t -a')
assert any(w.category == PDFNoValidXRefWarning for w in ws)
def test_simple2(self): def test_simple2(self):
run('simple2.pdf', '-t -a') run('simple2.pdf', '-t -a')
@ -32,10 +29,7 @@ class TestDumpPDF():
run('jo.pdf', '-t -a') run('jo.pdf', '-t -a')
def test_simple3(self): def test_simple3(self):
"""dumppdf.py simple3.pdf raises a warning because it has no xref""" run('simple3.pdf', '-t -a')
with warnings.catch_warnings(record=True) as ws:
run('simple3.pdf', '-t -a')
assert any(w.category == PDFNoValidXRefWarning for w in ws)
def test_2(self): def test_2(self):
run('nonfree/dmca.pdf', '-t -a') run('nonfree/dmca.pdf', '-t -a')

View File

@ -6,12 +6,10 @@ import re
import sys import sys
from typing import Any, Container, Dict, Iterable, List, Optional, TextIO, \ from typing import Any, Container, Dict, Iterable, List, Optional, TextIO, \
Union, cast Union, cast
import warnings
from argparse import ArgumentParser from argparse import ArgumentParser
import pdfminer import pdfminer
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines, PDFXRefFallback, \ from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines, PDFXRefFallback
PDFNoValidXRefWarning
from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser from pdfminer.pdfparser import PDFParser
from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError
@ -20,6 +18,7 @@ from pdfminer.psparser import PSKeyword, PSLiteral, LIT
from pdfminer.utils import isnumber from pdfminer.utils import isnumber
logging.basicConfig() logging.basicConfig()
logger = logging.getLogger(__name__)
ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]') ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
@ -115,7 +114,7 @@ def dumptrailers(
msg = 'This PDF does not have an xref. Use --show-fallback-xref if ' \ msg = 'This PDF does not have an xref. Use --show-fallback-xref if ' \
'you want to display the content of a fallback xref that ' \ 'you want to display the content of a fallback xref that ' \
'contains all objects.' 'contains all objects.'
warnings.warn(msg, PDFNoValidXRefWarning) logger.warning(msg)
return return