Change Text extraction is not allowed error to warning (#453)

* Changed error to warning for 'Text extraction is not allowed' * updated changelog * fix lint * made changes suggested in review * Update CHANGELOG.md * Add regression test for failing pdf * Reduce line length to <80 Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
2020-07-11 19:34:11 +05:30 · 2020-07-11 19:34:11 +05:30 · 6a9269b432
parent 836d312982
commit 6a9269b432
6 changed files with 30 additions and 12 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -10,6 +10,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

 ### Changed
 - Hiding fallback xref by default from dumppdf.py output ([#431](https://github.com/pdfminer/pdfminer.six/pull/431))
+- Raise a warning instead of an error when extracting text from a non-extractable PDF ([#350](https://github.com/pdfminer/pdfminer.six/issues/350))
  
 ## [20200517]

--- a/pdfminer/high_level.py
+++ b/pdfminer/high_level.py
@ -80,8 +80,7 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
                                  page_numbers,
                                  maxpages=maxpages,
                                  password=password,
-                                  caching=not disable_caching,
-                                  check_extractable=True):
+                                  caching=not disable_caching):
        page.rotate = (page.rotate + rotation) % 360
        interpreter.process_page(page)

@ -118,7 +117,6 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
                maxpages=maxpages,
                password=password,
                caching=caching,
-                check_extractable=True,
        ):
            interpreter.process_page(page)

--- a/pdfminer/pdfdocument.py
+++ b/pdfminer/pdfdocument.py
@ -44,7 +44,11 @@ class PDFPasswordIncorrect(PDFEncryptionError):
    pass


-class PDFTextExtractionNotAllowed(PDFEncryptionError):
+class PDFTextExtractionNotAllowedWarning(UserWarning):
+    pass
+
+
+class PDFTextExtractionNotAllowedError(PDFEncryptionError):
    pass


--- a/pdfminer/pdfpage.py
+++ b/pdfminer/pdfpage.py
@ -1,4 +1,5 @@
 import logging
+import warnings
 from . import settings
 from .psparser import LIT
 from .pdftypes import PDFObjectNotFound
@ -8,7 +9,8 @@ from .pdftypes import list_value
 from .pdftypes import dict_value
 from .pdfparser import PDFParser
 from .pdfdocument import PDFDocument
-from .pdfdocument import PDFTextExtractionNotAllowed
+from .pdfdocument import PDFTextExtractionNotAllowedWarning
+from .pdfdocument import PDFTextExtractionNotAllowedError


 log = logging.getLogger(__name__)
@ -120,15 +122,23 @@ class PDFPage:
    @classmethod
    def get_pages(cls, fp,
                  pagenos=None, maxpages=0, password='',
-                  caching=True, check_extractable=True):
+                  caching=True, check_extractable=False):
        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)
        # Create a PDF document object that stores the document structure.
        doc = PDFDocument(parser, password=password, caching=caching)
-        # Check if the document allows text extraction. If not, abort.
-        if check_extractable and not doc.is_extractable:
-            error_msg = 'Text extraction is not allowed: %r' % fp
-            raise PDFTextExtractionNotAllowed(error_msg)
+        # Check if the document allows text extraction.
+        # If not, warn the user and proceed.
+        if not doc.is_extractable:
+            if check_extractable:
+                error_msg = 'Text extraction is not allowed: %r' % fp
+                raise PDFTextExtractionNotAllowedError(error_msg)
+            else:
+                warning_msg = 'The PDF %r contains a metadata field '\
+                            'indicating that it should not allow '   \
+                            'text extraction. Ignoring this field '  \
+                            'and proceeding.' % fp
+                warnings.warn(warning_msg, PDFTextExtractionNotAllowedWarning)
        # Process each page contained in the document.
        for (pageno, page) in enumerate(cls.create_pages(doc)):
            if pagenos and (pageno not in pagenos):
--- a/samples/contrib/issue-00352-asw-oct96-p41.pdf
+++ b/samples/contrib/issue-00352-asw-oct96-p41.pdf
--- a/tests/test_tools_pdf2txt.py
+++ b/tests/test_tools_pdf2txt.py
@ -60,12 +60,17 @@ class TestPdf2Txt():
    def test_contrib_2b(self):
        run('contrib/2b.pdf', '-A -t xml')

+    def test_contrib_issue_350(self):
+        """Regression test for
+        https://github.com/pdfminer/pdfminer.six/issues/350"""
+        run('contrib/issue-00352-asw-oct96-p41.pdf')
+
    def test_scancode_patchelf(self):
        """Regression test for # https://github.com/euske/pdfminer/issues/96"""
        run('scancode/patchelf.pdf')

    def test_contrib_hash_two_complement(self):
-        """Check that unsigned integer is added correctly to encryption hash.
+        """Check that unsigned integer is added correctly to encryption hash.et

        See https://github.com/pdfminer/pdfminer.six/issues/186
        """