diff --git a/CHANGELOG.md b/CHANGELOG.md index e5483d6..97ccbbe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Changed - Hiding fallback xref by default from dumppdf.py output ([#431](https://github.com/pdfminer/pdfminer.six/pull/431)) - +- Raise a warning instead of an error when extracting text from a non-extractable PDF ([#350](https://github.com/pdfminer/pdfminer.six/issues/350)) + ## [20200517] ### Added diff --git a/pdfminer/high_level.py b/pdfminer/high_level.py index 1686c46..644c6ca 100644 --- a/pdfminer/high_level.py +++ b/pdfminer/high_level.py @@ -80,8 +80,7 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8', page_numbers, maxpages=maxpages, password=password, - caching=not disable_caching, - check_extractable=True): + caching=not disable_caching): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) @@ -118,7 +117,6 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0, maxpages=maxpages, password=password, caching=caching, - check_extractable=True, ): interpreter.process_page(page) diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py index 71852ee..760073c 100644 --- a/pdfminer/pdfdocument.py +++ b/pdfminer/pdfdocument.py @@ -44,7 +44,11 @@ class PDFPasswordIncorrect(PDFEncryptionError): pass -class PDFTextExtractionNotAllowed(PDFEncryptionError): +class PDFTextExtractionNotAllowedWarning(UserWarning): + pass + + +class PDFTextExtractionNotAllowedError(PDFEncryptionError): pass diff --git a/pdfminer/pdfpage.py b/pdfminer/pdfpage.py index 7a9b622..dc0a424 100644 --- a/pdfminer/pdfpage.py +++ b/pdfminer/pdfpage.py @@ -1,4 +1,5 @@ import logging +import warnings from . import settings from .psparser import LIT from .pdftypes import PDFObjectNotFound @@ -8,7 +9,8 @@ from .pdftypes import list_value from .pdftypes import dict_value from .pdfparser import PDFParser from .pdfdocument import PDFDocument -from .pdfdocument import PDFTextExtractionNotAllowed +from .pdfdocument import PDFTextExtractionNotAllowedWarning +from .pdfdocument import PDFTextExtractionNotAllowedError log = logging.getLogger(__name__) @@ -120,15 +122,23 @@ class PDFPage: @classmethod def get_pages(cls, fp, pagenos=None, maxpages=0, password='', - caching=True, check_extractable=True): + caching=True, check_extractable=False): # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. doc = PDFDocument(parser, password=password, caching=caching) - # Check if the document allows text extraction. If not, abort. - if check_extractable and not doc.is_extractable: - error_msg = 'Text extraction is not allowed: %r' % fp - raise PDFTextExtractionNotAllowed(error_msg) + # Check if the document allows text extraction. + # If not, warn the user and proceed. + if not doc.is_extractable: + if check_extractable: + error_msg = 'Text extraction is not allowed: %r' % fp + raise PDFTextExtractionNotAllowedError(error_msg) + else: + warning_msg = 'The PDF %r contains a metadata field '\ + 'indicating that it should not allow ' \ + 'text extraction. Ignoring this field ' \ + 'and proceeding.' % fp + warnings.warn(warning_msg, PDFTextExtractionNotAllowedWarning) # Process each page contained in the document. for (pageno, page) in enumerate(cls.create_pages(doc)): if pagenos and (pageno not in pagenos): diff --git a/samples/contrib/issue-00352-asw-oct96-p41.pdf b/samples/contrib/issue-00352-asw-oct96-p41.pdf new file mode 100644 index 0000000..f61d383 Binary files /dev/null and b/samples/contrib/issue-00352-asw-oct96-p41.pdf differ diff --git a/tests/test_tools_pdf2txt.py b/tests/test_tools_pdf2txt.py index 459e19b..bb9545d 100644 --- a/tests/test_tools_pdf2txt.py +++ b/tests/test_tools_pdf2txt.py @@ -60,12 +60,17 @@ class TestPdf2Txt(): def test_contrib_2b(self): run('contrib/2b.pdf', '-A -t xml') + def test_contrib_issue_350(self): + """Regression test for + https://github.com/pdfminer/pdfminer.six/issues/350""" + run('contrib/issue-00352-asw-oct96-p41.pdf') + def test_scancode_patchelf(self): """Regression test for # https://github.com/euske/pdfminer/issues/96""" run('scancode/patchelf.pdf') def test_contrib_hash_two_complement(self): - """Check that unsigned integer is added correctly to encryption hash. + """Check that unsigned integer is added correctly to encryption hash.et See https://github.com/pdfminer/pdfminer.six/issues/186 """