Change Text extraction is not allowed error to warning (#453)

* Changed error to warning for 'Text extraction is not allowed'

* updated changelog

* fix lint

* made changes suggested in review

* Update CHANGELOG.md

* Add regression test for failing pdf

* Reduce line length to <80

Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
pull/461/head
madhurcodes 2020-07-11 19:34:11 +05:30 committed by GitHub
parent 836d312982
commit 6a9269b432
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 30 additions and 12 deletions

View File

@ -10,6 +10,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
### Changed ### Changed
- Hiding fallback xref by default from dumppdf.py output ([#431](https://github.com/pdfminer/pdfminer.six/pull/431)) - Hiding fallback xref by default from dumppdf.py output ([#431](https://github.com/pdfminer/pdfminer.six/pull/431))
- Raise a warning instead of an error when extracting text from a non-extractable PDF ([#350](https://github.com/pdfminer/pdfminer.six/issues/350))
## [20200517] ## [20200517]

View File

@ -80,8 +80,7 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
page_numbers, page_numbers,
maxpages=maxpages, maxpages=maxpages,
password=password, password=password,
caching=not disable_caching, caching=not disable_caching):
check_extractable=True):
page.rotate = (page.rotate + rotation) % 360 page.rotate = (page.rotate + rotation) % 360
interpreter.process_page(page) interpreter.process_page(page)
@ -118,7 +117,6 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
maxpages=maxpages, maxpages=maxpages,
password=password, password=password,
caching=caching, caching=caching,
check_extractable=True,
): ):
interpreter.process_page(page) interpreter.process_page(page)

View File

@ -44,7 +44,11 @@ class PDFPasswordIncorrect(PDFEncryptionError):
pass pass
class PDFTextExtractionNotAllowed(PDFEncryptionError): class PDFTextExtractionNotAllowedWarning(UserWarning):
pass
class PDFTextExtractionNotAllowedError(PDFEncryptionError):
pass pass

View File

@ -1,4 +1,5 @@
import logging import logging
import warnings
from . import settings from . import settings
from .psparser import LIT from .psparser import LIT
from .pdftypes import PDFObjectNotFound from .pdftypes import PDFObjectNotFound
@ -8,7 +9,8 @@ from .pdftypes import list_value
from .pdftypes import dict_value from .pdftypes import dict_value
from .pdfparser import PDFParser from .pdfparser import PDFParser
from .pdfdocument import PDFDocument from .pdfdocument import PDFDocument
from .pdfdocument import PDFTextExtractionNotAllowed from .pdfdocument import PDFTextExtractionNotAllowedWarning
from .pdfdocument import PDFTextExtractionNotAllowedError
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -120,15 +122,23 @@ class PDFPage:
@classmethod @classmethod
def get_pages(cls, fp, def get_pages(cls, fp,
pagenos=None, maxpages=0, password='', pagenos=None, maxpages=0, password='',
caching=True, check_extractable=True): caching=True, check_extractable=False):
# Create a PDF parser object associated with the file object. # Create a PDF parser object associated with the file object.
parser = PDFParser(fp) parser = PDFParser(fp)
# Create a PDF document object that stores the document structure. # Create a PDF document object that stores the document structure.
doc = PDFDocument(parser, password=password, caching=caching) doc = PDFDocument(parser, password=password, caching=caching)
# Check if the document allows text extraction. If not, abort. # Check if the document allows text extraction.
if check_extractable and not doc.is_extractable: # If not, warn the user and proceed.
if not doc.is_extractable:
if check_extractable:
error_msg = 'Text extraction is not allowed: %r' % fp error_msg = 'Text extraction is not allowed: %r' % fp
raise PDFTextExtractionNotAllowed(error_msg) raise PDFTextExtractionNotAllowedError(error_msg)
else:
warning_msg = 'The PDF %r contains a metadata field '\
'indicating that it should not allow ' \
'text extraction. Ignoring this field ' \
'and proceeding.' % fp
warnings.warn(warning_msg, PDFTextExtractionNotAllowedWarning)
# Process each page contained in the document. # Process each page contained in the document.
for (pageno, page) in enumerate(cls.create_pages(doc)): for (pageno, page) in enumerate(cls.create_pages(doc)):
if pagenos and (pageno not in pagenos): if pagenos and (pageno not in pagenos):

Binary file not shown.

View File

@ -60,12 +60,17 @@ class TestPdf2Txt():
def test_contrib_2b(self): def test_contrib_2b(self):
run('contrib/2b.pdf', '-A -t xml') run('contrib/2b.pdf', '-A -t xml')
def test_contrib_issue_350(self):
"""Regression test for
https://github.com/pdfminer/pdfminer.six/issues/350"""
run('contrib/issue-00352-asw-oct96-p41.pdf')
def test_scancode_patchelf(self): def test_scancode_patchelf(self):
"""Regression test for # https://github.com/euske/pdfminer/issues/96""" """Regression test for # https://github.com/euske/pdfminer/issues/96"""
run('scancode/patchelf.pdf') run('scancode/patchelf.pdf')
def test_contrib_hash_two_complement(self): def test_contrib_hash_two_complement(self):
"""Check that unsigned integer is added correctly to encryption hash. """Check that unsigned integer is added correctly to encryption hash.et
See https://github.com/pdfminer/pdfminer.six/issues/186 See https://github.com/pdfminer/pdfminer.six/issues/186
""" """