Change Text extraction is not allowed error to warning (#453)
* Changed error to warning for 'Text extraction is not allowed' * updated changelog * fix lint * made changes suggested in review * Update CHANGELOG.md * Add regression test for failing pdf * Reduce line length to <80 Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>pull/461/head
parent
836d312982
commit
6a9269b432
|
@ -10,6 +10,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
|||
|
||||
### Changed
|
||||
- Hiding fallback xref by default from dumppdf.py output ([#431](https://github.com/pdfminer/pdfminer.six/pull/431))
|
||||
- Raise a warning instead of an error when extracting text from a non-extractable PDF ([#350](https://github.com/pdfminer/pdfminer.six/issues/350))
|
||||
|
||||
## [20200517]
|
||||
|
||||
|
|
|
@ -80,8 +80,7 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
|
|||
page_numbers,
|
||||
maxpages=maxpages,
|
||||
password=password,
|
||||
caching=not disable_caching,
|
||||
check_extractable=True):
|
||||
caching=not disable_caching):
|
||||
page.rotate = (page.rotate + rotation) % 360
|
||||
interpreter.process_page(page)
|
||||
|
||||
|
@ -118,7 +117,6 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
|
|||
maxpages=maxpages,
|
||||
password=password,
|
||||
caching=caching,
|
||||
check_extractable=True,
|
||||
):
|
||||
interpreter.process_page(page)
|
||||
|
||||
|
|
|
@ -44,7 +44,11 @@ class PDFPasswordIncorrect(PDFEncryptionError):
|
|||
pass
|
||||
|
||||
|
||||
class PDFTextExtractionNotAllowed(PDFEncryptionError):
|
||||
class PDFTextExtractionNotAllowedWarning(UserWarning):
|
||||
pass
|
||||
|
||||
|
||||
class PDFTextExtractionNotAllowedError(PDFEncryptionError):
|
||||
pass
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import logging
|
||||
import warnings
|
||||
from . import settings
|
||||
from .psparser import LIT
|
||||
from .pdftypes import PDFObjectNotFound
|
||||
|
@ -8,7 +9,8 @@ from .pdftypes import list_value
|
|||
from .pdftypes import dict_value
|
||||
from .pdfparser import PDFParser
|
||||
from .pdfdocument import PDFDocument
|
||||
from .pdfdocument import PDFTextExtractionNotAllowed
|
||||
from .pdfdocument import PDFTextExtractionNotAllowedWarning
|
||||
from .pdfdocument import PDFTextExtractionNotAllowedError
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
@ -120,15 +122,23 @@ class PDFPage:
|
|||
@classmethod
|
||||
def get_pages(cls, fp,
|
||||
pagenos=None, maxpages=0, password='',
|
||||
caching=True, check_extractable=True):
|
||||
caching=True, check_extractable=False):
|
||||
# Create a PDF parser object associated with the file object.
|
||||
parser = PDFParser(fp)
|
||||
# Create a PDF document object that stores the document structure.
|
||||
doc = PDFDocument(parser, password=password, caching=caching)
|
||||
# Check if the document allows text extraction. If not, abort.
|
||||
if check_extractable and not doc.is_extractable:
|
||||
error_msg = 'Text extraction is not allowed: %r' % fp
|
||||
raise PDFTextExtractionNotAllowed(error_msg)
|
||||
# Check if the document allows text extraction.
|
||||
# If not, warn the user and proceed.
|
||||
if not doc.is_extractable:
|
||||
if check_extractable:
|
||||
error_msg = 'Text extraction is not allowed: %r' % fp
|
||||
raise PDFTextExtractionNotAllowedError(error_msg)
|
||||
else:
|
||||
warning_msg = 'The PDF %r contains a metadata field '\
|
||||
'indicating that it should not allow ' \
|
||||
'text extraction. Ignoring this field ' \
|
||||
'and proceeding.' % fp
|
||||
warnings.warn(warning_msg, PDFTextExtractionNotAllowedWarning)
|
||||
# Process each page contained in the document.
|
||||
for (pageno, page) in enumerate(cls.create_pages(doc)):
|
||||
if pagenos and (pageno not in pagenos):
|
||||
|
|
Binary file not shown.
|
@ -60,12 +60,17 @@ class TestPdf2Txt():
|
|||
def test_contrib_2b(self):
|
||||
run('contrib/2b.pdf', '-A -t xml')
|
||||
|
||||
def test_contrib_issue_350(self):
|
||||
"""Regression test for
|
||||
https://github.com/pdfminer/pdfminer.six/issues/350"""
|
||||
run('contrib/issue-00352-asw-oct96-p41.pdf')
|
||||
|
||||
def test_scancode_patchelf(self):
|
||||
"""Regression test for # https://github.com/euske/pdfminer/issues/96"""
|
||||
run('scancode/patchelf.pdf')
|
||||
|
||||
def test_contrib_hash_two_complement(self):
|
||||
"""Check that unsigned integer is added correctly to encryption hash.
|
||||
"""Check that unsigned integer is added correctly to encryption hash.et
|
||||
|
||||
See https://github.com/pdfminer/pdfminer.six/issues/186
|
||||
"""
|
||||
|
|
Loading…
Reference in New Issue