Change Text extraction is not allowed error to warning (#453)
* Changed error to warning for 'Text extraction is not allowed' * updated changelog * fix lint * made changes suggested in review * Update CHANGELOG.md * Add regression test for failing pdf * Reduce line length to <80 Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>pull/461/head
parent
836d312982
commit
6a9269b432
|
@ -10,6 +10,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
- Hiding fallback xref by default from dumppdf.py output ([#431](https://github.com/pdfminer/pdfminer.six/pull/431))
|
- Hiding fallback xref by default from dumppdf.py output ([#431](https://github.com/pdfminer/pdfminer.six/pull/431))
|
||||||
|
- Raise a warning instead of an error when extracting text from a non-extractable PDF ([#350](https://github.com/pdfminer/pdfminer.six/issues/350))
|
||||||
|
|
||||||
## [20200517]
|
## [20200517]
|
||||||
|
|
||||||
|
|
|
@ -80,8 +80,7 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
|
||||||
page_numbers,
|
page_numbers,
|
||||||
maxpages=maxpages,
|
maxpages=maxpages,
|
||||||
password=password,
|
password=password,
|
||||||
caching=not disable_caching,
|
caching=not disable_caching):
|
||||||
check_extractable=True):
|
|
||||||
page.rotate = (page.rotate + rotation) % 360
|
page.rotate = (page.rotate + rotation) % 360
|
||||||
interpreter.process_page(page)
|
interpreter.process_page(page)
|
||||||
|
|
||||||
|
@ -118,7 +117,6 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
|
||||||
maxpages=maxpages,
|
maxpages=maxpages,
|
||||||
password=password,
|
password=password,
|
||||||
caching=caching,
|
caching=caching,
|
||||||
check_extractable=True,
|
|
||||||
):
|
):
|
||||||
interpreter.process_page(page)
|
interpreter.process_page(page)
|
||||||
|
|
||||||
|
|
|
@ -44,7 +44,11 @@ class PDFPasswordIncorrect(PDFEncryptionError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class PDFTextExtractionNotAllowed(PDFEncryptionError):
|
class PDFTextExtractionNotAllowedWarning(UserWarning):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class PDFTextExtractionNotAllowedError(PDFEncryptionError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import logging
|
import logging
|
||||||
|
import warnings
|
||||||
from . import settings
|
from . import settings
|
||||||
from .psparser import LIT
|
from .psparser import LIT
|
||||||
from .pdftypes import PDFObjectNotFound
|
from .pdftypes import PDFObjectNotFound
|
||||||
|
@ -8,7 +9,8 @@ from .pdftypes import list_value
|
||||||
from .pdftypes import dict_value
|
from .pdftypes import dict_value
|
||||||
from .pdfparser import PDFParser
|
from .pdfparser import PDFParser
|
||||||
from .pdfdocument import PDFDocument
|
from .pdfdocument import PDFDocument
|
||||||
from .pdfdocument import PDFTextExtractionNotAllowed
|
from .pdfdocument import PDFTextExtractionNotAllowedWarning
|
||||||
|
from .pdfdocument import PDFTextExtractionNotAllowedError
|
||||||
|
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
@ -120,15 +122,23 @@ class PDFPage:
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_pages(cls, fp,
|
def get_pages(cls, fp,
|
||||||
pagenos=None, maxpages=0, password='',
|
pagenos=None, maxpages=0, password='',
|
||||||
caching=True, check_extractable=True):
|
caching=True, check_extractable=False):
|
||||||
# Create a PDF parser object associated with the file object.
|
# Create a PDF parser object associated with the file object.
|
||||||
parser = PDFParser(fp)
|
parser = PDFParser(fp)
|
||||||
# Create a PDF document object that stores the document structure.
|
# Create a PDF document object that stores the document structure.
|
||||||
doc = PDFDocument(parser, password=password, caching=caching)
|
doc = PDFDocument(parser, password=password, caching=caching)
|
||||||
# Check if the document allows text extraction. If not, abort.
|
# Check if the document allows text extraction.
|
||||||
if check_extractable and not doc.is_extractable:
|
# If not, warn the user and proceed.
|
||||||
error_msg = 'Text extraction is not allowed: %r' % fp
|
if not doc.is_extractable:
|
||||||
raise PDFTextExtractionNotAllowed(error_msg)
|
if check_extractable:
|
||||||
|
error_msg = 'Text extraction is not allowed: %r' % fp
|
||||||
|
raise PDFTextExtractionNotAllowedError(error_msg)
|
||||||
|
else:
|
||||||
|
warning_msg = 'The PDF %r contains a metadata field '\
|
||||||
|
'indicating that it should not allow ' \
|
||||||
|
'text extraction. Ignoring this field ' \
|
||||||
|
'and proceeding.' % fp
|
||||||
|
warnings.warn(warning_msg, PDFTextExtractionNotAllowedWarning)
|
||||||
# Process each page contained in the document.
|
# Process each page contained in the document.
|
||||||
for (pageno, page) in enumerate(cls.create_pages(doc)):
|
for (pageno, page) in enumerate(cls.create_pages(doc)):
|
||||||
if pagenos and (pageno not in pagenos):
|
if pagenos and (pageno not in pagenos):
|
||||||
|
|
Binary file not shown.
|
@ -60,12 +60,17 @@ class TestPdf2Txt():
|
||||||
def test_contrib_2b(self):
|
def test_contrib_2b(self):
|
||||||
run('contrib/2b.pdf', '-A -t xml')
|
run('contrib/2b.pdf', '-A -t xml')
|
||||||
|
|
||||||
|
def test_contrib_issue_350(self):
|
||||||
|
"""Regression test for
|
||||||
|
https://github.com/pdfminer/pdfminer.six/issues/350"""
|
||||||
|
run('contrib/issue-00352-asw-oct96-p41.pdf')
|
||||||
|
|
||||||
def test_scancode_patchelf(self):
|
def test_scancode_patchelf(self):
|
||||||
"""Regression test for # https://github.com/euske/pdfminer/issues/96"""
|
"""Regression test for # https://github.com/euske/pdfminer/issues/96"""
|
||||||
run('scancode/patchelf.pdf')
|
run('scancode/patchelf.pdf')
|
||||||
|
|
||||||
def test_contrib_hash_two_complement(self):
|
def test_contrib_hash_two_complement(self):
|
||||||
"""Check that unsigned integer is added correctly to encryption hash.
|
"""Check that unsigned integer is added correctly to encryption hash.et
|
||||||
|
|
||||||
See https://github.com/pdfminer/pdfminer.six/issues/186
|
See https://github.com/pdfminer/pdfminer.six/issues/186
|
||||||
"""
|
"""
|
||||||
|
|
Loading…
Reference in New Issue