pdfminer/settings.py for STRICT and added ENFORCE_CHECK_EXTRACTABLE

pull/8/head
Chris Hager 2015-11-01 22:24:30 +01:00
parent a46ea52e20
commit b686dd0139
9 changed files with 26 additions and 17 deletions

View File

@ -19,7 +19,7 @@ from .psparser import PSEOF
from .psparser import literal_name
from .psparser import LIT
from .psparser import KWD
from .psparser import STRICT
from .settings import STRICT
from .pdftypes import PDFException
from .pdftypes import PDFTypeError
from .pdftypes import PDFStream

View File

@ -12,7 +12,7 @@ from .psparser import PSStackParser
from .psparser import PSEOF
from .psparser import LIT
from .psparser import KWD
from .psparser import STRICT
from .settings import STRICT
from .psparser import PSLiteral
from .psparser import literal_name
from .pdftypes import PDFException

View File

@ -12,7 +12,7 @@ from .psparser import keyword_name
from .psparser import PSStackParser
from .psparser import LIT
from .psparser import KWD
from .psparser import STRICT
from .settings import STRICT
from .pdftypes import PDFException
from .pdftypes import PDFStream
from .pdftypes import PDFObjRef

View File

@ -9,6 +9,7 @@ from .pdftypes import dict_value
from .pdfparser import PDFParser
from .pdfdocument import PDFDocument
from .pdfdocument import PDFTextExtractionNotAllowed
from .settings import ENFORCE_CHECK_EXTRACTABLE
import six # Python 2+3 compatibility
@ -120,6 +121,7 @@ class PDFPage(object):
# Create a PDF document object that stores the document structure.
doc = PDFDocument(parser, password=password, caching=caching)
# Check if the document allows text extraction. If not, abort.
if ENFORCE_CHECK_EXTRACTABLE:
if check_extractable and not doc.is_extractable:
raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
# Process each page contained in the document.

View File

@ -5,7 +5,7 @@ from .psparser import PSStackParser
from .psparser import PSSyntaxError
from .psparser import PSEOF
from .psparser import KWD
from .psparser import STRICT
from .settings import STRICT
from .pdftypes import PDFException
from .pdftypes import PDFStream
from .pdftypes import PDFObjRef

View File

@ -8,7 +8,7 @@ from .ccitt import ccittfaxdecode
from .psparser import PSException
from .psparser import PSObject
from .psparser import LIT
from .psparser import STRICT
from .settings import STRICT
from .utils import apply_png_predictor
from .utils import isnumber

View File

@ -5,12 +5,8 @@ import re
import logging
import six # Python 2+3 compatibility
try:
from django.conf import settings
except ImportError:
# in case it's not a django project
settings = None
from .settings import STRICT
def bytesindex(s,i,j=None):
"""implements s[i], s[i:], s[i:j] for Python2 and Python3"""
@ -21,7 +17,6 @@ def bytesindex(s,i,j=None):
from .utils import choplist
STRICT = getattr(settings, 'PDF_MINER_IS_STRICT', True)
## PS Exceptions
##

9
pdfminer/settings.py Normal file
View File

@ -0,0 +1,9 @@
try:
from django.conf import django_settings
except ImportError:
# in case it's not a django project
django_settings = None
# Get defaults from django settings
STRICT = getattr(django_settings, 'PDF_MINER_IS_STRICT', True)
ENFORCE_CHECK_EXTRACTABLE = True

View File

@ -5,6 +5,9 @@ Converts PDF text content (though not images containing text) to plain text, htm
import sys
import logging
import six
import pdfminer.settings
pdfminer.settings.STRICT = False
pdfminer.settings.ENFORCE_CHECK_EXTRACTABLE = False
import pdfminer.high_level
import pdfminer.layout