pdfminer/settings.py for STRICT and added ENFORCE_CHECK_EXTRACTABLE
parent
a46ea52e20
commit
b686dd0139
|
@ -19,7 +19,7 @@ from .psparser import PSEOF
|
|||
from .psparser import literal_name
|
||||
from .psparser import LIT
|
||||
from .psparser import KWD
|
||||
from .psparser import STRICT
|
||||
from .settings import STRICT
|
||||
from .pdftypes import PDFException
|
||||
from .pdftypes import PDFTypeError
|
||||
from .pdftypes import PDFStream
|
||||
|
|
|
@ -12,7 +12,7 @@ from .psparser import PSStackParser
|
|||
from .psparser import PSEOF
|
||||
from .psparser import LIT
|
||||
from .psparser import KWD
|
||||
from .psparser import STRICT
|
||||
from .settings import STRICT
|
||||
from .psparser import PSLiteral
|
||||
from .psparser import literal_name
|
||||
from .pdftypes import PDFException
|
||||
|
|
|
@ -12,7 +12,7 @@ from .psparser import keyword_name
|
|||
from .psparser import PSStackParser
|
||||
from .psparser import LIT
|
||||
from .psparser import KWD
|
||||
from .psparser import STRICT
|
||||
from .settings import STRICT
|
||||
from .pdftypes import PDFException
|
||||
from .pdftypes import PDFStream
|
||||
from .pdftypes import PDFObjRef
|
||||
|
|
|
@ -9,6 +9,7 @@ from .pdftypes import dict_value
|
|||
from .pdfparser import PDFParser
|
||||
from .pdfdocument import PDFDocument
|
||||
from .pdfdocument import PDFTextExtractionNotAllowed
|
||||
from .settings import ENFORCE_CHECK_EXTRACTABLE
|
||||
|
||||
import six # Python 2+3 compatibility
|
||||
|
||||
|
@ -120,8 +121,9 @@ class PDFPage(object):
|
|||
# Create a PDF document object that stores the document structure.
|
||||
doc = PDFDocument(parser, password=password, caching=caching)
|
||||
# Check if the document allows text extraction. If not, abort.
|
||||
if check_extractable and not doc.is_extractable:
|
||||
raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
|
||||
if ENFORCE_CHECK_EXTRACTABLE:
|
||||
if check_extractable and not doc.is_extractable:
|
||||
raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
|
||||
# Process each page contained in the document.
|
||||
for (pageno, page) in enumerate(klass.create_pages(doc)):
|
||||
if pagenos and (pageno not in pagenos):
|
||||
|
|
|
@ -5,7 +5,7 @@ from .psparser import PSStackParser
|
|||
from .psparser import PSSyntaxError
|
||||
from .psparser import PSEOF
|
||||
from .psparser import KWD
|
||||
from .psparser import STRICT
|
||||
from .settings import STRICT
|
||||
from .pdftypes import PDFException
|
||||
from .pdftypes import PDFStream
|
||||
from .pdftypes import PDFObjRef
|
||||
|
|
|
@ -8,7 +8,7 @@ from .ccitt import ccittfaxdecode
|
|||
from .psparser import PSException
|
||||
from .psparser import PSObject
|
||||
from .psparser import LIT
|
||||
from .psparser import STRICT
|
||||
from .settings import STRICT
|
||||
from .utils import apply_png_predictor
|
||||
from .utils import isnumber
|
||||
|
||||
|
|
|
@ -5,12 +5,8 @@ import re
|
|||
import logging
|
||||
|
||||
import six # Python 2+3 compatibility
|
||||
try:
|
||||
from django.conf import settings
|
||||
except ImportError:
|
||||
# in case it's not a django project
|
||||
settings = None
|
||||
|
||||
from .settings import STRICT
|
||||
|
||||
def bytesindex(s,i,j=None):
|
||||
"""implements s[i], s[i:], s[i:j] for Python2 and Python3"""
|
||||
|
@ -21,7 +17,6 @@ def bytesindex(s,i,j=None):
|
|||
|
||||
from .utils import choplist
|
||||
|
||||
STRICT = getattr(settings, 'PDF_MINER_IS_STRICT', True)
|
||||
|
||||
## PS Exceptions
|
||||
##
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
try:
|
||||
from django.conf import django_settings
|
||||
except ImportError:
|
||||
# in case it's not a django project
|
||||
django_settings = None
|
||||
|
||||
# Get defaults from django settings
|
||||
STRICT = getattr(django_settings, 'PDF_MINER_IS_STRICT', True)
|
||||
ENFORCE_CHECK_EXTRACTABLE = True
|
|
@ -5,6 +5,9 @@ Converts PDF text content (though not images containing text) to plain text, htm
|
|||
import sys
|
||||
import logging
|
||||
import six
|
||||
import pdfminer.settings
|
||||
pdfminer.settings.STRICT = False
|
||||
pdfminer.settings.ENFORCE_CHECK_EXTRACTABLE = False
|
||||
import pdfminer.high_level
|
||||
import pdfminer.layout
|
||||
|
||||
|
@ -24,7 +27,7 @@ def extract_text(files=[], outfile='-',
|
|||
|
||||
# If any LAParams group arguments were passed, create an LAParams object and
|
||||
# populate with given args. Otherwise, set it to None.
|
||||
if not no_laparams:
|
||||
if not no_laparams:
|
||||
laparams = pdfminer.layout.LAParams()
|
||||
for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
|
||||
paramv = locals().get(param, None)
|
||||
|
@ -44,14 +47,14 @@ def extract_text(files=[], outfile='-',
|
|||
(".tag", "tag") ):
|
||||
if outfile.endswith(override):
|
||||
output_type = alttype
|
||||
|
||||
|
||||
if outfile == "-":
|
||||
outfp = sys.stdout
|
||||
if outfp.encoding is not None:
|
||||
codec = 'utf-8'
|
||||
else:
|
||||
outfp = open(outfile, "wb")
|
||||
|
||||
|
||||
|
||||
for fname in files:
|
||||
with open(fname, "rb") as fp:
|
||||
|
@ -90,7 +93,7 @@ def main(args=None):
|
|||
A.page_numbers = set([x-1 for x in A.page_numbers])
|
||||
if A.pagenos:
|
||||
A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])
|
||||
|
||||
|
||||
imagewriter = None
|
||||
if A.output_dir:
|
||||
imagewriter = ImageWriter(A.output_dir)
|
||||
|
|
Loading…
Reference in New Issue