From b686dd0139643561e24dd8d0adc152830a02a6c4 Mon Sep 17 00:00:00 2001 From: Chris Hager Date: Sun, 1 Nov 2015 22:24:30 +0100 Subject: [PATCH] pdfminer/settings.py for STRICT and added ENFORCE_CHECK_EXTRACTABLE --- pdfminer/pdfdocument.py | 2 +- pdfminer/pdffont.py | 2 +- pdfminer/pdfinterp.py | 2 +- pdfminer/pdfpage.py | 6 ++++-- pdfminer/pdfparser.py | 2 +- pdfminer/pdftypes.py | 2 +- pdfminer/psparser.py | 7 +------ pdfminer/settings.py | 9 +++++++++ tools/pdf2txt.py | 11 +++++++---- 9 files changed, 26 insertions(+), 17 deletions(-) create mode 100644 pdfminer/settings.py diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py index f8ac79d..964610f 100644 --- a/pdfminer/pdfdocument.py +++ b/pdfminer/pdfdocument.py @@ -19,7 +19,7 @@ from .psparser import PSEOF from .psparser import literal_name from .psparser import LIT from .psparser import KWD -from .psparser import STRICT +from .settings import STRICT from .pdftypes import PDFException from .pdftypes import PDFTypeError from .pdftypes import PDFStream diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index b2a9df8..8196a33 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -12,7 +12,7 @@ from .psparser import PSStackParser from .psparser import PSEOF from .psparser import LIT from .psparser import KWD -from .psparser import STRICT +from .settings import STRICT from .psparser import PSLiteral from .psparser import literal_name from .pdftypes import PDFException diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index 65bf8b4..80d57ea 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -12,7 +12,7 @@ from .psparser import keyword_name from .psparser import PSStackParser from .psparser import LIT from .psparser import KWD -from .psparser import STRICT +from .settings import STRICT from .pdftypes import PDFException from .pdftypes import PDFStream from .pdftypes import PDFObjRef diff --git a/pdfminer/pdfpage.py b/pdfminer/pdfpage.py index 41882ea..e35e56e 100644 --- a/pdfminer/pdfpage.py +++ b/pdfminer/pdfpage.py @@ -9,6 +9,7 @@ from .pdftypes import dict_value from .pdfparser import PDFParser from .pdfdocument import PDFDocument from .pdfdocument import PDFTextExtractionNotAllowed +from .settings import ENFORCE_CHECK_EXTRACTABLE import six # Python 2+3 compatibility @@ -120,8 +121,9 @@ class PDFPage(object): # Create a PDF document object that stores the document structure. doc = PDFDocument(parser, password=password, caching=caching) # Check if the document allows text extraction. If not, abort. - if check_extractable and not doc.is_extractable: - raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp) + if ENFORCE_CHECK_EXTRACTABLE: + if check_extractable and not doc.is_extractable: + raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp) # Process each page contained in the document. for (pageno, page) in enumerate(klass.create_pages(doc)): if pagenos and (pageno not in pagenos): diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py index 7407ade..8e1934e 100644 --- a/pdfminer/pdfparser.py +++ b/pdfminer/pdfparser.py @@ -5,7 +5,7 @@ from .psparser import PSStackParser from .psparser import PSSyntaxError from .psparser import PSEOF from .psparser import KWD -from .psparser import STRICT +from .settings import STRICT from .pdftypes import PDFException from .pdftypes import PDFStream from .pdftypes import PDFObjRef diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index 64d84bb..834675e 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -8,7 +8,7 @@ from .ccitt import ccittfaxdecode from .psparser import PSException from .psparser import PSObject from .psparser import LIT -from .psparser import STRICT +from .settings import STRICT from .utils import apply_png_predictor from .utils import isnumber diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index b1fc6ac..dff3e04 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -5,12 +5,8 @@ import re import logging import six # Python 2+3 compatibility -try: - from django.conf import settings -except ImportError: - # in case it's not a django project - settings = None +from .settings import STRICT def bytesindex(s,i,j=None): """implements s[i], s[i:], s[i:j] for Python2 and Python3""" @@ -21,7 +17,6 @@ def bytesindex(s,i,j=None): from .utils import choplist -STRICT = getattr(settings, 'PDF_MINER_IS_STRICT', True) ## PS Exceptions ## diff --git a/pdfminer/settings.py b/pdfminer/settings.py new file mode 100644 index 0000000..e3bc183 --- /dev/null +++ b/pdfminer/settings.py @@ -0,0 +1,9 @@ +try: + from django.conf import django_settings +except ImportError: + # in case it's not a django project + django_settings = None + +# Get defaults from django settings +STRICT = getattr(django_settings, 'PDF_MINER_IS_STRICT', True) +ENFORCE_CHECK_EXTRACTABLE = True diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index f449928..2bbf7b5 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -5,6 +5,9 @@ Converts PDF text content (though not images containing text) to plain text, htm import sys import logging import six +import pdfminer.settings +pdfminer.settings.STRICT = False +pdfminer.settings.ENFORCE_CHECK_EXTRACTABLE = False import pdfminer.high_level import pdfminer.layout @@ -24,7 +27,7 @@ def extract_text(files=[], outfile='-', # If any LAParams group arguments were passed, create an LAParams object and # populate with given args. Otherwise, set it to None. - if not no_laparams: + if not no_laparams: laparams = pdfminer.layout.LAParams() for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"): paramv = locals().get(param, None) @@ -44,14 +47,14 @@ def extract_text(files=[], outfile='-', (".tag", "tag") ): if outfile.endswith(override): output_type = alttype - + if outfile == "-": outfp = sys.stdout if outfp.encoding is not None: codec = 'utf-8' else: outfp = open(outfile, "wb") - + for fname in files: with open(fname, "rb") as fp: @@ -90,7 +93,7 @@ def main(args=None): A.page_numbers = set([x-1 for x in A.page_numbers]) if A.pagenos: A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")]) - + imagewriter = None if A.output_dir: imagewriter = ImageWriter(A.output_dir)