From b686dd0139643561e24dd8d0adc152830a02a6c4 Mon Sep 17 00:00:00 2001 From: Chris Hager Date: Sun, 1 Nov 2015 22:24:30 +0100 Subject: [PATCH 1/4] pdfminer/settings.py for STRICT and added ENFORCE_CHECK_EXTRACTABLE --- pdfminer/pdfdocument.py | 2 +- pdfminer/pdffont.py | 2 +- pdfminer/pdfinterp.py | 2 +- pdfminer/pdfpage.py | 6 ++++-- pdfminer/pdfparser.py | 2 +- pdfminer/pdftypes.py | 2 +- pdfminer/psparser.py | 7 +------ pdfminer/settings.py | 9 +++++++++ tools/pdf2txt.py | 11 +++++++---- 9 files changed, 26 insertions(+), 17 deletions(-) create mode 100644 pdfminer/settings.py diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py index f8ac79d..964610f 100644 --- a/pdfminer/pdfdocument.py +++ b/pdfminer/pdfdocument.py @@ -19,7 +19,7 @@ from .psparser import PSEOF from .psparser import literal_name from .psparser import LIT from .psparser import KWD -from .psparser import STRICT +from .settings import STRICT from .pdftypes import PDFException from .pdftypes import PDFTypeError from .pdftypes import PDFStream diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index b2a9df8..8196a33 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -12,7 +12,7 @@ from .psparser import PSStackParser from .psparser import PSEOF from .psparser import LIT from .psparser import KWD -from .psparser import STRICT +from .settings import STRICT from .psparser import PSLiteral from .psparser import literal_name from .pdftypes import PDFException diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index 65bf8b4..80d57ea 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -12,7 +12,7 @@ from .psparser import keyword_name from .psparser import PSStackParser from .psparser import LIT from .psparser import KWD -from .psparser import STRICT +from .settings import STRICT from .pdftypes import PDFException from .pdftypes import PDFStream from .pdftypes import PDFObjRef diff --git a/pdfminer/pdfpage.py b/pdfminer/pdfpage.py index 41882ea..e35e56e 100644 --- a/pdfminer/pdfpage.py +++ b/pdfminer/pdfpage.py @@ -9,6 +9,7 @@ from .pdftypes import dict_value from .pdfparser import PDFParser from .pdfdocument import PDFDocument from .pdfdocument import PDFTextExtractionNotAllowed +from .settings import ENFORCE_CHECK_EXTRACTABLE import six # Python 2+3 compatibility @@ -120,8 +121,9 @@ class PDFPage(object): # Create a PDF document object that stores the document structure. doc = PDFDocument(parser, password=password, caching=caching) # Check if the document allows text extraction. If not, abort. - if check_extractable and not doc.is_extractable: - raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp) + if ENFORCE_CHECK_EXTRACTABLE: + if check_extractable and not doc.is_extractable: + raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp) # Process each page contained in the document. for (pageno, page) in enumerate(klass.create_pages(doc)): if pagenos and (pageno not in pagenos): diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py index 7407ade..8e1934e 100644 --- a/pdfminer/pdfparser.py +++ b/pdfminer/pdfparser.py @@ -5,7 +5,7 @@ from .psparser import PSStackParser from .psparser import PSSyntaxError from .psparser import PSEOF from .psparser import KWD -from .psparser import STRICT +from .settings import STRICT from .pdftypes import PDFException from .pdftypes import PDFStream from .pdftypes import PDFObjRef diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index 64d84bb..834675e 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -8,7 +8,7 @@ from .ccitt import ccittfaxdecode from .psparser import PSException from .psparser import PSObject from .psparser import LIT -from .psparser import STRICT +from .settings import STRICT from .utils import apply_png_predictor from .utils import isnumber diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index b1fc6ac..dff3e04 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -5,12 +5,8 @@ import re import logging import six # Python 2+3 compatibility -try: - from django.conf import settings -except ImportError: - # in case it's not a django project - settings = None +from .settings import STRICT def bytesindex(s,i,j=None): """implements s[i], s[i:], s[i:j] for Python2 and Python3""" @@ -21,7 +17,6 @@ def bytesindex(s,i,j=None): from .utils import choplist -STRICT = getattr(settings, 'PDF_MINER_IS_STRICT', True) ## PS Exceptions ## diff --git a/pdfminer/settings.py b/pdfminer/settings.py new file mode 100644 index 0000000..e3bc183 --- /dev/null +++ b/pdfminer/settings.py @@ -0,0 +1,9 @@ +try: + from django.conf import django_settings +except ImportError: + # in case it's not a django project + django_settings = None + +# Get defaults from django settings +STRICT = getattr(django_settings, 'PDF_MINER_IS_STRICT', True) +ENFORCE_CHECK_EXTRACTABLE = True diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index f449928..2bbf7b5 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -5,6 +5,9 @@ Converts PDF text content (though not images containing text) to plain text, htm import sys import logging import six +import pdfminer.settings +pdfminer.settings.STRICT = False +pdfminer.settings.ENFORCE_CHECK_EXTRACTABLE = False import pdfminer.high_level import pdfminer.layout @@ -24,7 +27,7 @@ def extract_text(files=[], outfile='-', # If any LAParams group arguments were passed, create an LAParams object and # populate with given args. Otherwise, set it to None. - if not no_laparams: + if not no_laparams: laparams = pdfminer.layout.LAParams() for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"): paramv = locals().get(param, None) @@ -44,14 +47,14 @@ def extract_text(files=[], outfile='-', (".tag", "tag") ): if outfile.endswith(override): output_type = alttype - + if outfile == "-": outfp = sys.stdout if outfp.encoding is not None: codec = 'utf-8' else: outfp = open(outfile, "wb") - + for fname in files: with open(fname, "rb") as fp: @@ -90,7 +93,7 @@ def main(args=None): A.page_numbers = set([x-1 for x in A.page_numbers]) if A.pagenos: A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")]) - + imagewriter = None if A.output_dir: imagewriter = ImageWriter(A.output_dir) From 2e1be5721fff435a2de175f1c95330ecffc41b13 Mon Sep 17 00:00:00 2001 From: Chris Hager Date: Sun, 1 Nov 2015 22:34:18 +0100 Subject: [PATCH 2/4] removed settings.ENFORCE_CHECK_EXTRACTABLE --- pdfminer/pdfpage.py | 7 +++---- pdfminer/settings.py | 1 - tools/pdf2txt.py | 1 - 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/pdfminer/pdfpage.py b/pdfminer/pdfpage.py index e35e56e..f9761d9 100644 --- a/pdfminer/pdfpage.py +++ b/pdfminer/pdfpage.py @@ -9,7 +9,7 @@ from .pdftypes import dict_value from .pdfparser import PDFParser from .pdfdocument import PDFDocument from .pdfdocument import PDFTextExtractionNotAllowed -from .settings import ENFORCE_CHECK_EXTRACTABLE +from .settings import import six # Python 2+3 compatibility @@ -121,9 +121,8 @@ class PDFPage(object): # Create a PDF document object that stores the document structure. doc = PDFDocument(parser, password=password, caching=caching) # Check if the document allows text extraction. If not, abort. - if ENFORCE_CHECK_EXTRACTABLE: - if check_extractable and not doc.is_extractable: - raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp) + if check_extractable and not doc.is_extractable: + raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp) # Process each page contained in the document. for (pageno, page) in enumerate(klass.create_pages(doc)): if pagenos and (pageno not in pagenos): diff --git a/pdfminer/settings.py b/pdfminer/settings.py index e3bc183..46d4bbd 100644 --- a/pdfminer/settings.py +++ b/pdfminer/settings.py @@ -6,4 +6,3 @@ except ImportError: # Get defaults from django settings STRICT = getattr(django_settings, 'PDF_MINER_IS_STRICT', True) -ENFORCE_CHECK_EXTRACTABLE = True diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index 2bbf7b5..d9a3ebb 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -7,7 +7,6 @@ import logging import six import pdfminer.settings pdfminer.settings.STRICT = False -pdfminer.settings.ENFORCE_CHECK_EXTRACTABLE = False import pdfminer.high_level import pdfminer.layout From 146abb459fef5710642b0b1d3520db29dac5a886 Mon Sep 17 00:00:00 2001 From: Chris Hager Date: Sun, 8 Nov 2015 02:32:23 +0100 Subject: [PATCH 3/4] Updated setup.py to work with Python 2.6 Simple fix. Mind to add and push to PyPi? --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 5676ca3..5fe5838 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ setup( version=__version__, packages=['pdfminer',], package_data={'pdfminer': ['cmap/*.pickle.gz']}, - install_requires=['six', 'chardet'] if sys.version_info.major>2 else ['six'], + install_requires=['six', 'chardet'] if sys.version_info >= (3, 0) else ['six'], description='PDF parser and analyzer', long_description='''fork of PDFMiner using six for Python 2+3 compatibility From 8149be16694059d6912557d622b4716972318d68 Mon Sep 17 00:00:00 2001 From: Chris Hager Date: Sun, 6 Dec 2015 00:17:58 +0100 Subject: [PATCH 4/4] bugfixes --- pdfminer/pdfpage.py | 1 - pdfminer/settings.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pdfminer/pdfpage.py b/pdfminer/pdfpage.py index f9761d9..41882ea 100644 --- a/pdfminer/pdfpage.py +++ b/pdfminer/pdfpage.py @@ -9,7 +9,6 @@ from .pdftypes import dict_value from .pdfparser import PDFParser from .pdfdocument import PDFDocument from .pdfdocument import PDFTextExtractionNotAllowed -from .settings import import six # Python 2+3 compatibility diff --git a/pdfminer/settings.py b/pdfminer/settings.py index 46d4bbd..350b2ce 100644 --- a/pdfminer/settings.py +++ b/pdfminer/settings.py @@ -1,6 +1,6 @@ try: from django.conf import django_settings -except ImportError: +except (ImportError, NameError) as e: # in case it's not a django project django_settings = None