Merge pull request #11 from metachris/pdfminerX

Pdfminer Updates
pull/13/head
Goulu 2015-12-06 18:56:53 +01:00
commit 72b2bc3197
9 changed files with 21 additions and 16 deletions

View File

@ -19,7 +19,7 @@ from .psparser import PSEOF
from .psparser import literal_name from .psparser import literal_name
from .psparser import LIT from .psparser import LIT
from .psparser import KWD from .psparser import KWD
from .psparser import STRICT from .settings import STRICT
from .pdftypes import PDFException from .pdftypes import PDFException
from .pdftypes import PDFTypeError from .pdftypes import PDFTypeError
from .pdftypes import PDFStream from .pdftypes import PDFStream

View File

@ -12,7 +12,7 @@ from .psparser import PSStackParser
from .psparser import PSEOF from .psparser import PSEOF
from .psparser import LIT from .psparser import LIT
from .psparser import KWD from .psparser import KWD
from .psparser import STRICT from .settings import STRICT
from .psparser import PSLiteral from .psparser import PSLiteral
from .psparser import literal_name from .psparser import literal_name
from .pdftypes import PDFException from .pdftypes import PDFException

View File

@ -12,7 +12,7 @@ from .psparser import keyword_name
from .psparser import PSStackParser from .psparser import PSStackParser
from .psparser import LIT from .psparser import LIT
from .psparser import KWD from .psparser import KWD
from .psparser import STRICT from .settings import STRICT
from .pdftypes import PDFException from .pdftypes import PDFException
from .pdftypes import PDFStream from .pdftypes import PDFStream
from .pdftypes import PDFObjRef from .pdftypes import PDFObjRef

View File

@ -5,7 +5,7 @@ from .psparser import PSStackParser
from .psparser import PSSyntaxError from .psparser import PSSyntaxError
from .psparser import PSEOF from .psparser import PSEOF
from .psparser import KWD from .psparser import KWD
from .psparser import STRICT from .settings import STRICT
from .pdftypes import PDFException from .pdftypes import PDFException
from .pdftypes import PDFStream from .pdftypes import PDFStream
from .pdftypes import PDFObjRef from .pdftypes import PDFObjRef

View File

@ -8,7 +8,7 @@ from .ccitt import ccittfaxdecode
from .psparser import PSException from .psparser import PSException
from .psparser import PSObject from .psparser import PSObject
from .psparser import LIT from .psparser import LIT
from .psparser import STRICT from .settings import STRICT
from .utils import apply_png_predictor from .utils import apply_png_predictor
from .utils import isnumber from .utils import isnumber

View File

@ -5,12 +5,8 @@ import re
import logging import logging
import six # Python 2+3 compatibility import six # Python 2+3 compatibility
try:
from django.conf import settings
except ImportError:
# in case it's not a django project
settings = None
from .settings import STRICT
def bytesindex(s,i,j=None): def bytesindex(s,i,j=None):
"""implements s[i], s[i:], s[i:j] for Python2 and Python3""" """implements s[i], s[i:], s[i:j] for Python2 and Python3"""
@ -21,7 +17,6 @@ def bytesindex(s,i,j=None):
from .utils import choplist from .utils import choplist
STRICT = getattr(settings, 'PDF_MINER_IS_STRICT', True)
## PS Exceptions ## PS Exceptions
## ##

8
pdfminer/settings.py Normal file
View File

@ -0,0 +1,8 @@
try:
from django.conf import django_settings
except (ImportError, NameError) as e:
# in case it's not a django project
django_settings = None
# Get defaults from django settings
STRICT = getattr(django_settings, 'PDF_MINER_IS_STRICT', True)

View File

@ -9,7 +9,7 @@ setup(
version=__version__, version=__version__,
packages=['pdfminer',], packages=['pdfminer',],
package_data={'pdfminer': ['cmap/*.pickle.gz']}, package_data={'pdfminer': ['cmap/*.pickle.gz']},
install_requires=['six', 'chardet'] if sys.version_info.major>2 else ['six'], install_requires=['six', 'chardet'] if sys.version_info >= (3, 0) else ['six'],
description='PDF parser and analyzer', description='PDF parser and analyzer',
long_description='''fork of PDFMiner using six for Python 2+3 compatibility long_description='''fork of PDFMiner using six for Python 2+3 compatibility

View File

@ -5,6 +5,8 @@ Converts PDF text content (though not images containing text) to plain text, htm
import sys import sys
import logging import logging
import six import six
import pdfminer.settings
pdfminer.settings.STRICT = False
import pdfminer.high_level import pdfminer.high_level
import pdfminer.layout import pdfminer.layout
@ -24,7 +26,7 @@ def extract_text(files=[], outfile='-',
# If any LAParams group arguments were passed, create an LAParams object and # If any LAParams group arguments were passed, create an LAParams object and
# populate with given args. Otherwise, set it to None. # populate with given args. Otherwise, set it to None.
if not no_laparams: if not no_laparams:
laparams = pdfminer.layout.LAParams() laparams = pdfminer.layout.LAParams()
for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"): for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
paramv = locals().get(param, None) paramv = locals().get(param, None)
@ -44,14 +46,14 @@ def extract_text(files=[], outfile='-',
(".tag", "tag") ): (".tag", "tag") ):
if outfile.endswith(override): if outfile.endswith(override):
output_type = alttype output_type = alttype
if outfile == "-": if outfile == "-":
outfp = sys.stdout outfp = sys.stdout
if outfp.encoding is not None: if outfp.encoding is not None:
codec = 'utf-8' codec = 'utf-8'
else: else:
outfp = open(outfile, "wb") outfp = open(outfile, "wb")
for fname in files: for fname in files:
with open(fname, "rb") as fp: with open(fname, "rb") as fp:
@ -90,7 +92,7 @@ def main(args=None):
A.page_numbers = set([x-1 for x in A.page_numbers]) A.page_numbers = set([x-1 for x in A.page_numbers])
if A.pagenos: if A.pagenos:
A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")]) A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])
imagewriter = None imagewriter = None
if A.output_dir: if A.output_dir:
imagewriter = ImageWriter(A.output_dir) imagewriter = ImageWriter(A.output_dir)