commit
72b2bc3197
|
@ -19,7 +19,7 @@ from .psparser import PSEOF
|
||||||
from .psparser import literal_name
|
from .psparser import literal_name
|
||||||
from .psparser import LIT
|
from .psparser import LIT
|
||||||
from .psparser import KWD
|
from .psparser import KWD
|
||||||
from .psparser import STRICT
|
from .settings import STRICT
|
||||||
from .pdftypes import PDFException
|
from .pdftypes import PDFException
|
||||||
from .pdftypes import PDFTypeError
|
from .pdftypes import PDFTypeError
|
||||||
from .pdftypes import PDFStream
|
from .pdftypes import PDFStream
|
||||||
|
|
|
@ -12,7 +12,7 @@ from .psparser import PSStackParser
|
||||||
from .psparser import PSEOF
|
from .psparser import PSEOF
|
||||||
from .psparser import LIT
|
from .psparser import LIT
|
||||||
from .psparser import KWD
|
from .psparser import KWD
|
||||||
from .psparser import STRICT
|
from .settings import STRICT
|
||||||
from .psparser import PSLiteral
|
from .psparser import PSLiteral
|
||||||
from .psparser import literal_name
|
from .psparser import literal_name
|
||||||
from .pdftypes import PDFException
|
from .pdftypes import PDFException
|
||||||
|
|
|
@ -12,7 +12,7 @@ from .psparser import keyword_name
|
||||||
from .psparser import PSStackParser
|
from .psparser import PSStackParser
|
||||||
from .psparser import LIT
|
from .psparser import LIT
|
||||||
from .psparser import KWD
|
from .psparser import KWD
|
||||||
from .psparser import STRICT
|
from .settings import STRICT
|
||||||
from .pdftypes import PDFException
|
from .pdftypes import PDFException
|
||||||
from .pdftypes import PDFStream
|
from .pdftypes import PDFStream
|
||||||
from .pdftypes import PDFObjRef
|
from .pdftypes import PDFObjRef
|
||||||
|
|
|
@ -5,7 +5,7 @@ from .psparser import PSStackParser
|
||||||
from .psparser import PSSyntaxError
|
from .psparser import PSSyntaxError
|
||||||
from .psparser import PSEOF
|
from .psparser import PSEOF
|
||||||
from .psparser import KWD
|
from .psparser import KWD
|
||||||
from .psparser import STRICT
|
from .settings import STRICT
|
||||||
from .pdftypes import PDFException
|
from .pdftypes import PDFException
|
||||||
from .pdftypes import PDFStream
|
from .pdftypes import PDFStream
|
||||||
from .pdftypes import PDFObjRef
|
from .pdftypes import PDFObjRef
|
||||||
|
|
|
@ -8,7 +8,7 @@ from .ccitt import ccittfaxdecode
|
||||||
from .psparser import PSException
|
from .psparser import PSException
|
||||||
from .psparser import PSObject
|
from .psparser import PSObject
|
||||||
from .psparser import LIT
|
from .psparser import LIT
|
||||||
from .psparser import STRICT
|
from .settings import STRICT
|
||||||
from .utils import apply_png_predictor
|
from .utils import apply_png_predictor
|
||||||
from .utils import isnumber
|
from .utils import isnumber
|
||||||
|
|
||||||
|
|
|
@ -5,12 +5,8 @@ import re
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
import six # Python 2+3 compatibility
|
import six # Python 2+3 compatibility
|
||||||
try:
|
|
||||||
from django.conf import settings
|
|
||||||
except ImportError:
|
|
||||||
# in case it's not a django project
|
|
||||||
settings = None
|
|
||||||
|
|
||||||
|
from .settings import STRICT
|
||||||
|
|
||||||
def bytesindex(s,i,j=None):
|
def bytesindex(s,i,j=None):
|
||||||
"""implements s[i], s[i:], s[i:j] for Python2 and Python3"""
|
"""implements s[i], s[i:], s[i:j] for Python2 and Python3"""
|
||||||
|
@ -21,7 +17,6 @@ def bytesindex(s,i,j=None):
|
||||||
|
|
||||||
from .utils import choplist
|
from .utils import choplist
|
||||||
|
|
||||||
STRICT = getattr(settings, 'PDF_MINER_IS_STRICT', True)
|
|
||||||
|
|
||||||
## PS Exceptions
|
## PS Exceptions
|
||||||
##
|
##
|
||||||
|
|
|
@ -0,0 +1,8 @@
|
||||||
|
try:
|
||||||
|
from django.conf import django_settings
|
||||||
|
except (ImportError, NameError) as e:
|
||||||
|
# in case it's not a django project
|
||||||
|
django_settings = None
|
||||||
|
|
||||||
|
# Get defaults from django settings
|
||||||
|
STRICT = getattr(django_settings, 'PDF_MINER_IS_STRICT', True)
|
2
setup.py
2
setup.py
|
@ -9,7 +9,7 @@ setup(
|
||||||
version=__version__,
|
version=__version__,
|
||||||
packages=['pdfminer',],
|
packages=['pdfminer',],
|
||||||
package_data={'pdfminer': ['cmap/*.pickle.gz']},
|
package_data={'pdfminer': ['cmap/*.pickle.gz']},
|
||||||
install_requires=['six', 'chardet'] if sys.version_info.major>2 else ['six'],
|
install_requires=['six', 'chardet'] if sys.version_info >= (3, 0) else ['six'],
|
||||||
description='PDF parser and analyzer',
|
description='PDF parser and analyzer',
|
||||||
long_description='''fork of PDFMiner using six for Python 2+3 compatibility
|
long_description='''fork of PDFMiner using six for Python 2+3 compatibility
|
||||||
|
|
||||||
|
|
|
@ -5,6 +5,8 @@ Converts PDF text content (though not images containing text) to plain text, htm
|
||||||
import sys
|
import sys
|
||||||
import logging
|
import logging
|
||||||
import six
|
import six
|
||||||
|
import pdfminer.settings
|
||||||
|
pdfminer.settings.STRICT = False
|
||||||
import pdfminer.high_level
|
import pdfminer.high_level
|
||||||
import pdfminer.layout
|
import pdfminer.layout
|
||||||
|
|
||||||
|
@ -24,7 +26,7 @@ def extract_text(files=[], outfile='-',
|
||||||
|
|
||||||
# If any LAParams group arguments were passed, create an LAParams object and
|
# If any LAParams group arguments were passed, create an LAParams object and
|
||||||
# populate with given args. Otherwise, set it to None.
|
# populate with given args. Otherwise, set it to None.
|
||||||
if not no_laparams:
|
if not no_laparams:
|
||||||
laparams = pdfminer.layout.LAParams()
|
laparams = pdfminer.layout.LAParams()
|
||||||
for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
|
for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
|
||||||
paramv = locals().get(param, None)
|
paramv = locals().get(param, None)
|
||||||
|
@ -44,14 +46,14 @@ def extract_text(files=[], outfile='-',
|
||||||
(".tag", "tag") ):
|
(".tag", "tag") ):
|
||||||
if outfile.endswith(override):
|
if outfile.endswith(override):
|
||||||
output_type = alttype
|
output_type = alttype
|
||||||
|
|
||||||
if outfile == "-":
|
if outfile == "-":
|
||||||
outfp = sys.stdout
|
outfp = sys.stdout
|
||||||
if outfp.encoding is not None:
|
if outfp.encoding is not None:
|
||||||
codec = 'utf-8'
|
codec = 'utf-8'
|
||||||
else:
|
else:
|
||||||
outfp = open(outfile, "wb")
|
outfp = open(outfile, "wb")
|
||||||
|
|
||||||
|
|
||||||
for fname in files:
|
for fname in files:
|
||||||
with open(fname, "rb") as fp:
|
with open(fname, "rb") as fp:
|
||||||
|
@ -90,7 +92,7 @@ def main(args=None):
|
||||||
A.page_numbers = set([x-1 for x in A.page_numbers])
|
A.page_numbers = set([x-1 for x in A.page_numbers])
|
||||||
if A.pagenos:
|
if A.pagenos:
|
||||||
A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])
|
A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])
|
||||||
|
|
||||||
imagewriter = None
|
imagewriter = None
|
||||||
if A.output_dir:
|
if A.output_dir:
|
||||||
imagewriter = ImageWriter(A.output_dir)
|
imagewriter = ImageWriter(A.output_dir)
|
||||||
|
|
Loading…
Reference in New Issue