Improved settings management

pull/16/head
Steve Hair 2016-01-10 12:17:38 -05:00
parent 72b2bc3197
commit 92c71436b9
7 changed files with 51 additions and 51 deletions

View File

@ -19,7 +19,7 @@ from .psparser import PSEOF
from .psparser import literal_name from .psparser import literal_name
from .psparser import LIT from .psparser import LIT
from .psparser import KWD from .psparser import KWD
from .settings import STRICT from . import settings
from .pdftypes import PDFException from .pdftypes import PDFException
from .pdftypes import PDFTypeError from .pdftypes import PDFTypeError
from .pdftypes import PDFStream from .pdftypes import PDFStream
@ -196,7 +196,7 @@ class PDFXRefFallback(PDFXRef):
try: try:
n = stream['N'] n = stream['N']
except KeyError: except KeyError:
if STRICT: if settings.STRICT:
raise PDFSyntaxError('N is not defined: %r' % stream) raise PDFSyntaxError('N is not defined: %r' % stream)
n = 0 n = 0
parser1 = PDFStreamParser(stream.get_data()) parser1 = PDFStreamParser(stream.get_data())
@ -582,7 +582,7 @@ class PDFDocument(object):
else: else:
raise PDFSyntaxError('No /Root object! - Is this really a PDF?') raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
if self.catalog.get('Type') is not LITERAL_CATALOG: if self.catalog.get('Type') is not LITERAL_CATALOG:
if STRICT: if settings.STRICT:
raise PDFSyntaxError('Catalog not found!') raise PDFSyntaxError('Catalog not found!')
return return
@ -620,12 +620,12 @@ class PDFDocument(object):
def _get_objects(self, stream): def _get_objects(self, stream):
if stream.get('Type') is not LITERAL_OBJSTM: if stream.get('Type') is not LITERAL_OBJSTM:
if STRICT: if settings.STRICT:
raise PDFSyntaxError('Not a stream object: %r' % stream) raise PDFSyntaxError('Not a stream object: %r' % stream)
try: try:
n = stream['N'] n = stream['N']
except KeyError: except KeyError:
if STRICT: if settings.STRICT:
raise PDFSyntaxError('N is not defined: %r' % stream) raise PDFSyntaxError('N is not defined: %r' % stream)
n = 0 n = 0
parser = PDFStreamParser(stream.get_data()) parser = PDFStreamParser(stream.get_data())

View File

@ -12,7 +12,7 @@ from .psparser import PSStackParser
from .psparser import PSEOF from .psparser import PSEOF
from .psparser import LIT from .psparser import LIT
from .psparser import KWD from .psparser import KWD
from .settings import STRICT from . import settings
from .psparser import PSLiteral from .psparser import PSLiteral
from .psparser import literal_name from .psparser import literal_name
from .pdftypes import PDFException from .pdftypes import PDFException
@ -574,7 +574,7 @@ class PDFType1Font(PDFSimpleFont):
try: try:
self.basefont = literal_name(spec['BaseFont']) self.basefont = literal_name(spec['BaseFont'])
except KeyError: except KeyError:
if STRICT: if settings.STRICT:
raise PDFFontError('BaseFont is missing') raise PDFFontError('BaseFont is missing')
self.basefont = 'unknown' self.basefont = 'unknown'
try: try:
@ -632,11 +632,11 @@ class PDFType3Font(PDFSimpleFont):
# PDFCIDFont # PDFCIDFont
class PDFCIDFont(PDFFont): class PDFCIDFont(PDFFont):
def __init__(self, rsrcmgr, spec, STRICT=False): def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
try: try:
self.basefont = literal_name(spec['BaseFont']) self.basefont = literal_name(spec['BaseFont'])
except KeyError: except KeyError:
if STRICT: if strict:
raise PDFFontError('BaseFont is missing') raise PDFFontError('BaseFont is missing')
self.basefont = 'unknown' self.basefont = 'unknown'
self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {})) self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
@ -645,19 +645,19 @@ class PDFCIDFont(PDFFont):
try: try:
name = literal_name(spec['Encoding']) name = literal_name(spec['Encoding'])
except KeyError: except KeyError:
if STRICT: if strict:
raise PDFFontError('Encoding is unspecified') raise PDFFontError('Encoding is unspecified')
name = 'unknown' name = 'unknown'
try: try:
self.cmap = CMapDB.get_cmap(name) self.cmap = CMapDB.get_cmap(name)
except CMapDB.CMapNotFound as e: except CMapDB.CMapNotFound as e:
if STRICT: if strict:
raise PDFFontError(e) raise PDFFontError(e)
self.cmap = CMap() self.cmap = CMap()
try: try:
descriptor = dict_value(spec['FontDescriptor']) descriptor = dict_value(spec['FontDescriptor'])
except KeyError: except KeyError:
if STRICT: if strict:
raise PDFFontError('FontDescriptor is missing') raise PDFFontError('FontDescriptor is missing')
descriptor = {} descriptor = {}
ttf = None ttf = None

View File

@ -12,7 +12,7 @@ from .psparser import keyword_name
from .psparser import PSStackParser from .psparser import PSStackParser
from .psparser import LIT from .psparser import LIT
from .psparser import KWD from .psparser import KWD
from .settings import STRICT from . import settings
from .pdftypes import PDFException from .pdftypes import PDFException
from .pdftypes import PDFStream from .pdftypes import PDFStream
from .pdftypes import PDFObjRef from .pdftypes import PDFObjRef
@ -167,14 +167,14 @@ class PDFResourceManager(object):
font = self._cached_fonts[objid] font = self._cached_fonts[objid]
else: else:
logging.info('get_font: create: objid=%r, spec=%r', objid, spec) logging.info('get_font: create: objid=%r, spec=%r', objid, spec)
if STRICT: if settings.STRICT:
if spec['Type'] is not LITERAL_FONT: if spec['Type'] is not LITERAL_FONT:
raise PDFFontError('Type is not /Font') raise PDFFontError('Type is not /Font')
# Create a Font object. # Create a Font object.
if 'Subtype' in spec: if 'Subtype' in spec:
subtype = literal_name(spec['Subtype']) subtype = literal_name(spec['Subtype'])
else: else:
if STRICT: if settings.STRICT:
raise PDFFontError('Font Subtype is not specified.') raise PDFFontError('Font Subtype is not specified.')
subtype = 'Type1' subtype = 'Type1'
if subtype in ('Type1', 'MMType1'): if subtype in ('Type1', 'MMType1'):
@ -199,7 +199,7 @@ class PDFResourceManager(object):
subspec[k] = resolve1(spec[k]) subspec[k] = resolve1(spec[k])
font = self.get_font(None, subspec) font = self.get_font(None, subspec)
else: else:
if STRICT: if settings.STRICT:
raise PDFFontError('Invalid Font spec: %r' % spec) raise PDFFontError('Invalid Font spec: %r' % spec)
font = PDFType1Font(self, spec) # this is so wrong! font = PDFType1Font(self, spec) # this is so wrong!
if objid and self.caching: if objid and self.caching:
@ -299,7 +299,7 @@ class PDFContentParser(PSStackParser):
self.push((pos, obj)) self.push((pos, obj))
self.push((pos, self.KEYWORD_EI)) self.push((pos, self.KEYWORD_EI))
except PSTypeError: except PSTypeError:
if STRICT: if settings.STRICT:
raise raise
else: else:
self.push((pos, token)) self.push((pos, token))
@ -559,7 +559,7 @@ class PDFPageInterpreter(object):
try: try:
self.scs = self.csmap[literal_name(name)] self.scs = self.csmap[literal_name(name)]
except KeyError: except KeyError:
if STRICT: if settings.STRICT:
raise PDFInterpreterError('Undefined ColorSpace: %r' % name) raise PDFInterpreterError('Undefined ColorSpace: %r' % name)
return return
@ -568,7 +568,7 @@ class PDFPageInterpreter(object):
try: try:
self.ncs = self.csmap[literal_name(name)] self.ncs = self.csmap[literal_name(name)]
except KeyError: except KeyError:
if STRICT: if settings.STRICT:
raise PDFInterpreterError('Undefined ColorSpace: %r' % name) raise PDFInterpreterError('Undefined ColorSpace: %r' % name)
return return
@ -607,7 +607,7 @@ class PDFPageInterpreter(object):
if self.scs: if self.scs:
n = self.scs.ncomponents n = self.scs.ncomponents
else: else:
if STRICT: if settings.STRICT:
raise PDFInterpreterError('No colorspace specified!') raise PDFInterpreterError('No colorspace specified!')
n = 1 n = 1
self.pop(n) self.pop(n)
@ -617,7 +617,7 @@ class PDFPageInterpreter(object):
if self.ncs: if self.ncs:
n = self.ncs.ncomponents n = self.ncs.ncomponents
else: else:
if STRICT: if settings.STRICT:
raise PDFInterpreterError('No colorspace specified!') raise PDFInterpreterError('No colorspace specified!')
n = 1 n = 1
self.pop(n) self.pop(n)
@ -698,7 +698,7 @@ class PDFPageInterpreter(object):
try: try:
self.textstate.font = self.fontmap[literal_name(fontid)] self.textstate.font = self.fontmap[literal_name(fontid)]
except KeyError: except KeyError:
if STRICT: if settings.STRICT:
raise PDFInterpreterError('Undefined Font id: %r' % fontid) raise PDFInterpreterError('Undefined Font id: %r' % fontid)
self.textstate.font = self.rsrcmgr.get_font(None, {}) self.textstate.font = self.rsrcmgr.get_font(None, {})
self.textstate.fontsize = fontsize self.textstate.fontsize = fontsize
@ -748,7 +748,7 @@ class PDFPageInterpreter(object):
def do_TJ(self, seq): def do_TJ(self, seq):
#print >>sys.stderr, 'TJ(%r): %r' % (seq, self.textstate) #print >>sys.stderr, 'TJ(%r): %r' % (seq, self.textstate)
if self.textstate.font is None: if self.textstate.font is None:
if STRICT: if settings.STRICT:
raise PDFInterpreterError('No font specified!') raise PDFInterpreterError('No font specified!')
return return
self.device.render_string(self.textstate, seq) self.device.render_string(self.textstate, seq)
@ -793,7 +793,7 @@ class PDFPageInterpreter(object):
try: try:
xobj = stream_value(self.xobjmap[xobjid]) xobj = stream_value(self.xobjmap[xobjid])
except KeyError: except KeyError:
if STRICT: if settings.STRICT:
raise PDFInterpreterError('Undefined xobject id: %r' % xobjid) raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
return return
logging.info('Processing xobj: %r', xobj) logging.info('Processing xobj: %r', xobj)
@ -872,7 +872,7 @@ class PDFPageInterpreter(object):
logging.debug('exec: %s', name) logging.debug('exec: %s', name)
func() func()
else: else:
if STRICT: if settings.STRICT:
raise PDFInterpreterError('Unknown operator: %r' % name) raise PDFInterpreterError('Unknown operator: %r' % name)
else: else:
self.push(obj) self.push(obj)

View File

@ -5,7 +5,7 @@ from .psparser import PSStackParser
from .psparser import PSSyntaxError from .psparser import PSSyntaxError
from .psparser import PSEOF from .psparser import PSEOF
from .psparser import KWD from .psparser import KWD
from .settings import STRICT from . import settings
from .pdftypes import PDFException from .pdftypes import PDFException
from .pdftypes import PDFStream from .pdftypes import PDFStream
from .pdftypes import PDFObjRef from .pdftypes import PDFObjRef
@ -89,13 +89,13 @@ class PDFParser(PSStackParser):
try: try:
objlen = int_value(dic['Length']) objlen = int_value(dic['Length'])
except KeyError: except KeyError:
if STRICT: if settings.STRICT:
raise PDFSyntaxError('/Length is undefined: %r' % dic) raise PDFSyntaxError('/Length is undefined: %r' % dic)
self.seek(pos) self.seek(pos)
try: try:
(_, line) = self.nextline() # 'stream' (_, line) = self.nextline() # 'stream'
except PSEOF: except PSEOF:
if STRICT: if settings.STRICT:
raise PDFSyntaxError('Unexpected EOF') raise PDFSyntaxError('Unexpected EOF')
return return
pos += len(line) pos += len(line)
@ -106,7 +106,7 @@ class PDFParser(PSStackParser):
try: try:
(linepos, line) = self.nextline() (linepos, line) = self.nextline()
except PSEOF: except PSEOF:
if STRICT: if settings.STRICT:
raise PDFSyntaxError('Unexpected EOF') raise PDFSyntaxError('Unexpected EOF')
break break
if b'endstream' in line: if b'endstream' in line:
@ -164,7 +164,7 @@ class PDFStreamParser(PDFParser):
pass pass
return return
elif token in (self.KEYWORD_OBJ, self.KEYWORD_ENDOBJ): elif token in (self.KEYWORD_OBJ, self.KEYWORD_ENDOBJ):
if STRICT: if settings.STRICT:
# See PDF Spec 3.4.6: Only the object values are stored in the # See PDF Spec 3.4.6: Only the object values are stored in the
# stream; the obj and endobj keywords are not used. # stream; the obj and endobj keywords are not used.
raise PDFSyntaxError('Keyword endobj found in stream') raise PDFSyntaxError('Keyword endobj found in stream')

View File

@ -8,7 +8,7 @@ from .ccitt import ccittfaxdecode
from .psparser import PSException from .psparser import PSException
from .psparser import PSObject from .psparser import PSObject
from .psparser import LIT from .psparser import LIT
from .settings import STRICT from . import settings
from .utils import apply_png_predictor from .utils import apply_png_predictor
from .utils import isnumber from .utils import isnumber
@ -53,7 +53,7 @@ class PDFObjRef(PDFObject):
def __init__(self, doc, objid, _): def __init__(self, doc, objid, _):
if objid == 0: if objid == 0:
if STRICT: if settings.STRICT:
raise PDFValueError('PDF object id cannot be 0.') raise PDFValueError('PDF object id cannot be 0.')
self.doc = doc self.doc = doc
self.objid = objid self.objid = objid
@ -115,7 +115,7 @@ def decipher_all(decipher, objid, genno, x):
def int_value(x): def int_value(x):
x = resolve1(x) x = resolve1(x)
if not isinstance(x, int): if not isinstance(x, int):
if STRICT: if settings.STRICT:
raise PDFTypeError('Integer required: %r' % x) raise PDFTypeError('Integer required: %r' % x)
return 0 return 0
return x return x
@ -124,7 +124,7 @@ def int_value(x):
def float_value(x): def float_value(x):
x = resolve1(x) x = resolve1(x)
if not isinstance(x, float): if not isinstance(x, float):
if STRICT: if settings.STRICT:
raise PDFTypeError('Float required: %r' % x) raise PDFTypeError('Float required: %r' % x)
return 0.0 return 0.0
return x return x
@ -133,7 +133,7 @@ def float_value(x):
def num_value(x): def num_value(x):
x = resolve1(x) x = resolve1(x)
if not isnumber(x): if not isnumber(x):
if STRICT: if settings.STRICT:
raise PDFTypeError('Int or Float required: %r' % x) raise PDFTypeError('Int or Float required: %r' % x)
return 0 return 0
return x return x
@ -142,7 +142,7 @@ def num_value(x):
def str_value(x): def str_value(x):
x = resolve1(x) x = resolve1(x)
if not isinstance(x, six.binary_type): if not isinstance(x, six.binary_type):
if STRICT: if settings.STRICT:
raise PDFTypeError('String required: %r' % x) raise PDFTypeError('String required: %r' % x)
return '' return ''
return x return x
@ -151,7 +151,7 @@ def str_value(x):
def list_value(x): def list_value(x):
x = resolve1(x) x = resolve1(x)
if not isinstance(x, (list, tuple)): if not isinstance(x, (list, tuple)):
if STRICT: if settings.STRICT:
raise PDFTypeError('List required: %r' % x) raise PDFTypeError('List required: %r' % x)
return [] return []
return x return x
@ -160,7 +160,7 @@ def list_value(x):
def dict_value(x): def dict_value(x):
x = resolve1(x) x = resolve1(x)
if not isinstance(x, dict): if not isinstance(x, dict):
if STRICT: if settings.STRICT:
import logging import logging
logging.error('PDFTypeError : Dict required: %r', x) logging.error('PDFTypeError : Dict required: %r', x)
raise PDFTypeError('Dict required: %r' % x) raise PDFTypeError('Dict required: %r' % x)
@ -171,7 +171,7 @@ def dict_value(x):
def stream_value(x): def stream_value(x):
x = resolve1(x) x = resolve1(x)
if not isinstance(x, PDFStream): if not isinstance(x, PDFStream):
if STRICT: if settings.STRICT:
raise PDFTypeError('PDFStream required: %r' % x) raise PDFTypeError('PDFStream required: %r' % x)
return PDFStream({}, '') return PDFStream({}, '')
return x return x
@ -247,7 +247,7 @@ class PDFStream(PDFObject):
try: try:
data = zlib.decompress(data) data = zlib.decompress(data)
except zlib.error as e: except zlib.error as e:
if STRICT: if settings.STRICT:
raise PDFException('Invalid zlib bytes: %r, %r' % (e, data)) raise PDFException('Invalid zlib bytes: %r, %r' % (e, data))
data = b'' data = b''
elif f in LITERALS_LZW_DECODE: elif f in LITERALS_LZW_DECODE:

View File

@ -6,7 +6,7 @@ import logging
import six # Python 2+3 compatibility import six # Python 2+3 compatibility
from .settings import STRICT from . import settings
def bytesindex(s,i,j=None): def bytesindex(s,i,j=None):
"""implements s[i], s[i:], s[i:j] for Python2 and Python3""" """implements s[i], s[i:], s[i:j] for Python2 and Python3"""
@ -134,7 +134,7 @@ KEYWORD_DICT_END = KWD(b'>>')
def literal_name(x): def literal_name(x):
if not isinstance(x, PSLiteral): if not isinstance(x, PSLiteral):
if STRICT: if settings.STRICT:
raise PSTypeError('Literal required: %r' % x) raise PSTypeError('Literal required: %r' % x)
else: else:
name=x name=x
@ -149,7 +149,7 @@ def literal_name(x):
def keyword_name(x): def keyword_name(x):
if not isinstance(x, PSKeyword): if not isinstance(x, PSKeyword):
if STRICT: if settings.STRICT:
raise PSTypeError('Keyword required: %r' % x) raise PSTypeError('Keyword required: %r' % x)
else: else:
name=x name=x
@ -592,7 +592,7 @@ class PSStackParser(PSBaseParser):
try: try:
self.push(self.end_type('a')) self.push(self.end_type('a'))
except PSTypeError: except PSTypeError:
if STRICT: if settings.STRICT:
raise raise
elif token == KEYWORD_DICT_BEGIN: elif token == KEYWORD_DICT_BEGIN:
# begin dictionary # begin dictionary
@ -607,7 +607,7 @@ class PSStackParser(PSBaseParser):
d = dict((literal_name(k), v) for (k, v) in choplist(2, objs) if v is not None) d = dict((literal_name(k), v) for (k, v) in choplist(2, objs) if v is not None)
self.push((pos, d)) self.push((pos, d))
except PSTypeError: except PSTypeError:
if STRICT: if settings.STRICT:
raise raise
elif token == KEYWORD_PROC_BEGIN: elif token == KEYWORD_PROC_BEGIN:
# begin proc # begin proc
@ -617,7 +617,7 @@ class PSStackParser(PSBaseParser):
try: try:
self.push(self.end_type('p')) self.push(self.end_type('p'))
except PSTypeError: except PSTypeError:
if STRICT: if settings.STRICT:
raise raise
elif isinstance(token,PSKeyword): elif isinstance(token,PSKeyword):
logging.debug('do_keyword: pos=%r, token=%r, stack=%r', pos, token, self.curstack) logging.debug('do_keyword: pos=%r, token=%r, stack=%r', pos, token, self.curstack)

View File

@ -1,8 +1,8 @@
STRICT = True
try: try:
from django.conf import django_settings from django.conf import django_settings
except (ImportError, NameError) as e: STRICT = getattr(django_settings, 'PDF_MINER_IS_STRICT', STRICT)
except Exception:
# in case it's not a django project # in case it's not a django project
django_settings = None pass
# Get defaults from django settings
STRICT = getattr(django_settings, 'PDF_MINER_IS_STRICT', True)