pdfminer.six/pdfminer/pdftypes.py

246 lines
7.3 KiB
Python
Raw Normal View History

#!/usr/bin/env python
import sys
import zlib
from lzw import LZWDecoder
from psparser import PSException
from psparser import PSObject, PSLiteral, PSKeyword
from psparser import PSLiteralTable, PSKeywordTable
from psparser import literal_name, keyword_name
from psparser import STRICT
LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'), PSLiteralTable.intern('Fl'))
LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'), PSLiteralTable.intern('LZW'))
LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'), PSLiteralTable.intern('A85'))
LITERALS_ASCIIHEX_DECODE = (PSLiteralTable.intern('ASCIIHexDecode'), PSLiteralTable.intern('AHx'))
## PDF Objects
##
class PDFObject(PSObject): pass
class PDFException(PSException): pass
class PDFTypeError(PDFException): pass
class PDFValueError(PDFException): pass
class PDFNotImplementedError(PSException): pass
## PDFObjRef
##
class PDFObjRef(PDFObject):
def __init__(self, doc, objid, _):
if objid == 0:
if STRICT:
raise PDFValueError('PDF object id cannot be 0.')
self.doc = doc
self.objid = objid
#self.genno = genno # Never used.
return
def __repr__(self):
return '<PDFObjRef:%d>' % (self.objid)
def resolve(self):
return self.doc.getobj(self.objid)
# resolve
def resolve1(x):
'''
Resolve an object. If this is an array or dictionary,
it may still contains some indirect objects inside.
'''
while isinstance(x, PDFObjRef):
x = x.resolve()
return x
def resolve_all(x):
'''
Recursively resolve X and all the internals.
Make sure there is no indirect reference within the nested object.
This procedure might be slow.
'''
while isinstance(x, PDFObjRef):
x = x.resolve()
if isinstance(x, list):
x = [ resolve_all(v) for v in x ]
elif isinstance(x, dict):
for (k,v) in x.iteritems():
x[k] = resolve_all(v)
return x
def decipher_all(decipher, objid, genno, x):
'''
Recursively decipher X.
'''
if isinstance(x, str):
return decipher(objid, genno, x)
if isinstance(x, list):
x = [ decipher_all(decipher, objid, genno, v) for v in x ]
elif isinstance(x, dict):
for (k,v) in x.iteritems():
x[k] = decipher_all(decipher, objid, genno, v)
return x
# Type cheking
def int_value(x):
x = resolve1(x)
if not isinstance(x, int):
if STRICT:
raise PDFTypeError('Integer required: %r' % x)
return 0
return x
def float_value(x):
x = resolve1(x)
if not isinstance(x, float):
if STRICT:
raise PDFTypeError('Float required: %r' % x)
return 0.0
return x
def num_value(x):
x = resolve1(x)
if not (isinstance(x, int) or isinstance(x, float)):
if STRICT:
raise PDFTypeError('Int or Float required: %r' % x)
return 0
return x
def str_value(x):
x = resolve1(x)
if not isinstance(x, str):
if STRICT:
raise PDFTypeError('String required: %r' % x)
return ''
return x
def list_value(x):
x = resolve1(x)
if not (isinstance(x, list) or isinstance(x, tuple)):
if STRICT:
raise PDFTypeError('List required: %r' % x)
return []
return x
def dict_value(x):
x = resolve1(x)
if not isinstance(x, dict):
if STRICT:
raise PDFTypeError('Dict required: %r' % x)
return {}
return x
def stream_value(x):
x = resolve1(x)
if not isinstance(x, PDFStream):
if STRICT:
raise PDFTypeError('PDFStream required: %r' % x)
return PDFStream({}, '')
return x
## PDFStream type
##
class PDFStream(PDFObject):
def __init__(self, dic, rawdata, decipher=None):
self.dic = dic
self.rawdata = rawdata
self.decipher = decipher
self.data = None
self.objid = None
self.genno = None
return
def set_objid(self, objid, genno):
self.objid = objid
self.genno = genno
return
def __repr__(self):
return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
def decomp(self,data):
import zlib
buf = data
# some FlateDecode streams have garbage (newlines, etc) appended to the
# end. remove chars from the end to try and decompress the buffer
while 8 <= len(buf):
try:
# will get errors if the document is encrypted.
dco = zlib.decompressobj()
return dco.decompress(buf)
except zlib.error:
buf = buf[:-1]
raise Exception, "zlib.error while decompressing data"
def decode(self):
assert self.data == None and self.rawdata != None
data = self.rawdata
if self.decipher:
# Handle encryption
data = self.decipher(self.objid, self.genno, data)
if 'Filter' not in self.dic:
self.data = data
self.rawdata = None
return
filters = self.dic['Filter']
if not isinstance(filters, list):
filters = [ filters ]
for f in filters:
if f in LITERALS_FLATE_DECODE:
# will get errors if the document is encrypted.
data = self.decomp(data)
elif f in LITERALS_LZW_DECODE:
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
data = ''.join(LZWDecoder(StringIO(data)).run())
elif f in LITERALS_ASCII85_DECODE:
import ascii85
data = ascii85.ascii85decode(data)
elif f in LITERALS_ASCIIHEX_DECODE:
import ascii85
data = ascii85.asciihexdecode(data)
elif f == LITERAL_CRYPT:
raise PDFNotImplementedError('/Crypt filter is unsupported')
else:
raise PDFNotImplementedError('Unsupported filter: %r' % f)
# apply predictors
if 'DP' in self.dic:
params = self.dic['DP']
else:
params = self.dic.get('DecodeParms', {})
if 'Predictor' in params:
pred = int_value(params['Predictor'])
if pred:
if pred != 12:
raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
if 'Columns' not in params:
raise PDFValueError('Columns undefined for predictor=12')
columns = int_value(params['Columns'])
buf = ''
ent0 = '\x00' * columns
for i in xrange(0, len(data), columns+1):
pred = data[i]
ent1 = data[i+1:i+1+columns]
if pred == '\x02':
ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )
buf += ent1
ent0 = ent1
data = buf
self.data = data
self.rawdata = None
return
def get_data(self):
if self.data == None:
self.decode()
return self.data
def get_rawdata(self):
return self.rawdata