pdfminer.six/pdfminer/pdftypes.py

#!/usr/bin/env python
import sys
import zlib
from lzw import LZWDecoder
from psparser import PSException
from psparser import PSObject, PSLiteral, PSKeyword
from psparser import PSLiteralTable, PSKeywordTable
from psparser import literal_name, keyword_name
from psparser import STRICT

LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'), PSLiteralTable.intern('Fl'))
LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'), PSLiteralTable.intern('LZW'))
LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'), PSLiteralTable.intern('A85'))
LITERALS_ASCIIHEX_DECODE = (PSLiteralTable.intern('ASCIIHexDecode'), PSLiteralTable.intern('AHx'))


##  PDF Objects
##
class PDFObject(PSObject): pass

class PDFException(PSException): pass
class PDFTypeError(PDFException): pass
class PDFValueError(PDFException): pass
class PDFNotImplementedError(PSException): pass


##  PDFObjRef
##
class PDFObjRef(PDFObject):

    def __init__(self, doc, objid, _):
        if objid == 0:
            if STRICT:
                raise PDFValueError('PDF object id cannot be 0.')
        self.doc = doc
        self.objid = objid
        #self.genno = genno  # Never used.
        return

    def __repr__(self):
        return '<PDFObjRef:%d>' % (self.objid)

    def resolve(self):
        return self.doc.getobj(self.objid)


# resolve
def resolve1(x):
    '''
    Resolve an object. If this is an array or dictionary,
    it may still contains some indirect objects inside.
    '''
    while isinstance(x, PDFObjRef):
        x = x.resolve()
    return x

def resolve_all(x):
    '''
    Recursively resolve X and all the internals.
    Make sure there is no indirect reference within the nested object.
    This procedure might be slow.
    '''
    while isinstance(x, PDFObjRef):
        x = x.resolve()
    if isinstance(x, list):
        x = [ resolve_all(v) for v in x ]
    elif isinstance(x, dict):
        for (k,v) in x.iteritems():
            x[k] = resolve_all(v)
    return x

def decipher_all(decipher, objid, genno, x):
    '''
    Recursively decipher X.
    '''
    if isinstance(x, str):
        return decipher(objid, genno, x)
    if isinstance(x, list):
        x = [ decipher_all(decipher, objid, genno, v) for v in x ]
    elif isinstance(x, dict):
        for (k,v) in x.iteritems():
            x[k] = decipher_all(decipher, objid, genno, v)
    return x

# Type cheking
def int_value(x):
    x = resolve1(x)
    if not isinstance(x, int):
        if STRICT:
            raise PDFTypeError('Integer required: %r' % x)
        return 0
    return x

def float_value(x):
    x = resolve1(x)
    if not isinstance(x, float):
        if STRICT:
            raise PDFTypeError('Float required: %r' % x)
        return 0.0
    return x

def num_value(x):
    x = resolve1(x)
    if not (isinstance(x, int) or isinstance(x, float)):
        if STRICT:
            raise PDFTypeError('Int or Float required: %r' % x)
        return 0
    return x

def str_value(x):
    x = resolve1(x)
    if not isinstance(x, str):
        if STRICT:
            raise PDFTypeError('String required: %r' % x)
        return ''
    return x

def list_value(x):
    x = resolve1(x)
    if not (isinstance(x, list) or isinstance(x, tuple)):
        if STRICT:
            raise PDFTypeError('List required: %r' % x)
        return []
    return x

def dict_value(x):
    x = resolve1(x)
    if not isinstance(x, dict):
        if STRICT:
            raise PDFTypeError('Dict required: %r' % x)
        return {}
    return x

def stream_value(x):
    x = resolve1(x)
    if not isinstance(x, PDFStream):
        if STRICT:
            raise PDFTypeError('PDFStream required: %r' % x)
        return PDFStream({}, '')
    return x


##  PDFStream type
##
class PDFStream(PDFObject):

    def __init__(self, dic, rawdata, decipher=None):
        self.dic = dic
        self.rawdata = rawdata
        self.decipher = decipher
        self.data = None
        self.objid = None
        self.genno = None
        return

    def set_objid(self, objid, genno):
        self.objid = objid
        self.genno = genno
        return

    def __repr__(self):
        return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)

    def decomp(self,data):
        import zlib
        buf = data
        # some FlateDecode streams have garbage (newlines, etc) appended to the
        # end.  remove chars from the end to try and decompress the buffer
        while 8 <= len(buf):
            try:
                # will get errors if the document is encrypted.
                dco = zlib.decompressobj()
                return dco.decompress(buf)
            except zlib.error:
                buf = buf[:-1]
        raise Exception, "zlib.error while decompressing data"

    def decode(self):
        assert self.data == None and self.rawdata != None
        data = self.rawdata
        if self.decipher:
            # Handle encryption
            data = self.decipher(self.objid, self.genno, data)
        if 'Filter' not in self.dic:
            self.data = data
            self.rawdata = None
            return
        filters = self.dic['Filter']
        if not isinstance(filters, list):
            filters = [ filters ]
        for f in filters:
            if f in LITERALS_FLATE_DECODE:
                # will get errors if the document is encrypted.
                data = self.decomp(data)
            elif f in LITERALS_LZW_DECODE:
                try:
                    from cStringIO import StringIO
                except ImportError:
                    from StringIO import StringIO
                data = ''.join(LZWDecoder(StringIO(data)).run())
            elif f in LITERALS_ASCII85_DECODE:
                import ascii85
                data = ascii85.ascii85decode(data)
            elif f in LITERALS_ASCIIHEX_DECODE:
                import ascii85
                data = ascii85.asciihexdecode(data)
            elif f == LITERAL_CRYPT:
                raise PDFNotImplementedError('/Crypt filter is unsupported')
            else:
                raise PDFNotImplementedError('Unsupported filter: %r' % f)
            # apply predictors
            if 'DP' in self.dic:
                params = self.dic['DP']
            else:
                params = self.dic.get('DecodeParms', {})
            if 'Predictor' in params:
                pred = int_value(params['Predictor'])
                if pred:
                    if pred != 12:
                        raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
                    if 'Columns' not in params:
                        raise PDFValueError('Columns undefined for predictor=12')
                    columns = int_value(params['Columns'])
                    buf = ''
                    ent0 = '\x00' * columns
                    for i in xrange(0, len(data), columns+1):
                        pred = data[i]
                        ent1 = data[i+1:i+1+columns]
                        if pred == '\x02':
                            ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )
                        buf += ent1
                        ent0 = ent1
                    data = buf
        self.data = data
        self.rawdata = None
        return

    def get_data(self):
        if self.data == None:
            self.decode()
        return self.data

    def get_rawdata(self):
        return self.rawdata
tmp git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@57 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-01-10 09:14:46 +00:00			`#!/usr/bin/env python`
source code tidy up git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@147 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-11-03 01:27:30 +00:00			`import sys`
			`import zlib`
			`from lzw import LZWDecoder`
			`from psparser import PSException`
fix CMapDB initialization stuff. more code cleanup. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@148 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-11-03 13:39:34 +00:00			`from psparser import PSObject, PSLiteral, PSKeyword`
			`from psparser import PSLiteralTable, PSKeywordTable`
			`from psparser import literal_name, keyword_name`
source code tidy up git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@147 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-11-03 01:27:30 +00:00			`from psparser import STRICT`
tmp git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@57 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-01-10 09:14:46 +00:00
			`LITERAL_CRYPT = PSLiteralTable.intern('Crypt')`
			`LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'), PSLiteralTable.intern('Fl'))`
			`LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'), PSLiteralTable.intern('LZW'))`
			`LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'), PSLiteralTable.intern('A85'))`
AsciiHexDecode filter patch incorporated. Thanks to Troy Bollinger. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@86 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-04-08 10:55:01 +00:00			`LITERALS_ASCIIHEX_DECODE = (PSLiteralTable.intern('ASCIIHexDecode'), PSLiteralTable.intern('AHx'))`
tmp git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@57 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-01-10 09:14:46 +00:00

			`## PDF Objects`
			`##`
			`class PDFObject(PSObject): pass`

			`class PDFException(PSException): pass`
			`class PDFTypeError(PDFException): pass`
			`class PDFValueError(PDFException): pass`
			`class PDFNotImplementedError(PSException): pass`


			`## PDFObjRef`
			`##`
			`class PDFObjRef(PDFObject):`

to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`def __init__(self, doc, objid, _):`
			`if objid == 0:`
			`if STRICT:`
			`raise PDFValueError('PDF object id cannot be 0.')`
			`self.doc = doc`
			`self.objid = objid`
			`#self.genno = genno # Never used.`
			`return`
tmp git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@57 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-01-10 09:14:46 +00:00
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`def __repr__(self):`
			`return '<PDFObjRef:%d>' % (self.objid)`

			`def resolve(self):`
			`return self.doc.getobj(self.objid)`
tmp git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@57 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-01-10 09:14:46 +00:00

			`# resolve`
			`def resolve1(x):`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`'''`
			`Resolve an object. If this is an array or dictionary,`
			`it may still contains some indirect objects inside.`
			`'''`
			`while isinstance(x, PDFObjRef):`
			`x = x.resolve()`
			`return x`
tmp git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@57 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-01-10 09:14:46 +00:00
			`def resolve_all(x):`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`'''`
			`Recursively resolve X and all the internals.`
			`Make sure there is no indirect reference within the nested object.`
			`This procedure might be slow.`
			`'''`
			`while isinstance(x, PDFObjRef):`
			`x = x.resolve()`
			`if isinstance(x, list):`
			`x = [ resolve_all(v) for v in x ]`
			`elif isinstance(x, dict):`
			`for (k,v) in x.iteritems():`
			`x[k] = resolve_all(v)`
			`return x`
tmp git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@57 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-01-10 09:14:46 +00:00
			`def decipher_all(decipher, objid, genno, x):`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`'''`
			`Recursively decipher X.`
			`'''`
			`if isinstance(x, str):`
			`return decipher(objid, genno, x)`
			`if isinstance(x, list):`
			`x = [ decipher_all(decipher, objid, genno, v) for v in x ]`
			`elif isinstance(x, dict):`
			`for (k,v) in x.iteritems():`
			`x[k] = decipher_all(decipher, objid, genno, v)`
			`return x`
tmp git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@57 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-01-10 09:14:46 +00:00
			`# Type cheking`
			`def int_value(x):`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`x = resolve1(x)`
			`if not isinstance(x, int):`
			`if STRICT:`
			`raise PDFTypeError('Integer required: %r' % x)`
			`return 0`
			`return x`
tmp git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@57 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-01-10 09:14:46 +00:00
			`def float_value(x):`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`x = resolve1(x)`
			`if not isinstance(x, float):`
			`if STRICT:`
			`raise PDFTypeError('Float required: %r' % x)`
			`return 0.0`
			`return x`
tmp git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@57 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-01-10 09:14:46 +00:00
			`def num_value(x):`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`x = resolve1(x)`
			`if not (isinstance(x, int) or isinstance(x, float)):`
			`if STRICT:`
			`raise PDFTypeError('Int or Float required: %r' % x)`
			`return 0`
			`return x`
tmp git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@57 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-01-10 09:14:46 +00:00
			`def str_value(x):`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`x = resolve1(x)`
			`if not isinstance(x, str):`
			`if STRICT:`
			`raise PDFTypeError('String required: %r' % x)`
			`return ''`
			`return x`
tmp git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@57 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-01-10 09:14:46 +00:00
			`def list_value(x):`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`x = resolve1(x)`
			`if not (isinstance(x, list) or isinstance(x, tuple)):`
			`if STRICT:`
			`raise PDFTypeError('List required: %r' % x)`
			`return []`
			`return x`
tmp git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@57 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-01-10 09:14:46 +00:00
			`def dict_value(x):`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`x = resolve1(x)`
			`if not isinstance(x, dict):`
			`if STRICT:`
			`raise PDFTypeError('Dict required: %r' % x)`
			`return {}`
			`return x`
tmp git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@57 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-01-10 09:14:46 +00:00
			`def stream_value(x):`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`x = resolve1(x)`
			`if not isinstance(x, PDFStream):`
			`if STRICT:`
			`raise PDFTypeError('PDFStream required: %r' % x)`
			`return PDFStream({}, '')`
			`return x`
tmp git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@57 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-01-10 09:14:46 +00:00

			`## PDFStream type`
			`##`
			`class PDFStream(PDFObject):`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00
			`def __init__(self, dic, rawdata, decipher=None):`
			`self.dic = dic`
			`self.rawdata = rawdata`
			`self.decipher = decipher`
			`self.data = None`
			`self.objid = None`
			`self.genno = None`
			`return`

			`def set_objid(self, objid, genno):`
			`self.objid = objid`
			`self.genno = genno`
			`return`

			`def __repr__(self):`
			`return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)`

			`def decomp(self,data):`
			`import zlib`
			`buf = data`
			`# some FlateDecode streams have garbage (newlines, etc) appended to the`
			`# end. remove chars from the end to try and decompress the buffer`
			`while 8 <= len(buf):`
			`try:`
			`# will get errors if the document is encrypted.`
			`dco = zlib.decompressobj()`
			`return dco.decompress(buf)`
			`except zlib.error:`
			`buf = buf[:-1]`
			`raise Exception, "zlib.error while decompressing data"`

			`def decode(self):`
			`assert self.data == None and self.rawdata != None`
			`data = self.rawdata`
			`if self.decipher:`
			`# Handle encryption`
			`data = self.decipher(self.objid, self.genno, data)`
			`if 'Filter' not in self.dic:`
			`self.data = data`
			`self.rawdata = None`
			`return`
			`filters = self.dic['Filter']`
			`if not isinstance(filters, list):`
			`filters = [ filters ]`
			`for f in filters:`
			`if f in LITERALS_FLATE_DECODE:`
			`# will get errors if the document is encrypted.`
			`data = self.decomp(data)`
			`elif f in LITERALS_LZW_DECODE:`
			`try:`
			`from cStringIO import StringIO`
			`except ImportError:`
			`from StringIO import StringIO`
			`data = ''.join(LZWDecoder(StringIO(data)).run())`
			`elif f in LITERALS_ASCII85_DECODE:`
			`import ascii85`
			`data = ascii85.ascii85decode(data)`
			`elif f in LITERALS_ASCIIHEX_DECODE:`
			`import ascii85`
			`data = ascii85.asciihexdecode(data)`
			`elif f == LITERAL_CRYPT:`
			`raise PDFNotImplementedError('/Crypt filter is unsupported')`
			`else:`
			`raise PDFNotImplementedError('Unsupported filter: %r' % f)`
			`# apply predictors`
			`if 'DP' in self.dic:`
			`params = self.dic['DP']`
			`else:`
			`params = self.dic.get('DecodeParms', {})`
			`if 'Predictor' in params:`
			`pred = int_value(params['Predictor'])`
			`if pred:`
			`if pred != 12:`
			`raise PDFNotImplementedError('Unsupported predictor: %r' % pred)`
			`if 'Columns' not in params:`
			`raise PDFValueError('Columns undefined for predictor=12')`
			`columns = int_value(params['Columns'])`
			`buf = ''`
			`ent0 = '\x00' * columns`
			`for i in xrange(0, len(data), columns+1):`
			`pred = data[i]`
			`ent1 = data[i+1:i+1+columns]`
			`if pred == '\x02':`
			`ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )`
			`buf += ent1`
			`ent0 = ent1`
			`data = buf`
			`self.data = data`
			`self.rawdata = None`
			`return`

			`def get_data(self):`
			`if self.data == None:`
			`self.decode()`
			`return self.data`

			`def get_rawdata(self):`
			`return self.rawdata`