pdfminer.six/pdfparser.py

541 lines
15 KiB
Python
Executable File

#!/usr/bin/env python
# pdfparser.py, Yusuke Shinyama
# ver 0.1, Dec 24 2004-
# ver 0.2, Dec 24 2007
# TODO:
# - Code Documentation.
# - Error handling for invalid type.
# - Outlines.
# - Named Objects. (pages)
# - Writers.
# - Linearized PDF.
# - Encryption?
import sys, re
stderr = sys.stderr
from utils import choplist, nunpack
from psparser import PSException, PSSyntaxError, PSTypeError, \
PSLiteral, PSKeyword, \
PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
PSStackParser
## PDF Exceptions
##
class PDFException(PSException): pass
class PDFSyntaxError(PDFException): pass
class PDFEncrypted(PDFException): pass
class PDFTypeError(PDFException): pass
class PDFValueError(PDFException): pass
# some predefined literals and keywords.
LITERAL_OBJSTM = PSLiteralTable.intern('ObjStm')
LITERAL_XREF = PSLiteralTable.intern('XRef')
LITERAL_PAGE = PSLiteralTable.intern('Page')
LITERAL_PAGES = PSLiteralTable.intern('Pages')
LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode')
KEYWORD_OBJ = PSKeywordTable.intern('obj')
KEYWORD_EI = PSKeywordTable.intern('EI')
## PDFObjRef
##
class PDFObjRef:
def __init__(self, doc, objid, genno):
if objid == 0:
raise PDFValueError('objid cannot be 0.')
self.doc = doc
self.objid = objid
#self.genno = genno # Never used.
return
def __repr__(self):
return '<PDFObjRef:%d>' % (self.objid)
def resolve(self):
return self.doc.getobj(self.objid)
# resolve
def resolve1(x):
'''
Resolve an object. If this is an array or dictionary,
it may still contains some indirect objects inside.
'''
while isinstance(x, PDFObjRef):
x = x.resolve()
return x
def resolveall(x):
'''
Recursively resolve X and all the internals.
Make sure there is no indirect reference within the nested object.
This procedure might be slow. Do not used it unless
you really need it.
'''
while isinstance(x, PDFObjRef):
x = x.resolve()
if isinstance(x, list):
x = [ resolveall(v) for v in x ]
elif isinstance(x, dict):
for (k,v) in x.iteritems():
x[k] = resolveall(v)
return x
# Type cheking
def int_value(x):
x = resolve1(x)
if not isinstance(x, int):
raise PDFTypeError('integer required: %r' % x)
return x
def float_value(x):
x = resolve1(x)
if not isinstance(x, float):
raise PDFTypeError('float required: %r' % x)
return x
def num_value(x):
x = resolve1(x)
if not (isinstance(x, int) or isinstance(x, float)):
raise PDFTypeError('int or float required: %r' % x)
return x
def str_value(x):
x = resolve1(x)
if not isinstance(x, str):
raise PDFTypeError('string required: %r' % x)
return x
def list_value(x):
x = resolve1(x)
if not isinstance(x, list):
raise PDFTypeError('list required: %r' % x)
return x
def dict_value(x):
x = resolve1(x)
if not isinstance(x, dict):
raise PDFTypeError('dict required: %r' % x)
return x
def stream_value(x):
x = resolve1(x)
if not isinstance(x, PDFStream):
raise PDFTypeError('stream required: %r' % x)
return x
## PDFStream type
##
class PDFStream:
def __init__(self, doc, dic, rawdata):
self.doc = doc
self.dic = dic
self.rawdata = rawdata
self.data = None
return
def __repr__(self):
return '<PDFStream: %r>' % (self.dic)
def decode(self):
assert self.data == None and self.rawdata != None
data = self.rawdata
if self.doc.crypt:
# func DECRYPT is not implemented yet...
raise NotImplementedError
data = DECRYPT(self.doc.crypt, data)
if 'Filter' not in self.dic:
self.data = data
self.rawdata = None
return
filters = self.dic['Filter']
if not isinstance(filters, list):
filters = [ filters ]
for f in filters:
if f == LITERAL_FLATE_DECODE:
import zlib
# will get errors if the document is encrypted.
data = zlib.decompress(data)
# apply predictors
params = self.dic.get('DecodeParms', {})
if 'Predictor' in params:
pred = int_value(params['Predictor'])
if pred:
if pred != 12:
raise PDFValueError('Unsupported predictor: %r' % pred)
if 'Columns' not in params:
raise PDFValueError('Columns undefined for predictor=12')
columns = int_value(params['Columns'])
buf = ''
ent0 = '\x00' * columns
for i in xrange(0, len(data), columns+1):
pred = data[i]
ent1 = data[i+1:i+1+columns]
if pred == '\x02':
ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )
buf += ent1
ent0 = ent1
data = buf
else:
raise PDFValueError('Invalid filter spec: %r' % f)
self.data = data
self.rawdata = None
return
def get_data(self):
if self.data == None:
self.decode()
return self.data
def parse_data(self, inline=False, debug=0):
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
return PDFParser(self.doc, StringIO(self.get_data()),
inline=inline, debug=debug).parse()
## PDFPage
##
class PDFPage:
def __init__(self, doc, pageidx, attrs, parent_attrs):
self.doc = doc
self.pageid = pageidx
self.attrs = dict_value(attrs)
self.parent_attrs = parent_attrs
self.resources = self.get_attr('Resources')
self.mediabox = self.get_attr('MediaBox')
contents = resolve1(self.attrs['Contents'])
if not isinstance(contents, list):
contents = [ contents ]
self.contents = contents
return
def __repr__(self):
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
def get_attr(self, k):
if k in self.attrs:
return resolve1(self.attrs[k])
return self.parent_attrs.get(k)
## XRefs
## PDFXRef
##
class PDFXRef:
def __init__(self, parser):
while 1:
line = parser.nextline()
if not line:
raise PDFSyntaxError('premature eof: %r' % parser)
line = line.strip()
f = line.split(' ')
if len(f) != 2:
if line != 'trailer':
raise PDFSyntaxError('trailer not found: %r: line=%r' % (parser, line))
break
(start, nobjs) = map(long, f)
self.objid0 = start
self.objid1 = start+nobjs
self.offsets = []
for objid in xrange(start, start+nobjs):
line = parser.nextline()
f = line.strip().split(' ')
if len(f) != 3:
raise PDFSyntaxError('invalid xref format: %r, line=%r' % (parser, line))
(pos, genno, use) = f
self.offsets.append((int(genno), long(pos), use))
# read trailer
self.trailer = dict_value(parser.parse()[0])
return
def getpos(self, objid):
if objid < self.objid0 or self.objid1 <= objid:
raise IndexError
(genno, pos, use) = self.offsets[objid-self.objid0]
if use != 'n':
raise PDFValueError('unused objid=%r' % objid)
return (None, pos)
## PDFXRefStream
##
class PDFXRefStream:
def __init__(self, parser):
(objid, genno, _, stream) = list_value(parser.parse())
assert stream.dic['Type'] == LITERAL_XREF
size = stream.dic['Size']
(start, nobjs) = stream.dic.get('Index', (0,size))
self.objid0 = start
self.objid1 = start+nobjs
(self.fl1, self.fl2, self.fl3) = stream.dic['W']
self.data = stream.get_data()
self.entlen = self.fl1+self.fl2+self.fl3
self.trailer = stream.dic
return
def getpos(self, objid):
if objid < self.objid0 or self.objid1 <= objid:
raise IndexError
i = self.entlen * (objid-self.objid0)
ent = self.data[i:i+self.entlen]
f1 = nunpack(ent[:self.fl1], 1)
if f1 == 1:
pos = nunpack(ent[self.fl1:self.fl1+self.fl2])
genno = nunpack(ent[self.fl1+self.fl2:])
return (None, pos)
elif f1 == 2:
objid = nunpack(ent[self.fl1:self.fl1+self.fl2])
index = nunpack(ent[self.fl1+self.fl2:])
return (objid, index)
## PDFDocument
##
class PDFDocument:
def __init__(self, debug=0):
self.debug = debug
self.xrefs = []
self.objs = {}
self.parsed_objs = {}
self.crypt = None
self.root = None
self.catalog = None
self.parser = None
return
def set_parser(self, parser):
if self.parser: return
self.parser = parser
self.xrefs = list(parser.read_xref())
for xref in self.xrefs:
trailer = xref.trailer
if 'Encrypt' in trailer:
self.crypt = dict_value(trailer['Encrypt'])
if 'Root' in trailer:
self.set_root(dict_value(trailer['Root']))
break
else:
raise PDFValueError('no /Root object!')
return
def getobj(self, objid):
assert self.xrefs
if objid in self.objs:
obj = self.objs[objid]
else:
for xref in self.xrefs:
try:
(strmid, index) = xref.getpos(objid)
break
except IndexError:
pass
else:
raise PDFValueError('Cannot locate objid=%r' % objid)
if strmid:
stream = stream_value(self.getobj(strmid))
if stream.dic['Type'] != LITERAL_OBJSTM:
raise PDFSyntaxError('Not a stream object: %r' % stream)
if 'N' not in stream.dic:
raise PDFSyntaxError('N is not defined: %r' % stream)
if strmid in self.parsed_objs:
objs = self.parsed_objs[stream]
else:
objs = stream.parse_data(self.debug)
self.parsed_objs[stream] = objs
obj = objs[stream.dic['N']*2+index]
else:
pos0 = self.parser.linepos
self.parser.seek(index)
seq = list_value(self.parser.parse())
if not (len(seq) == 4 and seq[0] == objid and seq[2] == KEYWORD_OBJ):
raise PDFSyntaxError('invalid stream spec: %r' % seq)
obj = seq[3]
self.parser.seek(pos0)
if 2 <= self.debug:
print >>stderr, 'register: objid=%r: %r' % (objid, obj)
self.objs[objid] = obj
return obj
def get_pages(self, debug=0):
assert self.xrefs
def search(obj, parent):
tree = dict_value(obj)
if tree['Type'] == LITERAL_PAGES:
if 1 <= debug:
print >>stderr, 'Pages: Kids=%r' % tree['Kids']
for c in tree['Kids']:
for x in search(c, tree):
yield x
elif tree['Type'] == LITERAL_PAGE:
if 1 <= debug:
print >>stderr, 'Page: %r' % tree
yield (tree, parent)
for (i,(tree,parent)) in enumerate(search(self.catalog['Pages'], self.catalog)):
yield PDFPage(self, i, tree, parent)
return
def set_root(self, root):
self.root = root
self.catalog = dict_value(self.root)
if self.catalog['Type'] != LITERAL_CATALOG:
raise PDFValueError('Catalog not found!')
self.outline = self.catalog.get('Outline')
return
## PDFParser
##
class PDFParser(PSStackParser):
def __init__(self, doc, fp, inline=False, debug=0):
PSStackParser.__init__(self, fp, debug=debug)
self.inline = inline
self.doc = doc
self.doc.set_parser(self)
return
def __repr__(self):
return '<PDFParser: linepos=%d>' % self.linepos
EOIPAT = re.compile(r'\nEI\W')
def do_token(self, pos, token):
name = keyword_name(token)
if name in ('xref', 'trailer', 'startxref', 'endobj'):
return True
if name == 'R':
# reference to indirect object
try:
(objid, genno) = self.pop(2)
(objid, genno) = (int(objid), int(genno))
obj = PDFObjRef(self.doc, objid, genno)
self.push(obj)
if 2 <= self.debug:
print >>stderr, 'refer obj: %r' % obj
except PSSyntaxError:
pass
elif name == 'stream':
# stream object
(dic,) = self.pop(1)
dic = dict_value(dic)
if 'Length' not in dic:
raise PDFValueError('/Length is undefined: %r' % dic)
objlen = int_value(dic['Length'])
self.seek(pos)
line = self.nextline() # 'stream'
self.fp.seek(pos+len(line))
data = self.fp.read(objlen)
self.seek(pos+len(line)+objlen)
while 1:
line = self.nextline()
if not line:
raise PDFSyntaxError('premature eof, need endstream: linepos=%d, line=%r' %
(self.linepos, line))
if line.strip():
if not line.startswith('endstream'):
raise PDFSyntaxError('need endstream: linepos=%d, line=%r' %
(self.linepos, line))
break
if 1 <= self.debug:
print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
(pos, objlen, dic, data[:10])
obj = PDFStream(self.doc, dic, data)
self.push(obj)
elif self.inline and name == 'BI':
# inline image within a content stream
self.context.append(('BI', self.partobj))
self.partobj = []
elif self.inline and name == 'ID':
objs = self.partobj
(type0, self.partobj) = self.context.pop()
if len(objs) % 2 != 0:
raise PSTypeError('invalid dictionary construct: %r' % objs)
dic = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
pos += len('ID ')
self.fp.seek(pos)
data = self.fp.read(8192)
# XXX how do we know the real length other than scanning?
m = self.EOIPAT.search(data)
assert m
objlen = m.start(0)
obj = PDFStream(self.doc, dic, data[:objlen])
self.push(obj)
self.seek(pos+objlen+len('\nEI'))
self.push(KEYWORD_EI)
else:
self.push(token)
return False
def find_xref(self):
# find the first xref table
prev = None
for line in self.revreadlines():
line = line.strip()
if 2 <= self.debug:
print >>stderr, 'line: %r' % line
if line == 'startxref': break
if line:
prev = line
else:
raise PDFSyntaxError('startxref not found!')
if 1 <= self.debug:
print >>stderr, 'xref found: pos=%r' % prev
self.seek(long(prev))
return
# read xref tables and trailers
def read_xref(self):
self.find_xref()
while 1:
# read xref table
pos0 = self.linepos
line = self.nextline()
if 2 <= self.debug:
print >>stderr, 'line: %r' % line
if line[0].isdigit():
# XRefStream: PDF-1.5
self.seek(pos0)
xref = PDFXRefStream(self)
elif line.strip() != 'xref':
raise PDFSyntaxError('xref not found: linepos=%d, line=%r' %
(self.linepos, line))
else:
xref = PDFXRef(self)
yield xref
trailer = xref.trailer
if 1 <= self.debug:
print >>stderr, 'trailer: %r' % trailer
if 'XRefStm' in trailer:
self.seek(int_value(trailer['XRefStm']))
if 'Prev' in trailer:
# find previous xref
pos0 = int_value(trailer['Prev'])
self.seek(pos0)
if 1 <= self.debug:
print >>stderr, 'prev trailer: pos=%d' % pos0
else:
break
return