2013-10-17 14:05:27 +00:00
|
|
|
#!/usr/bin/env python
|
2013-10-10 09:29:30 +00:00
|
|
|
import sys
|
|
|
|
import re
|
|
|
|
import struct
|
|
|
|
try:
|
|
|
|
import hashlib as md5
|
|
|
|
except ImportError:
|
|
|
|
import md5
|
|
|
|
from psparser import PSEOF
|
|
|
|
from psparser import literal_name
|
|
|
|
from psparser import LIT, KWD, STRICT
|
|
|
|
from pdftypes import PDFException, PDFTypeError, PDFNotImplementedError
|
|
|
|
from pdftypes import PDFObjectNotFound, PDFStream
|
2013-11-07 07:09:44 +00:00
|
|
|
from pdftypes import decipher_all
|
|
|
|
from pdftypes import int_value
|
2013-10-10 09:29:30 +00:00
|
|
|
from pdftypes import str_value, list_value, dict_value, stream_value
|
|
|
|
from pdfparser import PDFSyntaxError
|
|
|
|
from pdfparser import PDFStreamParser
|
|
|
|
from arcfour import Arcfour
|
|
|
|
from utils import choplist, nunpack
|
2013-10-10 09:34:43 +00:00
|
|
|
from utils import decode_text
|
2013-10-10 09:29:30 +00:00
|
|
|
|
|
|
|
|
|
|
|
## Exceptions
|
|
|
|
##
|
2013-11-07 08:35:04 +00:00
|
|
|
class PDFNoValidXRef(PDFSyntaxError):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
class PDFNoOutlines(PDFException):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
class PDFDestinationNotFound(PDFException):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
class PDFEncryptionError(PDFException):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
class PDFPasswordIncorrect(PDFEncryptionError):
|
|
|
|
pass
|
2013-10-10 09:29:30 +00:00
|
|
|
|
|
|
|
# some predefined literals and keywords.
|
|
|
|
LITERAL_OBJSTM = LIT('ObjStm')
|
|
|
|
LITERAL_XREF = LIT('XRef')
|
|
|
|
LITERAL_CATALOG = LIT('Catalog')
|
|
|
|
|
|
|
|
|
|
|
|
## XRefs
|
|
|
|
##
|
|
|
|
class PDFBaseXRef(object):
|
|
|
|
|
|
|
|
def get_trailer(self):
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
def get_objids(self):
|
|
|
|
return []
|
|
|
|
|
2013-10-10 10:17:58 +00:00
|
|
|
# Must return
|
|
|
|
# (strmid, index, genno)
|
|
|
|
# or (None, pos, genno)
|
2013-10-10 09:29:30 +00:00
|
|
|
def get_pos(self, objid):
|
|
|
|
raise KeyError(objid)
|
|
|
|
|
|
|
|
|
|
|
|
## PDFXRef
|
|
|
|
##
|
|
|
|
class PDFXRef(PDFBaseXRef):
|
2013-10-10 09:57:08 +00:00
|
|
|
|
2013-10-10 09:29:30 +00:00
|
|
|
def __init__(self):
|
|
|
|
self.offsets = {}
|
|
|
|
self.trailer = {}
|
|
|
|
return
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return '<PDFXRef: offsets=%r>' % (self.offsets.keys())
|
|
|
|
|
|
|
|
def load(self, parser, debug=0):
|
|
|
|
while 1:
|
|
|
|
try:
|
|
|
|
(pos, line) = parser.nextline()
|
2013-11-07 08:35:04 +00:00
|
|
|
if not line.strip():
|
|
|
|
continue
|
2013-10-10 09:29:30 +00:00
|
|
|
except PSEOF:
|
|
|
|
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
|
|
|
|
if not line:
|
|
|
|
raise PDFNoValidXRef('Premature eof: %r' % parser)
|
|
|
|
if line.startswith('trailer'):
|
|
|
|
parser.seek(pos)
|
|
|
|
break
|
|
|
|
f = line.strip().split(' ')
|
|
|
|
if len(f) != 2:
|
|
|
|
raise PDFNoValidXRef('Trailer not found: %r: line=%r' % (parser, line))
|
|
|
|
try:
|
|
|
|
(start, nobjs) = map(long, f)
|
|
|
|
except ValueError:
|
|
|
|
raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line))
|
|
|
|
for objid in xrange(start, start+nobjs):
|
|
|
|
try:
|
|
|
|
(_, line) = parser.nextline()
|
|
|
|
except PSEOF:
|
|
|
|
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
|
|
|
|
f = line.strip().split(' ')
|
|
|
|
if len(f) != 3:
|
|
|
|
raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line))
|
|
|
|
(pos, genno, use) = f
|
2013-11-07 08:35:04 +00:00
|
|
|
if use != 'n':
|
|
|
|
continue
|
2013-10-10 10:17:58 +00:00
|
|
|
self.offsets[objid] = (None, long(pos), int(genno))
|
2013-10-10 09:29:30 +00:00
|
|
|
if 1 <= debug:
|
|
|
|
print >>sys.stderr, 'xref objects:', self.offsets
|
|
|
|
self.load_trailer(parser)
|
|
|
|
return
|
|
|
|
|
|
|
|
KEYWORD_TRAILER = KWD('trailer')
|
2013-11-07 08:35:04 +00:00
|
|
|
|
2013-10-10 09:29:30 +00:00
|
|
|
def load_trailer(self, parser):
|
|
|
|
try:
|
2013-11-07 08:35:04 +00:00
|
|
|
(_, kwd) = parser.nexttoken()
|
2013-10-10 09:29:30 +00:00
|
|
|
assert kwd is self.KEYWORD_TRAILER
|
2013-11-07 08:35:04 +00:00
|
|
|
(_, dic) = parser.nextobject()
|
2013-10-10 09:29:30 +00:00
|
|
|
except PSEOF:
|
|
|
|
x = parser.pop(1)
|
|
|
|
if not x:
|
|
|
|
raise PDFNoValidXRef('Unexpected EOF - file corrupted')
|
2013-11-07 08:35:04 +00:00
|
|
|
(_, dic) = x[0]
|
2013-10-10 09:29:30 +00:00
|
|
|
self.trailer.update(dict_value(dic))
|
|
|
|
return
|
|
|
|
|
2013-10-10 09:44:12 +00:00
|
|
|
def get_trailer(self):
|
|
|
|
return self.trailer
|
|
|
|
|
|
|
|
def get_objids(self):
|
|
|
|
return self.offsets.iterkeys()
|
|
|
|
|
|
|
|
def get_pos(self, objid):
|
|
|
|
try:
|
2013-10-10 10:17:58 +00:00
|
|
|
return self.offsets[objid]
|
2013-10-10 09:44:12 +00:00
|
|
|
except KeyError:
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
|
|
## PDFXRefFallback
|
|
|
|
##
|
|
|
|
class PDFXRefFallback(PDFXRef):
|
|
|
|
|
2013-10-10 11:10:57 +00:00
|
|
|
def __repr__(self):
|
|
|
|
return '<PDFXRefFallback: offsets=%r>' % (self.offsets.keys())
|
|
|
|
|
2013-10-10 09:29:30 +00:00
|
|
|
PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
|
2013-11-07 08:35:04 +00:00
|
|
|
|
2013-10-10 09:44:12 +00:00
|
|
|
def load(self, parser, debug=0):
|
2013-10-10 09:29:30 +00:00
|
|
|
parser.seek(0)
|
|
|
|
while 1:
|
|
|
|
try:
|
|
|
|
(pos, line) = parser.nextline()
|
|
|
|
except PSEOF:
|
|
|
|
break
|
|
|
|
if line.startswith('trailer'):
|
|
|
|
parser.seek(pos)
|
|
|
|
self.load_trailer(parser)
|
|
|
|
if 1 <= debug:
|
|
|
|
print >>sys.stderr, 'trailer: %r' % self.get_trailer()
|
|
|
|
break
|
|
|
|
m = self.PDFOBJ_CUE.match(line)
|
2013-11-07 08:35:04 +00:00
|
|
|
if not m:
|
|
|
|
continue
|
2013-10-10 09:29:30 +00:00
|
|
|
(objid, genno) = m.groups()
|
2013-10-10 10:40:43 +00:00
|
|
|
objid = int(objid)
|
|
|
|
genno = int(genno)
|
|
|
|
self.offsets[objid] = (None, pos, genno)
|
|
|
|
# expand ObjStm.
|
2013-10-10 11:10:57 +00:00
|
|
|
parser.seek(pos)
|
2013-11-07 08:35:04 +00:00
|
|
|
(_, obj) = parser.nextobject()
|
2013-10-10 10:40:43 +00:00
|
|
|
if isinstance(obj, PDFStream) and obj.get('Type') is LITERAL_OBJSTM:
|
|
|
|
stream = stream_value(obj)
|
|
|
|
try:
|
|
|
|
n = stream['N']
|
|
|
|
except KeyError:
|
|
|
|
if STRICT:
|
|
|
|
raise PDFSyntaxError('N is not defined: %r' % stream)
|
|
|
|
n = 0
|
|
|
|
parser1 = PDFStreamParser(stream.get_data())
|
|
|
|
objs = []
|
|
|
|
try:
|
|
|
|
while 1:
|
2013-11-07 08:35:04 +00:00
|
|
|
(_, obj) = parser1.nextobject()
|
2013-10-10 10:40:43 +00:00
|
|
|
objs.append(obj)
|
|
|
|
except PSEOF:
|
|
|
|
pass
|
2013-11-26 12:35:16 +00:00
|
|
|
n = min(n, len(objs)//2)
|
2013-10-10 10:40:43 +00:00
|
|
|
for index in xrange(n):
|
|
|
|
objid1 = objs[index*2]
|
|
|
|
self.offsets[objid1] = (objid, index, 0)
|
2013-10-10 09:29:30 +00:00
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
## PDFXRefStream
|
|
|
|
##
|
|
|
|
class PDFXRefStream(PDFBaseXRef):
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
self.data = None
|
|
|
|
self.entlen = None
|
|
|
|
self.fl1 = self.fl2 = self.fl3 = None
|
2013-10-10 09:34:43 +00:00
|
|
|
self.ranges = []
|
2013-10-10 09:29:30 +00:00
|
|
|
return
|
|
|
|
|
|
|
|
def __repr__(self):
|
2013-10-10 10:17:58 +00:00
|
|
|
return '<PDFXRefStream: ranges=%r>' % (self.ranges)
|
2013-10-10 09:29:30 +00:00
|
|
|
|
|
|
|
def load(self, parser, debug=0):
|
2013-11-07 08:35:04 +00:00
|
|
|
(_, objid) = parser.nexttoken() # ignored
|
|
|
|
(_, genno) = parser.nexttoken() # ignored
|
|
|
|
(_, kwd) = parser.nexttoken()
|
|
|
|
(_, stream) = parser.nextobject()
|
2013-10-10 09:29:30 +00:00
|
|
|
if not isinstance(stream, PDFStream) or stream['Type'] is not LITERAL_XREF:
|
|
|
|
raise PDFNoValidXRef('Invalid PDF stream spec.')
|
|
|
|
size = stream['Size']
|
2014-03-24 11:43:36 +00:00
|
|
|
index_array = stream.get('Index', (0, size))
|
2013-10-10 09:29:30 +00:00
|
|
|
if len(index_array) % 2 != 0:
|
|
|
|
raise PDFSyntaxError('Invalid index number')
|
2013-10-10 09:34:43 +00:00
|
|
|
self.ranges.extend(choplist(2, index_array))
|
2013-10-10 09:29:30 +00:00
|
|
|
(self.fl1, self.fl2, self.fl3) = stream['W']
|
|
|
|
self.data = stream.get_data()
|
|
|
|
self.entlen = self.fl1+self.fl2+self.fl3
|
|
|
|
self.trailer = stream.attrs
|
|
|
|
if 1 <= debug:
|
|
|
|
print >>sys.stderr, ('xref stream: objid=%s, fields=%d,%d,%d' %
|
2013-11-07 08:35:04 +00:00
|
|
|
(', '.join(map(repr, self.ranges)),
|
|
|
|
self.fl1, self.fl2, self.fl3))
|
2013-10-10 09:29:30 +00:00
|
|
|
return
|
|
|
|
|
|
|
|
def get_trailer(self):
|
|
|
|
return self.trailer
|
|
|
|
|
|
|
|
def get_objids(self):
|
2013-11-07 08:35:04 +00:00
|
|
|
for (start, nobjs) in self.ranges:
|
2013-10-10 09:34:43 +00:00
|
|
|
for i in xrange(nobjs):
|
|
|
|
yield start+i
|
2013-10-10 09:29:30 +00:00
|
|
|
return
|
|
|
|
|
|
|
|
def get_pos(self, objid):
|
2013-10-10 09:34:43 +00:00
|
|
|
index = 0
|
2013-11-07 08:35:04 +00:00
|
|
|
for (start, nobjs) in self.ranges:
|
2013-10-10 09:34:43 +00:00
|
|
|
if start <= objid and objid < start+nobjs:
|
|
|
|
index += objid - start
|
2014-03-24 11:39:30 +00:00
|
|
|
break
|
2013-10-10 09:29:30 +00:00
|
|
|
else:
|
2013-10-10 09:34:43 +00:00
|
|
|
index += nobjs
|
|
|
|
else:
|
|
|
|
raise KeyError(objid)
|
|
|
|
offset = self.entlen * index
|
|
|
|
ent = self.data[offset:offset+self.entlen]
|
2013-10-10 09:29:30 +00:00
|
|
|
f1 = nunpack(ent[:self.fl1], 1)
|
2013-10-10 10:17:58 +00:00
|
|
|
f2 = nunpack(ent[self.fl1:self.fl1+self.fl2])
|
|
|
|
f3 = nunpack(ent[self.fl1+self.fl2:])
|
2013-10-10 09:29:30 +00:00
|
|
|
if f1 == 1:
|
2013-10-10 10:17:58 +00:00
|
|
|
return (None, f2, f3)
|
2013-10-10 09:29:30 +00:00
|
|
|
elif f1 == 2:
|
2013-10-10 10:17:58 +00:00
|
|
|
return (f2, f3, 0)
|
|
|
|
else:
|
|
|
|
# this is a free object
|
|
|
|
raise KeyError(objid)
|
2013-10-10 09:29:30 +00:00
|
|
|
|
|
|
|
|
|
|
|
## PDFDocument
|
|
|
|
##
|
|
|
|
class PDFDocument(object):
|
|
|
|
|
|
|
|
"""PDFDocument object represents a PDF document.
|
|
|
|
|
|
|
|
Since a PDF file can be very big, normally it is not loaded at
|
|
|
|
once. So PDF document has to cooperate with a PDF parser in order to
|
|
|
|
dynamically import the data as processing goes.
|
|
|
|
|
|
|
|
Typical usage:
|
2014-03-24 11:39:30 +00:00
|
|
|
doc = PDFDocument(parser, password)
|
2013-10-10 09:29:30 +00:00
|
|
|
obj = doc.getobj(objid)
|
2013-11-07 07:14:53 +00:00
|
|
|
|
2013-10-10 09:29:30 +00:00
|
|
|
"""
|
|
|
|
|
|
|
|
debug = 0
|
2014-03-24 11:39:30 +00:00
|
|
|
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
|
2013-10-10 09:29:30 +00:00
|
|
|
|
2014-03-24 11:39:30 +00:00
|
|
|
def __init__(self, parser, password='', caching=True, fallback=True):
|
2013-10-10 09:40:06 +00:00
|
|
|
"Set the document to use a given PDFParser object."
|
2013-10-10 09:29:30 +00:00
|
|
|
self.caching = caching
|
|
|
|
self.xrefs = []
|
|
|
|
self.info = []
|
|
|
|
self.catalog = None
|
|
|
|
self.encryption = None
|
|
|
|
self.decipher = None
|
|
|
|
self._parser = None
|
|
|
|
self._cached_objs = {}
|
|
|
|
self._parsed_objs = {}
|
|
|
|
self._parser = parser
|
2013-10-10 09:40:06 +00:00
|
|
|
self._parser.set_document(self)
|
2014-03-24 11:39:30 +00:00
|
|
|
self.is_printable = self.is_modifiable = self.is_extractable = True
|
2013-10-10 09:29:30 +00:00
|
|
|
# Retrieve the information of each header that was appended
|
|
|
|
# (maybe multiple times) at the end of the document.
|
|
|
|
try:
|
2013-10-10 09:57:08 +00:00
|
|
|
pos = self.find_xref(parser)
|
|
|
|
self.read_xref_from(parser, pos, self.xrefs)
|
2013-10-10 09:29:30 +00:00
|
|
|
except PDFNoValidXRef:
|
|
|
|
fallback = True
|
|
|
|
if fallback:
|
2013-10-10 09:57:08 +00:00
|
|
|
parser.fallback = True
|
|
|
|
xref = PDFXRefFallback()
|
|
|
|
xref.load(parser)
|
|
|
|
self.xrefs.append(xref)
|
2013-10-10 09:29:30 +00:00
|
|
|
for xref in self.xrefs:
|
|
|
|
trailer = xref.get_trailer()
|
2013-11-07 08:35:04 +00:00
|
|
|
if not trailer:
|
|
|
|
continue
|
2013-10-10 09:29:30 +00:00
|
|
|
# If there's an encryption info, remember it.
|
|
|
|
if 'Encrypt' in trailer:
|
|
|
|
#assert not self.encryption
|
|
|
|
self.encryption = (list_value(trailer['ID']),
|
|
|
|
dict_value(trailer['Encrypt']))
|
2014-03-24 11:39:30 +00:00
|
|
|
self._initialize_password(password)
|
2013-10-10 09:29:30 +00:00
|
|
|
if 'Info' in trailer:
|
|
|
|
self.info.append(dict_value(trailer['Info']))
|
|
|
|
if 'Root' in trailer:
|
|
|
|
# Every PDF file must have exactly one /Root dictionary.
|
|
|
|
self.catalog = dict_value(trailer['Root'])
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
|
|
|
|
if self.catalog.get('Type') is not LITERAL_CATALOG:
|
|
|
|
if STRICT:
|
|
|
|
raise PDFSyntaxError('Catalog not found!')
|
|
|
|
return
|
|
|
|
|
2014-03-24 11:39:30 +00:00
|
|
|
# _initialize_password(password='')
|
2013-10-10 09:29:30 +00:00
|
|
|
# Perform the initialization with a given password.
|
2014-03-24 11:39:30 +00:00
|
|
|
def _initialize_password(self, password=''):
|
2013-10-10 09:29:30 +00:00
|
|
|
(docid, param) = self.encryption
|
|
|
|
if literal_name(param.get('Filter')) != 'Standard':
|
|
|
|
raise PDFEncryptionError('Unknown filter: param=%r' % param)
|
|
|
|
V = int_value(param.get('V', 0))
|
|
|
|
if not (V == 1 or V == 2):
|
|
|
|
raise PDFEncryptionError('Unknown algorithm: param=%r' % param)
|
2013-11-07 08:35:04 +00:00
|
|
|
length = int_value(param.get('Length', 40)) # Key length (bits)
|
2013-10-10 09:29:30 +00:00
|
|
|
O = str_value(param['O'])
|
2013-11-07 08:35:04 +00:00
|
|
|
R = int_value(param['R']) # Revision
|
2013-10-10 09:29:30 +00:00
|
|
|
if 5 <= R:
|
|
|
|
raise PDFEncryptionError('Unknown revision: %r' % R)
|
|
|
|
U = str_value(param['U'])
|
|
|
|
P = int_value(param['P'])
|
|
|
|
self.is_printable = bool(P & 4)
|
|
|
|
self.is_modifiable = bool(P & 8)
|
|
|
|
self.is_extractable = bool(P & 16)
|
|
|
|
# Algorithm 3.2
|
2013-11-07 08:35:04 +00:00
|
|
|
password = (password+self.PASSWORD_PADDING)[:32] # 1
|
|
|
|
hash = md5.md5(password) # 2
|
|
|
|
hash.update(O) # 3
|
|
|
|
hash.update(struct.pack('<l', P)) # 4
|
|
|
|
hash.update(docid[0]) # 5
|
2013-10-10 09:29:30 +00:00
|
|
|
if 4 <= R:
|
|
|
|
# 6
|
|
|
|
raise PDFNotImplementedError('Revision 4 encryption is currently unsupported')
|
|
|
|
if 3 <= R:
|
|
|
|
# 8
|
|
|
|
for _ in xrange(50):
|
2013-11-26 12:35:16 +00:00
|
|
|
hash = md5.md5(hash.digest()[:length//8])
|
|
|
|
key = hash.digest()[:length//8]
|
2013-10-10 09:29:30 +00:00
|
|
|
if R == 2:
|
|
|
|
# Algorithm 3.4
|
|
|
|
u1 = Arcfour(key).process(self.PASSWORD_PADDING)
|
|
|
|
elif R == 3:
|
|
|
|
# Algorithm 3.5
|
2013-11-07 08:35:04 +00:00
|
|
|
hash = md5.md5(self.PASSWORD_PADDING) # 2
|
|
|
|
hash.update(docid[0]) # 3
|
|
|
|
x = Arcfour(key).process(hash.digest()[:16]) # 4
|
|
|
|
for i in xrange(1, 19+1):
|
|
|
|
k = ''.join(chr(ord(c) ^ i) for c in key)
|
2013-10-10 09:29:30 +00:00
|
|
|
x = Arcfour(k).process(x)
|
2013-11-07 08:35:04 +00:00
|
|
|
u1 = x+x # 32bytes total
|
2013-10-10 09:29:30 +00:00
|
|
|
if R == 2:
|
|
|
|
is_authenticated = (u1 == U)
|
|
|
|
else:
|
|
|
|
is_authenticated = (u1[:16] == U[:16])
|
|
|
|
if not is_authenticated:
|
|
|
|
raise PDFPasswordIncorrect
|
|
|
|
self.decrypt_key = key
|
|
|
|
self.decipher = self.decrypt_rc4 # XXX may be AES
|
|
|
|
return
|
|
|
|
|
|
|
|
def decrypt_rc4(self, objid, genno, data):
|
2013-11-07 08:35:04 +00:00
|
|
|
key = self.decrypt_key + struct.pack('<L', objid)[:3]+struct.pack('<L', genno)[:2]
|
2013-10-10 09:29:30 +00:00
|
|
|
hash = md5.md5(key)
|
2013-11-07 08:35:04 +00:00
|
|
|
key = hash.digest()[:min(len(key), 16)]
|
2013-10-10 09:29:30 +00:00
|
|
|
return Arcfour(key).process(data)
|
|
|
|
|
2013-10-10 10:17:58 +00:00
|
|
|
def _getobj_objstm(self, stream, index, objid):
|
2013-10-10 10:40:43 +00:00
|
|
|
if stream.objid in self._parsed_objs:
|
2013-11-07 08:35:04 +00:00
|
|
|
(objs, n) = self._parsed_objs[stream.objid]
|
2013-10-10 10:40:43 +00:00
|
|
|
else:
|
2013-11-07 08:35:04 +00:00
|
|
|
(objs, n) = self._get_objects(stream)
|
2013-10-10 10:40:43 +00:00
|
|
|
if self.caching:
|
2013-11-07 08:35:04 +00:00
|
|
|
self._parsed_objs[stream.objid] = (objs, n)
|
2013-10-10 10:40:43 +00:00
|
|
|
i = n*2+index
|
|
|
|
try:
|
|
|
|
obj = objs[i]
|
|
|
|
except IndexError:
|
|
|
|
raise PDFSyntaxError('index too big: %r' % index)
|
|
|
|
return obj
|
|
|
|
|
|
|
|
def _get_objects(self, stream):
|
2013-10-10 10:17:58 +00:00
|
|
|
if stream.get('Type') is not LITERAL_OBJSTM:
|
|
|
|
if STRICT:
|
|
|
|
raise PDFSyntaxError('Not a stream object: %r' % stream)
|
|
|
|
try:
|
|
|
|
n = stream['N']
|
|
|
|
except KeyError:
|
|
|
|
if STRICT:
|
|
|
|
raise PDFSyntaxError('N is not defined: %r' % stream)
|
|
|
|
n = 0
|
2013-10-10 10:40:43 +00:00
|
|
|
parser = PDFStreamParser(stream.get_data())
|
|
|
|
parser.set_document(self)
|
|
|
|
objs = []
|
2013-10-10 10:17:58 +00:00
|
|
|
try:
|
2013-10-10 10:40:43 +00:00
|
|
|
while 1:
|
2013-11-07 08:35:04 +00:00
|
|
|
(_, obj) = parser.nextobject()
|
2013-10-10 10:40:43 +00:00
|
|
|
objs.append(obj)
|
|
|
|
except PSEOF:
|
|
|
|
pass
|
|
|
|
return (objs, n)
|
2013-10-10 10:17:58 +00:00
|
|
|
|
2013-10-10 09:29:30 +00:00
|
|
|
KEYWORD_OBJ = KWD('obj')
|
2013-11-07 08:35:04 +00:00
|
|
|
|
2013-10-10 10:17:58 +00:00
|
|
|
def _getobj_parse(self, pos, objid):
|
|
|
|
self._parser.seek(pos)
|
2013-11-07 08:35:04 +00:00
|
|
|
(_, objid1) = self._parser.nexttoken() # objid
|
2013-10-10 10:17:58 +00:00
|
|
|
if objid1 != objid:
|
|
|
|
raise PDFSyntaxError('objid mismatch: %r=%r' % (objid1, objid))
|
2013-11-07 08:35:04 +00:00
|
|
|
(_, genno) = self._parser.nexttoken() # genno
|
|
|
|
(_, kwd) = self._parser.nexttoken()
|
2013-10-10 10:17:58 +00:00
|
|
|
if kwd is not self.KEYWORD_OBJ:
|
|
|
|
raise PDFSyntaxError('Invalid object spec: offset=%r' % pos)
|
2013-11-07 08:35:04 +00:00
|
|
|
(_, obj) = self._parser.nextobject()
|
2013-10-10 10:17:58 +00:00
|
|
|
return obj
|
2013-11-07 07:14:53 +00:00
|
|
|
|
2013-10-10 09:29:30 +00:00
|
|
|
# can raise PDFObjectNotFound
|
|
|
|
def getobj(self, objid):
|
2013-10-10 11:10:57 +00:00
|
|
|
assert objid != 0
|
2013-10-10 09:29:30 +00:00
|
|
|
if not self.xrefs:
|
|
|
|
raise PDFException('PDFDocument is not initialized')
|
|
|
|
if 2 <= self.debug:
|
|
|
|
print >>sys.stderr, 'getobj: objid=%r' % (objid)
|
|
|
|
if objid in self._cached_objs:
|
2013-10-10 10:17:58 +00:00
|
|
|
(obj, genno) = self._cached_objs[objid]
|
2013-10-10 09:29:30 +00:00
|
|
|
else:
|
|
|
|
for xref in self.xrefs:
|
|
|
|
try:
|
2013-10-10 10:17:58 +00:00
|
|
|
(strmid, index, genno) = xref.get_pos(objid)
|
2013-10-10 09:29:30 +00:00
|
|
|
except KeyError:
|
2013-10-10 10:17:58 +00:00
|
|
|
continue
|
2013-10-10 09:29:30 +00:00
|
|
|
try:
|
2013-10-10 10:17:58 +00:00
|
|
|
if strmid is not None:
|
|
|
|
stream = stream_value(self.getobj(strmid))
|
|
|
|
obj = self._getobj_objstm(stream, index, objid)
|
|
|
|
else:
|
|
|
|
obj = self._getobj_parse(index, objid)
|
2013-10-10 09:29:30 +00:00
|
|
|
if isinstance(obj, PDFStream):
|
|
|
|
obj.set_objid(objid, genno)
|
2013-10-10 10:17:58 +00:00
|
|
|
break
|
|
|
|
except (PSEOF, PDFSyntaxError):
|
|
|
|
continue
|
|
|
|
else:
|
|
|
|
raise PDFObjectNotFound(objid)
|
2013-10-10 09:29:30 +00:00
|
|
|
if 2 <= self.debug:
|
|
|
|
print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj)
|
|
|
|
if self.caching:
|
2013-10-10 10:17:58 +00:00
|
|
|
self._cached_objs[objid] = (obj, genno)
|
2013-10-10 09:29:30 +00:00
|
|
|
if self.decipher:
|
|
|
|
obj = decipher_all(self.decipher, objid, genno, obj)
|
|
|
|
return obj
|
|
|
|
|
|
|
|
def get_outlines(self):
|
|
|
|
if 'Outlines' not in self.catalog:
|
|
|
|
raise PDFNoOutlines
|
2013-11-07 08:35:04 +00:00
|
|
|
|
2013-10-10 09:29:30 +00:00
|
|
|
def search(entry, level):
|
|
|
|
entry = dict_value(entry)
|
|
|
|
if 'Title' in entry:
|
|
|
|
if 'A' in entry or 'Dest' in entry:
|
|
|
|
title = decode_text(str_value(entry['Title']))
|
|
|
|
dest = entry.get('Dest')
|
|
|
|
action = entry.get('A')
|
|
|
|
se = entry.get('SE')
|
|
|
|
yield (level, title, dest, action, se)
|
|
|
|
if 'First' in entry and 'Last' in entry:
|
|
|
|
for x in search(entry['First'], level+1):
|
|
|
|
yield x
|
|
|
|
if 'Next' in entry:
|
|
|
|
for x in search(entry['Next'], level):
|
|
|
|
yield x
|
|
|
|
return
|
|
|
|
return search(self.catalog['Outlines'], 0)
|
|
|
|
|
|
|
|
def lookup_name(self, cat, key):
|
|
|
|
try:
|
|
|
|
names = dict_value(self.catalog['Names'])
|
|
|
|
except (PDFTypeError, KeyError):
|
2013-11-07 08:35:04 +00:00
|
|
|
raise KeyError((cat, key))
|
2013-10-10 09:29:30 +00:00
|
|
|
# may raise KeyError
|
|
|
|
d0 = dict_value(names[cat])
|
2013-11-07 08:35:04 +00:00
|
|
|
|
2013-10-10 09:29:30 +00:00
|
|
|
def lookup(d):
|
|
|
|
if 'Limits' in d:
|
2013-11-07 08:35:04 +00:00
|
|
|
(k1, k2) = list_value(d['Limits'])
|
|
|
|
if key < k1 or k2 < key:
|
|
|
|
return None
|
2013-10-10 09:29:30 +00:00
|
|
|
if 'Names' in d:
|
|
|
|
objs = list_value(d['Names'])
|
|
|
|
names = dict(choplist(2, objs))
|
|
|
|
return names[key]
|
|
|
|
if 'Kids' in d:
|
|
|
|
for c in list_value(d['Kids']):
|
|
|
|
v = lookup(dict_value(c))
|
2013-11-07 08:35:04 +00:00
|
|
|
if v:
|
|
|
|
return v
|
|
|
|
raise KeyError((cat, key))
|
2013-10-10 09:29:30 +00:00
|
|
|
return lookup(d0)
|
|
|
|
|
|
|
|
def get_dest(self, name):
|
|
|
|
try:
|
|
|
|
# PDF-1.2 or later
|
|
|
|
obj = self.lookup_name('Dests', name)
|
|
|
|
except KeyError:
|
|
|
|
# PDF-1.1 or prior
|
|
|
|
if 'Dests' not in self.catalog:
|
|
|
|
raise PDFDestinationNotFound(name)
|
|
|
|
d0 = dict_value(self.catalog['Dests'])
|
|
|
|
if name not in d0:
|
|
|
|
raise PDFDestinationNotFound(name)
|
|
|
|
obj = d0[name]
|
|
|
|
return obj
|
|
|
|
|
|
|
|
# find_xref
|
|
|
|
def find_xref(self, parser):
|
|
|
|
"""Internal function used to locate the first XRef."""
|
|
|
|
# search the last xref table by scanning the file backwards.
|
|
|
|
prev = None
|
|
|
|
for line in parser.revreadlines():
|
|
|
|
line = line.strip()
|
|
|
|
if 2 <= self.debug:
|
|
|
|
print >>sys.stderr, 'find_xref: %r' % line
|
2013-11-07 08:35:04 +00:00
|
|
|
if line == 'startxref':
|
|
|
|
break
|
2013-10-10 09:29:30 +00:00
|
|
|
if line:
|
|
|
|
prev = line
|
|
|
|
else:
|
|
|
|
raise PDFNoValidXRef('Unexpected EOF')
|
|
|
|
if 1 <= self.debug:
|
|
|
|
print >>sys.stderr, 'xref found: pos=%r' % prev
|
|
|
|
return long(prev)
|
|
|
|
|
|
|
|
# read xref table
|
|
|
|
def read_xref_from(self, parser, start, xrefs):
|
|
|
|
"""Reads XRefs from the given location."""
|
|
|
|
parser.seek(start)
|
|
|
|
parser.reset()
|
|
|
|
try:
|
|
|
|
(pos, token) = parser.nexttoken()
|
|
|
|
except PSEOF:
|
|
|
|
raise PDFNoValidXRef('Unexpected EOF')
|
|
|
|
if 2 <= self.debug:
|
|
|
|
print >>sys.stderr, 'read_xref_from: start=%d, token=%r' % (start, token)
|
|
|
|
if isinstance(token, int):
|
|
|
|
# XRefStream: PDF-1.5
|
|
|
|
parser.seek(pos)
|
|
|
|
parser.reset()
|
|
|
|
xref = PDFXRefStream()
|
|
|
|
xref.load(parser, debug=self.debug)
|
|
|
|
else:
|
|
|
|
if token is parser.KEYWORD_XREF:
|
|
|
|
parser.nextline()
|
|
|
|
xref = PDFXRef()
|
|
|
|
xref.load(parser, debug=self.debug)
|
|
|
|
xrefs.append(xref)
|
|
|
|
trailer = xref.get_trailer()
|
|
|
|
if 1 <= self.debug:
|
|
|
|
print >>sys.stderr, 'trailer: %r' % trailer
|
|
|
|
if 'XRefStm' in trailer:
|
|
|
|
pos = int_value(trailer['XRefStm'])
|
|
|
|
self.read_xref_from(parser, pos, xrefs)
|
|
|
|
if 'Prev' in trailer:
|
|
|
|
# find previous xref
|
|
|
|
pos = int_value(trailer['Prev'])
|
|
|
|
self.read_xref_from(parser, pos, xrefs)
|
|
|
|
return
|