#!/usr/bin/env python import re import struct import logging import six # Python 2+3 compatibility try: import hashlib as md5 except ImportError: import md5 try: from Crypto.Cipher import ARC4 from Crypto.Cipher import AES from Crypto.Hash import SHA256 except ImportError: AES = SHA256 = None from . import arcfour as ARC4 from .psparser import PSEOF from .psparser import literal_name from .psparser import LIT from .psparser import KWD from .psparser import STRICT from .pdftypes import PDFException from .pdftypes import PDFTypeError from .pdftypes import PDFStream from .pdftypes import PDFObjectNotFound from .pdftypes import decipher_all from .pdftypes import int_value from .pdftypes import str_value from .pdftypes import list_value from .pdftypes import dict_value from .pdftypes import stream_value from .pdfparser import PDFSyntaxError from .pdfparser import PDFStreamParser from .utils import choplist from .utils import nunpack from .utils import decode_text ## Exceptions ## class PDFNoValidXRef(PDFSyntaxError): pass class PDFNoOutlines(PDFException): pass class PDFDestinationNotFound(PDFException): pass class PDFEncryptionError(PDFException): pass class PDFPasswordIncorrect(PDFEncryptionError): pass class PDFTextExtractionNotAllowed(PDFEncryptionError): pass # some predefined literals and keywords. LITERAL_OBJSTM = LIT('ObjStm') LITERAL_XREF = LIT('XRef') LITERAL_CATALOG = LIT('Catalog') ## XRefs ## class PDFBaseXRef(object): def get_trailer(self): raise NotImplementedError def get_objids(self): return [] # Must return # (strmid, index, genno) # or (None, pos, genno) def get_pos(self, objid): raise KeyError(objid) ## PDFXRef ## class PDFXRef(PDFBaseXRef): def __init__(self): self.offsets = {} self.trailer = {} return def __repr__(self): return '' % (self.offsets.keys()) def load(self, parser): while True: try: (pos, line) = parser.nextline() if not line.strip(): continue except PSEOF: raise PDFNoValidXRef('Unexpected EOF - file corrupted?') if not line: raise PDFNoValidXRef('Premature eof: %r' % parser) if line.startswith(b'trailer'): parser.seek(pos) break f = line.strip().split(b' ') if len(f) != 2: raise PDFNoValidXRef('Trailer not found: %r: line=%r' % (parser, line)) try: if six.PY2: (start, nobjs) = map(long, f) else: (start, nobjs) = map(int, f) except ValueError: raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line)) for objid in range(start, start+nobjs): try: (_, line) = parser.nextline() except PSEOF: raise PDFNoValidXRef('Unexpected EOF - file corrupted?') f = line.strip().split(b' ') if len(f) != 3: raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line)) (pos, genno, use) = f if use != b'n': continue self.offsets[objid] = (None, long(pos) if six.PY2 else int(pos), int(genno)) logging.info('xref objects: %r', self.offsets) self.load_trailer(parser) return def load_trailer(self, parser): try: (_, kwd) = parser.nexttoken() assert kwd is KWD(b'trailer') (_, dic) = parser.nextobject() except PSEOF: x = parser.pop(1) if not x: raise PDFNoValidXRef('Unexpected EOF - file corrupted') (_, dic) = x[0] self.trailer.update(dict_value(dic)) logging.debug('trailer=%r', self.trailer) return def get_trailer(self): return self.trailer def get_objids(self): return six.iterkeys(self.offsets) def get_pos(self, objid): try: return self.offsets[objid] except KeyError: raise ## PDFXRefFallback ## class PDFXRefFallback(PDFXRef): def __repr__(self): return '' % (self.offsets.keys()) PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b') def load(self, parser): parser.seek(0) while 1: try: (pos, line) = parser.nextline() except PSEOF: break if line.startswith(b'trailer'): parser.seek(pos) self.load_trailer(parser) logging.info('trailer: %r', self.trailer) break if six.PY3: line=line.decode('latin-1') #default pdf encoding m = self.PDFOBJ_CUE.match(line) if not m: continue (objid, genno) = m.groups() objid = int(objid) genno = int(genno) self.offsets[objid] = (None, pos, genno) # expand ObjStm. parser.seek(pos) (_, obj) = parser.nextobject() if isinstance(obj, PDFStream) and obj.get('Type') is LITERAL_OBJSTM: stream = stream_value(obj) try: n = stream['N'] except KeyError: if STRICT: raise PDFSyntaxError('N is not defined: %r' % stream) n = 0 parser1 = PDFStreamParser(stream.get_data()) objs = [] try: while 1: (_, obj) = parser1.nextobject() objs.append(obj) except PSEOF: pass n = min(n, len(objs)//2) for index in xrange(n): objid1 = objs[index*2] self.offsets[objid1] = (objid, index, 0) return ## PDFXRefStream ## class PDFXRefStream(PDFBaseXRef): def __init__(self): self.data = None self.entlen = None self.fl1 = self.fl2 = self.fl3 = None self.ranges = [] return def __repr__(self): return '' % (self.ranges) def load(self, parser): (_, objid) = parser.nexttoken() # ignored (_, genno) = parser.nexttoken() # ignored (_, kwd) = parser.nexttoken() (_, stream) = parser.nextobject() if not isinstance(stream, PDFStream) or stream['Type'] is not LITERAL_XREF: raise PDFNoValidXRef('Invalid PDF stream spec.') size = stream['Size'] index_array = stream.get('Index', (0, size)) if len(index_array) % 2 != 0: raise PDFSyntaxError('Invalid index number') self.ranges.extend(choplist(2, index_array)) (self.fl1, self.fl2, self.fl3) = stream['W'] self.data = stream.get_data() self.entlen = self.fl1+self.fl2+self.fl3 self.trailer = stream.attrs logging.info('xref stream: objid=%s, fields=%d,%d,%d', ', '.join(map(repr, self.ranges)), self.fl1, self.fl2, self.fl3) return def get_trailer(self): return self.trailer def get_objids(self): for (start, nobjs) in self.ranges: for i in xrange(nobjs): offset = self.entlen * i ent = self.data[offset:offset+self.entlen] f1 = nunpack(ent[:self.fl1], 1) if f1 == 1 or f1 == 2: yield start+i return def get_pos(self, objid): index = 0 for (start, nobjs) in self.ranges: if start <= objid and objid < start+nobjs: index += objid - start break else: index += nobjs else: raise KeyError(objid) offset = self.entlen * index ent = self.data[offset:offset+self.entlen] f1 = nunpack(ent[:self.fl1], 1) f2 = nunpack(ent[self.fl1:self.fl1+self.fl2]) f3 = nunpack(ent[self.fl1+self.fl2:]) if f1 == 1: return (None, f2, f3) elif f1 == 2: return (f2, f3, 0) else: # this is a free object raise KeyError(objid) ## PDFSecurityHandler ## class PDFStandardSecurityHandler(object): PASSWORD_PADDING = (b'(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08' b'..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz') supported_revisions = (2, 3) def __init__(self, docid, param, password=b''): self.docid = docid self.param = param self.password = password self.init() return def init(self): self.init_params() if self.r not in self.supported_revisions: raise PDFEncryptionError('Unsupported revision: param=%r' % self.param) self.init_key() return def init_params(self): self.v = int_value(self.param.get('V', 0)) self.r = int_value(self.param['R']) self.p = int_value(self.param['P']) self.o = str_value(self.param['O']) self.u = str_value(self.param['U']) self.length = int_value(self.param.get('Length', 40)) return def init_key(self): self.key = self.authenticate(self.password) if self.key is None: raise PDFPasswordIncorrect return def is_printable(self): return bool(self.p & 4) def is_modifiable(self): return bool(self.p & 8) def is_extractable(self): return bool(self.p & 16) def compute_u(self, key): if self.r == 2: # Algorithm 3.4 return ARC4.new(key).encrypt(self.PASSWORD_PADDING) # 2 else: # Algorithm 3.5 hash = md5.md5(self.PASSWORD_PADDING) # 2 hash.update(self.docid[0]) # 3 result = ARC4.new(key).encrypt(hash.digest()) # 4 for i in range(1, 20): # 5 k = b''.join(six.int2byte(c ^ i) for c in six.iterbytes(key)) result = ARC4.new(k).encrypt(result) result += result # 6 return result def compute_encryption_key(self, password): # Algorithm 3.2 password = (password + self.PASSWORD_PADDING)[:32] # 1 hash = md5.md5(password) # 2 hash.update(self.o) # 3 hash.update(struct.pack('= 4: if not self.encrypt_metadata: hash.update(b'\xff\xff\xff\xff') result = hash.digest() n = 5 if self.r >= 3: n = self.length // 8 for _ in range(50): result = md5.md5(result[:n]).digest() return result[:n] def authenticate(self, password): key = self.authenticate_user_password(password) if key is None: key = self.authenticate_owner_password(password) return key def authenticate_user_password(self, password): key = self.compute_encryption_key(password) if self.verify_encryption_key(key): return key else: return None def verify_encryption_key(self, key): # Algorithm 3.6 u = self.compute_u(key) if self.r == 2: return u == self.u return u[:16] == self.u[:16] def authenticate_owner_password(self, password): # Algorithm 3.7 password = (password + self.PASSWORD_PADDING)[:32] hash = md5.md5(password) if self.r >= 3: for _ in range(50): hash = md5.md5(hash.digest()) n = 5 if self.r >= 3: n = self.length // 8 key = hash.digest()[:n] if self.r == 2: user_password = ARC4.new(key).decrypt(self.o) else: user_password = self.o for i in range(19, -1, -1): k = b''.join(chr(ord(c) ^ i) for c in key) user_password = ARC4.new(k).decrypt(user_password) return self.authenticate_user_password(user_password) def decrypt(self, objid, genno, data, attrs=None): return self.decrypt_rc4(objid, genno, data) def decrypt_rc4(self, objid, genno, data): key = self.key + struct.pack('