pdfminer.six/pdfminer/pdfdocument.py

600 lines
20 KiB
Python
Raw Normal View History

2013-10-17 14:05:27 +00:00
#!/usr/bin/env python
2013-10-10 09:29:30 +00:00
import sys
import re
import struct
try:
import hashlib as md5
except ImportError:
import md5
from psparser import PSEOF
from psparser import literal_name
from psparser import LIT, KWD, STRICT
from pdftypes import PDFException, PDFTypeError, PDFNotImplementedError
from pdftypes import PDFObjectNotFound, PDFStream
from pdftypes import decipher_all
from pdftypes import int_value
2013-10-10 09:29:30 +00:00
from pdftypes import str_value, list_value, dict_value, stream_value
from pdfparser import PDFSyntaxError
from pdfparser import PDFStreamParser
from arcfour import Arcfour
from utils import choplist, nunpack
2013-10-10 09:34:43 +00:00
from utils import decode_text
2013-10-10 09:29:30 +00:00
## Exceptions
##
2013-11-07 08:35:04 +00:00
class PDFNoValidXRef(PDFSyntaxError):
pass
class PDFNoOutlines(PDFException):
pass
class PDFDestinationNotFound(PDFException):
pass
class PDFEncryptionError(PDFException):
pass
class PDFPasswordIncorrect(PDFEncryptionError):
pass
2013-10-10 09:29:30 +00:00
2014-03-24 11:59:24 +00:00
class PDFTextExtractionNotAllowed(PDFEncryptionError):
pass
2013-10-10 09:29:30 +00:00
# some predefined literals and keywords.
LITERAL_OBJSTM = LIT('ObjStm')
LITERAL_XREF = LIT('XRef')
LITERAL_CATALOG = LIT('Catalog')
## XRefs
##
class PDFBaseXRef(object):
def get_trailer(self):
raise NotImplementedError
def get_objids(self):
return []
2013-10-10 10:17:58 +00:00
# Must return
# (strmid, index, genno)
# or (None, pos, genno)
2013-10-10 09:29:30 +00:00
def get_pos(self, objid):
raise KeyError(objid)
## PDFXRef
##
class PDFXRef(PDFBaseXRef):
2013-10-10 09:57:08 +00:00
2013-10-10 09:29:30 +00:00
def __init__(self):
self.offsets = {}
self.trailer = {}
return
def __repr__(self):
return '<PDFXRef: offsets=%r>' % (self.offsets.keys())
def load(self, parser, debug=0):
while 1:
try:
(pos, line) = parser.nextline()
2013-11-07 08:35:04 +00:00
if not line.strip():
continue
2013-10-10 09:29:30 +00:00
except PSEOF:
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
if not line:
raise PDFNoValidXRef('Premature eof: %r' % parser)
if line.startswith('trailer'):
parser.seek(pos)
break
f = line.strip().split(' ')
if len(f) != 2:
raise PDFNoValidXRef('Trailer not found: %r: line=%r' % (parser, line))
try:
(start, nobjs) = map(long, f)
except ValueError:
raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line))
for objid in xrange(start, start+nobjs):
try:
(_, line) = parser.nextline()
except PSEOF:
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
f = line.strip().split(' ')
if len(f) != 3:
raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line))
(pos, genno, use) = f
2013-11-07 08:35:04 +00:00
if use != 'n':
continue
2013-10-10 10:17:58 +00:00
self.offsets[objid] = (None, long(pos), int(genno))
2013-10-10 09:29:30 +00:00
if 1 <= debug:
print >>sys.stderr, 'xref objects:', self.offsets
self.load_trailer(parser)
return
KEYWORD_TRAILER = KWD('trailer')
2013-11-07 08:35:04 +00:00
2013-10-10 09:29:30 +00:00
def load_trailer(self, parser):
try:
2013-11-07 08:35:04 +00:00
(_, kwd) = parser.nexttoken()
2013-10-10 09:29:30 +00:00
assert kwd is self.KEYWORD_TRAILER
2013-11-07 08:35:04 +00:00
(_, dic) = parser.nextobject()
2013-10-10 09:29:30 +00:00
except PSEOF:
x = parser.pop(1)
if not x:
raise PDFNoValidXRef('Unexpected EOF - file corrupted')
2013-11-07 08:35:04 +00:00
(_, dic) = x[0]
2013-10-10 09:29:30 +00:00
self.trailer.update(dict_value(dic))
return
2013-10-10 09:44:12 +00:00
def get_trailer(self):
return self.trailer
def get_objids(self):
return self.offsets.iterkeys()
def get_pos(self, objid):
try:
2013-10-10 10:17:58 +00:00
return self.offsets[objid]
2013-10-10 09:44:12 +00:00
except KeyError:
raise
## PDFXRefFallback
##
class PDFXRefFallback(PDFXRef):
def __repr__(self):
return '<PDFXRefFallback: offsets=%r>' % (self.offsets.keys())
2013-10-10 09:29:30 +00:00
PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
2013-11-07 08:35:04 +00:00
2013-10-10 09:44:12 +00:00
def load(self, parser, debug=0):
2013-10-10 09:29:30 +00:00
parser.seek(0)
while 1:
try:
(pos, line) = parser.nextline()
except PSEOF:
break
if line.startswith('trailer'):
parser.seek(pos)
self.load_trailer(parser)
if 1 <= debug:
print >>sys.stderr, 'trailer: %r' % self.get_trailer()
break
m = self.PDFOBJ_CUE.match(line)
2013-11-07 08:35:04 +00:00
if not m:
continue
2013-10-10 09:29:30 +00:00
(objid, genno) = m.groups()
2013-10-10 10:40:43 +00:00
objid = int(objid)
genno = int(genno)
self.offsets[objid] = (None, pos, genno)
# expand ObjStm.
parser.seek(pos)
2013-11-07 08:35:04 +00:00
(_, obj) = parser.nextobject()
2013-10-10 10:40:43 +00:00
if isinstance(obj, PDFStream) and obj.get('Type') is LITERAL_OBJSTM:
stream = stream_value(obj)
try:
n = stream['N']
except KeyError:
if STRICT:
raise PDFSyntaxError('N is not defined: %r' % stream)
n = 0
parser1 = PDFStreamParser(stream.get_data())
objs = []
try:
while 1:
2013-11-07 08:35:04 +00:00
(_, obj) = parser1.nextobject()
2013-10-10 10:40:43 +00:00
objs.append(obj)
except PSEOF:
pass
2013-11-26 12:35:16 +00:00
n = min(n, len(objs)//2)
2013-10-10 10:40:43 +00:00
for index in xrange(n):
objid1 = objs[index*2]
self.offsets[objid1] = (objid, index, 0)
2013-10-10 09:29:30 +00:00
return
## PDFXRefStream
##
class PDFXRefStream(PDFBaseXRef):
def __init__(self):
self.data = None
self.entlen = None
self.fl1 = self.fl2 = self.fl3 = None
2013-10-10 09:34:43 +00:00
self.ranges = []
2013-10-10 09:29:30 +00:00
return
def __repr__(self):
2013-10-10 10:17:58 +00:00
return '<PDFXRefStream: ranges=%r>' % (self.ranges)
2013-10-10 09:29:30 +00:00
def load(self, parser, debug=0):
2013-11-07 08:35:04 +00:00
(_, objid) = parser.nexttoken() # ignored
(_, genno) = parser.nexttoken() # ignored
(_, kwd) = parser.nexttoken()
(_, stream) = parser.nextobject()
2013-10-10 09:29:30 +00:00
if not isinstance(stream, PDFStream) or stream['Type'] is not LITERAL_XREF:
raise PDFNoValidXRef('Invalid PDF stream spec.')
size = stream['Size']
2014-03-28 08:24:03 +00:00
index_array = stream.get('Index', (0, size))
2013-10-10 09:29:30 +00:00
if len(index_array) % 2 != 0:
raise PDFSyntaxError('Invalid index number')
2013-10-10 09:34:43 +00:00
self.ranges.extend(choplist(2, index_array))
2013-10-10 09:29:30 +00:00
(self.fl1, self.fl2, self.fl3) = stream['W']
self.data = stream.get_data()
self.entlen = self.fl1+self.fl2+self.fl3
self.trailer = stream.attrs
if 1 <= debug:
print >>sys.stderr, ('xref stream: objid=%s, fields=%d,%d,%d' %
2013-11-07 08:35:04 +00:00
(', '.join(map(repr, self.ranges)),
self.fl1, self.fl2, self.fl3))
2013-10-10 09:29:30 +00:00
return
def get_trailer(self):
return self.trailer
def get_objids(self):
2013-11-07 08:35:04 +00:00
for (start, nobjs) in self.ranges:
2013-10-10 09:34:43 +00:00
for i in xrange(nobjs):
offset = self.entlen * i
ent = self.data[offset:offset+self.entlen]
f1 = nunpack(ent[:self.fl1], 1)
if f1 == 1 or f1 == 2:
yield start+i
2013-10-10 09:29:30 +00:00
return
def get_pos(self, objid):
2013-10-10 09:34:43 +00:00
index = 0
2013-11-07 08:35:04 +00:00
for (start, nobjs) in self.ranges:
2013-10-10 09:34:43 +00:00
if start <= objid and objid < start+nobjs:
index += objid - start
2014-03-28 08:24:03 +00:00
break
2013-10-10 09:29:30 +00:00
else:
2013-10-10 09:34:43 +00:00
index += nobjs
else:
raise KeyError(objid)
offset = self.entlen * index
ent = self.data[offset:offset+self.entlen]
2013-10-10 09:29:30 +00:00
f1 = nunpack(ent[:self.fl1], 1)
2013-10-10 10:17:58 +00:00
f2 = nunpack(ent[self.fl1:self.fl1+self.fl2])
f3 = nunpack(ent[self.fl1+self.fl2:])
2013-10-10 09:29:30 +00:00
if f1 == 1:
2013-10-10 10:17:58 +00:00
return (None, f2, f3)
2013-10-10 09:29:30 +00:00
elif f1 == 2:
2013-10-10 10:17:58 +00:00
return (f2, f3, 0)
else:
# this is a free object
raise KeyError(objid)
2013-10-10 09:29:30 +00:00
## PDFDocument
##
class PDFDocument(object):
"""PDFDocument object represents a PDF document.
Since a PDF file can be very big, normally it is not loaded at
once. So PDF document has to cooperate with a PDF parser in order to
dynamically import the data as processing goes.
Typical usage:
2014-03-24 11:39:30 +00:00
doc = PDFDocument(parser, password)
2013-10-10 09:29:30 +00:00
obj = doc.getobj(objid)
2013-11-07 07:14:53 +00:00
2013-10-10 09:29:30 +00:00
"""
debug = 0
2014-03-24 11:39:30 +00:00
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
2013-10-10 09:29:30 +00:00
2014-03-24 11:39:30 +00:00
def __init__(self, parser, password='', caching=True, fallback=True):
"Set the document to use a given PDFParser object."
2013-10-10 09:29:30 +00:00
self.caching = caching
self.xrefs = []
self.info = []
self.catalog = None
self.encryption = None
self.decipher = None
self._parser = None
self._cached_objs = {}
self._parsed_objs = {}
self._parser = parser
self._parser.set_document(self)
2014-03-24 11:39:30 +00:00
self.is_printable = self.is_modifiable = self.is_extractable = True
2013-10-10 09:29:30 +00:00
# Retrieve the information of each header that was appended
# (maybe multiple times) at the end of the document.
try:
2013-10-10 09:57:08 +00:00
pos = self.find_xref(parser)
self.read_xref_from(parser, pos, self.xrefs)
2013-10-10 09:29:30 +00:00
except PDFNoValidXRef:
fallback = True
if fallback:
2013-10-10 09:57:08 +00:00
parser.fallback = True
xref = PDFXRefFallback()
xref.load(parser)
self.xrefs.append(xref)
2013-10-10 09:29:30 +00:00
for xref in self.xrefs:
trailer = xref.get_trailer()
2013-11-07 08:35:04 +00:00
if not trailer:
continue
2013-10-10 09:29:30 +00:00
# If there's an encryption info, remember it.
if 'Encrypt' in trailer:
#assert not self.encryption
self.encryption = (list_value(trailer['ID']),
dict_value(trailer['Encrypt']))
2014-03-24 11:39:30 +00:00
self._initialize_password(password)
2013-10-10 09:29:30 +00:00
if 'Info' in trailer:
self.info.append(dict_value(trailer['Info']))
if 'Root' in trailer:
# Every PDF file must have exactly one /Root dictionary.
self.catalog = dict_value(trailer['Root'])
break
else:
raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
if self.catalog.get('Type') is not LITERAL_CATALOG:
if STRICT:
raise PDFSyntaxError('Catalog not found!')
return
2014-03-24 11:39:30 +00:00
# _initialize_password(password='')
2013-10-10 09:29:30 +00:00
# Perform the initialization with a given password.
2014-03-24 11:39:30 +00:00
def _initialize_password(self, password=''):
2013-10-10 09:29:30 +00:00
(docid, param) = self.encryption
if literal_name(param.get('Filter')) != 'Standard':
raise PDFEncryptionError('Unknown filter: param=%r' % param)
V = int_value(param.get('V', 0))
if not (V == 1 or V == 2):
raise PDFEncryptionError('Unknown algorithm: param=%r' % param)
2013-11-07 08:35:04 +00:00
length = int_value(param.get('Length', 40)) # Key length (bits)
2013-10-10 09:29:30 +00:00
O = str_value(param['O'])
2013-11-07 08:35:04 +00:00
R = int_value(param['R']) # Revision
2013-10-10 09:29:30 +00:00
if 5 <= R:
raise PDFEncryptionError('Unknown revision: %r' % R)
U = str_value(param['U'])
P = int_value(param['P'])
self.is_printable = bool(P & 4)
self.is_modifiable = bool(P & 8)
self.is_extractable = bool(P & 16)
# Algorithm 3.2
2013-11-07 08:35:04 +00:00
password = (password+self.PASSWORD_PADDING)[:32] # 1
hash = md5.md5(password) # 2
hash.update(O) # 3
hash.update(struct.pack('<l', P)) # 4
hash.update(docid[0]) # 5
2013-10-10 09:29:30 +00:00
if 4 <= R:
# 6
raise PDFNotImplementedError('Revision 4 encryption is currently unsupported')
if 3 <= R:
# 8
for _ in xrange(50):
2013-11-26 12:35:16 +00:00
hash = md5.md5(hash.digest()[:length//8])
key = hash.digest()[:length//8]
2013-10-10 09:29:30 +00:00
if R == 2:
# Algorithm 3.4
u1 = Arcfour(key).process(self.PASSWORD_PADDING)
elif R == 3:
# Algorithm 3.5
2013-11-07 08:35:04 +00:00
hash = md5.md5(self.PASSWORD_PADDING) # 2
hash.update(docid[0]) # 3
x = Arcfour(key).process(hash.digest()[:16]) # 4
for i in xrange(1, 19+1):
k = ''.join(chr(ord(c) ^ i) for c in key)
2013-10-10 09:29:30 +00:00
x = Arcfour(k).process(x)
2013-11-07 08:35:04 +00:00
u1 = x+x # 32bytes total
2013-10-10 09:29:30 +00:00
if R == 2:
is_authenticated = (u1 == U)
else:
is_authenticated = (u1[:16] == U[:16])
if not is_authenticated:
raise PDFPasswordIncorrect
self.decrypt_key = key
self.decipher = self.decrypt_rc4 # XXX may be AES
return
def decrypt_rc4(self, objid, genno, data):
2013-11-07 08:35:04 +00:00
key = self.decrypt_key + struct.pack('<L', objid)[:3]+struct.pack('<L', genno)[:2]
2013-10-10 09:29:30 +00:00
hash = md5.md5(key)
2013-11-07 08:35:04 +00:00
key = hash.digest()[:min(len(key), 16)]
2013-10-10 09:29:30 +00:00
return Arcfour(key).process(data)
2013-10-10 10:17:58 +00:00
def _getobj_objstm(self, stream, index, objid):
2013-10-10 10:40:43 +00:00
if stream.objid in self._parsed_objs:
2013-11-07 08:35:04 +00:00
(objs, n) = self._parsed_objs[stream.objid]
2013-10-10 10:40:43 +00:00
else:
2013-11-07 08:35:04 +00:00
(objs, n) = self._get_objects(stream)
2013-10-10 10:40:43 +00:00
if self.caching:
2013-11-07 08:35:04 +00:00
self._parsed_objs[stream.objid] = (objs, n)
2013-10-10 10:40:43 +00:00
i = n*2+index
try:
obj = objs[i]
except IndexError:
raise PDFSyntaxError('index too big: %r' % index)
return obj
def _get_objects(self, stream):
2013-10-10 10:17:58 +00:00
if stream.get('Type') is not LITERAL_OBJSTM:
if STRICT:
raise PDFSyntaxError('Not a stream object: %r' % stream)
try:
n = stream['N']
except KeyError:
if STRICT:
raise PDFSyntaxError('N is not defined: %r' % stream)
n = 0
2013-10-10 10:40:43 +00:00
parser = PDFStreamParser(stream.get_data())
parser.set_document(self)
objs = []
2013-10-10 10:17:58 +00:00
try:
2013-10-10 10:40:43 +00:00
while 1:
2013-11-07 08:35:04 +00:00
(_, obj) = parser.nextobject()
2013-10-10 10:40:43 +00:00
objs.append(obj)
except PSEOF:
pass
return (objs, n)
2013-10-10 10:17:58 +00:00
2013-10-10 09:29:30 +00:00
KEYWORD_OBJ = KWD('obj')
2013-11-07 08:35:04 +00:00
2013-10-10 10:17:58 +00:00
def _getobj_parse(self, pos, objid):
self._parser.seek(pos)
2013-11-07 08:35:04 +00:00
(_, objid1) = self._parser.nexttoken() # objid
2013-10-10 10:17:58 +00:00
if objid1 != objid:
raise PDFSyntaxError('objid mismatch: %r=%r' % (objid1, objid))
2013-11-07 08:35:04 +00:00
(_, genno) = self._parser.nexttoken() # genno
(_, kwd) = self._parser.nexttoken()
2013-10-10 10:17:58 +00:00
if kwd is not self.KEYWORD_OBJ:
raise PDFSyntaxError('Invalid object spec: offset=%r' % pos)
2013-11-07 08:35:04 +00:00
(_, obj) = self._parser.nextobject()
2013-10-10 10:17:58 +00:00
return obj
2013-11-07 07:14:53 +00:00
2013-10-10 09:29:30 +00:00
# can raise PDFObjectNotFound
def getobj(self, objid):
assert objid != 0
2013-10-10 09:29:30 +00:00
if not self.xrefs:
raise PDFException('PDFDocument is not initialized')
if 2 <= self.debug:
print >>sys.stderr, 'getobj: objid=%r' % (objid)
if objid in self._cached_objs:
2013-10-10 10:17:58 +00:00
(obj, genno) = self._cached_objs[objid]
2013-10-10 09:29:30 +00:00
else:
for xref in self.xrefs:
try:
2013-10-10 10:17:58 +00:00
(strmid, index, genno) = xref.get_pos(objid)
2013-10-10 09:29:30 +00:00
except KeyError:
2013-10-10 10:17:58 +00:00
continue
2013-10-10 09:29:30 +00:00
try:
2013-10-10 10:17:58 +00:00
if strmid is not None:
stream = stream_value(self.getobj(strmid))
obj = self._getobj_objstm(stream, index, objid)
else:
obj = self._getobj_parse(index, objid)
2013-10-10 09:29:30 +00:00
if isinstance(obj, PDFStream):
obj.set_objid(objid, genno)
2013-10-10 10:17:58 +00:00
break
except (PSEOF, PDFSyntaxError):
continue
else:
raise PDFObjectNotFound(objid)
2013-10-10 09:29:30 +00:00
if 2 <= self.debug:
print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj)
if self.caching:
2013-10-10 10:17:58 +00:00
self._cached_objs[objid] = (obj, genno)
2013-10-10 09:29:30 +00:00
if self.decipher:
obj = decipher_all(self.decipher, objid, genno, obj)
return obj
def get_outlines(self):
if 'Outlines' not in self.catalog:
raise PDFNoOutlines
2013-11-07 08:35:04 +00:00
2013-10-10 09:29:30 +00:00
def search(entry, level):
entry = dict_value(entry)
if 'Title' in entry:
if 'A' in entry or 'Dest' in entry:
title = decode_text(str_value(entry['Title']))
dest = entry.get('Dest')
action = entry.get('A')
se = entry.get('SE')
yield (level, title, dest, action, se)
if 'First' in entry and 'Last' in entry:
for x in search(entry['First'], level+1):
yield x
if 'Next' in entry:
for x in search(entry['Next'], level):
yield x
return
return search(self.catalog['Outlines'], 0)
def lookup_name(self, cat, key):
try:
names = dict_value(self.catalog['Names'])
except (PDFTypeError, KeyError):
2013-11-07 08:35:04 +00:00
raise KeyError((cat, key))
2013-10-10 09:29:30 +00:00
# may raise KeyError
d0 = dict_value(names[cat])
2013-11-07 08:35:04 +00:00
2013-10-10 09:29:30 +00:00
def lookup(d):
if 'Limits' in d:
2013-11-07 08:35:04 +00:00
(k1, k2) = list_value(d['Limits'])
if key < k1 or k2 < key:
return None
2013-10-10 09:29:30 +00:00
if 'Names' in d:
objs = list_value(d['Names'])
names = dict(choplist(2, objs))
return names[key]
if 'Kids' in d:
for c in list_value(d['Kids']):
v = lookup(dict_value(c))
2013-11-07 08:35:04 +00:00
if v:
return v
raise KeyError((cat, key))
2013-10-10 09:29:30 +00:00
return lookup(d0)
def get_dest(self, name):
try:
# PDF-1.2 or later
obj = self.lookup_name('Dests', name)
except KeyError:
# PDF-1.1 or prior
if 'Dests' not in self.catalog:
raise PDFDestinationNotFound(name)
d0 = dict_value(self.catalog['Dests'])
if name not in d0:
raise PDFDestinationNotFound(name)
obj = d0[name]
return obj
# find_xref
def find_xref(self, parser):
"""Internal function used to locate the first XRef."""
# search the last xref table by scanning the file backwards.
prev = None
for line in parser.revreadlines():
line = line.strip()
if 2 <= self.debug:
print >>sys.stderr, 'find_xref: %r' % line
2013-11-07 08:35:04 +00:00
if line == 'startxref':
break
2013-10-10 09:29:30 +00:00
if line:
prev = line
else:
raise PDFNoValidXRef('Unexpected EOF')
if 1 <= self.debug:
print >>sys.stderr, 'xref found: pos=%r' % prev
return long(prev)
# read xref table
def read_xref_from(self, parser, start, xrefs):
"""Reads XRefs from the given location."""
parser.seek(start)
parser.reset()
try:
(pos, token) = parser.nexttoken()
except PSEOF:
raise PDFNoValidXRef('Unexpected EOF')
if 2 <= self.debug:
print >>sys.stderr, 'read_xref_from: start=%d, token=%r' % (start, token)
if isinstance(token, int):
# XRefStream: PDF-1.5
parser.seek(pos)
parser.reset()
xref = PDFXRefStream()
xref.load(parser, debug=self.debug)
else:
if token is parser.KEYWORD_XREF:
parser.nextline()
xref = PDFXRef()
xref.load(parser, debug=self.debug)
xrefs.append(xref)
trailer = xref.get_trailer()
if 1 <= self.debug:
print >>sys.stderr, 'trailer: %r' % trailer
if 'XRefStm' in trailer:
pos = int_value(trailer['XRefStm'])
self.read_xref_from(parser, pos, xrefs)
if 'Prev' in trailer:
# find previous xref
pos = int_value(trailer['Prev'])
self.read_xref_from(parser, pos, xrefs)
return