Split pdfparser.py and pdfdocument.py.

pull/1/head
Yusuke Shinyama 2013-10-10 18:29:30 +09:00
parent 1467fc674c
commit 2221163b94
5 changed files with 667 additions and 649 deletions

657
pdfminer/pdfdocument.py Normal file
View File

@ -0,0 +1,657 @@
#!/usr/bin/env python2
import sys
import re
import struct
try:
import hashlib as md5
except ImportError:
import md5
from psparser import PSEOF
from psparser import literal_name
from psparser import LIT, KWD, STRICT
from pdftypes import PDFException, PDFTypeError, PDFNotImplementedError
from pdftypes import PDFObjectNotFound, PDFStream
from pdftypes import resolve1, decipher_all
from pdftypes import int_value, float_value, num_value
from pdftypes import str_value, list_value, dict_value, stream_value
from pdfparser import PDFSyntaxError
from pdfparser import PDFStreamParser
from arcfour import Arcfour
from utils import choplist, nunpack
from utils import decode_text, ObjIdRange
## Exceptions
##
class PDFNoValidXRef(PDFSyntaxError): pass
class PDFNoOutlines(PDFException): pass
class PDFDestinationNotFound(PDFException): pass
class PDFEncryptionError(PDFException): pass
class PDFPasswordIncorrect(PDFEncryptionError): pass
# some predefined literals and keywords.
LITERAL_OBJSTM = LIT('ObjStm')
LITERAL_XREF = LIT('XRef')
LITERAL_PAGE = LIT('Page')
LITERAL_PAGES = LIT('Pages')
LITERAL_CATALOG = LIT('Catalog')
## XRefs
##
class PDFBaseXRef(object):
def get_trailer(self):
raise NotImplementedError
def get_objids(self):
return []
def get_pos(self, objid):
raise KeyError(objid)
## PDFXRef
##
class PDFXRef(PDFBaseXRef):
def __init__(self):
self.offsets = {}
self.trailer = {}
return
def __repr__(self):
return '<PDFXRef: offsets=%r>' % (self.offsets.keys())
def load(self, parser, debug=0):
while 1:
try:
(pos, line) = parser.nextline()
if not line.strip(): continue
except PSEOF:
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
if not line:
raise PDFNoValidXRef('Premature eof: %r' % parser)
if line.startswith('trailer'):
parser.seek(pos)
break
f = line.strip().split(' ')
if len(f) != 2:
raise PDFNoValidXRef('Trailer not found: %r: line=%r' % (parser, line))
try:
(start, nobjs) = map(long, f)
except ValueError:
raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line))
for objid in xrange(start, start+nobjs):
try:
(_, line) = parser.nextline()
except PSEOF:
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
f = line.strip().split(' ')
if len(f) != 3:
raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line))
(pos, genno, use) = f
if use != 'n': continue
self.offsets[objid] = (int(genno), long(pos))
if 1 <= debug:
print >>sys.stderr, 'xref objects:', self.offsets
self.load_trailer(parser)
return
KEYWORD_TRAILER = KWD('trailer')
def load_trailer(self, parser):
try:
(_,kwd) = parser.nexttoken()
assert kwd is self.KEYWORD_TRAILER
(_,dic) = parser.nextobject()
except PSEOF:
x = parser.pop(1)
if not x:
raise PDFNoValidXRef('Unexpected EOF - file corrupted')
(_,dic) = x[0]
self.trailer.update(dict_value(dic))
return
PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
def load_fallback(self, parser, debug=0):
parser.seek(0)
while 1:
try:
(pos, line) = parser.nextline()
except PSEOF:
break
if line.startswith('trailer'):
parser.seek(pos)
self.load_trailer(parser)
if 1 <= debug:
print >>sys.stderr, 'trailer: %r' % self.get_trailer()
break
m = self.PDFOBJ_CUE.match(line)
if not m: continue
(objid, genno) = m.groups()
self.offsets[int(objid)] = (0, pos)
return
def get_trailer(self):
return self.trailer
def get_objids(self):
return self.offsets.iterkeys()
def get_pos(self, objid):
try:
(genno, pos) = self.offsets[objid]
except KeyError:
raise
return (None, pos)
## PDFXRefStream
##
class PDFXRefStream(PDFBaseXRef):
def __init__(self):
self.data = None
self.entlen = None
self.fl1 = self.fl2 = self.fl3 = None
self.objid_ranges = []
return
def __repr__(self):
return '<PDFXRefStream: fields=%d,%d,%d>' % (self.fl1, self.fl2, self.fl3)
def load(self, parser, debug=0):
(_,objid) = parser.nexttoken() # ignored
(_,genno) = parser.nexttoken() # ignored
(_,kwd) = parser.nexttoken()
(_,stream) = parser.nextobject()
if not isinstance(stream, PDFStream) or stream['Type'] is not LITERAL_XREF:
raise PDFNoValidXRef('Invalid PDF stream spec.')
size = stream['Size']
index_array = stream.get('Index', (0,size))
if len(index_array) % 2 != 0:
raise PDFSyntaxError('Invalid index number')
self.objid_ranges.extend( ObjIdRange(start, nobjs)
for (start,nobjs) in choplist(2, index_array) )
(self.fl1, self.fl2, self.fl3) = stream['W']
self.data = stream.get_data()
self.entlen = self.fl1+self.fl2+self.fl3
self.trailer = stream.attrs
if 1 <= debug:
print >>sys.stderr, ('xref stream: objid=%s, fields=%d,%d,%d' %
(', '.join(map(repr, self.objid_ranges)),
self.fl1, self.fl2, self.fl3))
return
def get_trailer(self):
return self.trailer
def get_objids(self):
for objid_range in self.objid_ranges:
for x in xrange(objid_range.get_start_id(), objid_range.get_end_id()+1):
yield x
return
def get_pos(self, objid):
offset = 0
found = False
for objid_range in self.objid_ranges:
if objid >= objid_range.get_start_id() and objid <= objid_range.get_end_id():
offset += objid - objid_range.get_start_id()
found = True
break
else:
offset += objid_range.get_nobjs()
if not found: raise KeyError(objid)
i = self.entlen * offset
ent = self.data[i:i+self.entlen]
f1 = nunpack(ent[:self.fl1], 1)
if f1 == 1:
pos = nunpack(ent[self.fl1:self.fl1+self.fl2])
genno = nunpack(ent[self.fl1+self.fl2:])
return (None, pos)
elif f1 == 2:
objid = nunpack(ent[self.fl1:self.fl1+self.fl2])
index = nunpack(ent[self.fl1+self.fl2:])
return (objid, index)
# this is a free object
raise KeyError(objid)
## PDFPage
##
class PDFPage(object):
"""An object that holds the information about a page.
A PDFPage object is merely a convenience class that has a set
of keys and values, which describe the properties of a page
and point to its contents.
Attributes:
doc: a PDFDocument object.
pageid: any Python object that can uniquely identify the page.
attrs: a dictionary of page attributes.
contents: a list of PDFStream objects that represents the page content.
lastmod: the last modified time of the page.
resources: a list of resources used by the page.
mediabox: the physical size of the page.
cropbox: the crop rectangle of the page.
rotate: the page rotation (in degree).
annots: the page annotations.
beads: a chain that represents natural reading order.
"""
def __init__(self, doc, pageid, attrs):
"""Initialize a page object.
doc: a PDFDocument object.
pageid: any Python object that can uniquely identify the page.
attrs: a dictionary of page attributes.
"""
self.doc = doc
self.pageid = pageid
self.attrs = dict_value(attrs)
self.lastmod = resolve1(self.attrs.get('LastModified'))
self.resources = resolve1(self.attrs['Resources'])
self.mediabox = resolve1(self.attrs['MediaBox'])
if 'CropBox' in self.attrs:
self.cropbox = resolve1(self.attrs['CropBox'])
else:
self.cropbox = self.mediabox
self.rotate = (self.attrs.get('Rotate', 0)+360) % 360
self.annots = self.attrs.get('Annots')
self.beads = self.attrs.get('B')
if 'Contents' in self.attrs:
contents = resolve1(self.attrs['Contents'])
else:
contents = []
if not isinstance(contents, list):
contents = [ contents ]
self.contents = contents
return
def __repr__(self):
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
## PDFDocument
##
class PDFDocument(object):
"""PDFDocument object represents a PDF document.
Since a PDF file can be very big, normally it is not loaded at
once. So PDF document has to cooperate with a PDF parser in order to
dynamically import the data as processing goes.
Typical usage:
doc = PDFDocument()
doc.set_parser(parser)
doc.initialize(password)
obj = doc.getobj(objid)
"""
debug = 0
def __init__(self, caching=True):
self.caching = caching
self.xrefs = []
self.info = []
self.catalog = None
self.encryption = None
self.decipher = None
self._parser = None
self._cached_objs = {}
self._parsed_objs = {}
return
def set_parser(self, parser, fallback=True):
"Set the document to use a given PDFParser object."
if self._parser: return
self._parser = parser
# Retrieve the information of each header that was appended
# (maybe multiple times) at the end of the document.
try:
self.xrefs = self.read_xref(parser)
except PDFNoValidXRef:
fallback = True
if fallback:
self.xrefs.extend(self.read_xref(parser, fallback=True))
for xref in self.xrefs:
trailer = xref.get_trailer()
if not trailer: continue
# If there's an encryption info, remember it.
if 'Encrypt' in trailer:
#assert not self.encryption
self.encryption = (list_value(trailer['ID']),
dict_value(trailer['Encrypt']))
if 'Info' in trailer:
self.info.append(dict_value(trailer['Info']))
if 'Root' in trailer:
# Every PDF file must have exactly one /Root dictionary.
self.catalog = dict_value(trailer['Root'])
break
else:
raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
if self.catalog.get('Type') is not LITERAL_CATALOG:
if STRICT:
raise PDFSyntaxError('Catalog not found!')
return
# initialize(password='')
# Perform the initialization with a given password.
# This step is mandatory even if there's no password associated
# with the document.
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
def initialize(self, password=''):
if not self.encryption:
self.is_printable = self.is_modifiable = self.is_extractable = True
return
(docid, param) = self.encryption
if literal_name(param.get('Filter')) != 'Standard':
raise PDFEncryptionError('Unknown filter: param=%r' % param)
V = int_value(param.get('V', 0))
if not (V == 1 or V == 2):
raise PDFEncryptionError('Unknown algorithm: param=%r' % param)
length = int_value(param.get('Length', 40)) # Key length (bits)
O = str_value(param['O'])
R = int_value(param['R']) # Revision
if 5 <= R:
raise PDFEncryptionError('Unknown revision: %r' % R)
U = str_value(param['U'])
P = int_value(param['P'])
self.is_printable = bool(P & 4)
self.is_modifiable = bool(P & 8)
self.is_extractable = bool(P & 16)
# Algorithm 3.2
password = (password+self.PASSWORD_PADDING)[:32] # 1
hash = md5.md5(password) # 2
hash.update(O) # 3
hash.update(struct.pack('<l', P)) # 4
hash.update(docid[0]) # 5
if 4 <= R:
# 6
raise PDFNotImplementedError('Revision 4 encryption is currently unsupported')
if 3 <= R:
# 8
for _ in xrange(50):
hash = md5.md5(hash.digest()[:length/8])
key = hash.digest()[:length/8]
if R == 2:
# Algorithm 3.4
u1 = Arcfour(key).process(self.PASSWORD_PADDING)
elif R == 3:
# Algorithm 3.5
hash = md5.md5(self.PASSWORD_PADDING) # 2
hash.update(docid[0]) # 3
x = Arcfour(key).process(hash.digest()[:16]) # 4
for i in xrange(1,19+1):
k = ''.join( chr(ord(c) ^ i) for c in key )
x = Arcfour(k).process(x)
u1 = x+x # 32bytes total
if R == 2:
is_authenticated = (u1 == U)
else:
is_authenticated = (u1[:16] == U[:16])
if not is_authenticated:
raise PDFPasswordIncorrect
self.decrypt_key = key
self.decipher = self.decrypt_rc4 # XXX may be AES
return
def decrypt_rc4(self, objid, genno, data):
key = self.decrypt_key + struct.pack('<L',objid)[:3]+struct.pack('<L',genno)[:2]
hash = md5.md5(key)
key = hash.digest()[:min(len(key),16)]
return Arcfour(key).process(data)
KEYWORD_OBJ = KWD('obj')
# can raise PDFObjectNotFound
def getobj(self, objid):
if not self.xrefs:
raise PDFException('PDFDocument is not initialized')
if 2 <= self.debug:
print >>sys.stderr, 'getobj: objid=%r' % (objid)
if objid in self._cached_objs:
genno = 0
obj = self._cached_objs[objid]
else:
for xref in self.xrefs:
try:
(strmid, index) = xref.get_pos(objid)
break
except KeyError:
pass
else:
raise PDFObjectNotFound(objid)
if strmid:
stream = stream_value(self.getobj(strmid))
if stream.get('Type') is not LITERAL_OBJSTM:
if STRICT:
raise PDFSyntaxError('Not a stream object: %r' % stream)
try:
n = stream['N']
except KeyError:
if STRICT:
raise PDFSyntaxError('N is not defined: %r' % stream)
n = 0
if strmid in self._parsed_objs:
objs = self._parsed_objs[strmid]
else:
parser = PDFStreamParser(stream.get_data())
parser.set_document(self)
objs = []
try:
while 1:
(_,obj) = parser.nextobject()
objs.append(obj)
except PSEOF:
pass
if self.caching:
self._parsed_objs[strmid] = objs
genno = 0
i = n*2+index
try:
obj = objs[i]
except IndexError:
raise PDFObjectNotFound(objid)
if isinstance(obj, PDFStream):
obj.set_objid(objid, 0)
else:
self._parser.seek(index)
try:
(_,objid1) = self._parser.nexttoken() # objid
(_,genno) = self._parser.nexttoken() # genno
(_,kwd) = self._parser.nexttoken()
# #### hack around malformed pdf files
#assert objid1 == objid, (objid, objid1)
if objid1 != objid:
x = []
while kwd is not self.KEYWORD_OBJ:
(_,kwd) = self._parser.nexttoken()
x.append(kwd)
if x:
objid1 = x[-2]
genno = x[-1]
# #### end hack around malformed pdf files
if kwd is not self.KEYWORD_OBJ:
raise PDFSyntaxError('Invalid object spec: offset=%r' % index)
(_,obj) = self._parser.nextobject()
if isinstance(obj, PDFStream):
obj.set_objid(objid, genno)
except PSEOF:
raise PDFObjectNotFound(objid)
if 2 <= self.debug:
print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj)
if self.caching:
self._cached_objs[objid] = obj
if self.decipher:
obj = decipher_all(self.decipher, objid, genno, obj)
return obj
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
def get_pages(self):
if not self.xrefs:
raise PDFException('PDFDocument is not initialized')
def search(obj, parent):
if isinstance(obj, int):
objid = obj
tree = dict_value(self.getobj(objid)).copy()
else:
objid = obj.objid
tree = dict_value(obj).copy()
for (k,v) in parent.iteritems():
if k in self.INHERITABLE_ATTRS and k not in tree:
tree[k] = v
if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
if 1 <= self.debug:
print >>sys.stderr, 'Pages: Kids=%r' % tree['Kids']
for c in list_value(tree['Kids']):
for x in search(c, tree):
yield x
elif tree.get('Type') is LITERAL_PAGE:
if 1 <= self.debug:
print >>sys.stderr, 'Page: %r' % tree
yield (objid, tree)
pages = False
if 'Pages' in self.catalog:
for (objid,tree) in search(self.catalog['Pages'], self.catalog):
yield PDFPage(self, objid, tree)
pages = True
if not pages:
# fallback when /Pages is missing.
for xref in self.xrefs:
for objid in xref.get_objids():
try:
obj = self.getobj(objid)
if isinstance(obj, dict) and obj.get('Type') is LITERAL_PAGE:
yield PDFPage(self, objid, obj)
except PDFObjectNotFound:
pass
return
def get_outlines(self):
if 'Outlines' not in self.catalog:
raise PDFNoOutlines
def search(entry, level):
entry = dict_value(entry)
if 'Title' in entry:
if 'A' in entry or 'Dest' in entry:
title = decode_text(str_value(entry['Title']))
dest = entry.get('Dest')
action = entry.get('A')
se = entry.get('SE')
yield (level, title, dest, action, se)
if 'First' in entry and 'Last' in entry:
for x in search(entry['First'], level+1):
yield x
if 'Next' in entry:
for x in search(entry['Next'], level):
yield x
return
return search(self.catalog['Outlines'], 0)
def lookup_name(self, cat, key):
try:
names = dict_value(self.catalog['Names'])
except (PDFTypeError, KeyError):
raise KeyError((cat,key))
# may raise KeyError
d0 = dict_value(names[cat])
def lookup(d):
if 'Limits' in d:
(k1,k2) = list_value(d['Limits'])
if key < k1 or k2 < key: return None
if 'Names' in d:
objs = list_value(d['Names'])
names = dict(choplist(2, objs))
return names[key]
if 'Kids' in d:
for c in list_value(d['Kids']):
v = lookup(dict_value(c))
if v: return v
raise KeyError((cat,key))
return lookup(d0)
def get_dest(self, name):
try:
# PDF-1.2 or later
obj = self.lookup_name('Dests', name)
except KeyError:
# PDF-1.1 or prior
if 'Dests' not in self.catalog:
raise PDFDestinationNotFound(name)
d0 = dict_value(self.catalog['Dests'])
if name not in d0:
raise PDFDestinationNotFound(name)
obj = d0[name]
return obj
# find_xref
def find_xref(self, parser):
"""Internal function used to locate the first XRef."""
# search the last xref table by scanning the file backwards.
prev = None
for line in parser.revreadlines():
line = line.strip()
if 2 <= self.debug:
print >>sys.stderr, 'find_xref: %r' % line
if line == 'startxref': break
if line:
prev = line
else:
raise PDFNoValidXRef('Unexpected EOF')
if 1 <= self.debug:
print >>sys.stderr, 'xref found: pos=%r' % prev
return long(prev)
# read xref table
def read_xref_from(self, parser, start, xrefs):
"""Reads XRefs from the given location."""
parser.seek(start)
parser.reset()
try:
(pos, token) = parser.nexttoken()
except PSEOF:
raise PDFNoValidXRef('Unexpected EOF')
if 2 <= self.debug:
print >>sys.stderr, 'read_xref_from: start=%d, token=%r' % (start, token)
if isinstance(token, int):
# XRefStream: PDF-1.5
parser.seek(pos)
parser.reset()
xref = PDFXRefStream()
xref.load(parser, debug=self.debug)
else:
if token is parser.KEYWORD_XREF:
parser.nextline()
xref = PDFXRef()
xref.load(parser, debug=self.debug)
xrefs.append(xref)
trailer = xref.get_trailer()
if 1 <= self.debug:
print >>sys.stderr, 'trailer: %r' % trailer
if 'XRefStm' in trailer:
pos = int_value(trailer['XRefStm'])
self.read_xref_from(parser, pos, xrefs)
if 'Prev' in trailer:
# find previous xref
pos = int_value(trailer['Prev'])
self.read_xref_from(parser, pos, xrefs)
return
# read xref tables and trailers
def read_xref(self, parser, fallback=False):
"""Reads all the XRefs in the PDF file and returns them."""
xrefs = []
parser.fallback = fallback
if parser.fallback:
xref = PDFXRef()
xref.load_fallback(parser)
xrefs.append(xref)
else:
pos = self.find_xref(parser)
self.read_xref_from(parser, pos, xrefs)
return xrefs

View File

@ -17,8 +17,9 @@ from pdftypes import str_value, list_value, dict_value, stream_value
from pdffont import PDFFontError
from pdffont import PDFType1Font, PDFTrueTypeFont, PDFType3Font
from pdffont import PDFCIDFont
from pdfparser import PDFDocument, PDFParser
from pdfparser import PDFPasswordIncorrect
from pdfparser import PDFParser
from pdfdocument import PDFDocument
from pdfdocument import PDFPasswordIncorrect
from pdfcolor import PDFColorSpace
from pdfcolor import PREDEFINED_COLORSPACE
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB

View File

@ -1,597 +1,21 @@
#!/usr/bin/env python2
import sys
import re
import struct
try:
import hashlib as md5
except ImportError:
import md5
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
from psparser import PSStackParser
from psparser import PSSyntaxError, PSEOF
from psparser import literal_name
from psparser import LIT, KWD, STRICT
from pdftypes import PDFException, PDFTypeError, PDFNotImplementedError
from pdftypes import PDFObjectNotFound, PDFStream, PDFObjRef
from pdftypes import resolve1, decipher_all
from psparser import KWD, STRICT
from pdftypes import PDFException
from pdftypes import PDFStream, PDFObjRef
from pdftypes import int_value, float_value, num_value
from pdftypes import str_value, list_value, dict_value, stream_value
from arcfour import Arcfour
from utils import choplist, nunpack
from utils import decode_text, ObjIdRange
## Exceptions
##
class PDFSyntaxError(PDFException): pass
class PDFNoValidXRef(PDFSyntaxError): pass
class PDFNoOutlines(PDFException): pass
class PDFDestinationNotFound(PDFException): pass
class PDFEncryptionError(PDFException): pass
class PDFPasswordIncorrect(PDFEncryptionError): pass
# some predefined literals and keywords.
LITERAL_OBJSTM = LIT('ObjStm')
LITERAL_XREF = LIT('XRef')
LITERAL_PAGE = LIT('Page')
LITERAL_PAGES = LIT('Pages')
LITERAL_CATALOG = LIT('Catalog')
## XRefs
##
class PDFBaseXRef(object):
def get_trailer(self):
raise NotImplementedError
def get_objids(self):
return []
def get_pos(self, objid):
raise KeyError(objid)
## PDFXRef
##
class PDFXRef(PDFBaseXRef):
def __init__(self):
self.offsets = {}
self.trailer = {}
return
def __repr__(self):
return '<PDFXRef: offsets=%r>' % (self.offsets.keys())
def load(self, parser, debug=0):
while 1:
try:
(pos, line) = parser.nextline()
if not line.strip(): continue
except PSEOF:
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
if not line:
raise PDFNoValidXRef('Premature eof: %r' % parser)
if line.startswith('trailer'):
parser.seek(pos)
break
f = line.strip().split(' ')
if len(f) != 2:
raise PDFNoValidXRef('Trailer not found: %r: line=%r' % (parser, line))
try:
(start, nobjs) = map(long, f)
except ValueError:
raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line))
for objid in xrange(start, start+nobjs):
try:
(_, line) = parser.nextline()
except PSEOF:
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
f = line.strip().split(' ')
if len(f) != 3:
raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line))
(pos, genno, use) = f
if use != 'n': continue
self.offsets[objid] = (int(genno), long(pos))
if 1 <= debug:
print >>sys.stderr, 'xref objects:', self.offsets
self.load_trailer(parser)
return
KEYWORD_TRAILER = KWD('trailer')
def load_trailer(self, parser):
try:
(_,kwd) = parser.nexttoken()
assert kwd is self.KEYWORD_TRAILER
(_,dic) = parser.nextobject()
except PSEOF:
x = parser.pop(1)
if not x:
raise PDFNoValidXRef('Unexpected EOF - file corrupted')
(_,dic) = x[0]
self.trailer.update(dict_value(dic))
return
PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
def load_fallback(self, parser, debug=0):
parser.seek(0)
while 1:
try:
(pos, line) = parser.nextline()
except PSEOF:
break
if line.startswith('trailer'):
parser.seek(pos)
self.load_trailer(parser)
if 1 <= debug:
print >>sys.stderr, 'trailer: %r' % self.get_trailer()
break
m = self.PDFOBJ_CUE.match(line)
if not m: continue
(objid, genno) = m.groups()
self.offsets[int(objid)] = (0, pos)
return
def get_trailer(self):
return self.trailer
def get_objids(self):
return self.offsets.iterkeys()
def get_pos(self, objid):
try:
(genno, pos) = self.offsets[objid]
except KeyError:
raise
return (None, pos)
## PDFXRefStream
##
class PDFXRefStream(PDFBaseXRef):
def __init__(self):
self.data = None
self.entlen = None
self.fl1 = self.fl2 = self.fl3 = None
self.objid_ranges = []
return
def __repr__(self):
return '<PDFXRefStream: fields=%d,%d,%d>' % (self.fl1, self.fl2, self.fl3)
def load(self, parser, debug=0):
(_,objid) = parser.nexttoken() # ignored
(_,genno) = parser.nexttoken() # ignored
(_,kwd) = parser.nexttoken()
(_,stream) = parser.nextobject()
if not isinstance(stream, PDFStream) or stream['Type'] is not LITERAL_XREF:
raise PDFNoValidXRef('Invalid PDF stream spec.')
size = stream['Size']
index_array = stream.get('Index', (0,size))
if len(index_array) % 2 != 0:
raise PDFSyntaxError('Invalid index number')
self.objid_ranges.extend( ObjIdRange(start, nobjs)
for (start,nobjs) in choplist(2, index_array) )
(self.fl1, self.fl2, self.fl3) = stream['W']
self.data = stream.get_data()
self.entlen = self.fl1+self.fl2+self.fl3
self.trailer = stream.attrs
if 1 <= debug:
print >>sys.stderr, ('xref stream: objid=%s, fields=%d,%d,%d' %
(', '.join(map(repr, self.objid_ranges)),
self.fl1, self.fl2, self.fl3))
return
def get_trailer(self):
return self.trailer
def get_objids(self):
for objid_range in self.objid_ranges:
for x in xrange(objid_range.get_start_id(), objid_range.get_end_id()+1):
yield x
return
def get_pos(self, objid):
offset = 0
found = False
for objid_range in self.objid_ranges:
if objid >= objid_range.get_start_id() and objid <= objid_range.get_end_id():
offset += objid - objid_range.get_start_id()
found = True
break
else:
offset += objid_range.get_nobjs()
if not found: raise KeyError(objid)
i = self.entlen * offset
ent = self.data[i:i+self.entlen]
f1 = nunpack(ent[:self.fl1], 1)
if f1 == 1:
pos = nunpack(ent[self.fl1:self.fl1+self.fl2])
genno = nunpack(ent[self.fl1+self.fl2:])
return (None, pos)
elif f1 == 2:
objid = nunpack(ent[self.fl1:self.fl1+self.fl2])
index = nunpack(ent[self.fl1+self.fl2:])
return (objid, index)
# this is a free object
raise KeyError(objid)
## PDFPage
##
class PDFPage(object):
"""An object that holds the information about a page.
A PDFPage object is merely a convenience class that has a set
of keys and values, which describe the properties of a page
and point to its contents.
Attributes:
doc: a PDFDocument object.
pageid: any Python object that can uniquely identify the page.
attrs: a dictionary of page attributes.
contents: a list of PDFStream objects that represents the page content.
lastmod: the last modified time of the page.
resources: a list of resources used by the page.
mediabox: the physical size of the page.
cropbox: the crop rectangle of the page.
rotate: the page rotation (in degree).
annots: the page annotations.
beads: a chain that represents natural reading order.
"""
def __init__(self, doc, pageid, attrs):
"""Initialize a page object.
doc: a PDFDocument object.
pageid: any Python object that can uniquely identify the page.
attrs: a dictionary of page attributes.
"""
self.doc = doc
self.pageid = pageid
self.attrs = dict_value(attrs)
self.lastmod = resolve1(self.attrs.get('LastModified'))
self.resources = resolve1(self.attrs['Resources'])
self.mediabox = resolve1(self.attrs['MediaBox'])
if 'CropBox' in self.attrs:
self.cropbox = resolve1(self.attrs['CropBox'])
else:
self.cropbox = self.mediabox
self.rotate = (self.attrs.get('Rotate', 0)+360) % 360
self.annots = self.attrs.get('Annots')
self.beads = self.attrs.get('B')
if 'Contents' in self.attrs:
contents = resolve1(self.attrs['Contents'])
else:
contents = []
if not isinstance(contents, list):
contents = [ contents ]
self.contents = contents
return
def __repr__(self):
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
## PDFDocument
##
class PDFDocument(object):
"""PDFDocument object represents a PDF document.
Since a PDF file can be very big, normally it is not loaded at
once. So PDF document has to cooperate with a PDF parser in order to
dynamically import the data as processing goes.
Typical usage:
doc = PDFDocument()
doc.set_parser(parser)
doc.initialize(password)
obj = doc.getobj(objid)
"""
debug = 0
def __init__(self, caching=True):
self.caching = caching
self.xrefs = []
self.info = []
self.catalog = None
self.encryption = None
self.decipher = None
self._parser = None
self._cached_objs = {}
self._parsed_objs = {}
return
def set_parser(self, parser, fallback=True):
"Set the document to use a given PDFParser object."
if self._parser: return
self._parser = parser
# Retrieve the information of each header that was appended
# (maybe multiple times) at the end of the document.
try:
self.xrefs = parser.read_xref()
except PDFNoValidXRef:
fallback = True
if fallback:
self.xrefs.extend(parser.read_xref(fallback=True))
for xref in self.xrefs:
trailer = xref.get_trailer()
if not trailer: continue
# If there's an encryption info, remember it.
if 'Encrypt' in trailer:
#assert not self.encryption
self.encryption = (list_value(trailer['ID']),
dict_value(trailer['Encrypt']))
if 'Info' in trailer:
self.info.append(dict_value(trailer['Info']))
if 'Root' in trailer:
# Every PDF file must have exactly one /Root dictionary.
self.catalog = dict_value(trailer['Root'])
break
else:
raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
if self.catalog.get('Type') is not LITERAL_CATALOG:
if STRICT:
raise PDFSyntaxError('Catalog not found!')
return
# initialize(password='')
# Perform the initialization with a given password.
# This step is mandatory even if there's no password associated
# with the document.
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
def initialize(self, password=''):
if not self.encryption:
self.is_printable = self.is_modifiable = self.is_extractable = True
return
(docid, param) = self.encryption
if literal_name(param.get('Filter')) != 'Standard':
raise PDFEncryptionError('Unknown filter: param=%r' % param)
V = int_value(param.get('V', 0))
if not (V == 1 or V == 2):
raise PDFEncryptionError('Unknown algorithm: param=%r' % param)
length = int_value(param.get('Length', 40)) # Key length (bits)
O = str_value(param['O'])
R = int_value(param['R']) # Revision
if 5 <= R:
raise PDFEncryptionError('Unknown revision: %r' % R)
U = str_value(param['U'])
P = int_value(param['P'])
self.is_printable = bool(P & 4)
self.is_modifiable = bool(P & 8)
self.is_extractable = bool(P & 16)
# Algorithm 3.2
password = (password+self.PASSWORD_PADDING)[:32] # 1
hash = md5.md5(password) # 2
hash.update(O) # 3
hash.update(struct.pack('<l', P)) # 4
hash.update(docid[0]) # 5
if 4 <= R:
# 6
raise PDFNotImplementedError('Revision 4 encryption is currently unsupported')
if 3 <= R:
# 8
for _ in xrange(50):
hash = md5.md5(hash.digest()[:length/8])
key = hash.digest()[:length/8]
if R == 2:
# Algorithm 3.4
u1 = Arcfour(key).process(self.PASSWORD_PADDING)
elif R == 3:
# Algorithm 3.5
hash = md5.md5(self.PASSWORD_PADDING) # 2
hash.update(docid[0]) # 3
x = Arcfour(key).process(hash.digest()[:16]) # 4
for i in xrange(1,19+1):
k = ''.join( chr(ord(c) ^ i) for c in key )
x = Arcfour(k).process(x)
u1 = x+x # 32bytes total
if R == 2:
is_authenticated = (u1 == U)
else:
is_authenticated = (u1[:16] == U[:16])
if not is_authenticated:
raise PDFPasswordIncorrect
self.decrypt_key = key
self.decipher = self.decrypt_rc4 # XXX may be AES
return
def decrypt_rc4(self, objid, genno, data):
key = self.decrypt_key + struct.pack('<L',objid)[:3]+struct.pack('<L',genno)[:2]
hash = md5.md5(key)
key = hash.digest()[:min(len(key),16)]
return Arcfour(key).process(data)
KEYWORD_OBJ = KWD('obj')
# can raise PDFObjectNotFound
def getobj(self, objid):
if not self.xrefs:
raise PDFException('PDFDocument is not initialized')
if 2 <= self.debug:
print >>sys.stderr, 'getobj: objid=%r' % (objid)
if objid in self._cached_objs:
genno = 0
obj = self._cached_objs[objid]
else:
for xref in self.xrefs:
try:
(strmid, index) = xref.get_pos(objid)
break
except KeyError:
pass
else:
raise PDFObjectNotFound(objid)
if strmid:
stream = stream_value(self.getobj(strmid))
if stream.get('Type') is not LITERAL_OBJSTM:
if STRICT:
raise PDFSyntaxError('Not a stream object: %r' % stream)
try:
n = stream['N']
except KeyError:
if STRICT:
raise PDFSyntaxError('N is not defined: %r' % stream)
n = 0
if strmid in self._parsed_objs:
objs = self._parsed_objs[strmid]
else:
parser = PDFStreamParser(stream.get_data())
parser.set_document(self)
objs = []
try:
while 1:
(_,obj) = parser.nextobject()
objs.append(obj)
except PSEOF:
pass
if self.caching:
self._parsed_objs[strmid] = objs
genno = 0
i = n*2+index
try:
obj = objs[i]
except IndexError:
raise PDFObjectNotFound(objid)
if isinstance(obj, PDFStream):
obj.set_objid(objid, 0)
else:
self._parser.seek(index)
try:
(_,objid1) = self._parser.nexttoken() # objid
(_,genno) = self._parser.nexttoken() # genno
(_,kwd) = self._parser.nexttoken()
# #### hack around malformed pdf files
#assert objid1 == objid, (objid, objid1)
if objid1 != objid:
x = []
while kwd is not self.KEYWORD_OBJ:
(_,kwd) = self._parser.nexttoken()
x.append(kwd)
if x:
objid1 = x[-2]
genno = x[-1]
# #### end hack around malformed pdf files
if kwd is not self.KEYWORD_OBJ:
raise PDFSyntaxError('Invalid object spec: offset=%r' % index)
(_,obj) = self._parser.nextobject()
if isinstance(obj, PDFStream):
obj.set_objid(objid, genno)
except PSEOF:
raise PDFObjectNotFound(objid)
if 2 <= self.debug:
print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj)
if self.caching:
self._cached_objs[objid] = obj
if self.decipher:
obj = decipher_all(self.decipher, objid, genno, obj)
return obj
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
def get_pages(self):
if not self.xrefs:
raise PDFException('PDFDocument is not initialized')
def search(obj, parent):
if isinstance(obj, int):
objid = obj
tree = dict_value(self.getobj(objid)).copy()
else:
objid = obj.objid
tree = dict_value(obj).copy()
for (k,v) in parent.iteritems():
if k in self.INHERITABLE_ATTRS and k not in tree:
tree[k] = v
if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
if 1 <= self.debug:
print >>sys.stderr, 'Pages: Kids=%r' % tree['Kids']
for c in list_value(tree['Kids']):
for x in search(c, tree):
yield x
elif tree.get('Type') is LITERAL_PAGE:
if 1 <= self.debug:
print >>sys.stderr, 'Page: %r' % tree
yield (objid, tree)
pages = False
if 'Pages' in self.catalog:
for (objid,tree) in search(self.catalog['Pages'], self.catalog):
yield PDFPage(self, objid, tree)
pages = True
if not pages:
# fallback when /Pages is missing.
for xref in self.xrefs:
for objid in xref.get_objids():
try:
obj = self.getobj(objid)
if isinstance(obj, dict) and obj.get('Type') is LITERAL_PAGE:
yield PDFPage(self, objid, obj)
except PDFObjectNotFound:
pass
return
def get_outlines(self):
if 'Outlines' not in self.catalog:
raise PDFNoOutlines
def search(entry, level):
entry = dict_value(entry)
if 'Title' in entry:
if 'A' in entry or 'Dest' in entry:
title = decode_text(str_value(entry['Title']))
dest = entry.get('Dest')
action = entry.get('A')
se = entry.get('SE')
yield (level, title, dest, action, se)
if 'First' in entry and 'Last' in entry:
for x in search(entry['First'], level+1):
yield x
if 'Next' in entry:
for x in search(entry['Next'], level):
yield x
return
return search(self.catalog['Outlines'], 0)
def lookup_name(self, cat, key):
try:
names = dict_value(self.catalog['Names'])
except (PDFTypeError, KeyError):
raise KeyError((cat,key))
# may raise KeyError
d0 = dict_value(names[cat])
def lookup(d):
if 'Limits' in d:
(k1,k2) = list_value(d['Limits'])
if key < k1 or k2 < key: return None
if 'Names' in d:
objs = list_value(d['Names'])
names = dict(choplist(2, objs))
return names[key]
if 'Kids' in d:
for c in list_value(d['Kids']):
v = lookup(dict_value(c))
if v: return v
raise KeyError((cat,key))
return lookup(d0)
def get_dest(self, name):
try:
# PDF-1.2 or later
obj = self.lookup_name('Dests', name)
except KeyError:
# PDF-1.1 or prior
if 'Dests' not in self.catalog:
raise PDFDestinationNotFound(name)
d0 = dict_value(self.catalog['Dests'])
if name not in d0:
raise PDFDestinationNotFound(name)
obj = d0[name]
return obj
## PDFParser
@ -704,72 +128,6 @@ class PDFParser(PSStackParser):
return
def find_xref(self):
"""Internal function used to locate the first XRef."""
# search the last xref table by scanning the file backwards.
prev = None
for line in self.revreadlines():
line = line.strip()
if 2 <= self.debug:
print >>sys.stderr, 'find_xref: %r' % line
if line == 'startxref': break
if line:
prev = line
else:
raise PDFNoValidXRef('Unexpected EOF')
if 1 <= self.debug:
print >>sys.stderr, 'xref found: pos=%r' % prev
return long(prev)
# read xref table
def read_xref_from(self, start, xrefs):
"""Reads XRefs from the given location."""
self.seek(start)
self.reset()
try:
(pos, token) = self.nexttoken()
except PSEOF:
raise PDFNoValidXRef('Unexpected EOF')
if 2 <= self.debug:
print >>sys.stderr, 'read_xref_from: start=%d, token=%r' % (start, token)
if isinstance(token, int):
# XRefStream: PDF-1.5
self.seek(pos)
self.reset()
xref = PDFXRefStream()
xref.load(self, debug=self.debug)
else:
if token is self.KEYWORD_XREF:
self.nextline()
xref = PDFXRef()
xref.load(self, debug=self.debug)
xrefs.append(xref)
trailer = xref.get_trailer()
if 1 <= self.debug:
print >>sys.stderr, 'trailer: %r' % trailer
if 'XRefStm' in trailer:
pos = int_value(trailer['XRefStm'])
self.read_xref_from(pos, xrefs)
if 'Prev' in trailer:
# find previous xref
pos = int_value(trailer['Prev'])
self.read_xref_from(pos, xrefs)
return
# read xref tables and trailers
def read_xref(self, fallback=False):
"""Reads all the XRefs in the PDF file and returns them."""
xrefs = []
self.fallback = fallback
if self.fallback:
xref = PDFXRef()
xref.load_fallback(self)
xrefs.append(xref)
else:
pos = self.find_xref()
self.read_xref_from(pos, xrefs)
return xrefs
## PDFStreamParser
##

View File

@ -8,7 +8,8 @@
#
import sys, re
from pdfminer.psparser import PSKeyword, PSLiteral
from pdfminer.pdfparser import PDFDocument, PDFParser, PDFNoOutlines
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
from pdfminer.pdftypes import PDFObjectNotFound
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value

View File

@ -1,6 +1,7 @@
#!/usr/bin/env python2
import sys
from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter