diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py new file mode 100644 index 0000000..c476ad1 --- /dev/null +++ b/pdfminer/pdfdocument.py @@ -0,0 +1,657 @@ +#!/usr/bin/env python2 +import sys +import re +import struct +try: + import hashlib as md5 +except ImportError: + import md5 +from psparser import PSEOF +from psparser import literal_name +from psparser import LIT, KWD, STRICT +from pdftypes import PDFException, PDFTypeError, PDFNotImplementedError +from pdftypes import PDFObjectNotFound, PDFStream +from pdftypes import resolve1, decipher_all +from pdftypes import int_value, float_value, num_value +from pdftypes import str_value, list_value, dict_value, stream_value +from pdfparser import PDFSyntaxError +from pdfparser import PDFStreamParser +from arcfour import Arcfour +from utils import choplist, nunpack +from utils import decode_text, ObjIdRange + + +## Exceptions +## +class PDFNoValidXRef(PDFSyntaxError): pass +class PDFNoOutlines(PDFException): pass +class PDFDestinationNotFound(PDFException): pass +class PDFEncryptionError(PDFException): pass +class PDFPasswordIncorrect(PDFEncryptionError): pass + +# some predefined literals and keywords. +LITERAL_OBJSTM = LIT('ObjStm') +LITERAL_XREF = LIT('XRef') +LITERAL_PAGE = LIT('Page') +LITERAL_PAGES = LIT('Pages') +LITERAL_CATALOG = LIT('Catalog') + + +## XRefs +## +class PDFBaseXRef(object): + + def get_trailer(self): + raise NotImplementedError + + def get_objids(self): + return [] + + def get_pos(self, objid): + raise KeyError(objid) + + +## PDFXRef +## +class PDFXRef(PDFBaseXRef): + + def __init__(self): + self.offsets = {} + self.trailer = {} + return + + def __repr__(self): + return '' % (self.offsets.keys()) + + def load(self, parser, debug=0): + while 1: + try: + (pos, line) = parser.nextline() + if not line.strip(): continue + except PSEOF: + raise PDFNoValidXRef('Unexpected EOF - file corrupted?') + if not line: + raise PDFNoValidXRef('Premature eof: %r' % parser) + if line.startswith('trailer'): + parser.seek(pos) + break + f = line.strip().split(' ') + if len(f) != 2: + raise PDFNoValidXRef('Trailer not found: %r: line=%r' % (parser, line)) + try: + (start, nobjs) = map(long, f) + except ValueError: + raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line)) + for objid in xrange(start, start+nobjs): + try: + (_, line) = parser.nextline() + except PSEOF: + raise PDFNoValidXRef('Unexpected EOF - file corrupted?') + f = line.strip().split(' ') + if len(f) != 3: + raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line)) + (pos, genno, use) = f + if use != 'n': continue + self.offsets[objid] = (int(genno), long(pos)) + if 1 <= debug: + print >>sys.stderr, 'xref objects:', self.offsets + self.load_trailer(parser) + return + + KEYWORD_TRAILER = KWD('trailer') + def load_trailer(self, parser): + try: + (_,kwd) = parser.nexttoken() + assert kwd is self.KEYWORD_TRAILER + (_,dic) = parser.nextobject() + except PSEOF: + x = parser.pop(1) + if not x: + raise PDFNoValidXRef('Unexpected EOF - file corrupted') + (_,dic) = x[0] + self.trailer.update(dict_value(dic)) + return + + PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b') + def load_fallback(self, parser, debug=0): + parser.seek(0) + while 1: + try: + (pos, line) = parser.nextline() + except PSEOF: + break + if line.startswith('trailer'): + parser.seek(pos) + self.load_trailer(parser) + if 1 <= debug: + print >>sys.stderr, 'trailer: %r' % self.get_trailer() + break + m = self.PDFOBJ_CUE.match(line) + if not m: continue + (objid, genno) = m.groups() + self.offsets[int(objid)] = (0, pos) + return + + def get_trailer(self): + return self.trailer + + def get_objids(self): + return self.offsets.iterkeys() + + def get_pos(self, objid): + try: + (genno, pos) = self.offsets[objid] + except KeyError: + raise + return (None, pos) + + +## PDFXRefStream +## +class PDFXRefStream(PDFBaseXRef): + + def __init__(self): + self.data = None + self.entlen = None + self.fl1 = self.fl2 = self.fl3 = None + self.objid_ranges = [] + return + + def __repr__(self): + return '' % (self.fl1, self.fl2, self.fl3) + + def load(self, parser, debug=0): + (_,objid) = parser.nexttoken() # ignored + (_,genno) = parser.nexttoken() # ignored + (_,kwd) = parser.nexttoken() + (_,stream) = parser.nextobject() + if not isinstance(stream, PDFStream) or stream['Type'] is not LITERAL_XREF: + raise PDFNoValidXRef('Invalid PDF stream spec.') + size = stream['Size'] + index_array = stream.get('Index', (0,size)) + if len(index_array) % 2 != 0: + raise PDFSyntaxError('Invalid index number') + self.objid_ranges.extend( ObjIdRange(start, nobjs) + for (start,nobjs) in choplist(2, index_array) ) + (self.fl1, self.fl2, self.fl3) = stream['W'] + self.data = stream.get_data() + self.entlen = self.fl1+self.fl2+self.fl3 + self.trailer = stream.attrs + if 1 <= debug: + print >>sys.stderr, ('xref stream: objid=%s, fields=%d,%d,%d' % + (', '.join(map(repr, self.objid_ranges)), + self.fl1, self.fl2, self.fl3)) + return + + def get_trailer(self): + return self.trailer + + def get_objids(self): + for objid_range in self.objid_ranges: + for x in xrange(objid_range.get_start_id(), objid_range.get_end_id()+1): + yield x + return + + def get_pos(self, objid): + offset = 0 + found = False + for objid_range in self.objid_ranges: + if objid >= objid_range.get_start_id() and objid <= objid_range.get_end_id(): + offset += objid - objid_range.get_start_id() + found = True + break + else: + offset += objid_range.get_nobjs() + if not found: raise KeyError(objid) + i = self.entlen * offset + ent = self.data[i:i+self.entlen] + f1 = nunpack(ent[:self.fl1], 1) + if f1 == 1: + pos = nunpack(ent[self.fl1:self.fl1+self.fl2]) + genno = nunpack(ent[self.fl1+self.fl2:]) + return (None, pos) + elif f1 == 2: + objid = nunpack(ent[self.fl1:self.fl1+self.fl2]) + index = nunpack(ent[self.fl1+self.fl2:]) + return (objid, index) + # this is a free object + raise KeyError(objid) + + +## PDFPage +## +class PDFPage(object): + + """An object that holds the information about a page. + + A PDFPage object is merely a convenience class that has a set + of keys and values, which describe the properties of a page + and point to its contents. + + Attributes: + doc: a PDFDocument object. + pageid: any Python object that can uniquely identify the page. + attrs: a dictionary of page attributes. + contents: a list of PDFStream objects that represents the page content. + lastmod: the last modified time of the page. + resources: a list of resources used by the page. + mediabox: the physical size of the page. + cropbox: the crop rectangle of the page. + rotate: the page rotation (in degree). + annots: the page annotations. + beads: a chain that represents natural reading order. + """ + + def __init__(self, doc, pageid, attrs): + """Initialize a page object. + + doc: a PDFDocument object. + pageid: any Python object that can uniquely identify the page. + attrs: a dictionary of page attributes. + """ + self.doc = doc + self.pageid = pageid + self.attrs = dict_value(attrs) + self.lastmod = resolve1(self.attrs.get('LastModified')) + self.resources = resolve1(self.attrs['Resources']) + self.mediabox = resolve1(self.attrs['MediaBox']) + if 'CropBox' in self.attrs: + self.cropbox = resolve1(self.attrs['CropBox']) + else: + self.cropbox = self.mediabox + self.rotate = (self.attrs.get('Rotate', 0)+360) % 360 + self.annots = self.attrs.get('Annots') + self.beads = self.attrs.get('B') + if 'Contents' in self.attrs: + contents = resolve1(self.attrs['Contents']) + else: + contents = [] + if not isinstance(contents, list): + contents = [ contents ] + self.contents = contents + return + + def __repr__(self): + return '' % (self.resources, self.mediabox) + + +## PDFDocument +## +class PDFDocument(object): + + """PDFDocument object represents a PDF document. + + Since a PDF file can be very big, normally it is not loaded at + once. So PDF document has to cooperate with a PDF parser in order to + dynamically import the data as processing goes. + + Typical usage: + doc = PDFDocument() + doc.set_parser(parser) + doc.initialize(password) + obj = doc.getobj(objid) + + """ + + debug = 0 + + def __init__(self, caching=True): + self.caching = caching + self.xrefs = [] + self.info = [] + self.catalog = None + self.encryption = None + self.decipher = None + self._parser = None + self._cached_objs = {} + self._parsed_objs = {} + return + + def set_parser(self, parser, fallback=True): + "Set the document to use a given PDFParser object." + if self._parser: return + self._parser = parser + # Retrieve the information of each header that was appended + # (maybe multiple times) at the end of the document. + try: + self.xrefs = self.read_xref(parser) + except PDFNoValidXRef: + fallback = True + if fallback: + self.xrefs.extend(self.read_xref(parser, fallback=True)) + for xref in self.xrefs: + trailer = xref.get_trailer() + if not trailer: continue + # If there's an encryption info, remember it. + if 'Encrypt' in trailer: + #assert not self.encryption + self.encryption = (list_value(trailer['ID']), + dict_value(trailer['Encrypt'])) + if 'Info' in trailer: + self.info.append(dict_value(trailer['Info'])) + if 'Root' in trailer: + # Every PDF file must have exactly one /Root dictionary. + self.catalog = dict_value(trailer['Root']) + break + else: + raise PDFSyntaxError('No /Root object! - Is this really a PDF?') + if self.catalog.get('Type') is not LITERAL_CATALOG: + if STRICT: + raise PDFSyntaxError('Catalog not found!') + return + + # initialize(password='') + # Perform the initialization with a given password. + # This step is mandatory even if there's no password associated + # with the document. + PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz' + def initialize(self, password=''): + if not self.encryption: + self.is_printable = self.is_modifiable = self.is_extractable = True + return + (docid, param) = self.encryption + if literal_name(param.get('Filter')) != 'Standard': + raise PDFEncryptionError('Unknown filter: param=%r' % param) + V = int_value(param.get('V', 0)) + if not (V == 1 or V == 2): + raise PDFEncryptionError('Unknown algorithm: param=%r' % param) + length = int_value(param.get('Length', 40)) # Key length (bits) + O = str_value(param['O']) + R = int_value(param['R']) # Revision + if 5 <= R: + raise PDFEncryptionError('Unknown revision: %r' % R) + U = str_value(param['U']) + P = int_value(param['P']) + self.is_printable = bool(P & 4) + self.is_modifiable = bool(P & 8) + self.is_extractable = bool(P & 16) + # Algorithm 3.2 + password = (password+self.PASSWORD_PADDING)[:32] # 1 + hash = md5.md5(password) # 2 + hash.update(O) # 3 + hash.update(struct.pack('>sys.stderr, 'getobj: objid=%r' % (objid) + if objid in self._cached_objs: + genno = 0 + obj = self._cached_objs[objid] + else: + for xref in self.xrefs: + try: + (strmid, index) = xref.get_pos(objid) + break + except KeyError: + pass + else: + raise PDFObjectNotFound(objid) + if strmid: + stream = stream_value(self.getobj(strmid)) + if stream.get('Type') is not LITERAL_OBJSTM: + if STRICT: + raise PDFSyntaxError('Not a stream object: %r' % stream) + try: + n = stream['N'] + except KeyError: + if STRICT: + raise PDFSyntaxError('N is not defined: %r' % stream) + n = 0 + if strmid in self._parsed_objs: + objs = self._parsed_objs[strmid] + else: + parser = PDFStreamParser(stream.get_data()) + parser.set_document(self) + objs = [] + try: + while 1: + (_,obj) = parser.nextobject() + objs.append(obj) + except PSEOF: + pass + if self.caching: + self._parsed_objs[strmid] = objs + genno = 0 + i = n*2+index + try: + obj = objs[i] + except IndexError: + raise PDFObjectNotFound(objid) + if isinstance(obj, PDFStream): + obj.set_objid(objid, 0) + else: + self._parser.seek(index) + try: + (_,objid1) = self._parser.nexttoken() # objid + (_,genno) = self._parser.nexttoken() # genno + (_,kwd) = self._parser.nexttoken() + # #### hack around malformed pdf files + #assert objid1 == objid, (objid, objid1) + if objid1 != objid: + x = [] + while kwd is not self.KEYWORD_OBJ: + (_,kwd) = self._parser.nexttoken() + x.append(kwd) + if x: + objid1 = x[-2] + genno = x[-1] + # #### end hack around malformed pdf files + if kwd is not self.KEYWORD_OBJ: + raise PDFSyntaxError('Invalid object spec: offset=%r' % index) + (_,obj) = self._parser.nextobject() + if isinstance(obj, PDFStream): + obj.set_objid(objid, genno) + except PSEOF: + raise PDFObjectNotFound(objid) + if 2 <= self.debug: + print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj) + if self.caching: + self._cached_objs[objid] = obj + if self.decipher: + obj = decipher_all(self.decipher, objid, genno, obj) + return obj + + INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate']) + def get_pages(self): + if not self.xrefs: + raise PDFException('PDFDocument is not initialized') + def search(obj, parent): + if isinstance(obj, int): + objid = obj + tree = dict_value(self.getobj(objid)).copy() + else: + objid = obj.objid + tree = dict_value(obj).copy() + for (k,v) in parent.iteritems(): + if k in self.INHERITABLE_ATTRS and k not in tree: + tree[k] = v + if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree: + if 1 <= self.debug: + print >>sys.stderr, 'Pages: Kids=%r' % tree['Kids'] + for c in list_value(tree['Kids']): + for x in search(c, tree): + yield x + elif tree.get('Type') is LITERAL_PAGE: + if 1 <= self.debug: + print >>sys.stderr, 'Page: %r' % tree + yield (objid, tree) + pages = False + if 'Pages' in self.catalog: + for (objid,tree) in search(self.catalog['Pages'], self.catalog): + yield PDFPage(self, objid, tree) + pages = True + if not pages: + # fallback when /Pages is missing. + for xref in self.xrefs: + for objid in xref.get_objids(): + try: + obj = self.getobj(objid) + if isinstance(obj, dict) and obj.get('Type') is LITERAL_PAGE: + yield PDFPage(self, objid, obj) + except PDFObjectNotFound: + pass + return + + def get_outlines(self): + if 'Outlines' not in self.catalog: + raise PDFNoOutlines + def search(entry, level): + entry = dict_value(entry) + if 'Title' in entry: + if 'A' in entry or 'Dest' in entry: + title = decode_text(str_value(entry['Title'])) + dest = entry.get('Dest') + action = entry.get('A') + se = entry.get('SE') + yield (level, title, dest, action, se) + if 'First' in entry and 'Last' in entry: + for x in search(entry['First'], level+1): + yield x + if 'Next' in entry: + for x in search(entry['Next'], level): + yield x + return + return search(self.catalog['Outlines'], 0) + + def lookup_name(self, cat, key): + try: + names = dict_value(self.catalog['Names']) + except (PDFTypeError, KeyError): + raise KeyError((cat,key)) + # may raise KeyError + d0 = dict_value(names[cat]) + def lookup(d): + if 'Limits' in d: + (k1,k2) = list_value(d['Limits']) + if key < k1 or k2 < key: return None + if 'Names' in d: + objs = list_value(d['Names']) + names = dict(choplist(2, objs)) + return names[key] + if 'Kids' in d: + for c in list_value(d['Kids']): + v = lookup(dict_value(c)) + if v: return v + raise KeyError((cat,key)) + return lookup(d0) + + def get_dest(self, name): + try: + # PDF-1.2 or later + obj = self.lookup_name('Dests', name) + except KeyError: + # PDF-1.1 or prior + if 'Dests' not in self.catalog: + raise PDFDestinationNotFound(name) + d0 = dict_value(self.catalog['Dests']) + if name not in d0: + raise PDFDestinationNotFound(name) + obj = d0[name] + return obj + + # find_xref + def find_xref(self, parser): + """Internal function used to locate the first XRef.""" + # search the last xref table by scanning the file backwards. + prev = None + for line in parser.revreadlines(): + line = line.strip() + if 2 <= self.debug: + print >>sys.stderr, 'find_xref: %r' % line + if line == 'startxref': break + if line: + prev = line + else: + raise PDFNoValidXRef('Unexpected EOF') + if 1 <= self.debug: + print >>sys.stderr, 'xref found: pos=%r' % prev + return long(prev) + + # read xref table + def read_xref_from(self, parser, start, xrefs): + """Reads XRefs from the given location.""" + parser.seek(start) + parser.reset() + try: + (pos, token) = parser.nexttoken() + except PSEOF: + raise PDFNoValidXRef('Unexpected EOF') + if 2 <= self.debug: + print >>sys.stderr, 'read_xref_from: start=%d, token=%r' % (start, token) + if isinstance(token, int): + # XRefStream: PDF-1.5 + parser.seek(pos) + parser.reset() + xref = PDFXRefStream() + xref.load(parser, debug=self.debug) + else: + if token is parser.KEYWORD_XREF: + parser.nextline() + xref = PDFXRef() + xref.load(parser, debug=self.debug) + xrefs.append(xref) + trailer = xref.get_trailer() + if 1 <= self.debug: + print >>sys.stderr, 'trailer: %r' % trailer + if 'XRefStm' in trailer: + pos = int_value(trailer['XRefStm']) + self.read_xref_from(parser, pos, xrefs) + if 'Prev' in trailer: + # find previous xref + pos = int_value(trailer['Prev']) + self.read_xref_from(parser, pos, xrefs) + return + + # read xref tables and trailers + def read_xref(self, parser, fallback=False): + """Reads all the XRefs in the PDF file and returns them.""" + xrefs = [] + parser.fallback = fallback + if parser.fallback: + xref = PDFXRef() + xref.load_fallback(parser) + xrefs.append(xref) + else: + pos = self.find_xref(parser) + self.read_xref_from(parser, pos, xrefs) + return xrefs diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index 849f825..350804b 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -17,8 +17,9 @@ from pdftypes import str_value, list_value, dict_value, stream_value from pdffont import PDFFontError from pdffont import PDFType1Font, PDFTrueTypeFont, PDFType3Font from pdffont import PDFCIDFont -from pdfparser import PDFDocument, PDFParser -from pdfparser import PDFPasswordIncorrect +from pdfparser import PDFParser +from pdfdocument import PDFDocument +from pdfdocument import PDFPasswordIncorrect from pdfcolor import PDFColorSpace from pdfcolor import PREDEFINED_COLORSPACE from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py index df5890f..57262b0 100644 --- a/pdfminer/pdfparser.py +++ b/pdfminer/pdfparser.py @@ -1,597 +1,21 @@ #!/usr/bin/env python2 import sys -import re -import struct -try: - import hashlib as md5 -except ImportError: - import md5 try: from cStringIO import StringIO except ImportError: from StringIO import StringIO from psparser import PSStackParser from psparser import PSSyntaxError, PSEOF -from psparser import literal_name -from psparser import LIT, KWD, STRICT -from pdftypes import PDFException, PDFTypeError, PDFNotImplementedError -from pdftypes import PDFObjectNotFound, PDFStream, PDFObjRef -from pdftypes import resolve1, decipher_all +from psparser import KWD, STRICT +from pdftypes import PDFException +from pdftypes import PDFStream, PDFObjRef from pdftypes import int_value, float_value, num_value from pdftypes import str_value, list_value, dict_value, stream_value -from arcfour import Arcfour -from utils import choplist, nunpack -from utils import decode_text, ObjIdRange ## Exceptions ## class PDFSyntaxError(PDFException): pass -class PDFNoValidXRef(PDFSyntaxError): pass -class PDFNoOutlines(PDFException): pass -class PDFDestinationNotFound(PDFException): pass -class PDFEncryptionError(PDFException): pass -class PDFPasswordIncorrect(PDFEncryptionError): pass - -# some predefined literals and keywords. -LITERAL_OBJSTM = LIT('ObjStm') -LITERAL_XREF = LIT('XRef') -LITERAL_PAGE = LIT('Page') -LITERAL_PAGES = LIT('Pages') -LITERAL_CATALOG = LIT('Catalog') - - -## XRefs -## -class PDFBaseXRef(object): - - def get_trailer(self): - raise NotImplementedError - - def get_objids(self): - return [] - - def get_pos(self, objid): - raise KeyError(objid) - - -## PDFXRef -## -class PDFXRef(PDFBaseXRef): - - def __init__(self): - self.offsets = {} - self.trailer = {} - return - - def __repr__(self): - return '' % (self.offsets.keys()) - - def load(self, parser, debug=0): - while 1: - try: - (pos, line) = parser.nextline() - if not line.strip(): continue - except PSEOF: - raise PDFNoValidXRef('Unexpected EOF - file corrupted?') - if not line: - raise PDFNoValidXRef('Premature eof: %r' % parser) - if line.startswith('trailer'): - parser.seek(pos) - break - f = line.strip().split(' ') - if len(f) != 2: - raise PDFNoValidXRef('Trailer not found: %r: line=%r' % (parser, line)) - try: - (start, nobjs) = map(long, f) - except ValueError: - raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line)) - for objid in xrange(start, start+nobjs): - try: - (_, line) = parser.nextline() - except PSEOF: - raise PDFNoValidXRef('Unexpected EOF - file corrupted?') - f = line.strip().split(' ') - if len(f) != 3: - raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line)) - (pos, genno, use) = f - if use != 'n': continue - self.offsets[objid] = (int(genno), long(pos)) - if 1 <= debug: - print >>sys.stderr, 'xref objects:', self.offsets - self.load_trailer(parser) - return - - KEYWORD_TRAILER = KWD('trailer') - def load_trailer(self, parser): - try: - (_,kwd) = parser.nexttoken() - assert kwd is self.KEYWORD_TRAILER - (_,dic) = parser.nextobject() - except PSEOF: - x = parser.pop(1) - if not x: - raise PDFNoValidXRef('Unexpected EOF - file corrupted') - (_,dic) = x[0] - self.trailer.update(dict_value(dic)) - return - - PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b') - def load_fallback(self, parser, debug=0): - parser.seek(0) - while 1: - try: - (pos, line) = parser.nextline() - except PSEOF: - break - if line.startswith('trailer'): - parser.seek(pos) - self.load_trailer(parser) - if 1 <= debug: - print >>sys.stderr, 'trailer: %r' % self.get_trailer() - break - m = self.PDFOBJ_CUE.match(line) - if not m: continue - (objid, genno) = m.groups() - self.offsets[int(objid)] = (0, pos) - return - - def get_trailer(self): - return self.trailer - - def get_objids(self): - return self.offsets.iterkeys() - - def get_pos(self, objid): - try: - (genno, pos) = self.offsets[objid] - except KeyError: - raise - return (None, pos) - - -## PDFXRefStream -## -class PDFXRefStream(PDFBaseXRef): - - def __init__(self): - self.data = None - self.entlen = None - self.fl1 = self.fl2 = self.fl3 = None - self.objid_ranges = [] - return - - def __repr__(self): - return '' % (self.fl1, self.fl2, self.fl3) - - def load(self, parser, debug=0): - (_,objid) = parser.nexttoken() # ignored - (_,genno) = parser.nexttoken() # ignored - (_,kwd) = parser.nexttoken() - (_,stream) = parser.nextobject() - if not isinstance(stream, PDFStream) or stream['Type'] is not LITERAL_XREF: - raise PDFNoValidXRef('Invalid PDF stream spec.') - size = stream['Size'] - index_array = stream.get('Index', (0,size)) - if len(index_array) % 2 != 0: - raise PDFSyntaxError('Invalid index number') - self.objid_ranges.extend( ObjIdRange(start, nobjs) - for (start,nobjs) in choplist(2, index_array) ) - (self.fl1, self.fl2, self.fl3) = stream['W'] - self.data = stream.get_data() - self.entlen = self.fl1+self.fl2+self.fl3 - self.trailer = stream.attrs - if 1 <= debug: - print >>sys.stderr, ('xref stream: objid=%s, fields=%d,%d,%d' % - (', '.join(map(repr, self.objid_ranges)), - self.fl1, self.fl2, self.fl3)) - return - - def get_trailer(self): - return self.trailer - - def get_objids(self): - for objid_range in self.objid_ranges: - for x in xrange(objid_range.get_start_id(), objid_range.get_end_id()+1): - yield x - return - - def get_pos(self, objid): - offset = 0 - found = False - for objid_range in self.objid_ranges: - if objid >= objid_range.get_start_id() and objid <= objid_range.get_end_id(): - offset += objid - objid_range.get_start_id() - found = True - break - else: - offset += objid_range.get_nobjs() - if not found: raise KeyError(objid) - i = self.entlen * offset - ent = self.data[i:i+self.entlen] - f1 = nunpack(ent[:self.fl1], 1) - if f1 == 1: - pos = nunpack(ent[self.fl1:self.fl1+self.fl2]) - genno = nunpack(ent[self.fl1+self.fl2:]) - return (None, pos) - elif f1 == 2: - objid = nunpack(ent[self.fl1:self.fl1+self.fl2]) - index = nunpack(ent[self.fl1+self.fl2:]) - return (objid, index) - # this is a free object - raise KeyError(objid) - - -## PDFPage -## -class PDFPage(object): - - """An object that holds the information about a page. - - A PDFPage object is merely a convenience class that has a set - of keys and values, which describe the properties of a page - and point to its contents. - - Attributes: - doc: a PDFDocument object. - pageid: any Python object that can uniquely identify the page. - attrs: a dictionary of page attributes. - contents: a list of PDFStream objects that represents the page content. - lastmod: the last modified time of the page. - resources: a list of resources used by the page. - mediabox: the physical size of the page. - cropbox: the crop rectangle of the page. - rotate: the page rotation (in degree). - annots: the page annotations. - beads: a chain that represents natural reading order. - """ - - def __init__(self, doc, pageid, attrs): - """Initialize a page object. - - doc: a PDFDocument object. - pageid: any Python object that can uniquely identify the page. - attrs: a dictionary of page attributes. - """ - self.doc = doc - self.pageid = pageid - self.attrs = dict_value(attrs) - self.lastmod = resolve1(self.attrs.get('LastModified')) - self.resources = resolve1(self.attrs['Resources']) - self.mediabox = resolve1(self.attrs['MediaBox']) - if 'CropBox' in self.attrs: - self.cropbox = resolve1(self.attrs['CropBox']) - else: - self.cropbox = self.mediabox - self.rotate = (self.attrs.get('Rotate', 0)+360) % 360 - self.annots = self.attrs.get('Annots') - self.beads = self.attrs.get('B') - if 'Contents' in self.attrs: - contents = resolve1(self.attrs['Contents']) - else: - contents = [] - if not isinstance(contents, list): - contents = [ contents ] - self.contents = contents - return - - def __repr__(self): - return '' % (self.resources, self.mediabox) - - -## PDFDocument -## -class PDFDocument(object): - - """PDFDocument object represents a PDF document. - - Since a PDF file can be very big, normally it is not loaded at - once. So PDF document has to cooperate with a PDF parser in order to - dynamically import the data as processing goes. - - Typical usage: - doc = PDFDocument() - doc.set_parser(parser) - doc.initialize(password) - obj = doc.getobj(objid) - - """ - - debug = 0 - - def __init__(self, caching=True): - self.caching = caching - self.xrefs = [] - self.info = [] - self.catalog = None - self.encryption = None - self.decipher = None - self._parser = None - self._cached_objs = {} - self._parsed_objs = {} - return - - def set_parser(self, parser, fallback=True): - "Set the document to use a given PDFParser object." - if self._parser: return - self._parser = parser - # Retrieve the information of each header that was appended - # (maybe multiple times) at the end of the document. - try: - self.xrefs = parser.read_xref() - except PDFNoValidXRef: - fallback = True - if fallback: - self.xrefs.extend(parser.read_xref(fallback=True)) - for xref in self.xrefs: - trailer = xref.get_trailer() - if not trailer: continue - # If there's an encryption info, remember it. - if 'Encrypt' in trailer: - #assert not self.encryption - self.encryption = (list_value(trailer['ID']), - dict_value(trailer['Encrypt'])) - if 'Info' in trailer: - self.info.append(dict_value(trailer['Info'])) - if 'Root' in trailer: - # Every PDF file must have exactly one /Root dictionary. - self.catalog = dict_value(trailer['Root']) - break - else: - raise PDFSyntaxError('No /Root object! - Is this really a PDF?') - if self.catalog.get('Type') is not LITERAL_CATALOG: - if STRICT: - raise PDFSyntaxError('Catalog not found!') - return - - # initialize(password='') - # Perform the initialization with a given password. - # This step is mandatory even if there's no password associated - # with the document. - PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz' - def initialize(self, password=''): - if not self.encryption: - self.is_printable = self.is_modifiable = self.is_extractable = True - return - (docid, param) = self.encryption - if literal_name(param.get('Filter')) != 'Standard': - raise PDFEncryptionError('Unknown filter: param=%r' % param) - V = int_value(param.get('V', 0)) - if not (V == 1 or V == 2): - raise PDFEncryptionError('Unknown algorithm: param=%r' % param) - length = int_value(param.get('Length', 40)) # Key length (bits) - O = str_value(param['O']) - R = int_value(param['R']) # Revision - if 5 <= R: - raise PDFEncryptionError('Unknown revision: %r' % R) - U = str_value(param['U']) - P = int_value(param['P']) - self.is_printable = bool(P & 4) - self.is_modifiable = bool(P & 8) - self.is_extractable = bool(P & 16) - # Algorithm 3.2 - password = (password+self.PASSWORD_PADDING)[:32] # 1 - hash = md5.md5(password) # 2 - hash.update(O) # 3 - hash.update(struct.pack('>sys.stderr, 'getobj: objid=%r' % (objid) - if objid in self._cached_objs: - genno = 0 - obj = self._cached_objs[objid] - else: - for xref in self.xrefs: - try: - (strmid, index) = xref.get_pos(objid) - break - except KeyError: - pass - else: - raise PDFObjectNotFound(objid) - if strmid: - stream = stream_value(self.getobj(strmid)) - if stream.get('Type') is not LITERAL_OBJSTM: - if STRICT: - raise PDFSyntaxError('Not a stream object: %r' % stream) - try: - n = stream['N'] - except KeyError: - if STRICT: - raise PDFSyntaxError('N is not defined: %r' % stream) - n = 0 - if strmid in self._parsed_objs: - objs = self._parsed_objs[strmid] - else: - parser = PDFStreamParser(stream.get_data()) - parser.set_document(self) - objs = [] - try: - while 1: - (_,obj) = parser.nextobject() - objs.append(obj) - except PSEOF: - pass - if self.caching: - self._parsed_objs[strmid] = objs - genno = 0 - i = n*2+index - try: - obj = objs[i] - except IndexError: - raise PDFObjectNotFound(objid) - if isinstance(obj, PDFStream): - obj.set_objid(objid, 0) - else: - self._parser.seek(index) - try: - (_,objid1) = self._parser.nexttoken() # objid - (_,genno) = self._parser.nexttoken() # genno - (_,kwd) = self._parser.nexttoken() - # #### hack around malformed pdf files - #assert objid1 == objid, (objid, objid1) - if objid1 != objid: - x = [] - while kwd is not self.KEYWORD_OBJ: - (_,kwd) = self._parser.nexttoken() - x.append(kwd) - if x: - objid1 = x[-2] - genno = x[-1] - # #### end hack around malformed pdf files - if kwd is not self.KEYWORD_OBJ: - raise PDFSyntaxError('Invalid object spec: offset=%r' % index) - (_,obj) = self._parser.nextobject() - if isinstance(obj, PDFStream): - obj.set_objid(objid, genno) - except PSEOF: - raise PDFObjectNotFound(objid) - if 2 <= self.debug: - print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj) - if self.caching: - self._cached_objs[objid] = obj - if self.decipher: - obj = decipher_all(self.decipher, objid, genno, obj) - return obj - - INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate']) - def get_pages(self): - if not self.xrefs: - raise PDFException('PDFDocument is not initialized') - def search(obj, parent): - if isinstance(obj, int): - objid = obj - tree = dict_value(self.getobj(objid)).copy() - else: - objid = obj.objid - tree = dict_value(obj).copy() - for (k,v) in parent.iteritems(): - if k in self.INHERITABLE_ATTRS and k not in tree: - tree[k] = v - if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree: - if 1 <= self.debug: - print >>sys.stderr, 'Pages: Kids=%r' % tree['Kids'] - for c in list_value(tree['Kids']): - for x in search(c, tree): - yield x - elif tree.get('Type') is LITERAL_PAGE: - if 1 <= self.debug: - print >>sys.stderr, 'Page: %r' % tree - yield (objid, tree) - pages = False - if 'Pages' in self.catalog: - for (objid,tree) in search(self.catalog['Pages'], self.catalog): - yield PDFPage(self, objid, tree) - pages = True - if not pages: - # fallback when /Pages is missing. - for xref in self.xrefs: - for objid in xref.get_objids(): - try: - obj = self.getobj(objid) - if isinstance(obj, dict) and obj.get('Type') is LITERAL_PAGE: - yield PDFPage(self, objid, obj) - except PDFObjectNotFound: - pass - return - - def get_outlines(self): - if 'Outlines' not in self.catalog: - raise PDFNoOutlines - def search(entry, level): - entry = dict_value(entry) - if 'Title' in entry: - if 'A' in entry or 'Dest' in entry: - title = decode_text(str_value(entry['Title'])) - dest = entry.get('Dest') - action = entry.get('A') - se = entry.get('SE') - yield (level, title, dest, action, se) - if 'First' in entry and 'Last' in entry: - for x in search(entry['First'], level+1): - yield x - if 'Next' in entry: - for x in search(entry['Next'], level): - yield x - return - return search(self.catalog['Outlines'], 0) - - def lookup_name(self, cat, key): - try: - names = dict_value(self.catalog['Names']) - except (PDFTypeError, KeyError): - raise KeyError((cat,key)) - # may raise KeyError - d0 = dict_value(names[cat]) - def lookup(d): - if 'Limits' in d: - (k1,k2) = list_value(d['Limits']) - if key < k1 or k2 < key: return None - if 'Names' in d: - objs = list_value(d['Names']) - names = dict(choplist(2, objs)) - return names[key] - if 'Kids' in d: - for c in list_value(d['Kids']): - v = lookup(dict_value(c)) - if v: return v - raise KeyError((cat,key)) - return lookup(d0) - - def get_dest(self, name): - try: - # PDF-1.2 or later - obj = self.lookup_name('Dests', name) - except KeyError: - # PDF-1.1 or prior - if 'Dests' not in self.catalog: - raise PDFDestinationNotFound(name) - d0 = dict_value(self.catalog['Dests']) - if name not in d0: - raise PDFDestinationNotFound(name) - obj = d0[name] - return obj ## PDFParser @@ -704,72 +128,6 @@ class PDFParser(PSStackParser): return - def find_xref(self): - """Internal function used to locate the first XRef.""" - # search the last xref table by scanning the file backwards. - prev = None - for line in self.revreadlines(): - line = line.strip() - if 2 <= self.debug: - print >>sys.stderr, 'find_xref: %r' % line - if line == 'startxref': break - if line: - prev = line - else: - raise PDFNoValidXRef('Unexpected EOF') - if 1 <= self.debug: - print >>sys.stderr, 'xref found: pos=%r' % prev - return long(prev) - - # read xref table - def read_xref_from(self, start, xrefs): - """Reads XRefs from the given location.""" - self.seek(start) - self.reset() - try: - (pos, token) = self.nexttoken() - except PSEOF: - raise PDFNoValidXRef('Unexpected EOF') - if 2 <= self.debug: - print >>sys.stderr, 'read_xref_from: start=%d, token=%r' % (start, token) - if isinstance(token, int): - # XRefStream: PDF-1.5 - self.seek(pos) - self.reset() - xref = PDFXRefStream() - xref.load(self, debug=self.debug) - else: - if token is self.KEYWORD_XREF: - self.nextline() - xref = PDFXRef() - xref.load(self, debug=self.debug) - xrefs.append(xref) - trailer = xref.get_trailer() - if 1 <= self.debug: - print >>sys.stderr, 'trailer: %r' % trailer - if 'XRefStm' in trailer: - pos = int_value(trailer['XRefStm']) - self.read_xref_from(pos, xrefs) - if 'Prev' in trailer: - # find previous xref - pos = int_value(trailer['Prev']) - self.read_xref_from(pos, xrefs) - return - - # read xref tables and trailers - def read_xref(self, fallback=False): - """Reads all the XRefs in the PDF file and returns them.""" - xrefs = [] - self.fallback = fallback - if self.fallback: - xref = PDFXRef() - xref.load_fallback(self) - xrefs.append(xref) - else: - pos = self.find_xref() - self.read_xref_from(pos, xrefs) - return xrefs - ## PDFStreamParser ## diff --git a/tools/dumppdf.py b/tools/dumppdf.py index 0612124..7b9ec57 100755 --- a/tools/dumppdf.py +++ b/tools/dumppdf.py @@ -8,7 +8,8 @@ # import sys, re from pdfminer.psparser import PSKeyword, PSLiteral -from pdfminer.pdfparser import PDFDocument, PDFParser, PDFNoOutlines +from pdfminer.pdfparser import PDFParser +from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines from pdfminer.pdftypes import PDFObjectNotFound from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index 2d233f9..9c5aab6 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -1,6 +1,7 @@ #!/usr/bin/env python2 import sys -from pdfminer.pdfparser import PDFDocument, PDFParser +from pdfminer.pdfdocument import PDFDocument +from pdfminer.pdfparser import PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf from pdfminer.pdfdevice import PDFDevice, TagExtractor from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter