Separated PDFPage to pdfpage.py.

2013-10-10 19:54:55 +09:00 · 2013-10-10 19:54:55 +09:00 · f85c374cae
parent 2df67d85ae
commit f85c374cae
4 changed files with 115 additions and 103 deletions
--- a/pdfminer/pdfdocument.py
+++ b/pdfminer/pdfdocument.py
@ -32,8 +32,6 @@ class PDFPasswordIncorrect(PDFEncryptionError): pass
 # some predefined literals and keywords.
 LITERAL_OBJSTM = LIT('ObjStm')
 LITERAL_XREF = LIT('XRef')
 LITERAL_PAGE = LIT('Page')
 LITERAL_PAGES = LIT('Pages')
 LITERAL_CATALOG = LIT('Catalog')
@ -244,63 +242,6 @@ class PDFXRefStream(PDFBaseXRef):
            raise KeyError(objid)
 ##  PDFPage
 ##
 class PDFPage(object):
    """An object that holds the information about a page.
    A PDFPage object is merely a convenience class that has a set
    of keys and values, which describe the properties of a page
    and point to its contents.
    Attributes:
      doc: a PDFDocument object.
      pageid: any Python object that can uniquely identify the page.
      attrs: a dictionary of page attributes.
      contents: a list of PDFStream objects that represents the page content.
      lastmod: the last modified time of the page.
      resources: a list of resources used by the page.
      mediabox: the physical size of the page.
      cropbox: the crop rectangle of the page.
      rotate: the page rotation (in degree).
      annots: the page annotations.
      beads: a chain that represents natural reading order.
    """
    def __init__(self, doc, pageid, attrs):
        """Initialize a page object.
        doc: a PDFDocument object.
        pageid: any Python object that can uniquely identify the page.
        attrs: a dictionary of page attributes.
        """
        self.doc = doc
        self.pageid = pageid
        self.attrs = dict_value(attrs)
        self.lastmod = resolve1(self.attrs.get('LastModified'))
        self.resources = resolve1(self.attrs['Resources'])
        self.mediabox = resolve1(self.attrs['MediaBox'])
        if 'CropBox' in self.attrs:
            self.cropbox = resolve1(self.attrs['CropBox'])
        else:
            self.cropbox = self.mediabox
        self.rotate = (self.attrs.get('Rotate', 0)+360) % 360
        self.annots = self.attrs.get('Annots')
        self.beads = self.attrs.get('B')
        if 'Contents' in self.attrs:
            contents = resolve1(self.attrs['Contents'])
        else:
            contents = []
        if not isinstance(contents, list):
            contents = [ contents ]
        self.contents = contents
        return
    def __repr__(self):
        return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
 ##  PDFDocument
 ##
 class PDFDocument(object):
@ -516,47 +457,6 @@ class PDFDocument(object):
            obj = decipher_all(self.decipher, objid, genno, obj)
        return obj
    INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
    def get_pages(self):
        if not self.xrefs:
            raise PDFException('PDFDocument is not initialized')
        def search(obj, parent):
            if isinstance(obj, int):
                objid = obj
                tree = dict_value(self.getobj(objid)).copy()
            else:
                objid = obj.objid
                tree = dict_value(obj).copy()
            for (k,v) in parent.iteritems():
                if k in self.INHERITABLE_ATTRS and k not in tree:
                    tree[k] = v
            if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
                if 1 <= self.debug:
                    print >>sys.stderr, 'Pages: Kids=%r' % tree['Kids']
                for c in list_value(tree['Kids']):
                    for x in search(c, tree):
                        yield x
            elif tree.get('Type') is LITERAL_PAGE:
                if 1 <= self.debug:
                    print >>sys.stderr, 'Page: %r' % tree
                yield (objid, tree)
        pages = False
        if 'Pages' in self.catalog:
            for (objid,tree) in search(self.catalog['Pages'], self.catalog):
                yield PDFPage(self, objid, tree)
                pages = True
        if not pages:
            # fallback when /Pages is missing.
            for xref in self.xrefs:
                for objid in xref.get_objids():
                    try:
                        obj = self.getobj(objid)
                        if isinstance(obj, dict) and obj.get('Type') is LITERAL_PAGE:
                            yield PDFPage(self, objid, obj)
                    except PDFObjectNotFound:
                        pass
        return
    def get_outlines(self):
        if 'Outlines' not in self.catalog:
            raise PDFNoOutlines
--- a/pdfminer/pdfinterp.py
+++ b/pdfminer/pdfinterp.py
@ -24,6 +24,7 @@ from pdfcolor import PDFColorSpace
 from pdfcolor import PREDEFINED_COLORSPACE
 from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
 from pdfcolor import LITERAL_DEVICE_CMYK
 from pdfpage import PDFPage
 from utils import choplist
 from utils import mult_matrix, MATRIX_IDENTITY
@ -824,7 +825,7 @@ def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password='',
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
-    for (pageno,page) in enumerate(doc.get_pages()):
+    for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
        if pagenos and (pageno not in pagenos): continue
        interpreter.process_page(page)
        if maxpages and maxpages <= pageno+1: break
--- a/pdfminer/pdfpage.py
+++ b/pdfminer/pdfpage.py
@ -0,0 +1,109 @@
 #!/usr/bin/env python2
 import sys
 from psparser import LIT, KWD, STRICT
 from pdftypes import PDFObjectNotFound
 from pdftypes import resolve1
 from pdftypes import int_value, float_value, num_value
 from pdftypes import str_value, list_value, dict_value, stream_value
 # some predefined literals and keywords.
 LITERAL_PAGE = LIT('Page')
 LITERAL_PAGES = LIT('Pages')
 ##  PDFPage
 ##
 class PDFPage(object):
    """An object that holds the information about a page.
    A PDFPage object is merely a convenience class that has a set
    of keys and values, which describe the properties of a page
    and point to its contents.
    Attributes:
      doc: a PDFDocument object.
      pageid: any Python object that can uniquely identify the page.
      attrs: a dictionary of page attributes.
      contents: a list of PDFStream objects that represents the page content.
      lastmod: the last modified time of the page.
      resources: a list of resources used by the page.
      mediabox: the physical size of the page.
      cropbox: the crop rectangle of the page.
      rotate: the page rotation (in degree).
      annots: the page annotations.
      beads: a chain that represents natural reading order.
    """
    def __init__(self, doc, pageid, attrs):
        """Initialize a page object.
        doc: a PDFDocument object.
        pageid: any Python object that can uniquely identify the page.
        attrs: a dictionary of page attributes.
        """
        self.doc = doc
        self.pageid = pageid
        self.attrs = dict_value(attrs)
        self.lastmod = resolve1(self.attrs.get('LastModified'))
        self.resources = resolve1(self.attrs['Resources'])
        self.mediabox = resolve1(self.attrs['MediaBox'])
        if 'CropBox' in self.attrs:
            self.cropbox = resolve1(self.attrs['CropBox'])
        else:
            self.cropbox = self.mediabox
        self.rotate = (self.attrs.get('Rotate', 0)+360) % 360
        self.annots = self.attrs.get('Annots')
        self.beads = self.attrs.get('B')
        if 'Contents' in self.attrs:
            contents = resolve1(self.attrs['Contents'])
        else:
            contents = []
        if not isinstance(contents, list):
            contents = [ contents ]
        self.contents = contents
        return
    def __repr__(self):
        return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
    INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
    @classmethod
    def create_pages(klass, document, debug=0):
        def search(obj, parent):
            if isinstance(obj, int):
                objid = obj
                tree = dict_value(document.getobj(objid)).copy()
            else:
                objid = obj.objid
                tree = dict_value(obj).copy()
            for (k,v) in parent.iteritems():
                if k in klass.INHERITABLE_ATTRS and k not in tree:
                    tree[k] = v
            if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
                if 1 <= debug:
                    print >>sys.stderr, 'Pages: Kids=%r' % tree['Kids']
                for c in list_value(tree['Kids']):
                    for x in search(c, tree):
                        yield x
            elif tree.get('Type') is LITERAL_PAGE:
                if 1 <= debug:
                    print >>sys.stderr, 'Page: %r' % tree
                yield (objid, tree)
        pages = False
        if 'Pages' in document.catalog:
            for (objid,tree) in search(document.catalog['Pages'], document.catalog):
                yield klass(document, objid, tree)
                pages = True
        if not pages:
            # fallback when /Pages is missing.
            for xref in document.xrefs:
                for objid in xref.get_objids():
                    try:
                        obj = document.getobj(objid)
                        if isinstance(obj, dict) and obj.get('Type') is LITERAL_PAGE:
                            yield klass(document, objid, obj)
                    except PDFObjectNotFound:
                        pass
        return
--- a/tools/dumppdf.py
+++ b/tools/dumppdf.py
@ -12,6 +12,7 @@ from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
 from pdfminer.pdftypes import PDFObjectNotFound
 from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
 from pdfminer.pdfpage import PDFPage
 ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
@ -112,7 +113,8 @@ def dumpoutline(outfp, fname, objids, pagenos, password='',
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    doc.initialize(password)
-    pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
+    pages = dict( (page.pageid, pageno) for (pageno,page)
                  in enumerate(PDFPage.create_pages(doc)) )
    def resolve_dest(dest):
        if isinstance(dest, str):
            dest = resolve1(doc.get_dest(dest))
@ -164,7 +166,7 @@ def dumppdf(outfp, fname, objids, pagenos, password='',
            obj = doc.getobj(objid)
            dumpxml(outfp, obj, codec=codec)
    if pagenos:
-        for (pageno,page) in enumerate(doc.get_pages()):
+        for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
            if pageno in pagenos:
                if codec:
                    for obj in page.contents: