Separated PDFPage to pdfpage.py.

2013-10-10 19:54:55 +09:00 · 2013-10-10 19:54:55 +09:00 · f85c374cae
parent 2df67d85ae
commit f85c374cae
4 changed files with 115 additions and 103 deletions
--- a/pdfminer/pdfdocument.py
+++ b/pdfminer/pdfdocument.py
@ -32,8 +32,6 @@ class PDFPasswordIncorrect(PDFEncryptionError): pass
 # some predefined literals and keywords.
 LITERAL_OBJSTM = LIT('ObjStm')
 LITERAL_XREF = LIT('XRef')
-LITERAL_PAGE = LIT('Page')
-LITERAL_PAGES = LIT('Pages')
 LITERAL_CATALOG = LIT('Catalog')


@ -244,63 +242,6 @@ class PDFXRefStream(PDFBaseXRef):
            raise KeyError(objid)


-##  PDFPage
-##
-class PDFPage(object):
-
-    """An object that holds the information about a page.
-
-    A PDFPage object is merely a convenience class that has a set
-    of keys and values, which describe the properties of a page
-    and point to its contents.
-
-    Attributes:
-      doc: a PDFDocument object.
-      pageid: any Python object that can uniquely identify the page.
-      attrs: a dictionary of page attributes.
-      contents: a list of PDFStream objects that represents the page content.
-      lastmod: the last modified time of the page.
-      resources: a list of resources used by the page.
-      mediabox: the physical size of the page.
-      cropbox: the crop rectangle of the page.
-      rotate: the page rotation (in degree).
-      annots: the page annotations.
-      beads: a chain that represents natural reading order.
-    """
-
-    def __init__(self, doc, pageid, attrs):
-        """Initialize a page object.
-        
-        doc: a PDFDocument object.
-        pageid: any Python object that can uniquely identify the page.
-        attrs: a dictionary of page attributes.
-        """
-        self.doc = doc
-        self.pageid = pageid
-        self.attrs = dict_value(attrs)
-        self.lastmod = resolve1(self.attrs.get('LastModified'))
-        self.resources = resolve1(self.attrs['Resources'])
-        self.mediabox = resolve1(self.attrs['MediaBox'])
-        if 'CropBox' in self.attrs:
-            self.cropbox = resolve1(self.attrs['CropBox'])
-        else:
-            self.cropbox = self.mediabox
-        self.rotate = (self.attrs.get('Rotate', 0)+360) % 360
-        self.annots = self.attrs.get('Annots')
-        self.beads = self.attrs.get('B')
-        if 'Contents' in self.attrs:
-            contents = resolve1(self.attrs['Contents'])
-        else:
-            contents = []
-        if not isinstance(contents, list):
-            contents = [ contents ]
-        self.contents = contents
-        return
-
-    def __repr__(self):
-        return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
-
-
 ##  PDFDocument
 ##
 class PDFDocument(object):
@ -516,47 +457,6 @@ class PDFDocument(object):
            obj = decipher_all(self.decipher, objid, genno, obj)
        return obj

-    INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
-    def get_pages(self):
-        if not self.xrefs:
-            raise PDFException('PDFDocument is not initialized')
-        def search(obj, parent):
-            if isinstance(obj, int):
-                objid = obj
-                tree = dict_value(self.getobj(objid)).copy()
-            else:
-                objid = obj.objid
-                tree = dict_value(obj).copy()
-            for (k,v) in parent.iteritems():
-                if k in self.INHERITABLE_ATTRS and k not in tree:
-                    tree[k] = v
-            if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
-                if 1 <= self.debug:
-                    print >>sys.stderr, 'Pages: Kids=%r' % tree['Kids']
-                for c in list_value(tree['Kids']):
-                    for x in search(c, tree):
-                        yield x
-            elif tree.get('Type') is LITERAL_PAGE:
-                if 1 <= self.debug:
-                    print >>sys.stderr, 'Page: %r' % tree
-                yield (objid, tree)
-        pages = False
-        if 'Pages' in self.catalog:
-            for (objid,tree) in search(self.catalog['Pages'], self.catalog):
-                yield PDFPage(self, objid, tree)
-                pages = True
-        if not pages:
-            # fallback when /Pages is missing.
-            for xref in self.xrefs:
-                for objid in xref.get_objids():
-                    try:
-                        obj = self.getobj(objid)
-                        if isinstance(obj, dict) and obj.get('Type') is LITERAL_PAGE:
-                            yield PDFPage(self, objid, obj)
-                    except PDFObjectNotFound:
-                        pass
-        return
-
    def get_outlines(self):
        if 'Outlines' not in self.catalog:
            raise PDFNoOutlines
--- a/pdfminer/pdfinterp.py
+++ b/pdfminer/pdfinterp.py
@ -24,6 +24,7 @@ from pdfcolor import PDFColorSpace
 from pdfcolor import PREDEFINED_COLORSPACE
 from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
 from pdfcolor import LITERAL_DEVICE_CMYK
+from pdfpage import PDFPage
 from utils import choplist
 from utils import mult_matrix, MATRIX_IDENTITY

@ -824,7 +825,7 @@ def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password='',
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
-    for (pageno,page) in enumerate(doc.get_pages()):
+    for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
        if pagenos and (pageno not in pagenos): continue
        interpreter.process_page(page)
        if maxpages and maxpages <= pageno+1: break
--- a/pdfminer/pdfpage.py
+++ b/pdfminer/pdfpage.py
@ -0,0 +1,109 @@
+#!/usr/bin/env python2
+import sys
+from psparser import LIT, KWD, STRICT
+from pdftypes import PDFObjectNotFound
+from pdftypes import resolve1
+from pdftypes import int_value, float_value, num_value
+from pdftypes import str_value, list_value, dict_value, stream_value
+
+
+# some predefined literals and keywords.
+LITERAL_PAGE = LIT('Page')
+LITERAL_PAGES = LIT('Pages')
+
+
+##  PDFPage
+##
+class PDFPage(object):
+
+    """An object that holds the information about a page.
+
+    A PDFPage object is merely a convenience class that has a set
+    of keys and values, which describe the properties of a page
+    and point to its contents.
+
+    Attributes:
+      doc: a PDFDocument object.
+      pageid: any Python object that can uniquely identify the page.
+      attrs: a dictionary of page attributes.
+      contents: a list of PDFStream objects that represents the page content.
+      lastmod: the last modified time of the page.
+      resources: a list of resources used by the page.
+      mediabox: the physical size of the page.
+      cropbox: the crop rectangle of the page.
+      rotate: the page rotation (in degree).
+      annots: the page annotations.
+      beads: a chain that represents natural reading order.
+    """
+
+    def __init__(self, doc, pageid, attrs):
+        """Initialize a page object.
+        
+        doc: a PDFDocument object.
+        pageid: any Python object that can uniquely identify the page.
+        attrs: a dictionary of page attributes.
+        """
+        self.doc = doc
+        self.pageid = pageid
+        self.attrs = dict_value(attrs)
+        self.lastmod = resolve1(self.attrs.get('LastModified'))
+        self.resources = resolve1(self.attrs['Resources'])
+        self.mediabox = resolve1(self.attrs['MediaBox'])
+        if 'CropBox' in self.attrs:
+            self.cropbox = resolve1(self.attrs['CropBox'])
+        else:
+            self.cropbox = self.mediabox
+        self.rotate = (self.attrs.get('Rotate', 0)+360) % 360
+        self.annots = self.attrs.get('Annots')
+        self.beads = self.attrs.get('B')
+        if 'Contents' in self.attrs:
+            contents = resolve1(self.attrs['Contents'])
+        else:
+            contents = []
+        if not isinstance(contents, list):
+            contents = [ contents ]
+        self.contents = contents
+        return
+
+    def __repr__(self):
+        return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
+
+    INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
+    @classmethod
+    def create_pages(klass, document, debug=0):
+        def search(obj, parent):
+            if isinstance(obj, int):
+                objid = obj
+                tree = dict_value(document.getobj(objid)).copy()
+            else:
+                objid = obj.objid
+                tree = dict_value(obj).copy()
+            for (k,v) in parent.iteritems():
+                if k in klass.INHERITABLE_ATTRS and k not in tree:
+                    tree[k] = v
+            if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
+                if 1 <= debug:
+                    print >>sys.stderr, 'Pages: Kids=%r' % tree['Kids']
+                for c in list_value(tree['Kids']):
+                    for x in search(c, tree):
+                        yield x
+            elif tree.get('Type') is LITERAL_PAGE:
+                if 1 <= debug:
+                    print >>sys.stderr, 'Page: %r' % tree
+                yield (objid, tree)
+        pages = False
+        if 'Pages' in document.catalog:
+            for (objid,tree) in search(document.catalog['Pages'], document.catalog):
+                yield klass(document, objid, tree)
+                pages = True
+        if not pages:
+            # fallback when /Pages is missing.
+            for xref in document.xrefs:
+                for objid in xref.get_objids():
+                    try:
+                        obj = document.getobj(objid)
+                        if isinstance(obj, dict) and obj.get('Type') is LITERAL_PAGE:
+                            yield klass(document, objid, obj)
+                    except PDFObjectNotFound:
+                        pass
+        return
--- a/tools/dumppdf.py
+++ b/tools/dumppdf.py
@ -12,6 +12,7 @@ from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
 from pdfminer.pdftypes import PDFObjectNotFound
 from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
+from pdfminer.pdfpage import PDFPage


 ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
@ -112,7 +113,8 @@ def dumpoutline(outfp, fname, objids, pagenos, password='',
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    doc.initialize(password)
-    pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
+    pages = dict( (page.pageid, pageno) for (pageno,page)
+                  in enumerate(PDFPage.create_pages(doc)) )
    def resolve_dest(dest):
        if isinstance(dest, str):
            dest = resolve1(doc.get_dest(dest))
@ -164,7 +166,7 @@ def dumppdf(outfp, fname, objids, pagenos, password='',
            obj = doc.getobj(objid)
            dumpxml(outfp, obj, codec=codec)
    if pagenos:
-        for (pageno,page) in enumerate(doc.get_pages()):
+        for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
            if pageno in pagenos:
                if codec:
                    for obj in page.contents: