pdfminer.six/pdfminer/pdfpage.py


import logging
from . import settings
from .psparser import LIT
from .pdftypes import PDFObjectNotFound
from .pdftypes import resolve1
from .pdftypes import int_value
from .pdftypes import list_value
from .pdftypes import dict_value
from .pdfparser import PDFParser
from .pdfdocument import PDFDocument
from .pdfdocument import PDFTextExtractionNotAllowed

import six  # Python 2+3 compatibility

log = logging.getLogger(__name__)

# some predefined literals and keywords.
LITERAL_PAGE = LIT('Page')
LITERAL_PAGES = LIT('Pages')

##  PDFPage
##
class PDFPage(object):

    """An object that holds the information about a page.

    A PDFPage object is merely a convenience class that has a set
    of keys and values, which describe the properties of a page
    and point to its contents.

    Attributes:
      doc: a PDFDocument object.
      pageid: any Python object that can uniquely identify the page.
      attrs: a dictionary of page attributes.
      contents: a list of PDFStream objects that represents the page content.
      lastmod: the last modified time of the page.
      resources: a list of resources used by the page.
      mediabox: the physical size of the page.
      cropbox: the crop rectangle of the page.
      rotate: the page rotation (in degree).
      annots: the page annotations.
      beads: a chain that represents natural reading order.
    """

    def __init__(self, doc, pageid, attrs):
        """Initialize a page object.

        doc: a PDFDocument object.
        pageid: any Python object that can uniquely identify the page.
        attrs: a dictionary of page attributes.
        """
        self.doc = doc
        self.pageid = pageid
        self.attrs = dict_value(attrs)
        self.lastmod = resolve1(self.attrs.get('LastModified'))
        self.resources = resolve1(self.attrs.get('Resources', dict()))
        self.mediabox = resolve1(self.attrs['MediaBox'])
        if 'CropBox' in self.attrs:
            self.cropbox = resolve1(self.attrs['CropBox'])
        else:
            self.cropbox = self.mediabox
        self.rotate = (int_value(self.attrs.get('Rotate', 0))+360) % 360
        self.annots = self.attrs.get('Annots')
        self.beads = self.attrs.get('B')
        if 'Contents' in self.attrs:
            contents = resolve1(self.attrs['Contents'])
        else:
            contents = []
        if not isinstance(contents, list):
            contents = [contents]
        self.contents = contents
        return

    def __repr__(self):
        return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)

    INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])

    @classmethod
    def create_pages(klass, document):
        def search(obj, parent):
            if isinstance(obj, int):
                objid = obj
                tree = dict_value(document.getobj(objid)).copy()
            else:
                objid = obj.objid
                tree = dict_value(obj).copy()
            for (k, v) in six.iteritems(parent):
                if k in klass.INHERITABLE_ATTRS and k not in tree:
                    tree[k] = v

            tree_type = tree.get('Type')
            if tree_type is None and not settings.STRICT:  # See #64
                tree_type = tree.get('type')

            if tree_type is LITERAL_PAGES and 'Kids' in tree:
                log.info('Pages: Kids=%r', tree['Kids'])
                for c in list_value(tree['Kids']):
                    for x in search(c, tree):
                        yield x
            elif tree_type is LITERAL_PAGE:
                log.info('Page: %r', tree)
                yield (objid, tree)
        pages = False
        if 'Pages' in document.catalog:
            for (objid, tree) in search(document.catalog['Pages'], document.catalog):
                yield klass(document, objid, tree)
                pages = True
        if not pages:
            # fallback when /Pages is missing.
            for xref in document.xrefs:
                for objid in xref.get_objids():
                    try:
                        obj = document.getobj(objid)
                        if isinstance(obj, dict) and obj.get('Type') is LITERAL_PAGE:
                            yield klass(document, objid, obj)
                    except PDFObjectNotFound:
                        pass
        return

    @classmethod
    def get_pages(klass, fp,
                  pagenos=None, maxpages=0, password='',
                  caching=True, check_extractable=True):
        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)
        # Create a PDF document object that stores the document structure.
        doc = PDFDocument(parser, password=password, caching=caching)
        # Check if the document allows text extraction. If not, abort.
        if check_extractable and not doc.is_extractable:
            raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
        # Process each page contained in the document.
        for (pageno, page) in enumerate(klass.create_pages(doc)):
            if pagenos and (pageno not in pagenos):
                continue
            yield page
            if maxpages and maxpages <= pageno+1:
                break
        return
Removing all the "#!/usr/bin/env python" lines, they do not need for … (#34) * Removing all the "#!/usr/bin/env python" lines, they do not need for python3, solving issue number: #19. * Restored all the shebangs in the tools and tests folders (because they are real executables) but used "#!/usr/bin/env python" instead of "#!/usr/bin/python" as this blog points out: https://www.peterbe.com/plog/importance-of-env Removed also the shebang from pdfminer/psparser.py file. 2016-11-08 19:01:11 +00:00
Use logging module instead of print. 2014-06-14 03:00:49 +00:00			`import logging`
Fixes #64 -- be less strict when inspecting a tree type (#76) In the PDFStream it's possible that the /Type element is not present, but /type is. According to the spec, these are different elements, but in the case in point they had the same meaning. If PDFMiner is not running in STRICT mode and /Type doesn't resolve, a fallback to /type is used to determine the tree type. 2017-07-20 18:46:35 +00:00			`from . import settings`
Cleanup imports. Use relative imports. 2014-06-26 09:12:39 +00:00			`from .psparser import LIT`
			`from .pdftypes import PDFObjectNotFound`
			`from .pdftypes import resolve1`
			`from .pdftypes import int_value`
			`from .pdftypes import list_value`
			`from .pdftypes import dict_value`
			`from .pdfparser import PDFParser`
			`from .pdfdocument import PDFDocument`
			`from .pdfdocument import PDFTextExtractionNotAllowed`
Separated PDFPage to pdfpage.py. 2013-10-10 10:54:55 +00:00
Make the logger run in a namespace. 2016-05-20 19:12:05 +00:00			`import six # Python 2+3 compatibility`

			`log = logging.getLogger(__name__)`
Separated PDFPage to pdfpage.py. 2013-10-10 10:54:55 +00:00
Python 3.4 support and tests 2014-09-03 13:26:08 +00:00			`# some predefined literals and keywords.`
			`LITERAL_PAGE = LIT('Page')`
			`LITERAL_PAGES = LIT('Pages')`

Separated PDFPage to pdfpage.py. 2013-10-10 10:54:55 +00:00			`## PDFPage`
			`##`
			`class PDFPage(object):`

			`"""An object that holds the information about a page.`

			`A PDFPage object is merely a convenience class that has a set`
			`of keys and values, which describe the properties of a page`
			`and point to its contents.`

			`Attributes:`
			`doc: a PDFDocument object.`
			`pageid: any Python object that can uniquely identify the page.`
			`attrs: a dictionary of page attributes.`
			`contents: a list of PDFStream objects that represents the page content.`
			`lastmod: the last modified time of the page.`
			`resources: a list of resources used by the page.`
			`mediabox: the physical size of the page.`
			`cropbox: the crop rectangle of the page.`
			`rotate: the page rotation (in degree).`
			`annots: the page annotations.`
			`beads: a chain that represents natural reading order.`
			`"""`

			`def __init__(self, doc, pageid, attrs):`
			`"""Initialize a page object.`
PEP8: Remove trailing whitespace 2013-11-07 07:14:53 +00:00
Separated PDFPage to pdfpage.py. 2013-10-10 10:54:55 +00:00			`doc: a PDFDocument object.`
			`pageid: any Python object that can uniquely identify the page.`
			`attrs: a dictionary of page attributes.`
			`"""`
			`self.doc = doc`
			`self.pageid = pageid`
			`self.attrs = dict_value(attrs)`
			`self.lastmod = resolve1(self.attrs.get('LastModified'))`
Fixed: #56 (with a derpy fix) 2014-06-18 10:11:45 +00:00			`self.resources = resolve1(self.attrs.get('Resources', dict()))`
Separated PDFPage to pdfpage.py. 2013-10-10 10:54:55 +00:00			`self.mediabox = resolve1(self.attrs['MediaBox'])`
			`if 'CropBox' in self.attrs:`
			`self.cropbox = resolve1(self.attrs['CropBox'])`
			`else:`
			`self.cropbox = self.mediabox`
fixed: https://github.com/euske/pdfminer/issues/26 2013-10-17 14:20:08 +00:00			`self.rotate = (int_value(self.attrs.get('Rotate', 0))+360) % 360`
Separated PDFPage to pdfpage.py. 2013-10-10 10:54:55 +00:00			`self.annots = self.attrs.get('Annots')`
			`self.beads = self.attrs.get('B')`
			`if 'Contents' in self.attrs:`
			`contents = resolve1(self.attrs['Contents'])`
			`else:`
			`contents = []`
			`if not isinstance(contents, list):`
PEP8: Whitespace changes to match pep8 2013-11-07 08:35:04 +00:00			`contents = [contents]`
Separated PDFPage to pdfpage.py. 2013-10-10 10:54:55 +00:00			`self.contents = contents`
			`return`

			`def __repr__(self):`
			`return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)`

			`INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])`
PEP8: Whitespace changes to match pep8 2013-11-07 08:35:04 +00:00
Separated PDFPage to pdfpage.py. 2013-10-10 10:54:55 +00:00			`@classmethod`
Code cleanup: removed some debug flags. 2014-06-14 06:43:10 +00:00			`def create_pages(klass, document):`
Separated PDFPage to pdfpage.py. 2013-10-10 10:54:55 +00:00			`def search(obj, parent):`
			`if isinstance(obj, int):`
			`objid = obj`
			`tree = dict_value(document.getobj(objid)).copy()`
			`else:`
			`objid = obj.objid`
			`tree = dict_value(obj).copy()`
tests pass under Py 2.7 and 3.4 2014-09-01 12:16:49 +00:00			`for (k, v) in six.iteritems(parent):`
Separated PDFPage to pdfpage.py. 2013-10-10 10:54:55 +00:00			`if k in klass.INHERITABLE_ATTRS and k not in tree:`
			`tree[k] = v`
Fixes #64 -- be less strict when inspecting a tree type (#76) In the PDFStream it's possible that the /Type element is not present, but /type is. According to the spec, these are different elements, but in the case in point they had the same meaning. If PDFMiner is not running in STRICT mode and /Type doesn't resolve, a fallback to /type is used to determine the tree type. 2017-07-20 18:46:35 +00:00
			`tree_type = tree.get('Type')`
			`if tree_type is None and not settings.STRICT: # See #64`
			`tree_type = tree.get('type')`

			`if tree_type is LITERAL_PAGES and 'Kids' in tree:`
Make the logger run in a namespace. 2016-05-20 19:12:05 +00:00			`log.info('Pages: Kids=%r', tree['Kids'])`
Separated PDFPage to pdfpage.py. 2013-10-10 10:54:55 +00:00			`for c in list_value(tree['Kids']):`
			`for x in search(c, tree):`
			`yield x`
Fixes #64 -- be less strict when inspecting a tree type (#76) In the PDFStream it's possible that the /Type element is not present, but /type is. According to the spec, these are different elements, but in the case in point they had the same meaning. If PDFMiner is not running in STRICT mode and /Type doesn't resolve, a fallback to /type is used to determine the tree type. 2017-07-20 18:46:35 +00:00			`elif tree_type is LITERAL_PAGE:`
Make the logger run in a namespace. 2016-05-20 19:12:05 +00:00			`log.info('Page: %r', tree)`
Separated PDFPage to pdfpage.py. 2013-10-10 10:54:55 +00:00			`yield (objid, tree)`
			`pages = False`
			`if 'Pages' in document.catalog:`
PEP8: Whitespace changes to match pep8 2013-11-07 08:35:04 +00:00			`for (objid, tree) in search(document.catalog['Pages'], document.catalog):`
Separated PDFPage to pdfpage.py. 2013-10-10 10:54:55 +00:00			`yield klass(document, objid, tree)`
			`pages = True`
			`if not pages:`
			`# fallback when /Pages is missing.`
			`for xref in document.xrefs:`
			`for objid in xref.get_objids():`
			`try:`
			`obj = document.getobj(objid)`
			`if isinstance(obj, dict) and obj.get('Type') is LITERAL_PAGE:`
			`yield klass(document, objid, obj)`
			`except PDFObjectNotFound:`
			`pass`
			`return`
API change: process_pdf -> PDFPage.get_pages 2013-10-22 09:59:16 +00:00
			`@classmethod`
			`def get_pages(klass, fp,`
keep password api unicode, latin1 or utf-8 is encoded in handler 2014-09-16 20:58:25 +00:00			`pagenos=None, maxpages=0, password='',`
API change: process_pdf -> PDFPage.get_pages 2013-10-22 09:59:16 +00:00			`caching=True, check_extractable=True):`
			`# Create a PDF parser object associated with the file object.`
			`parser = PDFParser(fp)`
			`# Create a PDF document object that stores the document structure.`
Applied a patch by Axel Kaiser. 2014-03-24 11:39:30 +00:00			`doc = PDFDocument(parser, password=password, caching=caching)`
API change: process_pdf -> PDFPage.get_pages 2013-10-22 09:59:16 +00:00			`# Check if the document allows text extraction. If not, abort.`
			`if check_extractable and not doc.is_extractable:`
Applied a patch by Axel Kaiser. 2014-03-24 11:39:30 +00:00			`raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)`
API change: process_pdf -> PDFPage.get_pages 2013-10-22 09:59:16 +00:00			`# Process each page contained in the document.`
PEP8: Whitespace changes to match pep8 2013-11-07 08:35:04 +00:00			`for (pageno, page) in enumerate(klass.create_pages(doc)):`
			`if pagenos and (pageno not in pagenos):`
			`continue`
API change: process_pdf -> PDFPage.get_pages 2013-10-22 09:59:16 +00:00			`yield page`
PEP8: Whitespace changes to match pep8 2013-11-07 08:35:04 +00:00			`if maxpages and maxpages <= pageno+1:`
			`break`
API change: process_pdf -> PDFPage.get_pages 2013-10-22 09:59:16 +00:00			`return`