2016-11-08 19:01:11 +00:00
|
|
|
|
2014-06-14 03:00:49 +00:00
|
|
|
import logging
|
2017-07-20 18:46:35 +00:00
|
|
|
from . import settings
|
2014-06-26 09:12:39 +00:00
|
|
|
from .psparser import LIT
|
|
|
|
from .pdftypes import PDFObjectNotFound
|
|
|
|
from .pdftypes import resolve1
|
|
|
|
from .pdftypes import int_value
|
|
|
|
from .pdftypes import list_value
|
|
|
|
from .pdftypes import dict_value
|
|
|
|
from .pdfparser import PDFParser
|
|
|
|
from .pdfdocument import PDFDocument
|
|
|
|
from .pdfdocument import PDFTextExtractionNotAllowed
|
2013-10-10 10:54:55 +00:00
|
|
|
|
2016-05-20 19:12:05 +00:00
|
|
|
import six # Python 2+3 compatibility
|
|
|
|
|
|
|
|
log = logging.getLogger(__name__)
|
2013-10-10 10:54:55 +00:00
|
|
|
|
2014-09-03 13:26:08 +00:00
|
|
|
# some predefined literals and keywords.
|
|
|
|
LITERAL_PAGE = LIT('Page')
|
|
|
|
LITERAL_PAGES = LIT('Pages')
|
|
|
|
|
2013-10-10 10:54:55 +00:00
|
|
|
## PDFPage
|
|
|
|
##
|
|
|
|
class PDFPage(object):
|
|
|
|
|
|
|
|
"""An object that holds the information about a page.
|
|
|
|
|
|
|
|
A PDFPage object is merely a convenience class that has a set
|
|
|
|
of keys and values, which describe the properties of a page
|
|
|
|
and point to its contents.
|
|
|
|
|
|
|
|
Attributes:
|
|
|
|
doc: a PDFDocument object.
|
|
|
|
pageid: any Python object that can uniquely identify the page.
|
|
|
|
attrs: a dictionary of page attributes.
|
|
|
|
contents: a list of PDFStream objects that represents the page content.
|
|
|
|
lastmod: the last modified time of the page.
|
|
|
|
resources: a list of resources used by the page.
|
|
|
|
mediabox: the physical size of the page.
|
|
|
|
cropbox: the crop rectangle of the page.
|
|
|
|
rotate: the page rotation (in degree).
|
|
|
|
annots: the page annotations.
|
|
|
|
beads: a chain that represents natural reading order.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, doc, pageid, attrs):
|
|
|
|
"""Initialize a page object.
|
2013-11-07 07:14:53 +00:00
|
|
|
|
2013-10-10 10:54:55 +00:00
|
|
|
doc: a PDFDocument object.
|
|
|
|
pageid: any Python object that can uniquely identify the page.
|
|
|
|
attrs: a dictionary of page attributes.
|
|
|
|
"""
|
|
|
|
self.doc = doc
|
|
|
|
self.pageid = pageid
|
|
|
|
self.attrs = dict_value(attrs)
|
|
|
|
self.lastmod = resolve1(self.attrs.get('LastModified'))
|
2014-06-18 10:11:45 +00:00
|
|
|
self.resources = resolve1(self.attrs.get('Resources', dict()))
|
2013-10-10 10:54:55 +00:00
|
|
|
self.mediabox = resolve1(self.attrs['MediaBox'])
|
|
|
|
if 'CropBox' in self.attrs:
|
|
|
|
self.cropbox = resolve1(self.attrs['CropBox'])
|
|
|
|
else:
|
|
|
|
self.cropbox = self.mediabox
|
2013-10-17 14:20:08 +00:00
|
|
|
self.rotate = (int_value(self.attrs.get('Rotate', 0))+360) % 360
|
2013-10-10 10:54:55 +00:00
|
|
|
self.annots = self.attrs.get('Annots')
|
|
|
|
self.beads = self.attrs.get('B')
|
|
|
|
if 'Contents' in self.attrs:
|
|
|
|
contents = resolve1(self.attrs['Contents'])
|
|
|
|
else:
|
|
|
|
contents = []
|
|
|
|
if not isinstance(contents, list):
|
2013-11-07 08:35:04 +00:00
|
|
|
contents = [contents]
|
2013-10-10 10:54:55 +00:00
|
|
|
self.contents = contents
|
|
|
|
return
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
|
|
|
|
|
|
|
|
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
|
2013-11-07 08:35:04 +00:00
|
|
|
|
2013-10-10 10:54:55 +00:00
|
|
|
@classmethod
|
2014-06-14 06:43:10 +00:00
|
|
|
def create_pages(klass, document):
|
2013-10-10 10:54:55 +00:00
|
|
|
def search(obj, parent):
|
|
|
|
if isinstance(obj, int):
|
|
|
|
objid = obj
|
|
|
|
tree = dict_value(document.getobj(objid)).copy()
|
|
|
|
else:
|
|
|
|
objid = obj.objid
|
|
|
|
tree = dict_value(obj).copy()
|
2014-09-01 12:16:49 +00:00
|
|
|
for (k, v) in six.iteritems(parent):
|
2013-10-10 10:54:55 +00:00
|
|
|
if k in klass.INHERITABLE_ATTRS and k not in tree:
|
|
|
|
tree[k] = v
|
2017-07-20 18:46:35 +00:00
|
|
|
|
|
|
|
tree_type = tree.get('Type')
|
|
|
|
if tree_type is None and not settings.STRICT: # See #64
|
|
|
|
tree_type = tree.get('type')
|
|
|
|
|
|
|
|
if tree_type is LITERAL_PAGES and 'Kids' in tree:
|
2016-05-20 19:12:05 +00:00
|
|
|
log.info('Pages: Kids=%r', tree['Kids'])
|
2013-10-10 10:54:55 +00:00
|
|
|
for c in list_value(tree['Kids']):
|
|
|
|
for x in search(c, tree):
|
|
|
|
yield x
|
2017-07-20 18:46:35 +00:00
|
|
|
elif tree_type is LITERAL_PAGE:
|
2016-05-20 19:12:05 +00:00
|
|
|
log.info('Page: %r', tree)
|
2013-10-10 10:54:55 +00:00
|
|
|
yield (objid, tree)
|
|
|
|
pages = False
|
|
|
|
if 'Pages' in document.catalog:
|
2013-11-07 08:35:04 +00:00
|
|
|
for (objid, tree) in search(document.catalog['Pages'], document.catalog):
|
2013-10-10 10:54:55 +00:00
|
|
|
yield klass(document, objid, tree)
|
|
|
|
pages = True
|
|
|
|
if not pages:
|
|
|
|
# fallback when /Pages is missing.
|
|
|
|
for xref in document.xrefs:
|
|
|
|
for objid in xref.get_objids():
|
|
|
|
try:
|
|
|
|
obj = document.getobj(objid)
|
|
|
|
if isinstance(obj, dict) and obj.get('Type') is LITERAL_PAGE:
|
|
|
|
yield klass(document, objid, obj)
|
|
|
|
except PDFObjectNotFound:
|
|
|
|
pass
|
|
|
|
return
|
2013-10-22 09:59:16 +00:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def get_pages(klass, fp,
|
2014-09-16 20:58:25 +00:00
|
|
|
pagenos=None, maxpages=0, password='',
|
2013-10-22 09:59:16 +00:00
|
|
|
caching=True, check_extractable=True):
|
|
|
|
# Create a PDF parser object associated with the file object.
|
|
|
|
parser = PDFParser(fp)
|
|
|
|
# Create a PDF document object that stores the document structure.
|
2014-03-24 11:39:30 +00:00
|
|
|
doc = PDFDocument(parser, password=password, caching=caching)
|
2013-10-22 09:59:16 +00:00
|
|
|
# Check if the document allows text extraction. If not, abort.
|
|
|
|
if check_extractable and not doc.is_extractable:
|
2014-03-24 11:39:30 +00:00
|
|
|
raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
|
2013-10-22 09:59:16 +00:00
|
|
|
# Process each page contained in the document.
|
2013-11-07 08:35:04 +00:00
|
|
|
for (pageno, page) in enumerate(klass.create_pages(doc)):
|
|
|
|
if pagenos and (pageno not in pagenos):
|
|
|
|
continue
|
2013-10-22 09:59:16 +00:00
|
|
|
yield page
|
2013-11-07 08:35:04 +00:00
|
|
|
if maxpages and maxpages <= pageno+1:
|
|
|
|
break
|
2013-10-22 09:59:16 +00:00
|
|
|
return
|