Separated PDFPage to pdfpage.py.
parent
2df67d85ae
commit
f85c374cae
|
@ -32,8 +32,6 @@ class PDFPasswordIncorrect(PDFEncryptionError): pass
|
|||
# some predefined literals and keywords.
|
||||
LITERAL_OBJSTM = LIT('ObjStm')
|
||||
LITERAL_XREF = LIT('XRef')
|
||||
LITERAL_PAGE = LIT('Page')
|
||||
LITERAL_PAGES = LIT('Pages')
|
||||
LITERAL_CATALOG = LIT('Catalog')
|
||||
|
||||
|
||||
|
@ -244,63 +242,6 @@ class PDFXRefStream(PDFBaseXRef):
|
|||
raise KeyError(objid)
|
||||
|
||||
|
||||
## PDFPage
|
||||
##
|
||||
class PDFPage(object):
|
||||
|
||||
"""An object that holds the information about a page.
|
||||
|
||||
A PDFPage object is merely a convenience class that has a set
|
||||
of keys and values, which describe the properties of a page
|
||||
and point to its contents.
|
||||
|
||||
Attributes:
|
||||
doc: a PDFDocument object.
|
||||
pageid: any Python object that can uniquely identify the page.
|
||||
attrs: a dictionary of page attributes.
|
||||
contents: a list of PDFStream objects that represents the page content.
|
||||
lastmod: the last modified time of the page.
|
||||
resources: a list of resources used by the page.
|
||||
mediabox: the physical size of the page.
|
||||
cropbox: the crop rectangle of the page.
|
||||
rotate: the page rotation (in degree).
|
||||
annots: the page annotations.
|
||||
beads: a chain that represents natural reading order.
|
||||
"""
|
||||
|
||||
def __init__(self, doc, pageid, attrs):
|
||||
"""Initialize a page object.
|
||||
|
||||
doc: a PDFDocument object.
|
||||
pageid: any Python object that can uniquely identify the page.
|
||||
attrs: a dictionary of page attributes.
|
||||
"""
|
||||
self.doc = doc
|
||||
self.pageid = pageid
|
||||
self.attrs = dict_value(attrs)
|
||||
self.lastmod = resolve1(self.attrs.get('LastModified'))
|
||||
self.resources = resolve1(self.attrs['Resources'])
|
||||
self.mediabox = resolve1(self.attrs['MediaBox'])
|
||||
if 'CropBox' in self.attrs:
|
||||
self.cropbox = resolve1(self.attrs['CropBox'])
|
||||
else:
|
||||
self.cropbox = self.mediabox
|
||||
self.rotate = (self.attrs.get('Rotate', 0)+360) % 360
|
||||
self.annots = self.attrs.get('Annots')
|
||||
self.beads = self.attrs.get('B')
|
||||
if 'Contents' in self.attrs:
|
||||
contents = resolve1(self.attrs['Contents'])
|
||||
else:
|
||||
contents = []
|
||||
if not isinstance(contents, list):
|
||||
contents = [ contents ]
|
||||
self.contents = contents
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
|
||||
|
||||
|
||||
## PDFDocument
|
||||
##
|
||||
class PDFDocument(object):
|
||||
|
@ -516,47 +457,6 @@ class PDFDocument(object):
|
|||
obj = decipher_all(self.decipher, objid, genno, obj)
|
||||
return obj
|
||||
|
||||
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
|
||||
def get_pages(self):
|
||||
if not self.xrefs:
|
||||
raise PDFException('PDFDocument is not initialized')
|
||||
def search(obj, parent):
|
||||
if isinstance(obj, int):
|
||||
objid = obj
|
||||
tree = dict_value(self.getobj(objid)).copy()
|
||||
else:
|
||||
objid = obj.objid
|
||||
tree = dict_value(obj).copy()
|
||||
for (k,v) in parent.iteritems():
|
||||
if k in self.INHERITABLE_ATTRS and k not in tree:
|
||||
tree[k] = v
|
||||
if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
|
||||
if 1 <= self.debug:
|
||||
print >>sys.stderr, 'Pages: Kids=%r' % tree['Kids']
|
||||
for c in list_value(tree['Kids']):
|
||||
for x in search(c, tree):
|
||||
yield x
|
||||
elif tree.get('Type') is LITERAL_PAGE:
|
||||
if 1 <= self.debug:
|
||||
print >>sys.stderr, 'Page: %r' % tree
|
||||
yield (objid, tree)
|
||||
pages = False
|
||||
if 'Pages' in self.catalog:
|
||||
for (objid,tree) in search(self.catalog['Pages'], self.catalog):
|
||||
yield PDFPage(self, objid, tree)
|
||||
pages = True
|
||||
if not pages:
|
||||
# fallback when /Pages is missing.
|
||||
for xref in self.xrefs:
|
||||
for objid in xref.get_objids():
|
||||
try:
|
||||
obj = self.getobj(objid)
|
||||
if isinstance(obj, dict) and obj.get('Type') is LITERAL_PAGE:
|
||||
yield PDFPage(self, objid, obj)
|
||||
except PDFObjectNotFound:
|
||||
pass
|
||||
return
|
||||
|
||||
def get_outlines(self):
|
||||
if 'Outlines' not in self.catalog:
|
||||
raise PDFNoOutlines
|
||||
|
|
|
@ -24,6 +24,7 @@ from pdfcolor import PDFColorSpace
|
|||
from pdfcolor import PREDEFINED_COLORSPACE
|
||||
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
|
||||
from pdfcolor import LITERAL_DEVICE_CMYK
|
||||
from pdfpage import PDFPage
|
||||
from utils import choplist
|
||||
from utils import mult_matrix, MATRIX_IDENTITY
|
||||
|
||||
|
@ -824,7 +825,7 @@ def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password='',
|
|||
# Create a PDF interpreter object.
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
# Process each page contained in the document.
|
||||
for (pageno,page) in enumerate(doc.get_pages()):
|
||||
for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
|
||||
if pagenos and (pageno not in pagenos): continue
|
||||
interpreter.process_page(page)
|
||||
if maxpages and maxpages <= pageno+1: break
|
||||
|
|
|
@ -0,0 +1,109 @@
|
|||
#!/usr/bin/env python2
|
||||
import sys
|
||||
from psparser import LIT, KWD, STRICT
|
||||
from pdftypes import PDFObjectNotFound
|
||||
from pdftypes import resolve1
|
||||
from pdftypes import int_value, float_value, num_value
|
||||
from pdftypes import str_value, list_value, dict_value, stream_value
|
||||
|
||||
|
||||
# some predefined literals and keywords.
|
||||
LITERAL_PAGE = LIT('Page')
|
||||
LITERAL_PAGES = LIT('Pages')
|
||||
|
||||
|
||||
## PDFPage
|
||||
##
|
||||
class PDFPage(object):
|
||||
|
||||
"""An object that holds the information about a page.
|
||||
|
||||
A PDFPage object is merely a convenience class that has a set
|
||||
of keys and values, which describe the properties of a page
|
||||
and point to its contents.
|
||||
|
||||
Attributes:
|
||||
doc: a PDFDocument object.
|
||||
pageid: any Python object that can uniquely identify the page.
|
||||
attrs: a dictionary of page attributes.
|
||||
contents: a list of PDFStream objects that represents the page content.
|
||||
lastmod: the last modified time of the page.
|
||||
resources: a list of resources used by the page.
|
||||
mediabox: the physical size of the page.
|
||||
cropbox: the crop rectangle of the page.
|
||||
rotate: the page rotation (in degree).
|
||||
annots: the page annotations.
|
||||
beads: a chain that represents natural reading order.
|
||||
"""
|
||||
|
||||
def __init__(self, doc, pageid, attrs):
|
||||
"""Initialize a page object.
|
||||
|
||||
doc: a PDFDocument object.
|
||||
pageid: any Python object that can uniquely identify the page.
|
||||
attrs: a dictionary of page attributes.
|
||||
"""
|
||||
self.doc = doc
|
||||
self.pageid = pageid
|
||||
self.attrs = dict_value(attrs)
|
||||
self.lastmod = resolve1(self.attrs.get('LastModified'))
|
||||
self.resources = resolve1(self.attrs['Resources'])
|
||||
self.mediabox = resolve1(self.attrs['MediaBox'])
|
||||
if 'CropBox' in self.attrs:
|
||||
self.cropbox = resolve1(self.attrs['CropBox'])
|
||||
else:
|
||||
self.cropbox = self.mediabox
|
||||
self.rotate = (self.attrs.get('Rotate', 0)+360) % 360
|
||||
self.annots = self.attrs.get('Annots')
|
||||
self.beads = self.attrs.get('B')
|
||||
if 'Contents' in self.attrs:
|
||||
contents = resolve1(self.attrs['Contents'])
|
||||
else:
|
||||
contents = []
|
||||
if not isinstance(contents, list):
|
||||
contents = [ contents ]
|
||||
self.contents = contents
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
|
||||
|
||||
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
|
||||
@classmethod
|
||||
def create_pages(klass, document, debug=0):
|
||||
def search(obj, parent):
|
||||
if isinstance(obj, int):
|
||||
objid = obj
|
||||
tree = dict_value(document.getobj(objid)).copy()
|
||||
else:
|
||||
objid = obj.objid
|
||||
tree = dict_value(obj).copy()
|
||||
for (k,v) in parent.iteritems():
|
||||
if k in klass.INHERITABLE_ATTRS and k not in tree:
|
||||
tree[k] = v
|
||||
if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
|
||||
if 1 <= debug:
|
||||
print >>sys.stderr, 'Pages: Kids=%r' % tree['Kids']
|
||||
for c in list_value(tree['Kids']):
|
||||
for x in search(c, tree):
|
||||
yield x
|
||||
elif tree.get('Type') is LITERAL_PAGE:
|
||||
if 1 <= debug:
|
||||
print >>sys.stderr, 'Page: %r' % tree
|
||||
yield (objid, tree)
|
||||
pages = False
|
||||
if 'Pages' in document.catalog:
|
||||
for (objid,tree) in search(document.catalog['Pages'], document.catalog):
|
||||
yield klass(document, objid, tree)
|
||||
pages = True
|
||||
if not pages:
|
||||
# fallback when /Pages is missing.
|
||||
for xref in document.xrefs:
|
||||
for objid in xref.get_objids():
|
||||
try:
|
||||
obj = document.getobj(objid)
|
||||
if isinstance(obj, dict) and obj.get('Type') is LITERAL_PAGE:
|
||||
yield klass(document, objid, obj)
|
||||
except PDFObjectNotFound:
|
||||
pass
|
||||
return
|
|
@ -12,6 +12,7 @@ from pdfminer.pdfparser import PDFParser
|
|||
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
|
||||
from pdfminer.pdftypes import PDFObjectNotFound
|
||||
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
|
||||
|
||||
ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
|
||||
|
@ -112,7 +113,8 @@ def dumpoutline(outfp, fname, objids, pagenos, password='',
|
|||
parser = PDFParser(fp)
|
||||
doc = PDFDocument(parser)
|
||||
doc.initialize(password)
|
||||
pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
|
||||
pages = dict( (page.pageid, pageno) for (pageno,page)
|
||||
in enumerate(PDFPage.create_pages(doc)) )
|
||||
def resolve_dest(dest):
|
||||
if isinstance(dest, str):
|
||||
dest = resolve1(doc.get_dest(dest))
|
||||
|
@ -164,7 +166,7 @@ def dumppdf(outfp, fname, objids, pagenos, password='',
|
|||
obj = doc.getobj(objid)
|
||||
dumpxml(outfp, obj, codec=codec)
|
||||
if pagenos:
|
||||
for (pageno,page) in enumerate(doc.get_pages()):
|
||||
for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
|
||||
if pageno in pagenos:
|
||||
if codec:
|
||||
for obj in page.contents:
|
||||
|
|
Loading…
Reference in New Issue