Separated PDFPage to pdfpage.py.
parent
2df67d85ae
commit
f85c374cae
|
@ -32,8 +32,6 @@ class PDFPasswordIncorrect(PDFEncryptionError): pass
|
||||||
# some predefined literals and keywords.
|
# some predefined literals and keywords.
|
||||||
LITERAL_OBJSTM = LIT('ObjStm')
|
LITERAL_OBJSTM = LIT('ObjStm')
|
||||||
LITERAL_XREF = LIT('XRef')
|
LITERAL_XREF = LIT('XRef')
|
||||||
LITERAL_PAGE = LIT('Page')
|
|
||||||
LITERAL_PAGES = LIT('Pages')
|
|
||||||
LITERAL_CATALOG = LIT('Catalog')
|
LITERAL_CATALOG = LIT('Catalog')
|
||||||
|
|
||||||
|
|
||||||
|
@ -244,63 +242,6 @@ class PDFXRefStream(PDFBaseXRef):
|
||||||
raise KeyError(objid)
|
raise KeyError(objid)
|
||||||
|
|
||||||
|
|
||||||
## PDFPage
|
|
||||||
##
|
|
||||||
class PDFPage(object):
|
|
||||||
|
|
||||||
"""An object that holds the information about a page.
|
|
||||||
|
|
||||||
A PDFPage object is merely a convenience class that has a set
|
|
||||||
of keys and values, which describe the properties of a page
|
|
||||||
and point to its contents.
|
|
||||||
|
|
||||||
Attributes:
|
|
||||||
doc: a PDFDocument object.
|
|
||||||
pageid: any Python object that can uniquely identify the page.
|
|
||||||
attrs: a dictionary of page attributes.
|
|
||||||
contents: a list of PDFStream objects that represents the page content.
|
|
||||||
lastmod: the last modified time of the page.
|
|
||||||
resources: a list of resources used by the page.
|
|
||||||
mediabox: the physical size of the page.
|
|
||||||
cropbox: the crop rectangle of the page.
|
|
||||||
rotate: the page rotation (in degree).
|
|
||||||
annots: the page annotations.
|
|
||||||
beads: a chain that represents natural reading order.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, doc, pageid, attrs):
|
|
||||||
"""Initialize a page object.
|
|
||||||
|
|
||||||
doc: a PDFDocument object.
|
|
||||||
pageid: any Python object that can uniquely identify the page.
|
|
||||||
attrs: a dictionary of page attributes.
|
|
||||||
"""
|
|
||||||
self.doc = doc
|
|
||||||
self.pageid = pageid
|
|
||||||
self.attrs = dict_value(attrs)
|
|
||||||
self.lastmod = resolve1(self.attrs.get('LastModified'))
|
|
||||||
self.resources = resolve1(self.attrs['Resources'])
|
|
||||||
self.mediabox = resolve1(self.attrs['MediaBox'])
|
|
||||||
if 'CropBox' in self.attrs:
|
|
||||||
self.cropbox = resolve1(self.attrs['CropBox'])
|
|
||||||
else:
|
|
||||||
self.cropbox = self.mediabox
|
|
||||||
self.rotate = (self.attrs.get('Rotate', 0)+360) % 360
|
|
||||||
self.annots = self.attrs.get('Annots')
|
|
||||||
self.beads = self.attrs.get('B')
|
|
||||||
if 'Contents' in self.attrs:
|
|
||||||
contents = resolve1(self.attrs['Contents'])
|
|
||||||
else:
|
|
||||||
contents = []
|
|
||||||
if not isinstance(contents, list):
|
|
||||||
contents = [ contents ]
|
|
||||||
self.contents = contents
|
|
||||||
return
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
|
|
||||||
|
|
||||||
|
|
||||||
## PDFDocument
|
## PDFDocument
|
||||||
##
|
##
|
||||||
class PDFDocument(object):
|
class PDFDocument(object):
|
||||||
|
@ -516,47 +457,6 @@ class PDFDocument(object):
|
||||||
obj = decipher_all(self.decipher, objid, genno, obj)
|
obj = decipher_all(self.decipher, objid, genno, obj)
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
|
|
||||||
def get_pages(self):
|
|
||||||
if not self.xrefs:
|
|
||||||
raise PDFException('PDFDocument is not initialized')
|
|
||||||
def search(obj, parent):
|
|
||||||
if isinstance(obj, int):
|
|
||||||
objid = obj
|
|
||||||
tree = dict_value(self.getobj(objid)).copy()
|
|
||||||
else:
|
|
||||||
objid = obj.objid
|
|
||||||
tree = dict_value(obj).copy()
|
|
||||||
for (k,v) in parent.iteritems():
|
|
||||||
if k in self.INHERITABLE_ATTRS and k not in tree:
|
|
||||||
tree[k] = v
|
|
||||||
if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
|
|
||||||
if 1 <= self.debug:
|
|
||||||
print >>sys.stderr, 'Pages: Kids=%r' % tree['Kids']
|
|
||||||
for c in list_value(tree['Kids']):
|
|
||||||
for x in search(c, tree):
|
|
||||||
yield x
|
|
||||||
elif tree.get('Type') is LITERAL_PAGE:
|
|
||||||
if 1 <= self.debug:
|
|
||||||
print >>sys.stderr, 'Page: %r' % tree
|
|
||||||
yield (objid, tree)
|
|
||||||
pages = False
|
|
||||||
if 'Pages' in self.catalog:
|
|
||||||
for (objid,tree) in search(self.catalog['Pages'], self.catalog):
|
|
||||||
yield PDFPage(self, objid, tree)
|
|
||||||
pages = True
|
|
||||||
if not pages:
|
|
||||||
# fallback when /Pages is missing.
|
|
||||||
for xref in self.xrefs:
|
|
||||||
for objid in xref.get_objids():
|
|
||||||
try:
|
|
||||||
obj = self.getobj(objid)
|
|
||||||
if isinstance(obj, dict) and obj.get('Type') is LITERAL_PAGE:
|
|
||||||
yield PDFPage(self, objid, obj)
|
|
||||||
except PDFObjectNotFound:
|
|
||||||
pass
|
|
||||||
return
|
|
||||||
|
|
||||||
def get_outlines(self):
|
def get_outlines(self):
|
||||||
if 'Outlines' not in self.catalog:
|
if 'Outlines' not in self.catalog:
|
||||||
raise PDFNoOutlines
|
raise PDFNoOutlines
|
||||||
|
|
|
@ -24,6 +24,7 @@ from pdfcolor import PDFColorSpace
|
||||||
from pdfcolor import PREDEFINED_COLORSPACE
|
from pdfcolor import PREDEFINED_COLORSPACE
|
||||||
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
|
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
|
||||||
from pdfcolor import LITERAL_DEVICE_CMYK
|
from pdfcolor import LITERAL_DEVICE_CMYK
|
||||||
|
from pdfpage import PDFPage
|
||||||
from utils import choplist
|
from utils import choplist
|
||||||
from utils import mult_matrix, MATRIX_IDENTITY
|
from utils import mult_matrix, MATRIX_IDENTITY
|
||||||
|
|
||||||
|
@ -824,7 +825,7 @@ def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password='',
|
||||||
# Create a PDF interpreter object.
|
# Create a PDF interpreter object.
|
||||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||||
# Process each page contained in the document.
|
# Process each page contained in the document.
|
||||||
for (pageno,page) in enumerate(doc.get_pages()):
|
for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
|
||||||
if pagenos and (pageno not in pagenos): continue
|
if pagenos and (pageno not in pagenos): continue
|
||||||
interpreter.process_page(page)
|
interpreter.process_page(page)
|
||||||
if maxpages and maxpages <= pageno+1: break
|
if maxpages and maxpages <= pageno+1: break
|
||||||
|
|
|
@ -0,0 +1,109 @@
|
||||||
|
#!/usr/bin/env python2
|
||||||
|
import sys
|
||||||
|
from psparser import LIT, KWD, STRICT
|
||||||
|
from pdftypes import PDFObjectNotFound
|
||||||
|
from pdftypes import resolve1
|
||||||
|
from pdftypes import int_value, float_value, num_value
|
||||||
|
from pdftypes import str_value, list_value, dict_value, stream_value
|
||||||
|
|
||||||
|
|
||||||
|
# some predefined literals and keywords.
|
||||||
|
LITERAL_PAGE = LIT('Page')
|
||||||
|
LITERAL_PAGES = LIT('Pages')
|
||||||
|
|
||||||
|
|
||||||
|
## PDFPage
|
||||||
|
##
|
||||||
|
class PDFPage(object):
|
||||||
|
|
||||||
|
"""An object that holds the information about a page.
|
||||||
|
|
||||||
|
A PDFPage object is merely a convenience class that has a set
|
||||||
|
of keys and values, which describe the properties of a page
|
||||||
|
and point to its contents.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
doc: a PDFDocument object.
|
||||||
|
pageid: any Python object that can uniquely identify the page.
|
||||||
|
attrs: a dictionary of page attributes.
|
||||||
|
contents: a list of PDFStream objects that represents the page content.
|
||||||
|
lastmod: the last modified time of the page.
|
||||||
|
resources: a list of resources used by the page.
|
||||||
|
mediabox: the physical size of the page.
|
||||||
|
cropbox: the crop rectangle of the page.
|
||||||
|
rotate: the page rotation (in degree).
|
||||||
|
annots: the page annotations.
|
||||||
|
beads: a chain that represents natural reading order.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, doc, pageid, attrs):
|
||||||
|
"""Initialize a page object.
|
||||||
|
|
||||||
|
doc: a PDFDocument object.
|
||||||
|
pageid: any Python object that can uniquely identify the page.
|
||||||
|
attrs: a dictionary of page attributes.
|
||||||
|
"""
|
||||||
|
self.doc = doc
|
||||||
|
self.pageid = pageid
|
||||||
|
self.attrs = dict_value(attrs)
|
||||||
|
self.lastmod = resolve1(self.attrs.get('LastModified'))
|
||||||
|
self.resources = resolve1(self.attrs['Resources'])
|
||||||
|
self.mediabox = resolve1(self.attrs['MediaBox'])
|
||||||
|
if 'CropBox' in self.attrs:
|
||||||
|
self.cropbox = resolve1(self.attrs['CropBox'])
|
||||||
|
else:
|
||||||
|
self.cropbox = self.mediabox
|
||||||
|
self.rotate = (self.attrs.get('Rotate', 0)+360) % 360
|
||||||
|
self.annots = self.attrs.get('Annots')
|
||||||
|
self.beads = self.attrs.get('B')
|
||||||
|
if 'Contents' in self.attrs:
|
||||||
|
contents = resolve1(self.attrs['Contents'])
|
||||||
|
else:
|
||||||
|
contents = []
|
||||||
|
if not isinstance(contents, list):
|
||||||
|
contents = [ contents ]
|
||||||
|
self.contents = contents
|
||||||
|
return
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
|
||||||
|
|
||||||
|
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
|
||||||
|
@classmethod
|
||||||
|
def create_pages(klass, document, debug=0):
|
||||||
|
def search(obj, parent):
|
||||||
|
if isinstance(obj, int):
|
||||||
|
objid = obj
|
||||||
|
tree = dict_value(document.getobj(objid)).copy()
|
||||||
|
else:
|
||||||
|
objid = obj.objid
|
||||||
|
tree = dict_value(obj).copy()
|
||||||
|
for (k,v) in parent.iteritems():
|
||||||
|
if k in klass.INHERITABLE_ATTRS and k not in tree:
|
||||||
|
tree[k] = v
|
||||||
|
if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
|
||||||
|
if 1 <= debug:
|
||||||
|
print >>sys.stderr, 'Pages: Kids=%r' % tree['Kids']
|
||||||
|
for c in list_value(tree['Kids']):
|
||||||
|
for x in search(c, tree):
|
||||||
|
yield x
|
||||||
|
elif tree.get('Type') is LITERAL_PAGE:
|
||||||
|
if 1 <= debug:
|
||||||
|
print >>sys.stderr, 'Page: %r' % tree
|
||||||
|
yield (objid, tree)
|
||||||
|
pages = False
|
||||||
|
if 'Pages' in document.catalog:
|
||||||
|
for (objid,tree) in search(document.catalog['Pages'], document.catalog):
|
||||||
|
yield klass(document, objid, tree)
|
||||||
|
pages = True
|
||||||
|
if not pages:
|
||||||
|
# fallback when /Pages is missing.
|
||||||
|
for xref in document.xrefs:
|
||||||
|
for objid in xref.get_objids():
|
||||||
|
try:
|
||||||
|
obj = document.getobj(objid)
|
||||||
|
if isinstance(obj, dict) and obj.get('Type') is LITERAL_PAGE:
|
||||||
|
yield klass(document, objid, obj)
|
||||||
|
except PDFObjectNotFound:
|
||||||
|
pass
|
||||||
|
return
|
|
@ -12,6 +12,7 @@ from pdfminer.pdfparser import PDFParser
|
||||||
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
|
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
|
||||||
from pdfminer.pdftypes import PDFObjectNotFound
|
from pdfminer.pdftypes import PDFObjectNotFound
|
||||||
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
|
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
|
||||||
|
from pdfminer.pdfpage import PDFPage
|
||||||
|
|
||||||
|
|
||||||
ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
|
ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
|
||||||
|
@ -112,7 +113,8 @@ def dumpoutline(outfp, fname, objids, pagenos, password='',
|
||||||
parser = PDFParser(fp)
|
parser = PDFParser(fp)
|
||||||
doc = PDFDocument(parser)
|
doc = PDFDocument(parser)
|
||||||
doc.initialize(password)
|
doc.initialize(password)
|
||||||
pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
|
pages = dict( (page.pageid, pageno) for (pageno,page)
|
||||||
|
in enumerate(PDFPage.create_pages(doc)) )
|
||||||
def resolve_dest(dest):
|
def resolve_dest(dest):
|
||||||
if isinstance(dest, str):
|
if isinstance(dest, str):
|
||||||
dest = resolve1(doc.get_dest(dest))
|
dest = resolve1(doc.get_dest(dest))
|
||||||
|
@ -164,7 +166,7 @@ def dumppdf(outfp, fname, objids, pagenos, password='',
|
||||||
obj = doc.getobj(objid)
|
obj = doc.getobj(objid)
|
||||||
dumpxml(outfp, obj, codec=codec)
|
dumpxml(outfp, obj, codec=codec)
|
||||||
if pagenos:
|
if pagenos:
|
||||||
for (pageno,page) in enumerate(doc.get_pages()):
|
for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
|
||||||
if pageno in pagenos:
|
if pageno in pagenos:
|
||||||
if codec:
|
if codec:
|
||||||
for obj in page.contents:
|
for obj in page.contents:
|
||||||
|
|
Loading…
Reference in New Issue