Separated PDFPage to pdfpage.py.

pull/1/head
Yusuke Shinyama 2013-10-10 19:54:55 +09:00
parent 2df67d85ae
commit f85c374cae
4 changed files with 115 additions and 103 deletions

View File

@ -32,8 +32,6 @@ class PDFPasswordIncorrect(PDFEncryptionError): pass
# some predefined literals and keywords.
LITERAL_OBJSTM = LIT('ObjStm')
LITERAL_XREF = LIT('XRef')
LITERAL_PAGE = LIT('Page')
LITERAL_PAGES = LIT('Pages')
LITERAL_CATALOG = LIT('Catalog')
@ -244,63 +242,6 @@ class PDFXRefStream(PDFBaseXRef):
raise KeyError(objid)
## PDFPage
##
class PDFPage(object):
"""An object that holds the information about a page.
A PDFPage object is merely a convenience class that has a set
of keys and values, which describe the properties of a page
and point to its contents.
Attributes:
doc: a PDFDocument object.
pageid: any Python object that can uniquely identify the page.
attrs: a dictionary of page attributes.
contents: a list of PDFStream objects that represents the page content.
lastmod: the last modified time of the page.
resources: a list of resources used by the page.
mediabox: the physical size of the page.
cropbox: the crop rectangle of the page.
rotate: the page rotation (in degree).
annots: the page annotations.
beads: a chain that represents natural reading order.
"""
def __init__(self, doc, pageid, attrs):
"""Initialize a page object.
doc: a PDFDocument object.
pageid: any Python object that can uniquely identify the page.
attrs: a dictionary of page attributes.
"""
self.doc = doc
self.pageid = pageid
self.attrs = dict_value(attrs)
self.lastmod = resolve1(self.attrs.get('LastModified'))
self.resources = resolve1(self.attrs['Resources'])
self.mediabox = resolve1(self.attrs['MediaBox'])
if 'CropBox' in self.attrs:
self.cropbox = resolve1(self.attrs['CropBox'])
else:
self.cropbox = self.mediabox
self.rotate = (self.attrs.get('Rotate', 0)+360) % 360
self.annots = self.attrs.get('Annots')
self.beads = self.attrs.get('B')
if 'Contents' in self.attrs:
contents = resolve1(self.attrs['Contents'])
else:
contents = []
if not isinstance(contents, list):
contents = [ contents ]
self.contents = contents
return
def __repr__(self):
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
## PDFDocument
##
class PDFDocument(object):
@ -516,47 +457,6 @@ class PDFDocument(object):
obj = decipher_all(self.decipher, objid, genno, obj)
return obj
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
def get_pages(self):
if not self.xrefs:
raise PDFException('PDFDocument is not initialized')
def search(obj, parent):
if isinstance(obj, int):
objid = obj
tree = dict_value(self.getobj(objid)).copy()
else:
objid = obj.objid
tree = dict_value(obj).copy()
for (k,v) in parent.iteritems():
if k in self.INHERITABLE_ATTRS and k not in tree:
tree[k] = v
if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
if 1 <= self.debug:
print >>sys.stderr, 'Pages: Kids=%r' % tree['Kids']
for c in list_value(tree['Kids']):
for x in search(c, tree):
yield x
elif tree.get('Type') is LITERAL_PAGE:
if 1 <= self.debug:
print >>sys.stderr, 'Page: %r' % tree
yield (objid, tree)
pages = False
if 'Pages' in self.catalog:
for (objid,tree) in search(self.catalog['Pages'], self.catalog):
yield PDFPage(self, objid, tree)
pages = True
if not pages:
# fallback when /Pages is missing.
for xref in self.xrefs:
for objid in xref.get_objids():
try:
obj = self.getobj(objid)
if isinstance(obj, dict) and obj.get('Type') is LITERAL_PAGE:
yield PDFPage(self, objid, obj)
except PDFObjectNotFound:
pass
return
def get_outlines(self):
if 'Outlines' not in self.catalog:
raise PDFNoOutlines

View File

@ -24,6 +24,7 @@ from pdfcolor import PDFColorSpace
from pdfcolor import PREDEFINED_COLORSPACE
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
from pdfcolor import LITERAL_DEVICE_CMYK
from pdfpage import PDFPage
from utils import choplist
from utils import mult_matrix, MATRIX_IDENTITY
@ -824,7 +825,7 @@ def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password='',
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for (pageno,page) in enumerate(doc.get_pages()):
for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
if pagenos and (pageno not in pagenos): continue
interpreter.process_page(page)
if maxpages and maxpages <= pageno+1: break

109
pdfminer/pdfpage.py Normal file
View File

@ -0,0 +1,109 @@
#!/usr/bin/env python2
import sys
from psparser import LIT, KWD, STRICT
from pdftypes import PDFObjectNotFound
from pdftypes import resolve1
from pdftypes import int_value, float_value, num_value
from pdftypes import str_value, list_value, dict_value, stream_value
# some predefined literals and keywords.
LITERAL_PAGE = LIT('Page')
LITERAL_PAGES = LIT('Pages')
## PDFPage
##
class PDFPage(object):
"""An object that holds the information about a page.
A PDFPage object is merely a convenience class that has a set
of keys and values, which describe the properties of a page
and point to its contents.
Attributes:
doc: a PDFDocument object.
pageid: any Python object that can uniquely identify the page.
attrs: a dictionary of page attributes.
contents: a list of PDFStream objects that represents the page content.
lastmod: the last modified time of the page.
resources: a list of resources used by the page.
mediabox: the physical size of the page.
cropbox: the crop rectangle of the page.
rotate: the page rotation (in degree).
annots: the page annotations.
beads: a chain that represents natural reading order.
"""
def __init__(self, doc, pageid, attrs):
"""Initialize a page object.
doc: a PDFDocument object.
pageid: any Python object that can uniquely identify the page.
attrs: a dictionary of page attributes.
"""
self.doc = doc
self.pageid = pageid
self.attrs = dict_value(attrs)
self.lastmod = resolve1(self.attrs.get('LastModified'))
self.resources = resolve1(self.attrs['Resources'])
self.mediabox = resolve1(self.attrs['MediaBox'])
if 'CropBox' in self.attrs:
self.cropbox = resolve1(self.attrs['CropBox'])
else:
self.cropbox = self.mediabox
self.rotate = (self.attrs.get('Rotate', 0)+360) % 360
self.annots = self.attrs.get('Annots')
self.beads = self.attrs.get('B')
if 'Contents' in self.attrs:
contents = resolve1(self.attrs['Contents'])
else:
contents = []
if not isinstance(contents, list):
contents = [ contents ]
self.contents = contents
return
def __repr__(self):
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
@classmethod
def create_pages(klass, document, debug=0):
def search(obj, parent):
if isinstance(obj, int):
objid = obj
tree = dict_value(document.getobj(objid)).copy()
else:
objid = obj.objid
tree = dict_value(obj).copy()
for (k,v) in parent.iteritems():
if k in klass.INHERITABLE_ATTRS and k not in tree:
tree[k] = v
if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
if 1 <= debug:
print >>sys.stderr, 'Pages: Kids=%r' % tree['Kids']
for c in list_value(tree['Kids']):
for x in search(c, tree):
yield x
elif tree.get('Type') is LITERAL_PAGE:
if 1 <= debug:
print >>sys.stderr, 'Page: %r' % tree
yield (objid, tree)
pages = False
if 'Pages' in document.catalog:
for (objid,tree) in search(document.catalog['Pages'], document.catalog):
yield klass(document, objid, tree)
pages = True
if not pages:
# fallback when /Pages is missing.
for xref in document.xrefs:
for objid in xref.get_objids():
try:
obj = document.getobj(objid)
if isinstance(obj, dict) and obj.get('Type') is LITERAL_PAGE:
yield klass(document, objid, obj)
except PDFObjectNotFound:
pass
return

View File

@ -12,6 +12,7 @@ from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
from pdfminer.pdftypes import PDFObjectNotFound
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
from pdfminer.pdfpage import PDFPage
ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
@ -112,7 +113,8 @@ def dumpoutline(outfp, fname, objids, pagenos, password='',
parser = PDFParser(fp)
doc = PDFDocument(parser)
doc.initialize(password)
pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
pages = dict( (page.pageid, pageno) for (pageno,page)
in enumerate(PDFPage.create_pages(doc)) )
def resolve_dest(dest):
if isinstance(dest, str):
dest = resolve1(doc.get_dest(dest))
@ -164,7 +166,7 @@ def dumppdf(outfp, fname, objids, pagenos, password='',
obj = doc.getobj(objid)
dumpxml(outfp, obj, codec=codec)
if pagenos:
for (pageno,page) in enumerate(doc.get_pages()):
for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
if pageno in pagenos:
if codec:
for obj in page.contents: