API change: process_pdf -> PDFPage.get_pages

pull/1/head
Yusuke Shinyama 2013-10-22 18:59:16 +09:00
parent 8a70a9f657
commit d3730a29ec
3 changed files with 33 additions and 31 deletions

View File

@ -24,7 +24,6 @@ from pdfcolor import PDFColorSpace
from pdfcolor import PREDEFINED_COLORSPACE
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
from pdfcolor import LITERAL_DEVICE_CMYK
from pdfpage import PDFPage
from utils import choplist
from utils import mult_matrix, MATRIX_IDENTITY
@ -804,29 +803,3 @@ class PDFPageInterpreter(object):
else:
self.push(obj)
return
## process_pdf
##
class PDFTextExtractionNotAllowed(PDFInterpreterError): pass
def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password='',
caching=True, check_extractable=True):
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
doc = PDFDocument(parser, caching=caching)
# Supply the document password for initialization.
# (If no password is set, give an empty string.)
doc.initialize(password)
# Check if the document allows text extraction. If not, abort.
if check_extractable and not doc.is_extractable:
raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
if pagenos and (pageno not in pagenos): continue
interpreter.process_page(page)
if maxpages and maxpages <= pageno+1: break
return

View File

@ -5,7 +5,9 @@ from pdftypes import PDFObjectNotFound
from pdftypes import resolve1
from pdftypes import int_value, float_value, num_value
from pdftypes import str_value, list_value, dict_value, stream_value
from pdfparser import PDFParser
from pdfdocument import PDFDocument
from pdfdocument import PDFEncryptionError
# some predefined literals and keywords.
LITERAL_PAGE = LIT('Page')
@ -107,3 +109,26 @@ class PDFPage(object):
except PDFObjectNotFound:
pass
return
class PDFTextExtractionNotAllowed(PDFEncryptionError): pass
@classmethod
def get_pages(klass, fp,
pagenos=None, maxpages=0, password='',
caching=True, check_extractable=True):
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
doc = PDFDocument(parser, caching=caching)
# Supply the document password for initialization.
# (If no password is set, give an empty string.)
doc.initialize(password)
# Check if the document allows text extraction. If not, abort.
if check_extractable and not doc.is_extractable:
raise klass.PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
# Process each page contained in the document.
for (pageno,page) in enumerate(klass.create_pages(doc)):
if pagenos and (pageno not in pagenos): continue
yield page
if maxpages and maxpages <= pageno+1: break
return

View File

@ -2,8 +2,9 @@
import sys
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams
@ -96,8 +97,11 @@ def main(argv):
return usage()
for fname in args:
fp = file(fname, 'rb')
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
caching=caching, check_extractable=True)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp, pagenos,
maxpages=maxpages, password=password,
caching=caching, check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
outfp.close()