API change: process_pdf -> PDFPage.get_pages
parent
8a70a9f657
commit
d3730a29ec
|
@ -24,7 +24,6 @@ from pdfcolor import PDFColorSpace
|
|||
from pdfcolor import PREDEFINED_COLORSPACE
|
||||
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
|
||||
from pdfcolor import LITERAL_DEVICE_CMYK
|
||||
from pdfpage import PDFPage
|
||||
from utils import choplist
|
||||
from utils import mult_matrix, MATRIX_IDENTITY
|
||||
|
||||
|
@ -804,29 +803,3 @@ class PDFPageInterpreter(object):
|
|||
else:
|
||||
self.push(obj)
|
||||
return
|
||||
|
||||
|
||||
## process_pdf
|
||||
##
|
||||
class PDFTextExtractionNotAllowed(PDFInterpreterError): pass
|
||||
|
||||
def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password='',
|
||||
caching=True, check_extractable=True):
|
||||
# Create a PDF parser object associated with the file object.
|
||||
parser = PDFParser(fp)
|
||||
# Create a PDF document object that stores the document structure.
|
||||
doc = PDFDocument(parser, caching=caching)
|
||||
# Supply the document password for initialization.
|
||||
# (If no password is set, give an empty string.)
|
||||
doc.initialize(password)
|
||||
# Check if the document allows text extraction. If not, abort.
|
||||
if check_extractable and not doc.is_extractable:
|
||||
raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
|
||||
# Create a PDF interpreter object.
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
# Process each page contained in the document.
|
||||
for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
|
||||
if pagenos and (pageno not in pagenos): continue
|
||||
interpreter.process_page(page)
|
||||
if maxpages and maxpages <= pageno+1: break
|
||||
return
|
||||
|
|
|
@ -5,7 +5,9 @@ from pdftypes import PDFObjectNotFound
|
|||
from pdftypes import resolve1
|
||||
from pdftypes import int_value, float_value, num_value
|
||||
from pdftypes import str_value, list_value, dict_value, stream_value
|
||||
|
||||
from pdfparser import PDFParser
|
||||
from pdfdocument import PDFDocument
|
||||
from pdfdocument import PDFEncryptionError
|
||||
|
||||
# some predefined literals and keywords.
|
||||
LITERAL_PAGE = LIT('Page')
|
||||
|
@ -107,3 +109,26 @@ class PDFPage(object):
|
|||
except PDFObjectNotFound:
|
||||
pass
|
||||
return
|
||||
|
||||
class PDFTextExtractionNotAllowed(PDFEncryptionError): pass
|
||||
|
||||
@classmethod
|
||||
def get_pages(klass, fp,
|
||||
pagenos=None, maxpages=0, password='',
|
||||
caching=True, check_extractable=True):
|
||||
# Create a PDF parser object associated with the file object.
|
||||
parser = PDFParser(fp)
|
||||
# Create a PDF document object that stores the document structure.
|
||||
doc = PDFDocument(parser, caching=caching)
|
||||
# Supply the document password for initialization.
|
||||
# (If no password is set, give an empty string.)
|
||||
doc.initialize(password)
|
||||
# Check if the document allows text extraction. If not, abort.
|
||||
if check_extractable and not doc.is_extractable:
|
||||
raise klass.PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
|
||||
# Process each page contained in the document.
|
||||
for (pageno,page) in enumerate(klass.create_pages(doc)):
|
||||
if pagenos and (pageno not in pagenos): continue
|
||||
yield page
|
||||
if maxpages and maxpages <= pageno+1: break
|
||||
return
|
||||
|
|
|
@ -2,8 +2,9 @@
|
|||
import sys
|
||||
from pdfminer.pdfdocument import PDFDocument
|
||||
from pdfminer.pdfparser import PDFParser
|
||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
|
||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
from pdfminer.pdfdevice import PDFDevice, TagExtractor
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
|
||||
from pdfminer.cmapdb import CMapDB
|
||||
from pdfminer.layout import LAParams
|
||||
|
@ -96,8 +97,11 @@ def main(argv):
|
|||
return usage()
|
||||
for fname in args:
|
||||
fp = file(fname, 'rb')
|
||||
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
|
||||
caching=caching, check_extractable=True)
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
for page in PDFPage.get_pages(fp, pagenos,
|
||||
maxpages=maxpages, password=password,
|
||||
caching=caching, check_extractable=True):
|
||||
interpreter.process_page(page)
|
||||
fp.close()
|
||||
device.close()
|
||||
outfp.close()
|
||||
|
|
Loading…
Reference in New Issue