diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index 357ca16..e80b1bd 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -24,7 +24,6 @@ from pdfcolor import PDFColorSpace from pdfcolor import PREDEFINED_COLORSPACE from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB from pdfcolor import LITERAL_DEVICE_CMYK -from pdfpage import PDFPage from utils import choplist from utils import mult_matrix, MATRIX_IDENTITY @@ -804,29 +803,3 @@ class PDFPageInterpreter(object): else: self.push(obj) return - - -## process_pdf -## -class PDFTextExtractionNotAllowed(PDFInterpreterError): pass - -def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password='', - caching=True, check_extractable=True): - # Create a PDF parser object associated with the file object. - parser = PDFParser(fp) - # Create a PDF document object that stores the document structure. - doc = PDFDocument(parser, caching=caching) - # Supply the document password for initialization. - # (If no password is set, give an empty string.) - doc.initialize(password) - # Check if the document allows text extraction. If not, abort. - if check_extractable and not doc.is_extractable: - raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp) - # Create a PDF interpreter object. - interpreter = PDFPageInterpreter(rsrcmgr, device) - # Process each page contained in the document. - for (pageno,page) in enumerate(PDFPage.create_pages(doc)): - if pagenos and (pageno not in pagenos): continue - interpreter.process_page(page) - if maxpages and maxpages <= pageno+1: break - return diff --git a/pdfminer/pdfpage.py b/pdfminer/pdfpage.py index c0e1736..5beb7b8 100644 --- a/pdfminer/pdfpage.py +++ b/pdfminer/pdfpage.py @@ -5,7 +5,9 @@ from pdftypes import PDFObjectNotFound from pdftypes import resolve1 from pdftypes import int_value, float_value, num_value from pdftypes import str_value, list_value, dict_value, stream_value - +from pdfparser import PDFParser +from pdfdocument import PDFDocument +from pdfdocument import PDFEncryptionError # some predefined literals and keywords. LITERAL_PAGE = LIT('Page') @@ -107,3 +109,26 @@ class PDFPage(object): except PDFObjectNotFound: pass return + + class PDFTextExtractionNotAllowed(PDFEncryptionError): pass + + @classmethod + def get_pages(klass, fp, + pagenos=None, maxpages=0, password='', + caching=True, check_extractable=True): + # Create a PDF parser object associated with the file object. + parser = PDFParser(fp) + # Create a PDF document object that stores the document structure. + doc = PDFDocument(parser, caching=caching) + # Supply the document password for initialization. + # (If no password is set, give an empty string.) + doc.initialize(password) + # Check if the document allows text extraction. If not, abort. + if check_extractable and not doc.is_extractable: + raise klass.PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp) + # Process each page contained in the document. + for (pageno,page) in enumerate(klass.create_pages(doc)): + if pagenos and (pageno not in pagenos): continue + yield page + if maxpages and maxpages <= pageno+1: break + return diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index 76fcecd..2652ec3 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -2,8 +2,9 @@ import sys from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfparser import PDFParser -from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf +from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice, TagExtractor +from pdfminer.pdfpage import PDFPage from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter from pdfminer.cmapdb import CMapDB from pdfminer.layout import LAParams @@ -96,8 +97,11 @@ def main(argv): return usage() for fname in args: fp = file(fname, 'rb') - process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, - caching=caching, check_extractable=True) + interpreter = PDFPageInterpreter(rsrcmgr, device) + for page in PDFPage.get_pages(fp, pagenos, + maxpages=maxpages, password=password, + caching=caching, check_extractable=True): + interpreter.process_page(page) fp.close() device.close() outfp.close()