API change: process_pdf -> PDFPage.get_pages

2013-10-22 18:59:16 +09:00 · 2013-10-22 18:59:16 +09:00 · d3730a29ec
parent 8a70a9f657
commit d3730a29ec
3 changed files with 33 additions and 31 deletions
--- a/pdfminer/pdfinterp.py
+++ b/pdfminer/pdfinterp.py
@ -24,7 +24,6 @@ from pdfcolor import PDFColorSpace
 from pdfcolor import PREDEFINED_COLORSPACE
 from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
 from pdfcolor import LITERAL_DEVICE_CMYK
-from pdfpage import PDFPage
 from utils import choplist
 from utils import mult_matrix, MATRIX_IDENTITY

@ -804,29 +803,3 @@ class PDFPageInterpreter(object):
            else:
                self.push(obj)
        return
-
-
-##  process_pdf
-##
-class PDFTextExtractionNotAllowed(PDFInterpreterError): pass
-
-def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password='',
-                caching=True, check_extractable=True):
-    # Create a PDF parser object associated with the file object.
-    parser = PDFParser(fp)
-    # Create a PDF document object that stores the document structure.
-    doc = PDFDocument(parser, caching=caching)
-    # Supply the document password for initialization.
-    # (If no password is set, give an empty string.)
-    doc.initialize(password)
-    # Check if the document allows text extraction. If not, abort.
-    if check_extractable and not doc.is_extractable:
-        raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
-    # Create a PDF interpreter object.
-    interpreter = PDFPageInterpreter(rsrcmgr, device)
-    # Process each page contained in the document.
-    for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
-        if pagenos and (pageno not in pagenos): continue
-        interpreter.process_page(page)
-        if maxpages and maxpages <= pageno+1: break
-    return
--- a/pdfminer/pdfpage.py
+++ b/pdfminer/pdfpage.py
@ -5,7 +5,9 @@ from pdftypes import PDFObjectNotFound
 from pdftypes import resolve1
 from pdftypes import int_value, float_value, num_value
 from pdftypes import str_value, list_value, dict_value, stream_value
-
+from pdfparser import PDFParser
+from pdfdocument import PDFDocument
+from pdfdocument import PDFEncryptionError

 # some predefined literals and keywords.
 LITERAL_PAGE = LIT('Page')
@ -107,3 +109,26 @@ class PDFPage(object):
                    except PDFObjectNotFound:
                        pass
        return
+
+    class PDFTextExtractionNotAllowed(PDFEncryptionError): pass
+
+    @classmethod
+    def get_pages(klass, fp,
+                  pagenos=None, maxpages=0, password='',
+                  caching=True, check_extractable=True):
+        # Create a PDF parser object associated with the file object.
+        parser = PDFParser(fp)
+        # Create a PDF document object that stores the document structure.
+        doc = PDFDocument(parser, caching=caching)
+        # Supply the document password for initialization.
+        # (If no password is set, give an empty string.)
+        doc.initialize(password)
+        # Check if the document allows text extraction. If not, abort.
+        if check_extractable and not doc.is_extractable:
+            raise klass.PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
+        # Process each page contained in the document.
+        for (pageno,page) in enumerate(klass.create_pages(doc)):
+            if pagenos and (pageno not in pagenos): continue
+            yield page
+            if maxpages and maxpages <= pageno+1: break
+        return
--- a/tools/pdf2txt.py
+++ b/tools/pdf2txt.py
@ -2,8 +2,9 @@
 import sys
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdfparser import PDFParser
-from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
+from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
 from pdfminer.pdfdevice import PDFDevice, TagExtractor
+from pdfminer.pdfpage import PDFPage
 from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
 from pdfminer.cmapdb import CMapDB
 from pdfminer.layout import LAParams
@ -96,8 +97,11 @@ def main(argv):
        return usage()
    for fname in args:
        fp = file(fname, 'rb')
-        process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
-                    caching=caching, check_extractable=True)
+        interpreter = PDFPageInterpreter(rsrcmgr, device)
+        for page in PDFPage.get_pages(fp, pagenos,
+                                      maxpages=maxpages, password=password,
+                                      caching=caching, check_extractable=True):
+            interpreter.process_page(page)
        fp.close()
    device.close()
    outfp.close()