Applied a patch by Axel Kaiser.

2014-03-24 20:39:30 +09:00 · 2014-03-24 20:39:30 +09:00 · d7c4ff28e9
parent 636d4caeb3
commit d7c4ff28e9
2 changed files with 13 additions and 20 deletions
--- a/pdfminer/pdfdocument.py
+++ b/pdfminer/pdfdocument.py
@ -247,6 +247,7 @@ class PDFXRefStream(PDFBaseXRef):
        for (start, nobjs) in self.ranges:
            if start <= objid and objid < start+nobjs:
                index += objid - start
+                break
            else:
                index += nobjs
        else:
@ -276,15 +277,15 @@ class PDFDocument(object):
    dynamically import the data as processing goes.

    Typical usage:
-      doc = PDFDocument(parser)
-      doc.initialize(password)
+      doc = PDFDocument(parser, password)
      obj = doc.getobj(objid)

    """

    debug = 0
+    PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'

-    def __init__(self, parser, caching=True, fallback=True):
+    def __init__(self, parser, password='', caching=True, fallback=True):
        "Set the document to use a given PDFParser object."
        self.caching = caching
        self.xrefs = []
@ -297,6 +298,7 @@ class PDFDocument(object):
        self._parsed_objs = {}
        self._parser = parser
        self._parser.set_document(self)
+        self.is_printable = self.is_modifiable = self.is_extractable = True
        # Retrieve the information of each header that was appended
        # (maybe multiple times) at the end of the document.
        try:
@ -318,6 +320,7 @@ class PDFDocument(object):
                #assert not self.encryption
                self.encryption = (list_value(trailer['ID']),
                                   dict_value(trailer['Encrypt']))
+                self._initialize_password(password)
            if 'Info' in trailer:
                self.info.append(dict_value(trailer['Info']))
            if 'Root' in trailer:
@ -331,16 +334,9 @@ class PDFDocument(object):
                raise PDFSyntaxError('Catalog not found!')
        return

-    # initialize(password='')
+    # _initialize_password(password='')
    #   Perform the initialization with a given password.
-    #   This step is mandatory even if there's no password associated
-    #   with the document.
-    PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
-
-    def initialize(self, password=''):
-        if not self.encryption:
-            self.is_printable = self.is_modifiable = self.is_extractable = True
-            return
+    def _initialize_password(self, password=''):
        (docid, param) = self.encryption
        if literal_name(param.get('Filter')) != 'Standard':
            raise PDFEncryptionError('Unknown filter: param=%r' % param)
--- a/pdfminer/pdfpage.py
+++ b/pdfminer/pdfpage.py
@ -13,6 +13,9 @@ LITERAL_PAGE = LIT('Page')
 LITERAL_PAGES = LIT('Pages')


+class PDFTextExtractionNotAllowed(PDFEncryptionError):
+    pass
+
 ##  PDFPage
 ##
 class PDFPage(object):
@ -110,9 +113,6 @@ class PDFPage(object):
                        pass
        return

-    class PDFTextExtractionNotAllowed(PDFEncryptionError):
-        pass
-
    @classmethod
    def get_pages(klass, fp,
                  pagenos=None, maxpages=0, password='',
@ -120,13 +120,10 @@ class PDFPage(object):
        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)
        # Create a PDF document object that stores the document structure.
-        doc = PDFDocument(parser, caching=caching)
-        # Supply the document password for initialization.
-        # (If no password is set, give an empty string.)
-        doc.initialize(password)
+        doc = PDFDocument(parser, password=password, caching=caching)
        # Check if the document allows text extraction. If not, abort.
        if check_extractable and not doc.is_extractable:
-            raise klass.PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
+            raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
        # Process each page contained in the document.
        for (pageno, page) in enumerate(klass.create_pages(doc)):
            if pagenos and (pageno not in pagenos):