diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py index 7c153a6..c170f1a 100644 --- a/pdfminer/pdfdocument.py +++ b/pdfminer/pdfdocument.py @@ -247,6 +247,7 @@ class PDFXRefStream(PDFBaseXRef): for (start, nobjs) in self.ranges: if start <= objid and objid < start+nobjs: index += objid - start + break else: index += nobjs else: @@ -276,15 +277,15 @@ class PDFDocument(object): dynamically import the data as processing goes. Typical usage: - doc = PDFDocument(parser) - doc.initialize(password) + doc = PDFDocument(parser, password) obj = doc.getobj(objid) """ debug = 0 + PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz' - def __init__(self, parser, caching=True, fallback=True): + def __init__(self, parser, password='', caching=True, fallback=True): "Set the document to use a given PDFParser object." self.caching = caching self.xrefs = [] @@ -297,6 +298,7 @@ class PDFDocument(object): self._parsed_objs = {} self._parser = parser self._parser.set_document(self) + self.is_printable = self.is_modifiable = self.is_extractable = True # Retrieve the information of each header that was appended # (maybe multiple times) at the end of the document. try: @@ -318,6 +320,7 @@ class PDFDocument(object): #assert not self.encryption self.encryption = (list_value(trailer['ID']), dict_value(trailer['Encrypt'])) + self._initialize_password(password) if 'Info' in trailer: self.info.append(dict_value(trailer['Info'])) if 'Root' in trailer: @@ -331,16 +334,9 @@ class PDFDocument(object): raise PDFSyntaxError('Catalog not found!') return - # initialize(password='') + # _initialize_password(password='') # Perform the initialization with a given password. - # This step is mandatory even if there's no password associated - # with the document. - PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz' - - def initialize(self, password=''): - if not self.encryption: - self.is_printable = self.is_modifiable = self.is_extractable = True - return + def _initialize_password(self, password=''): (docid, param) = self.encryption if literal_name(param.get('Filter')) != 'Standard': raise PDFEncryptionError('Unknown filter: param=%r' % param) diff --git a/pdfminer/pdfpage.py b/pdfminer/pdfpage.py index a2906a2..95d12f3 100644 --- a/pdfminer/pdfpage.py +++ b/pdfminer/pdfpage.py @@ -13,6 +13,9 @@ LITERAL_PAGE = LIT('Page') LITERAL_PAGES = LIT('Pages') +class PDFTextExtractionNotAllowed(PDFEncryptionError): + pass + ## PDFPage ## class PDFPage(object): @@ -110,9 +113,6 @@ class PDFPage(object): pass return - class PDFTextExtractionNotAllowed(PDFEncryptionError): - pass - @classmethod def get_pages(klass, fp, pagenos=None, maxpages=0, password='', @@ -120,13 +120,10 @@ class PDFPage(object): # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. - doc = PDFDocument(parser, caching=caching) - # Supply the document password for initialization. - # (If no password is set, give an empty string.) - doc.initialize(password) + doc = PDFDocument(parser, password=password, caching=caching) # Check if the document allows text extraction. If not, abort. if check_extractable and not doc.is_extractable: - raise klass.PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp) + raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp) # Process each page contained in the document. for (pageno, page) in enumerate(klass.create_pages(doc)): if pagenos and (pageno not in pagenos):