Applied a patch by Axel Kaiser.
parent
636d4caeb3
commit
d7c4ff28e9
|
@ -247,6 +247,7 @@ class PDFXRefStream(PDFBaseXRef):
|
|||
for (start, nobjs) in self.ranges:
|
||||
if start <= objid and objid < start+nobjs:
|
||||
index += objid - start
|
||||
break
|
||||
else:
|
||||
index += nobjs
|
||||
else:
|
||||
|
@ -276,15 +277,15 @@ class PDFDocument(object):
|
|||
dynamically import the data as processing goes.
|
||||
|
||||
Typical usage:
|
||||
doc = PDFDocument(parser)
|
||||
doc.initialize(password)
|
||||
doc = PDFDocument(parser, password)
|
||||
obj = doc.getobj(objid)
|
||||
|
||||
"""
|
||||
|
||||
debug = 0
|
||||
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
|
||||
|
||||
def __init__(self, parser, caching=True, fallback=True):
|
||||
def __init__(self, parser, password='', caching=True, fallback=True):
|
||||
"Set the document to use a given PDFParser object."
|
||||
self.caching = caching
|
||||
self.xrefs = []
|
||||
|
@ -297,6 +298,7 @@ class PDFDocument(object):
|
|||
self._parsed_objs = {}
|
||||
self._parser = parser
|
||||
self._parser.set_document(self)
|
||||
self.is_printable = self.is_modifiable = self.is_extractable = True
|
||||
# Retrieve the information of each header that was appended
|
||||
# (maybe multiple times) at the end of the document.
|
||||
try:
|
||||
|
@ -318,6 +320,7 @@ class PDFDocument(object):
|
|||
#assert not self.encryption
|
||||
self.encryption = (list_value(trailer['ID']),
|
||||
dict_value(trailer['Encrypt']))
|
||||
self._initialize_password(password)
|
||||
if 'Info' in trailer:
|
||||
self.info.append(dict_value(trailer['Info']))
|
||||
if 'Root' in trailer:
|
||||
|
@ -331,16 +334,9 @@ class PDFDocument(object):
|
|||
raise PDFSyntaxError('Catalog not found!')
|
||||
return
|
||||
|
||||
# initialize(password='')
|
||||
# _initialize_password(password='')
|
||||
# Perform the initialization with a given password.
|
||||
# This step is mandatory even if there's no password associated
|
||||
# with the document.
|
||||
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
|
||||
|
||||
def initialize(self, password=''):
|
||||
if not self.encryption:
|
||||
self.is_printable = self.is_modifiable = self.is_extractable = True
|
||||
return
|
||||
def _initialize_password(self, password=''):
|
||||
(docid, param) = self.encryption
|
||||
if literal_name(param.get('Filter')) != 'Standard':
|
||||
raise PDFEncryptionError('Unknown filter: param=%r' % param)
|
||||
|
|
|
@ -13,6 +13,9 @@ LITERAL_PAGE = LIT('Page')
|
|||
LITERAL_PAGES = LIT('Pages')
|
||||
|
||||
|
||||
class PDFTextExtractionNotAllowed(PDFEncryptionError):
|
||||
pass
|
||||
|
||||
## PDFPage
|
||||
##
|
||||
class PDFPage(object):
|
||||
|
@ -110,9 +113,6 @@ class PDFPage(object):
|
|||
pass
|
||||
return
|
||||
|
||||
class PDFTextExtractionNotAllowed(PDFEncryptionError):
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def get_pages(klass, fp,
|
||||
pagenos=None, maxpages=0, password='',
|
||||
|
@ -120,13 +120,10 @@ class PDFPage(object):
|
|||
# Create a PDF parser object associated with the file object.
|
||||
parser = PDFParser(fp)
|
||||
# Create a PDF document object that stores the document structure.
|
||||
doc = PDFDocument(parser, caching=caching)
|
||||
# Supply the document password for initialization.
|
||||
# (If no password is set, give an empty string.)
|
||||
doc.initialize(password)
|
||||
doc = PDFDocument(parser, password=password, caching=caching)
|
||||
# Check if the document allows text extraction. If not, abort.
|
||||
if check_extractable and not doc.is_extractable:
|
||||
raise klass.PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
|
||||
raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
|
||||
# Process each page contained in the document.
|
||||
for (pageno, page) in enumerate(klass.create_pages(doc)):
|
||||
if pagenos and (pageno not in pagenos):
|
||||
|
|
Loading…
Reference in New Issue