Applied a patch by Axel Kaiser.
parent
636d4caeb3
commit
d7c4ff28e9
|
@ -247,6 +247,7 @@ class PDFXRefStream(PDFBaseXRef):
|
||||||
for (start, nobjs) in self.ranges:
|
for (start, nobjs) in self.ranges:
|
||||||
if start <= objid and objid < start+nobjs:
|
if start <= objid and objid < start+nobjs:
|
||||||
index += objid - start
|
index += objid - start
|
||||||
|
break
|
||||||
else:
|
else:
|
||||||
index += nobjs
|
index += nobjs
|
||||||
else:
|
else:
|
||||||
|
@ -276,15 +277,15 @@ class PDFDocument(object):
|
||||||
dynamically import the data as processing goes.
|
dynamically import the data as processing goes.
|
||||||
|
|
||||||
Typical usage:
|
Typical usage:
|
||||||
doc = PDFDocument(parser)
|
doc = PDFDocument(parser, password)
|
||||||
doc.initialize(password)
|
|
||||||
obj = doc.getobj(objid)
|
obj = doc.getobj(objid)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
debug = 0
|
debug = 0
|
||||||
|
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
|
||||||
|
|
||||||
def __init__(self, parser, caching=True, fallback=True):
|
def __init__(self, parser, password='', caching=True, fallback=True):
|
||||||
"Set the document to use a given PDFParser object."
|
"Set the document to use a given PDFParser object."
|
||||||
self.caching = caching
|
self.caching = caching
|
||||||
self.xrefs = []
|
self.xrefs = []
|
||||||
|
@ -297,6 +298,7 @@ class PDFDocument(object):
|
||||||
self._parsed_objs = {}
|
self._parsed_objs = {}
|
||||||
self._parser = parser
|
self._parser = parser
|
||||||
self._parser.set_document(self)
|
self._parser.set_document(self)
|
||||||
|
self.is_printable = self.is_modifiable = self.is_extractable = True
|
||||||
# Retrieve the information of each header that was appended
|
# Retrieve the information of each header that was appended
|
||||||
# (maybe multiple times) at the end of the document.
|
# (maybe multiple times) at the end of the document.
|
||||||
try:
|
try:
|
||||||
|
@ -318,6 +320,7 @@ class PDFDocument(object):
|
||||||
#assert not self.encryption
|
#assert not self.encryption
|
||||||
self.encryption = (list_value(trailer['ID']),
|
self.encryption = (list_value(trailer['ID']),
|
||||||
dict_value(trailer['Encrypt']))
|
dict_value(trailer['Encrypt']))
|
||||||
|
self._initialize_password(password)
|
||||||
if 'Info' in trailer:
|
if 'Info' in trailer:
|
||||||
self.info.append(dict_value(trailer['Info']))
|
self.info.append(dict_value(trailer['Info']))
|
||||||
if 'Root' in trailer:
|
if 'Root' in trailer:
|
||||||
|
@ -331,16 +334,9 @@ class PDFDocument(object):
|
||||||
raise PDFSyntaxError('Catalog not found!')
|
raise PDFSyntaxError('Catalog not found!')
|
||||||
return
|
return
|
||||||
|
|
||||||
# initialize(password='')
|
# _initialize_password(password='')
|
||||||
# Perform the initialization with a given password.
|
# Perform the initialization with a given password.
|
||||||
# This step is mandatory even if there's no password associated
|
def _initialize_password(self, password=''):
|
||||||
# with the document.
|
|
||||||
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
|
|
||||||
|
|
||||||
def initialize(self, password=''):
|
|
||||||
if not self.encryption:
|
|
||||||
self.is_printable = self.is_modifiable = self.is_extractable = True
|
|
||||||
return
|
|
||||||
(docid, param) = self.encryption
|
(docid, param) = self.encryption
|
||||||
if literal_name(param.get('Filter')) != 'Standard':
|
if literal_name(param.get('Filter')) != 'Standard':
|
||||||
raise PDFEncryptionError('Unknown filter: param=%r' % param)
|
raise PDFEncryptionError('Unknown filter: param=%r' % param)
|
||||||
|
|
|
@ -13,6 +13,9 @@ LITERAL_PAGE = LIT('Page')
|
||||||
LITERAL_PAGES = LIT('Pages')
|
LITERAL_PAGES = LIT('Pages')
|
||||||
|
|
||||||
|
|
||||||
|
class PDFTextExtractionNotAllowed(PDFEncryptionError):
|
||||||
|
pass
|
||||||
|
|
||||||
## PDFPage
|
## PDFPage
|
||||||
##
|
##
|
||||||
class PDFPage(object):
|
class PDFPage(object):
|
||||||
|
@ -110,9 +113,6 @@ class PDFPage(object):
|
||||||
pass
|
pass
|
||||||
return
|
return
|
||||||
|
|
||||||
class PDFTextExtractionNotAllowed(PDFEncryptionError):
|
|
||||||
pass
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_pages(klass, fp,
|
def get_pages(klass, fp,
|
||||||
pagenos=None, maxpages=0, password='',
|
pagenos=None, maxpages=0, password='',
|
||||||
|
@ -120,13 +120,10 @@ class PDFPage(object):
|
||||||
# Create a PDF parser object associated with the file object.
|
# Create a PDF parser object associated with the file object.
|
||||||
parser = PDFParser(fp)
|
parser = PDFParser(fp)
|
||||||
# Create a PDF document object that stores the document structure.
|
# Create a PDF document object that stores the document structure.
|
||||||
doc = PDFDocument(parser, caching=caching)
|
doc = PDFDocument(parser, password=password, caching=caching)
|
||||||
# Supply the document password for initialization.
|
|
||||||
# (If no password is set, give an empty string.)
|
|
||||||
doc.initialize(password)
|
|
||||||
# Check if the document allows text extraction. If not, abort.
|
# Check if the document allows text extraction. If not, abort.
|
||||||
if check_extractable and not doc.is_extractable:
|
if check_extractable and not doc.is_extractable:
|
||||||
raise klass.PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
|
raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
|
||||||
# Process each page contained in the document.
|
# Process each page contained in the document.
|
||||||
for (pageno, page) in enumerate(klass.create_pages(doc)):
|
for (pageno, page) in enumerate(klass.create_pages(doc)):
|
||||||
if pagenos and (pageno not in pagenos):
|
if pagenos and (pageno not in pagenos):
|
||||||
|
|
Loading…
Reference in New Issue