Applied a patch by Axel Kaiser.

pull/1/head
Yusuke Shinyama 2014-03-24 20:39:30 +09:00
parent 636d4caeb3
commit d7c4ff28e9
2 changed files with 13 additions and 20 deletions

View File

@ -247,6 +247,7 @@ class PDFXRefStream(PDFBaseXRef):
for (start, nobjs) in self.ranges: for (start, nobjs) in self.ranges:
if start <= objid and objid < start+nobjs: if start <= objid and objid < start+nobjs:
index += objid - start index += objid - start
break
else: else:
index += nobjs index += nobjs
else: else:
@ -276,15 +277,15 @@ class PDFDocument(object):
dynamically import the data as processing goes. dynamically import the data as processing goes.
Typical usage: Typical usage:
doc = PDFDocument(parser) doc = PDFDocument(parser, password)
doc.initialize(password)
obj = doc.getobj(objid) obj = doc.getobj(objid)
""" """
debug = 0 debug = 0
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
def __init__(self, parser, caching=True, fallback=True): def __init__(self, parser, password='', caching=True, fallback=True):
"Set the document to use a given PDFParser object." "Set the document to use a given PDFParser object."
self.caching = caching self.caching = caching
self.xrefs = [] self.xrefs = []
@ -297,6 +298,7 @@ class PDFDocument(object):
self._parsed_objs = {} self._parsed_objs = {}
self._parser = parser self._parser = parser
self._parser.set_document(self) self._parser.set_document(self)
self.is_printable = self.is_modifiable = self.is_extractable = True
# Retrieve the information of each header that was appended # Retrieve the information of each header that was appended
# (maybe multiple times) at the end of the document. # (maybe multiple times) at the end of the document.
try: try:
@ -318,6 +320,7 @@ class PDFDocument(object):
#assert not self.encryption #assert not self.encryption
self.encryption = (list_value(trailer['ID']), self.encryption = (list_value(trailer['ID']),
dict_value(trailer['Encrypt'])) dict_value(trailer['Encrypt']))
self._initialize_password(password)
if 'Info' in trailer: if 'Info' in trailer:
self.info.append(dict_value(trailer['Info'])) self.info.append(dict_value(trailer['Info']))
if 'Root' in trailer: if 'Root' in trailer:
@ -331,16 +334,9 @@ class PDFDocument(object):
raise PDFSyntaxError('Catalog not found!') raise PDFSyntaxError('Catalog not found!')
return return
# initialize(password='') # _initialize_password(password='')
# Perform the initialization with a given password. # Perform the initialization with a given password.
# This step is mandatory even if there's no password associated def _initialize_password(self, password=''):
# with the document.
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
def initialize(self, password=''):
if not self.encryption:
self.is_printable = self.is_modifiable = self.is_extractable = True
return
(docid, param) = self.encryption (docid, param) = self.encryption
if literal_name(param.get('Filter')) != 'Standard': if literal_name(param.get('Filter')) != 'Standard':
raise PDFEncryptionError('Unknown filter: param=%r' % param) raise PDFEncryptionError('Unknown filter: param=%r' % param)

View File

@ -13,6 +13,9 @@ LITERAL_PAGE = LIT('Page')
LITERAL_PAGES = LIT('Pages') LITERAL_PAGES = LIT('Pages')
class PDFTextExtractionNotAllowed(PDFEncryptionError):
pass
## PDFPage ## PDFPage
## ##
class PDFPage(object): class PDFPage(object):
@ -110,9 +113,6 @@ class PDFPage(object):
pass pass
return return
class PDFTextExtractionNotAllowed(PDFEncryptionError):
pass
@classmethod @classmethod
def get_pages(klass, fp, def get_pages(klass, fp,
pagenos=None, maxpages=0, password='', pagenos=None, maxpages=0, password='',
@ -120,13 +120,10 @@ class PDFPage(object):
# Create a PDF parser object associated with the file object. # Create a PDF parser object associated with the file object.
parser = PDFParser(fp) parser = PDFParser(fp)
# Create a PDF document object that stores the document structure. # Create a PDF document object that stores the document structure.
doc = PDFDocument(parser, caching=caching) doc = PDFDocument(parser, password=password, caching=caching)
# Supply the document password for initialization.
# (If no password is set, give an empty string.)
doc.initialize(password)
# Check if the document allows text extraction. If not, abort. # Check if the document allows text extraction. If not, abort.
if check_extractable and not doc.is_extractable: if check_extractable and not doc.is_extractable:
raise klass.PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp) raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
# Process each page contained in the document. # Process each page contained in the document.
for (pageno, page) in enumerate(klass.create_pages(doc)): for (pageno, page) in enumerate(klass.create_pages(doc)):
if pagenos and (pageno not in pagenos): if pagenos and (pageno not in pagenos):