From 2bf9c2380123535005c0fa4b1551f705cfc5189c Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Tue, 23 Nov 2010 10:53:28 +0000 Subject: [PATCH] check_extractable paramater added git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@276 1aa58f4a-7d42-0410-adbc-911cccaed67c --- pdfminer/pdfinterp.py | 5 +++-- tools/pdf2txt.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index 2c68846..0c0b5b0 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -808,7 +808,8 @@ class PDFPageInterpreter(object): ## class PDFTextExtractionNotAllowed(PDFInterpreterError): pass -def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password=''): +def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password='', + check_extractable=True): # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. @@ -820,7 +821,7 @@ def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password=''): # (If no password is set, give an empty string.) doc.initialize(password) # Check if the document allows text extraction. If not, abort. - if not doc.is_extractable: + if check_extractable and not doc.is_extractable: raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index 5960f85..1704ffa 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -87,7 +87,8 @@ def main(argv): return usage() for fname in args: fp = file(fname, 'rb') - process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password) + process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, + check_extractable=True) fp.close() device.close() outfp.close()