From 4918d59bc2616a3f6731840269bd53c7912882ad Mon Sep 17 00:00:00 2001 From: Yusuke Shinyama Date: Thu, 3 Mar 2011 00:04:43 +0900 Subject: [PATCH] disable caching support --- docs/index.html | 6 +++++- pdfminer/pdfinterp.py | 17 +++++++++-------- pdfminer/pdfparser.py | 21 ++++++++++++--------- tools/pdf2txt.py | 10 ++++++---- 4 files changed, 32 insertions(+), 22 deletions(-) diff --git a/docs/index.html b/docs/index.html index 9017298..6e24e87 100644 --- a/docs/index.html +++ b/docs/index.html @@ -9,7 +9,7 @@
-Last Modified: Sun Feb 27 10:51:18 UTC 2011 +Last Modified: Wed Mar 2 15:03:42 UTC 2011
@@ -263,6 +263,10 @@ are M = 1.0, L = 0.3, and W = 0.2, respectively. ↑

+

-C +
Suppress object caching. +This will reduce the memory consumption but also slows down the process. +

-n
Suppress layout analysis.

diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index d24300a..f5c8006 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -131,8 +131,9 @@ class PDFResourceManager(object): """ debug = 0 - def __init__(self): - self.fonts = {} + def __init__(self, caching=True): + self.caching = caching + self._cached_fonts = {} return def get_procset(self, procs): @@ -154,8 +155,8 @@ class PDFResourceManager(object): return CMap() def get_font(self, objid, spec): - if objid and objid in self.fonts: - font = self.fonts[objid] + if objid and objid in self._cached_fonts: + font = self._cached_fonts[objid] else: if 2 <= self.debug: print >>sys.stderr, 'get_font: create: objid=%r, spec=%r' % (objid, spec) @@ -194,8 +195,8 @@ class PDFResourceManager(object): if STRICT: raise PDFFontError('Invalid Font spec: %r' % spec) font = PDFType1Font(self, spec) # this is so wrong! - if objid: - self.fonts[objid] = font + if objid and self.caching: + self._cached_fonts[objid] = font return font @@ -809,11 +810,11 @@ class PDFPageInterpreter(object): class PDFTextExtractionNotAllowed(PDFInterpreterError): pass def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password='', - check_extractable=True): + caching=True, check_extractable=True): # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. - doc = PDFDocument() + doc = PDFDocument(caching=caching) # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py index 9ce7fdd..1533826 100644 --- a/pdfminer/pdfparser.py +++ b/pdfminer/pdfparser.py @@ -296,15 +296,16 @@ class PDFDocument(object): debug = 0 - def __init__(self): + def __init__(self, caching=True): + self.caching = caching self.xrefs = [] - self.objs = {} - self.parsed_objs = {} self.info = [] self.catalog = None self.encryption = None self.decipher = None self._parser = None + self._cached_objs = {} + self._parsed_objs = {} return def set_parser(self, parser): @@ -408,9 +409,9 @@ class PDFDocument(object): raise PDFException('PDFDocument is not initialized') if 2 <= self.debug: print >>sys.stderr, 'getobj: objid=%r' % (objid) - if objid in self.objs: + if objid in self._cached_objs: genno = 0 - obj = self.objs[objid] + obj = self._cached_objs[objid] else: for xref in self.xrefs: try: @@ -434,8 +435,8 @@ class PDFDocument(object): if STRICT: raise PDFSyntaxError('N is not defined: %r' % stream) n = 0 - if strmid in self.parsed_objs: - objs = self.parsed_objs[strmid] + if strmid in self._parsed_objs: + objs = self._parsed_objs[strmid] else: parser = PDFStreamParser(stream.get_data()) parser.set_document(self) @@ -446,7 +447,8 @@ class PDFDocument(object): objs.append(obj) except PSEOF: pass - self.parsed_objs[strmid] = objs + if self.caching: + self._parsed_objs[strmid] = objs genno = 0 i = n*2+index try: @@ -481,7 +483,8 @@ class PDFDocument(object): return None if 2 <= self.debug: print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj) - self.objs[objid] = obj + if self.caching: + self._cached_objs[objid] = obj if self.decipher: obj = decipher_all(self.decipher, objid, genno, obj) return obj diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index dc355d5..9085813 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -11,12 +11,12 @@ from pdfminer.layout import LAParams def main(argv): import getopt def usage(): - print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] ' + print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] ' '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] ' '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]) return 100 try: - (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAVM:L:W:F:Y:O:t:c:s:') + (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() @@ -34,6 +34,7 @@ def main(argv): codec = 'utf-8' pageno = 1 scale = 1 + caching = True showpageno = True laparams = LAParams() for (k, v) in opts: @@ -42,6 +43,7 @@ def main(argv): elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v + elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True @@ -62,7 +64,7 @@ def main(argv): PDFPageInterpreter.debug = debug PDFDevice.debug = debug # - rsrcmgr = PDFResourceManager() + rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: @@ -90,7 +92,7 @@ def main(argv): for fname in args: fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, - check_extractable=True) + caching=caching, check_extractable=True) fp.close() device.close() outfp.close()