From 4918d59bc2616a3f6731840269bd53c7912882ad Mon Sep 17 00:00:00 2001
From: Yusuke Shinyama <yusuke@cs.nyu.edu>
Date: Thu, 3 Mar 2011 00:04:43 +0900
Subject: [PATCH] disable caching support

---
 docs/index.html       |  6 +++++-
 pdfminer/pdfinterp.py | 17 +++++++++--------
 pdfminer/pdfparser.py | 21 ++++++++++++---------
 tools/pdf2txt.py      | 10 ++++++----
 4 files changed, 32 insertions(+), 22 deletions(-)
diff --git a/docs/index.html b/docs/index.html
index 9017298..6e24e87 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -9,7 +9,7 @@
 
 <div align=right class=lastmod>
 <!-- hhmts start -->
-Last Modified: Sun Feb 27 10:51:18 UTC 2011
+Last Modified: Wed Mar  2 15:03:42 UTC 2011
 <!-- hhmts end -->
 </div>
 
@@ -263,6 +263,10 @@ are M = 1.0, L = 0.3, and W = 0.2, respectively.
 <td style="border-top:1px blue solid" align=right>&uarr;</td>
 </tr></table>
 <p>
+<dt> <code>-C</code> 
+<dd> Suppress object caching. 
+This will reduce the memory consumption but also slows down the process.
+<p>
 <dt> <code>-n</code> 
 <dd> Suppress layout analysis.
 <p>
diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py
index d24300a..f5c8006 100644
--- a/pdfminer/pdfinterp.py
+++ b/pdfminer/pdfinterp.py
@@ -131,8 +131,9 @@ class PDFResourceManager(object):
     """
     debug = 0
 
-    def __init__(self):
-        self.fonts = {}
+    def __init__(self, caching=True):
+        self.caching = caching
+        self._cached_fonts = {}
         return
 
     def get_procset(self, procs):
@@ -154,8 +155,8 @@ class PDFResourceManager(object):
             return CMap()
 
     def get_font(self, objid, spec):
-        if objid and objid in self.fonts:
-            font = self.fonts[objid]
+        if objid and objid in self._cached_fonts:
+            font = self._cached_fonts[objid]
         else:
             if 2 <= self.debug:
                 print >>sys.stderr, 'get_font: create: objid=%r, spec=%r' % (objid, spec)
@@ -194,8 +195,8 @@ class PDFResourceManager(object):
                 if STRICT:
                     raise PDFFontError('Invalid Font spec: %r' % spec)
                 font = PDFType1Font(self, spec) # this is so wrong!
-            if objid:
-                self.fonts[objid] = font
+            if objid and self.caching:
+                self._cached_fonts[objid] = font
         return font
 
 
@@ -809,11 +810,11 @@ class PDFPageInterpreter(object):
 class PDFTextExtractionNotAllowed(PDFInterpreterError): pass
 
 def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password='',
-                check_extractable=True):
+                caching=True, check_extractable=True):
     # Create a PDF parser object associated with the file object.
     parser = PDFParser(fp)
     # Create a PDF document object that stores the document structure.
-    doc = PDFDocument()
+    doc = PDFDocument(caching=caching)
     # Connect the parser and document objects.
     parser.set_document(doc)
     doc.set_parser(parser)
diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py
index 9ce7fdd..1533826 100644
--- a/pdfminer/pdfparser.py
+++ b/pdfminer/pdfparser.py
@@ -296,15 +296,16 @@ class PDFDocument(object):
 
     debug = 0
 
-    def __init__(self):
+    def __init__(self, caching=True):
+        self.caching = caching
         self.xrefs = []
-        self.objs = {}
-        self.parsed_objs = {}
         self.info = []
         self.catalog = None
         self.encryption = None
         self.decipher = None
         self._parser = None
+        self._cached_objs = {}
+        self._parsed_objs = {}
         return
 
     def set_parser(self, parser):
@@ -408,9 +409,9 @@ class PDFDocument(object):
             raise PDFException('PDFDocument is not initialized')
         if 2 <= self.debug:
             print >>sys.stderr, 'getobj: objid=%r' % (objid)
-        if objid in self.objs:
+        if objid in self._cached_objs:
             genno = 0
-            obj = self.objs[objid]
+            obj = self._cached_objs[objid]
         else:
             for xref in self.xrefs:
                 try:
@@ -434,8 +435,8 @@ class PDFDocument(object):
                     if STRICT:
                         raise PDFSyntaxError('N is not defined: %r' % stream)
                     n = 0
-                if strmid in self.parsed_objs:
-                    objs = self.parsed_objs[strmid]
+                if strmid in self._parsed_objs:
+                    objs = self._parsed_objs[strmid]
                 else:
                     parser = PDFStreamParser(stream.get_data())
                     parser.set_document(self)
@@ -446,7 +447,8 @@ class PDFDocument(object):
                             objs.append(obj)
                     except PSEOF:
                         pass
-                    self.parsed_objs[strmid] = objs
+                    if self.caching:
+                        self._parsed_objs[strmid] = objs
                 genno = 0
                 i = n*2+index
                 try:
@@ -481,7 +483,8 @@ class PDFDocument(object):
                     return None
             if 2 <= self.debug:
                 print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj)
-            self.objs[objid] = obj
+            if self.caching:
+                self._cached_objs[objid] = obj
         if self.decipher:
             obj = decipher_all(self.decipher, objid, genno, obj)
         return obj
diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py
index dc355d5..9085813 100755
--- a/tools/pdf2txt.py
+++ b/tools/pdf2txt.py
@@ -11,12 +11,12 @@ from pdfminer.layout import LAParams
 def main(argv):
     import getopt
     def usage():
-        print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] '
+        print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
                '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
                '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
         return 100
     try:
-        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAVM:L:W:F:Y:O:t:c:s:')
+        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
     except getopt.GetoptError:
         return usage()
     if not args: return usage()
@@ -34,6 +34,7 @@ def main(argv):
     codec = 'utf-8'
     pageno = 1
     scale = 1
+    caching = True
     showpageno = True
     laparams = LAParams()
     for (k, v) in opts:
@@ -42,6 +43,7 @@ def main(argv):
         elif k == '-m': maxpages = int(v)
         elif k == '-P': password = v
         elif k == '-o': outfile = v
+        elif k == '-C': caching = False
         elif k == '-n': laparams = None
         elif k == '-A': laparams.all_texts = True
         elif k == '-V': laparams.detect_vertical = True
@@ -62,7 +64,7 @@ def main(argv):
     PDFPageInterpreter.debug = debug
     PDFDevice.debug = debug
     #
-    rsrcmgr = PDFResourceManager()
+    rsrcmgr = PDFResourceManager(caching=caching)
     if not outtype:
         outtype = 'text'
         if outfile:
@@ -90,7 +92,7 @@ def main(argv):
     for fname in args:
         fp = file(fname, 'rb')
         process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
-                    check_extractable=True)
+                    caching=caching, check_extractable=True)
         fp.close()
     device.close()
     outfp.close()