disable caching support

2011-03-03 00:04:43 +09:00 · 2011-03-03 00:04:43 +09:00 · 4918d59bc2
parent 18e782f330
commit 4918d59bc2
4 changed files with 32 additions and 22 deletions
--- a/docs/index.html
+++ b/docs/index.html
@ -9,7 +9,7 @@

 <div align=right class=lastmod>
 <!-- hhmts start -->
-Last Modified: Sun Feb 27 10:51:18 UTC 2011
+Last Modified: Wed Mar  2 15:03:42 UTC 2011
 <!-- hhmts end -->
 </div>

@ -263,6 +263,10 @@ are M = 1.0, L = 0.3, and W = 0.2, respectively.
 <td style="border-top:1px blue solid" align=right>&uarr;</td>
 </tr></table>
 <p>
+<dt> <code>-C</code> 
+<dd> Suppress object caching. 
+This will reduce the memory consumption but also slows down the process.
+<p>
 <dt> <code>-n</code> 
 <dd> Suppress layout analysis.
 <p>
--- a/pdfminer/pdfinterp.py
+++ b/pdfminer/pdfinterp.py
@ -131,8 +131,9 @@ class PDFResourceManager(object):
    """
    debug = 0

-    def __init__(self):
-        self.fonts = {}
+    def __init__(self, caching=True):
+        self.caching = caching
+        self._cached_fonts = {}
        return

    def get_procset(self, procs):
@ -154,8 +155,8 @@ class PDFResourceManager(object):
            return CMap()

    def get_font(self, objid, spec):
-        if objid and objid in self.fonts:
-            font = self.fonts[objid]
+        if objid and objid in self._cached_fonts:
+            font = self._cached_fonts[objid]
        else:
            if 2 <= self.debug:
                print >>sys.stderr, 'get_font: create: objid=%r, spec=%r' % (objid, spec)
@ -194,8 +195,8 @@ class PDFResourceManager(object):
                if STRICT:
                    raise PDFFontError('Invalid Font spec: %r' % spec)
                font = PDFType1Font(self, spec) # this is so wrong!
-            if objid:
-                self.fonts[objid] = font
+            if objid and self.caching:
+                self._cached_fonts[objid] = font
        return font


@ -809,11 +810,11 @@ class PDFPageInterpreter(object):
 class PDFTextExtractionNotAllowed(PDFInterpreterError): pass

 def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password='',
-                check_extractable=True):
+                caching=True, check_extractable=True):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
-    doc = PDFDocument()
+    doc = PDFDocument(caching=caching)
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.set_parser(parser)
--- a/pdfminer/pdfparser.py
+++ b/pdfminer/pdfparser.py
@ -296,15 +296,16 @@ class PDFDocument(object):

    debug = 0

-    def __init__(self):
+    def __init__(self, caching=True):
+        self.caching = caching
        self.xrefs = []
-        self.objs = {}
-        self.parsed_objs = {}
        self.info = []
        self.catalog = None
        self.encryption = None
        self.decipher = None
        self._parser = None
+        self._cached_objs = {}
+        self._parsed_objs = {}
        return

    def set_parser(self, parser):
@ -408,9 +409,9 @@ class PDFDocument(object):
            raise PDFException('PDFDocument is not initialized')
        if 2 <= self.debug:
            print >>sys.stderr, 'getobj: objid=%r' % (objid)
-        if objid in self.objs:
+        if objid in self._cached_objs:
            genno = 0
-            obj = self.objs[objid]
+            obj = self._cached_objs[objid]
        else:
            for xref in self.xrefs:
                try:
@ -434,8 +435,8 @@ class PDFDocument(object):
                    if STRICT:
                        raise PDFSyntaxError('N is not defined: %r' % stream)
                    n = 0
-                if strmid in self.parsed_objs:
-                    objs = self.parsed_objs[strmid]
+                if strmid in self._parsed_objs:
+                    objs = self._parsed_objs[strmid]
                else:
                    parser = PDFStreamParser(stream.get_data())
                    parser.set_document(self)
@ -446,7 +447,8 @@ class PDFDocument(object):
                            objs.append(obj)
                    except PSEOF:
                        pass
-                    self.parsed_objs[strmid] = objs
+                    if self.caching:
+                        self._parsed_objs[strmid] = objs
                genno = 0
                i = n*2+index
                try:
@ -481,7 +483,8 @@ class PDFDocument(object):
                    return None
            if 2 <= self.debug:
                print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj)
-            self.objs[objid] = obj
+            if self.caching:
+                self._cached_objs[objid] = obj
        if self.decipher:
            obj = decipher_all(self.decipher, objid, genno, obj)
        return obj
--- a/tools/pdf2txt.py
+++ b/tools/pdf2txt.py
@ -11,12 +11,12 @@ from pdfminer.layout import LAParams
 def main(argv):
    import getopt
    def usage():
-        print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] '
+        print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
               '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
               '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
        return 100
    try:
-        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAVM:L:W:F:Y:O:t:c:s:')
+        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
@ -34,6 +34,7 @@ def main(argv):
    codec = 'utf-8'
    pageno = 1
    scale = 1
+    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
@ -42,6 +43,7 @@ def main(argv):
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
+        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
@ -62,7 +64,7 @@ def main(argv):
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
-    rsrcmgr = PDFResourceManager()
+    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
@ -90,7 +92,7 @@ def main(argv):
    for fname in args:
        fp = file(fname, 'rb')
        process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
-                    check_extractable=True)
+                    caching=caching, check_extractable=True)
        fp.close()
    device.close()
    outfp.close()