disable caching support

pull/1/head
Yusuke Shinyama 2011-03-03 00:04:43 +09:00
parent 18e782f330
commit 4918d59bc2
4 changed files with 32 additions and 22 deletions

View File

@ -9,7 +9,7 @@
<div align=right class=lastmod> <div align=right class=lastmod>
<!-- hhmts start --> <!-- hhmts start -->
Last Modified: Sun Feb 27 10:51:18 UTC 2011 Last Modified: Wed Mar 2 15:03:42 UTC 2011
<!-- hhmts end --> <!-- hhmts end -->
</div> </div>
@ -263,6 +263,10 @@ are M = 1.0, L = 0.3, and W = 0.2, respectively.
<td style="border-top:1px blue solid" align=right>&uarr;</td> <td style="border-top:1px blue solid" align=right>&uarr;</td>
</tr></table> </tr></table>
<p> <p>
<dt> <code>-C</code>
<dd> Suppress object caching.
This will reduce the memory consumption but also slows down the process.
<p>
<dt> <code>-n</code> <dt> <code>-n</code>
<dd> Suppress layout analysis. <dd> Suppress layout analysis.
<p> <p>

View File

@ -131,8 +131,9 @@ class PDFResourceManager(object):
""" """
debug = 0 debug = 0
def __init__(self): def __init__(self, caching=True):
self.fonts = {} self.caching = caching
self._cached_fonts = {}
return return
def get_procset(self, procs): def get_procset(self, procs):
@ -154,8 +155,8 @@ class PDFResourceManager(object):
return CMap() return CMap()
def get_font(self, objid, spec): def get_font(self, objid, spec):
if objid and objid in self.fonts: if objid and objid in self._cached_fonts:
font = self.fonts[objid] font = self._cached_fonts[objid]
else: else:
if 2 <= self.debug: if 2 <= self.debug:
print >>sys.stderr, 'get_font: create: objid=%r, spec=%r' % (objid, spec) print >>sys.stderr, 'get_font: create: objid=%r, spec=%r' % (objid, spec)
@ -194,8 +195,8 @@ class PDFResourceManager(object):
if STRICT: if STRICT:
raise PDFFontError('Invalid Font spec: %r' % spec) raise PDFFontError('Invalid Font spec: %r' % spec)
font = PDFType1Font(self, spec) # this is so wrong! font = PDFType1Font(self, spec) # this is so wrong!
if objid: if objid and self.caching:
self.fonts[objid] = font self._cached_fonts[objid] = font
return font return font
@ -809,11 +810,11 @@ class PDFPageInterpreter(object):
class PDFTextExtractionNotAllowed(PDFInterpreterError): pass class PDFTextExtractionNotAllowed(PDFInterpreterError): pass
def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password='', def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password='',
check_extractable=True): caching=True, check_extractable=True):
# Create a PDF parser object associated with the file object. # Create a PDF parser object associated with the file object.
parser = PDFParser(fp) parser = PDFParser(fp)
# Create a PDF document object that stores the document structure. # Create a PDF document object that stores the document structure.
doc = PDFDocument() doc = PDFDocument(caching=caching)
# Connect the parser and document objects. # Connect the parser and document objects.
parser.set_document(doc) parser.set_document(doc)
doc.set_parser(parser) doc.set_parser(parser)

View File

@ -296,15 +296,16 @@ class PDFDocument(object):
debug = 0 debug = 0
def __init__(self): def __init__(self, caching=True):
self.caching = caching
self.xrefs = [] self.xrefs = []
self.objs = {}
self.parsed_objs = {}
self.info = [] self.info = []
self.catalog = None self.catalog = None
self.encryption = None self.encryption = None
self.decipher = None self.decipher = None
self._parser = None self._parser = None
self._cached_objs = {}
self._parsed_objs = {}
return return
def set_parser(self, parser): def set_parser(self, parser):
@ -408,9 +409,9 @@ class PDFDocument(object):
raise PDFException('PDFDocument is not initialized') raise PDFException('PDFDocument is not initialized')
if 2 <= self.debug: if 2 <= self.debug:
print >>sys.stderr, 'getobj: objid=%r' % (objid) print >>sys.stderr, 'getobj: objid=%r' % (objid)
if objid in self.objs: if objid in self._cached_objs:
genno = 0 genno = 0
obj = self.objs[objid] obj = self._cached_objs[objid]
else: else:
for xref in self.xrefs: for xref in self.xrefs:
try: try:
@ -434,8 +435,8 @@ class PDFDocument(object):
if STRICT: if STRICT:
raise PDFSyntaxError('N is not defined: %r' % stream) raise PDFSyntaxError('N is not defined: %r' % stream)
n = 0 n = 0
if strmid in self.parsed_objs: if strmid in self._parsed_objs:
objs = self.parsed_objs[strmid] objs = self._parsed_objs[strmid]
else: else:
parser = PDFStreamParser(stream.get_data()) parser = PDFStreamParser(stream.get_data())
parser.set_document(self) parser.set_document(self)
@ -446,7 +447,8 @@ class PDFDocument(object):
objs.append(obj) objs.append(obj)
except PSEOF: except PSEOF:
pass pass
self.parsed_objs[strmid] = objs if self.caching:
self._parsed_objs[strmid] = objs
genno = 0 genno = 0
i = n*2+index i = n*2+index
try: try:
@ -481,7 +483,8 @@ class PDFDocument(object):
return None return None
if 2 <= self.debug: if 2 <= self.debug:
print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj) print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj)
self.objs[objid] = obj if self.caching:
self._cached_objs[objid] = obj
if self.decipher: if self.decipher:
obj = decipher_all(self.decipher, objid, genno, obj) obj = decipher_all(self.decipher, objid, genno, obj)
return obj return obj

View File

@ -11,12 +11,12 @@ from pdfminer.layout import LAParams
def main(argv): def main(argv):
import getopt import getopt
def usage(): def usage():
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] ' print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
'[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] ' '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
'[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]) '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
return 100 return 100
try: try:
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAVM:L:W:F:Y:O:t:c:s:') (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
except getopt.GetoptError: except getopt.GetoptError:
return usage() return usage()
if not args: return usage() if not args: return usage()
@ -34,6 +34,7 @@ def main(argv):
codec = 'utf-8' codec = 'utf-8'
pageno = 1 pageno = 1
scale = 1 scale = 1
caching = True
showpageno = True showpageno = True
laparams = LAParams() laparams = LAParams()
for (k, v) in opts: for (k, v) in opts:
@ -42,6 +43,7 @@ def main(argv):
elif k == '-m': maxpages = int(v) elif k == '-m': maxpages = int(v)
elif k == '-P': password = v elif k == '-P': password = v
elif k == '-o': outfile = v elif k == '-o': outfile = v
elif k == '-C': caching = False
elif k == '-n': laparams = None elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True elif k == '-V': laparams.detect_vertical = True
@ -62,7 +64,7 @@ def main(argv):
PDFPageInterpreter.debug = debug PDFPageInterpreter.debug = debug
PDFDevice.debug = debug PDFDevice.debug = debug
# #
rsrcmgr = PDFResourceManager() rsrcmgr = PDFResourceManager(caching=caching)
if not outtype: if not outtype:
outtype = 'text' outtype = 'text'
if outfile: if outfile:
@ -90,7 +92,7 @@ def main(argv):
for fname in args: for fname in args:
fp = file(fname, 'rb') fp = file(fname, 'rb')
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
check_extractable=True) caching=caching, check_extractable=True)
fp.close() fp.close()
device.close() device.close()
outfp.close() outfp.close()