disable caching support

pull/1/head
Yusuke Shinyama 2011-03-03 00:04:43 +09:00
parent 18e782f330
commit 4918d59bc2
4 changed files with 32 additions and 22 deletions

View File

@ -9,7 +9,7 @@
<div align=right class=lastmod>
<!-- hhmts start -->
Last Modified: Sun Feb 27 10:51:18 UTC 2011
Last Modified: Wed Mar 2 15:03:42 UTC 2011
<!-- hhmts end -->
</div>
@ -263,6 +263,10 @@ are M = 1.0, L = 0.3, and W = 0.2, respectively.
<td style="border-top:1px blue solid" align=right>&uarr;</td>
</tr></table>
<p>
<dt> <code>-C</code>
<dd> Suppress object caching.
This will reduce the memory consumption but also slows down the process.
<p>
<dt> <code>-n</code>
<dd> Suppress layout analysis.
<p>

View File

@ -131,8 +131,9 @@ class PDFResourceManager(object):
"""
debug = 0
def __init__(self):
self.fonts = {}
def __init__(self, caching=True):
self.caching = caching
self._cached_fonts = {}
return
def get_procset(self, procs):
@ -154,8 +155,8 @@ class PDFResourceManager(object):
return CMap()
def get_font(self, objid, spec):
if objid and objid in self.fonts:
font = self.fonts[objid]
if objid and objid in self._cached_fonts:
font = self._cached_fonts[objid]
else:
if 2 <= self.debug:
print >>sys.stderr, 'get_font: create: objid=%r, spec=%r' % (objid, spec)
@ -194,8 +195,8 @@ class PDFResourceManager(object):
if STRICT:
raise PDFFontError('Invalid Font spec: %r' % spec)
font = PDFType1Font(self, spec) # this is so wrong!
if objid:
self.fonts[objid] = font
if objid and self.caching:
self._cached_fonts[objid] = font
return font
@ -809,11 +810,11 @@ class PDFPageInterpreter(object):
class PDFTextExtractionNotAllowed(PDFInterpreterError): pass
def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password='',
check_extractable=True):
caching=True, check_extractable=True):
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
doc = PDFDocument()
doc = PDFDocument(caching=caching)
# Connect the parser and document objects.
parser.set_document(doc)
doc.set_parser(parser)

View File

@ -296,15 +296,16 @@ class PDFDocument(object):
debug = 0
def __init__(self):
def __init__(self, caching=True):
self.caching = caching
self.xrefs = []
self.objs = {}
self.parsed_objs = {}
self.info = []
self.catalog = None
self.encryption = None
self.decipher = None
self._parser = None
self._cached_objs = {}
self._parsed_objs = {}
return
def set_parser(self, parser):
@ -408,9 +409,9 @@ class PDFDocument(object):
raise PDFException('PDFDocument is not initialized')
if 2 <= self.debug:
print >>sys.stderr, 'getobj: objid=%r' % (objid)
if objid in self.objs:
if objid in self._cached_objs:
genno = 0
obj = self.objs[objid]
obj = self._cached_objs[objid]
else:
for xref in self.xrefs:
try:
@ -434,8 +435,8 @@ class PDFDocument(object):
if STRICT:
raise PDFSyntaxError('N is not defined: %r' % stream)
n = 0
if strmid in self.parsed_objs:
objs = self.parsed_objs[strmid]
if strmid in self._parsed_objs:
objs = self._parsed_objs[strmid]
else:
parser = PDFStreamParser(stream.get_data())
parser.set_document(self)
@ -446,7 +447,8 @@ class PDFDocument(object):
objs.append(obj)
except PSEOF:
pass
self.parsed_objs[strmid] = objs
if self.caching:
self._parsed_objs[strmid] = objs
genno = 0
i = n*2+index
try:
@ -481,7 +483,8 @@ class PDFDocument(object):
return None
if 2 <= self.debug:
print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj)
self.objs[objid] = obj
if self.caching:
self._cached_objs[objid] = obj
if self.decipher:
obj = decipher_all(self.decipher, objid, genno, obj)
return obj

View File

@ -11,12 +11,12 @@ from pdfminer.layout import LAParams
def main(argv):
import getopt
def usage():
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] '
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
'[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
'[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAVM:L:W:F:Y:O:t:c:s:')
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
except getopt.GetoptError:
return usage()
if not args: return usage()
@ -34,6 +34,7 @@ def main(argv):
codec = 'utf-8'
pageno = 1
scale = 1
caching = True
showpageno = True
laparams = LAParams()
for (k, v) in opts:
@ -42,6 +43,7 @@ def main(argv):
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
elif k == '-C': caching = False
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True
@ -62,7 +64,7 @@ def main(argv):
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
rsrcmgr = PDFResourceManager()
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = 'text'
if outfile:
@ -90,7 +92,7 @@ def main(argv):
for fname in args:
fp = file(fname, 'rb')
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
check_extractable=True)
caching=caching, check_extractable=True)
fp.close()
device.close()
outfp.close()