disable caching support
parent
18e782f330
commit
4918d59bc2
|
@ -9,7 +9,7 @@
|
|||
|
||||
<div align=right class=lastmod>
|
||||
<!-- hhmts start -->
|
||||
Last Modified: Sun Feb 27 10:51:18 UTC 2011
|
||||
Last Modified: Wed Mar 2 15:03:42 UTC 2011
|
||||
<!-- hhmts end -->
|
||||
</div>
|
||||
|
||||
|
@ -263,6 +263,10 @@ are M = 1.0, L = 0.3, and W = 0.2, respectively.
|
|||
<td style="border-top:1px blue solid" align=right>↑</td>
|
||||
</tr></table>
|
||||
<p>
|
||||
<dt> <code>-C</code>
|
||||
<dd> Suppress object caching.
|
||||
This will reduce the memory consumption but also slows down the process.
|
||||
<p>
|
||||
<dt> <code>-n</code>
|
||||
<dd> Suppress layout analysis.
|
||||
<p>
|
||||
|
|
|
@ -131,8 +131,9 @@ class PDFResourceManager(object):
|
|||
"""
|
||||
debug = 0
|
||||
|
||||
def __init__(self):
|
||||
self.fonts = {}
|
||||
def __init__(self, caching=True):
|
||||
self.caching = caching
|
||||
self._cached_fonts = {}
|
||||
return
|
||||
|
||||
def get_procset(self, procs):
|
||||
|
@ -154,8 +155,8 @@ class PDFResourceManager(object):
|
|||
return CMap()
|
||||
|
||||
def get_font(self, objid, spec):
|
||||
if objid and objid in self.fonts:
|
||||
font = self.fonts[objid]
|
||||
if objid and objid in self._cached_fonts:
|
||||
font = self._cached_fonts[objid]
|
||||
else:
|
||||
if 2 <= self.debug:
|
||||
print >>sys.stderr, 'get_font: create: objid=%r, spec=%r' % (objid, spec)
|
||||
|
@ -194,8 +195,8 @@ class PDFResourceManager(object):
|
|||
if STRICT:
|
||||
raise PDFFontError('Invalid Font spec: %r' % spec)
|
||||
font = PDFType1Font(self, spec) # this is so wrong!
|
||||
if objid:
|
||||
self.fonts[objid] = font
|
||||
if objid and self.caching:
|
||||
self._cached_fonts[objid] = font
|
||||
return font
|
||||
|
||||
|
||||
|
@ -809,11 +810,11 @@ class PDFPageInterpreter(object):
|
|||
class PDFTextExtractionNotAllowed(PDFInterpreterError): pass
|
||||
|
||||
def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password='',
|
||||
check_extractable=True):
|
||||
caching=True, check_extractable=True):
|
||||
# Create a PDF parser object associated with the file object.
|
||||
parser = PDFParser(fp)
|
||||
# Create a PDF document object that stores the document structure.
|
||||
doc = PDFDocument()
|
||||
doc = PDFDocument(caching=caching)
|
||||
# Connect the parser and document objects.
|
||||
parser.set_document(doc)
|
||||
doc.set_parser(parser)
|
||||
|
|
|
@ -296,15 +296,16 @@ class PDFDocument(object):
|
|||
|
||||
debug = 0
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, caching=True):
|
||||
self.caching = caching
|
||||
self.xrefs = []
|
||||
self.objs = {}
|
||||
self.parsed_objs = {}
|
||||
self.info = []
|
||||
self.catalog = None
|
||||
self.encryption = None
|
||||
self.decipher = None
|
||||
self._parser = None
|
||||
self._cached_objs = {}
|
||||
self._parsed_objs = {}
|
||||
return
|
||||
|
||||
def set_parser(self, parser):
|
||||
|
@ -408,9 +409,9 @@ class PDFDocument(object):
|
|||
raise PDFException('PDFDocument is not initialized')
|
||||
if 2 <= self.debug:
|
||||
print >>sys.stderr, 'getobj: objid=%r' % (objid)
|
||||
if objid in self.objs:
|
||||
if objid in self._cached_objs:
|
||||
genno = 0
|
||||
obj = self.objs[objid]
|
||||
obj = self._cached_objs[objid]
|
||||
else:
|
||||
for xref in self.xrefs:
|
||||
try:
|
||||
|
@ -434,8 +435,8 @@ class PDFDocument(object):
|
|||
if STRICT:
|
||||
raise PDFSyntaxError('N is not defined: %r' % stream)
|
||||
n = 0
|
||||
if strmid in self.parsed_objs:
|
||||
objs = self.parsed_objs[strmid]
|
||||
if strmid in self._parsed_objs:
|
||||
objs = self._parsed_objs[strmid]
|
||||
else:
|
||||
parser = PDFStreamParser(stream.get_data())
|
||||
parser.set_document(self)
|
||||
|
@ -446,7 +447,8 @@ class PDFDocument(object):
|
|||
objs.append(obj)
|
||||
except PSEOF:
|
||||
pass
|
||||
self.parsed_objs[strmid] = objs
|
||||
if self.caching:
|
||||
self._parsed_objs[strmid] = objs
|
||||
genno = 0
|
||||
i = n*2+index
|
||||
try:
|
||||
|
@ -481,7 +483,8 @@ class PDFDocument(object):
|
|||
return None
|
||||
if 2 <= self.debug:
|
||||
print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj)
|
||||
self.objs[objid] = obj
|
||||
if self.caching:
|
||||
self._cached_objs[objid] = obj
|
||||
if self.decipher:
|
||||
obj = decipher_all(self.decipher, objid, genno, obj)
|
||||
return obj
|
||||
|
|
|
@ -11,12 +11,12 @@ from pdfminer.layout import LAParams
|
|||
def main(argv):
|
||||
import getopt
|
||||
def usage():
|
||||
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] '
|
||||
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
|
||||
'[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
|
||||
'[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
|
||||
return 100
|
||||
try:
|
||||
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAVM:L:W:F:Y:O:t:c:s:')
|
||||
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
|
||||
except getopt.GetoptError:
|
||||
return usage()
|
||||
if not args: return usage()
|
||||
|
@ -34,6 +34,7 @@ def main(argv):
|
|||
codec = 'utf-8'
|
||||
pageno = 1
|
||||
scale = 1
|
||||
caching = True
|
||||
showpageno = True
|
||||
laparams = LAParams()
|
||||
for (k, v) in opts:
|
||||
|
@ -42,6 +43,7 @@ def main(argv):
|
|||
elif k == '-m': maxpages = int(v)
|
||||
elif k == '-P': password = v
|
||||
elif k == '-o': outfile = v
|
||||
elif k == '-C': caching = False
|
||||
elif k == '-n': laparams = None
|
||||
elif k == '-A': laparams.all_texts = True
|
||||
elif k == '-V': laparams.detect_vertical = True
|
||||
|
@ -62,7 +64,7 @@ def main(argv):
|
|||
PDFPageInterpreter.debug = debug
|
||||
PDFDevice.debug = debug
|
||||
#
|
||||
rsrcmgr = PDFResourceManager()
|
||||
rsrcmgr = PDFResourceManager(caching=caching)
|
||||
if not outtype:
|
||||
outtype = 'text'
|
||||
if outfile:
|
||||
|
@ -90,7 +92,7 @@ def main(argv):
|
|||
for fname in args:
|
||||
fp = file(fname, 'rb')
|
||||
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
|
||||
check_extractable=True)
|
||||
caching=caching, check_extractable=True)
|
||||
fp.close()
|
||||
device.close()
|
||||
outfp.close()
|
||||
|
|
Loading…
Reference in New Issue