disable caching support
parent
18e782f330
commit
4918d59bc2
|
@ -9,7 +9,7 @@
|
||||||
|
|
||||||
<div align=right class=lastmod>
|
<div align=right class=lastmod>
|
||||||
<!-- hhmts start -->
|
<!-- hhmts start -->
|
||||||
Last Modified: Sun Feb 27 10:51:18 UTC 2011
|
Last Modified: Wed Mar 2 15:03:42 UTC 2011
|
||||||
<!-- hhmts end -->
|
<!-- hhmts end -->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -263,6 +263,10 @@ are M = 1.0, L = 0.3, and W = 0.2, respectively.
|
||||||
<td style="border-top:1px blue solid" align=right>↑</td>
|
<td style="border-top:1px blue solid" align=right>↑</td>
|
||||||
</tr></table>
|
</tr></table>
|
||||||
<p>
|
<p>
|
||||||
|
<dt> <code>-C</code>
|
||||||
|
<dd> Suppress object caching.
|
||||||
|
This will reduce the memory consumption but also slows down the process.
|
||||||
|
<p>
|
||||||
<dt> <code>-n</code>
|
<dt> <code>-n</code>
|
||||||
<dd> Suppress layout analysis.
|
<dd> Suppress layout analysis.
|
||||||
<p>
|
<p>
|
||||||
|
|
|
@ -131,8 +131,9 @@ class PDFResourceManager(object):
|
||||||
"""
|
"""
|
||||||
debug = 0
|
debug = 0
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, caching=True):
|
||||||
self.fonts = {}
|
self.caching = caching
|
||||||
|
self._cached_fonts = {}
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_procset(self, procs):
|
def get_procset(self, procs):
|
||||||
|
@ -154,8 +155,8 @@ class PDFResourceManager(object):
|
||||||
return CMap()
|
return CMap()
|
||||||
|
|
||||||
def get_font(self, objid, spec):
|
def get_font(self, objid, spec):
|
||||||
if objid and objid in self.fonts:
|
if objid and objid in self._cached_fonts:
|
||||||
font = self.fonts[objid]
|
font = self._cached_fonts[objid]
|
||||||
else:
|
else:
|
||||||
if 2 <= self.debug:
|
if 2 <= self.debug:
|
||||||
print >>sys.stderr, 'get_font: create: objid=%r, spec=%r' % (objid, spec)
|
print >>sys.stderr, 'get_font: create: objid=%r, spec=%r' % (objid, spec)
|
||||||
|
@ -194,8 +195,8 @@ class PDFResourceManager(object):
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFFontError('Invalid Font spec: %r' % spec)
|
raise PDFFontError('Invalid Font spec: %r' % spec)
|
||||||
font = PDFType1Font(self, spec) # this is so wrong!
|
font = PDFType1Font(self, spec) # this is so wrong!
|
||||||
if objid:
|
if objid and self.caching:
|
||||||
self.fonts[objid] = font
|
self._cached_fonts[objid] = font
|
||||||
return font
|
return font
|
||||||
|
|
||||||
|
|
||||||
|
@ -809,11 +810,11 @@ class PDFPageInterpreter(object):
|
||||||
class PDFTextExtractionNotAllowed(PDFInterpreterError): pass
|
class PDFTextExtractionNotAllowed(PDFInterpreterError): pass
|
||||||
|
|
||||||
def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password='',
|
def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password='',
|
||||||
check_extractable=True):
|
caching=True, check_extractable=True):
|
||||||
# Create a PDF parser object associated with the file object.
|
# Create a PDF parser object associated with the file object.
|
||||||
parser = PDFParser(fp)
|
parser = PDFParser(fp)
|
||||||
# Create a PDF document object that stores the document structure.
|
# Create a PDF document object that stores the document structure.
|
||||||
doc = PDFDocument()
|
doc = PDFDocument(caching=caching)
|
||||||
# Connect the parser and document objects.
|
# Connect the parser and document objects.
|
||||||
parser.set_document(doc)
|
parser.set_document(doc)
|
||||||
doc.set_parser(parser)
|
doc.set_parser(parser)
|
||||||
|
|
|
@ -296,15 +296,16 @@ class PDFDocument(object):
|
||||||
|
|
||||||
debug = 0
|
debug = 0
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, caching=True):
|
||||||
|
self.caching = caching
|
||||||
self.xrefs = []
|
self.xrefs = []
|
||||||
self.objs = {}
|
|
||||||
self.parsed_objs = {}
|
|
||||||
self.info = []
|
self.info = []
|
||||||
self.catalog = None
|
self.catalog = None
|
||||||
self.encryption = None
|
self.encryption = None
|
||||||
self.decipher = None
|
self.decipher = None
|
||||||
self._parser = None
|
self._parser = None
|
||||||
|
self._cached_objs = {}
|
||||||
|
self._parsed_objs = {}
|
||||||
return
|
return
|
||||||
|
|
||||||
def set_parser(self, parser):
|
def set_parser(self, parser):
|
||||||
|
@ -408,9 +409,9 @@ class PDFDocument(object):
|
||||||
raise PDFException('PDFDocument is not initialized')
|
raise PDFException('PDFDocument is not initialized')
|
||||||
if 2 <= self.debug:
|
if 2 <= self.debug:
|
||||||
print >>sys.stderr, 'getobj: objid=%r' % (objid)
|
print >>sys.stderr, 'getobj: objid=%r' % (objid)
|
||||||
if objid in self.objs:
|
if objid in self._cached_objs:
|
||||||
genno = 0
|
genno = 0
|
||||||
obj = self.objs[objid]
|
obj = self._cached_objs[objid]
|
||||||
else:
|
else:
|
||||||
for xref in self.xrefs:
|
for xref in self.xrefs:
|
||||||
try:
|
try:
|
||||||
|
@ -434,8 +435,8 @@ class PDFDocument(object):
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFSyntaxError('N is not defined: %r' % stream)
|
raise PDFSyntaxError('N is not defined: %r' % stream)
|
||||||
n = 0
|
n = 0
|
||||||
if strmid in self.parsed_objs:
|
if strmid in self._parsed_objs:
|
||||||
objs = self.parsed_objs[strmid]
|
objs = self._parsed_objs[strmid]
|
||||||
else:
|
else:
|
||||||
parser = PDFStreamParser(stream.get_data())
|
parser = PDFStreamParser(stream.get_data())
|
||||||
parser.set_document(self)
|
parser.set_document(self)
|
||||||
|
@ -446,7 +447,8 @@ class PDFDocument(object):
|
||||||
objs.append(obj)
|
objs.append(obj)
|
||||||
except PSEOF:
|
except PSEOF:
|
||||||
pass
|
pass
|
||||||
self.parsed_objs[strmid] = objs
|
if self.caching:
|
||||||
|
self._parsed_objs[strmid] = objs
|
||||||
genno = 0
|
genno = 0
|
||||||
i = n*2+index
|
i = n*2+index
|
||||||
try:
|
try:
|
||||||
|
@ -481,7 +483,8 @@ class PDFDocument(object):
|
||||||
return None
|
return None
|
||||||
if 2 <= self.debug:
|
if 2 <= self.debug:
|
||||||
print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj)
|
print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj)
|
||||||
self.objs[objid] = obj
|
if self.caching:
|
||||||
|
self._cached_objs[objid] = obj
|
||||||
if self.decipher:
|
if self.decipher:
|
||||||
obj = decipher_all(self.decipher, objid, genno, obj)
|
obj = decipher_all(self.decipher, objid, genno, obj)
|
||||||
return obj
|
return obj
|
||||||
|
|
|
@ -11,12 +11,12 @@ from pdfminer.layout import LAParams
|
||||||
def main(argv):
|
def main(argv):
|
||||||
import getopt
|
import getopt
|
||||||
def usage():
|
def usage():
|
||||||
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] '
|
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
|
||||||
'[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
|
'[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
|
||||||
'[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
|
'[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
|
||||||
return 100
|
return 100
|
||||||
try:
|
try:
|
||||||
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAVM:L:W:F:Y:O:t:c:s:')
|
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
|
||||||
except getopt.GetoptError:
|
except getopt.GetoptError:
|
||||||
return usage()
|
return usage()
|
||||||
if not args: return usage()
|
if not args: return usage()
|
||||||
|
@ -34,6 +34,7 @@ def main(argv):
|
||||||
codec = 'utf-8'
|
codec = 'utf-8'
|
||||||
pageno = 1
|
pageno = 1
|
||||||
scale = 1
|
scale = 1
|
||||||
|
caching = True
|
||||||
showpageno = True
|
showpageno = True
|
||||||
laparams = LAParams()
|
laparams = LAParams()
|
||||||
for (k, v) in opts:
|
for (k, v) in opts:
|
||||||
|
@ -42,6 +43,7 @@ def main(argv):
|
||||||
elif k == '-m': maxpages = int(v)
|
elif k == '-m': maxpages = int(v)
|
||||||
elif k == '-P': password = v
|
elif k == '-P': password = v
|
||||||
elif k == '-o': outfile = v
|
elif k == '-o': outfile = v
|
||||||
|
elif k == '-C': caching = False
|
||||||
elif k == '-n': laparams = None
|
elif k == '-n': laparams = None
|
||||||
elif k == '-A': laparams.all_texts = True
|
elif k == '-A': laparams.all_texts = True
|
||||||
elif k == '-V': laparams.detect_vertical = True
|
elif k == '-V': laparams.detect_vertical = True
|
||||||
|
@ -62,7 +64,7 @@ def main(argv):
|
||||||
PDFPageInterpreter.debug = debug
|
PDFPageInterpreter.debug = debug
|
||||||
PDFDevice.debug = debug
|
PDFDevice.debug = debug
|
||||||
#
|
#
|
||||||
rsrcmgr = PDFResourceManager()
|
rsrcmgr = PDFResourceManager(caching=caching)
|
||||||
if not outtype:
|
if not outtype:
|
||||||
outtype = 'text'
|
outtype = 'text'
|
||||||
if outfile:
|
if outfile:
|
||||||
|
@ -90,7 +92,7 @@ def main(argv):
|
||||||
for fname in args:
|
for fname in args:
|
||||||
fp = file(fname, 'rb')
|
fp = file(fname, 'rb')
|
||||||
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
|
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
|
||||||
check_extractable=True)
|
caching=caching, check_extractable=True)
|
||||||
fp.close()
|
fp.close()
|
||||||
device.close()
|
device.close()
|
||||||
outfp.close()
|
outfp.close()
|
||||||
|
|
Loading…
Reference in New Issue