fix xobject resources bug, thanks to Jose Maria
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@209 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
34665b7374
commit
97848409e5
|
@ -14,8 +14,8 @@ from utils import enc, bbox2str, create_bmp
|
|||
##
|
||||
class PDFPageAggregator(PDFTextDevice):
|
||||
|
||||
def __init__(self, rsrc, pageno=1, laparams=None):
|
||||
PDFTextDevice.__init__(self, rsrc)
|
||||
def __init__(self, rsrcmgr, pageno=1, laparams=None):
|
||||
PDFTextDevice.__init__(self, rsrcmgr)
|
||||
self.laparams = laparams
|
||||
self.pageno = pageno
|
||||
self.stack = []
|
||||
|
@ -100,8 +100,8 @@ class PDFPageAggregator(PDFTextDevice):
|
|||
##
|
||||
class PDFConverter(PDFPageAggregator):
|
||||
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None):
|
||||
PDFPageAggregator.__init__(self, rsrc, pageno=pageno, laparams=laparams)
|
||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None):
|
||||
PDFPageAggregator.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
|
||||
self.outfp = outfp
|
||||
self.codec = codec
|
||||
return
|
||||
|
@ -138,9 +138,9 @@ class PDFConverter(PDFPageAggregator):
|
|||
##
|
||||
class TextConverter(PDFConverter):
|
||||
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
|
||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
|
||||
showpageno=False):
|
||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||
self.showpageno = showpageno
|
||||
return
|
||||
|
||||
|
@ -169,9 +169,9 @@ class TextConverter(PDFConverter):
|
|||
##
|
||||
class HTMLConverter(PDFConverter):
|
||||
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
|
||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
|
||||
scale=1, showpageno=True, pagepad=50, outdir=None):
|
||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||
self.showpageno = showpageno
|
||||
self.pagepad = pagepad
|
||||
self.outdir = outdir
|
||||
|
@ -261,8 +261,8 @@ class HTMLConverter(PDFConverter):
|
|||
##
|
||||
class XMLConverter(PDFConverter):
|
||||
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, outdir=None):
|
||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, outdir=None):
|
||||
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||
self.outdir = outdir
|
||||
self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % codec)
|
||||
self.outfp.write('<pages>\n')
|
||||
|
@ -349,8 +349,8 @@ class XMLConverter(PDFConverter):
|
|||
##
|
||||
class TagExtractor(PDFDevice):
|
||||
|
||||
def __init__(self, rsrc, outfp, codec='utf-8'):
|
||||
PDFDevice.__init__(self, rsrc)
|
||||
def __init__(self, rsrcmgr, outfp, codec='utf-8'):
|
||||
PDFDevice.__init__(self, rsrcmgr)
|
||||
self.outfp = outfp
|
||||
self.codec = codec
|
||||
self.pageno = 0
|
||||
|
|
|
@ -11,8 +11,8 @@ class PDFDevice(object):
|
|||
|
||||
debug = 0
|
||||
|
||||
def __init__(self, rsrc):
|
||||
self.rsrc = rsrc
|
||||
def __init__(self, rsrcmgr):
|
||||
self.rsrcmgr = rsrcmgr
|
||||
self.ctm = None
|
||||
return
|
||||
|
||||
|
|
|
@ -419,7 +419,7 @@ class PDFSimpleFont(PDFFont):
|
|||
# PDFType1Font
|
||||
class PDFType1Font(PDFSimpleFont):
|
||||
|
||||
def __init__(self, rsrc, spec):
|
||||
def __init__(self, rsrcmgr, spec):
|
||||
try:
|
||||
self.basefont = literal_name(spec['BaseFont'])
|
||||
except KeyError:
|
||||
|
@ -449,7 +449,7 @@ class PDFTrueTypeFont(PDFType1Font):
|
|||
# PDFType3Font
|
||||
class PDFType3Font(PDFSimpleFont):
|
||||
|
||||
def __init__(self, rsrc, spec):
|
||||
def __init__(self, rsrcmgr, spec):
|
||||
firstchar = int_value(spec.get('FirstChar', 0))
|
||||
lastchar = int_value(spec.get('LastChar', 0))
|
||||
widths = list_value(spec.get('Widths', [0]*256))
|
||||
|
@ -472,7 +472,7 @@ class PDFType3Font(PDFSimpleFont):
|
|||
# PDFCIDFont
|
||||
class PDFCIDFont(PDFFont):
|
||||
|
||||
def __init__(self, rsrc, spec):
|
||||
def __init__(self, rsrcmgr, spec):
|
||||
try:
|
||||
self.basefont = literal_name(spec['BaseFont'])
|
||||
except KeyError:
|
||||
|
|
|
@ -157,6 +157,8 @@ class PDFResourceManager(object):
|
|||
if objid and objid in self.fonts:
|
||||
font = self.fonts[objid]
|
||||
else:
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'get_font: create: objid=%r, spec=%r' % (objid, spec)
|
||||
if STRICT:
|
||||
if spec['Type'] is not LITERAL_FONT:
|
||||
raise PDFFontError('Type is not /Font')
|
||||
|
@ -297,17 +299,18 @@ class PDFPageInterpreter(object):
|
|||
|
||||
debug = 0
|
||||
|
||||
def __init__(self, rsrc, device):
|
||||
self.rsrc = rsrc
|
||||
def __init__(self, rsrcmgr, device):
|
||||
self.rsrcmgr = rsrcmgr
|
||||
self.device = device
|
||||
return
|
||||
|
||||
def dup(self):
|
||||
return PDFPageInterpreter(self.rsrc, self.device)
|
||||
return PDFPageInterpreter(self.rsrcmgr, self.device)
|
||||
|
||||
# init_resources(resources):
|
||||
# Prepare the fonts and XObjects listed in the Resource attribute.
|
||||
def init_resources(self, resources):
|
||||
self.resources = resources
|
||||
self.fontmap = {}
|
||||
self.xobjmap = {}
|
||||
self.csmap = PREDEFINED_COLORSPACE.copy()
|
||||
|
@ -324,7 +327,7 @@ class PDFPageInterpreter(object):
|
|||
else:
|
||||
return PREDEFINED_COLORSPACE[name]
|
||||
for (k,v) in dict_value(resources).iteritems():
|
||||
if 1 <= self.debug:
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'Resource: %r: %r' % (k,v)
|
||||
if k == 'Font':
|
||||
for (fontid,spec) in dict_value(v).iteritems():
|
||||
|
@ -332,12 +335,12 @@ class PDFPageInterpreter(object):
|
|||
if isinstance(spec, PDFObjRef):
|
||||
objid = spec.objid
|
||||
spec = dict_value(spec)
|
||||
self.fontmap[fontid] = self.rsrc.get_font(objid, spec)
|
||||
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
|
||||
elif k == 'ColorSpace':
|
||||
for (csid,spec) in dict_value(v).iteritems():
|
||||
self.csmap[csid] = get_colorspace(resolve1(spec))
|
||||
elif k == 'ProcSet':
|
||||
self.rsrc.get_procset(list_value(v))
|
||||
self.rsrcmgr.get_procset(list_value(v))
|
||||
elif k == 'XObject':
|
||||
for (xobjid,xobjstrm) in dict_value(v).iteritems():
|
||||
self.xobjmap[xobjid] = xobjstrm
|
||||
|
@ -625,6 +628,7 @@ class PDFPageInterpreter(object):
|
|||
try:
|
||||
self.textstate.font = self.fontmap[literal_name(fontid)]
|
||||
except KeyError:
|
||||
raise
|
||||
if STRICT:
|
||||
raise PDFInterpreterError('Undefined Font id: %r' % fontid)
|
||||
return
|
||||
|
@ -669,6 +673,10 @@ class PDFPageInterpreter(object):
|
|||
# show-pos
|
||||
def do_TJ(self, seq):
|
||||
#print >>stderr, 'TJ(%r): %r' % (seq,self.textstate)
|
||||
if self.textstate.font is None:
|
||||
if STRICT:
|
||||
raise PDFInterpreterError('No font specified!')
|
||||
return
|
||||
self.device.render_string(self.textstate, seq)
|
||||
return
|
||||
# show
|
||||
|
@ -716,8 +724,12 @@ class PDFPageInterpreter(object):
|
|||
interpreter = self.dup()
|
||||
bbox = list_value(xobj['BBox'])
|
||||
matrix = list_value(xobj.get('Matrix', MATRIX_IDENTITY))
|
||||
# According to PDF reference 1.7 section 4.9.1, XObjects in
|
||||
# earlier PDFs (prior to v1.2) use the page's Resources entry
|
||||
# instead of having their own Resources entry.
|
||||
resources = dict_value(xobj.get('Resources')) or self.resources.copy()
|
||||
self.device.begin_figure(xobjid, bbox, matrix)
|
||||
interpreter.render_contents(dict_value(xobj.get('Resources')), [xobj], ctm=mult_matrix(matrix, self.ctm))
|
||||
interpreter.render_contents(resources, [xobj], ctm=mult_matrix(matrix, self.ctm))
|
||||
self.device.end_figure(xobjid)
|
||||
elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj:
|
||||
self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY)
|
||||
|
@ -749,6 +761,9 @@ class PDFPageInterpreter(object):
|
|||
# Render the content streams.
|
||||
# This method may be called recursively.
|
||||
def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY):
|
||||
if 1 <= self.debug:
|
||||
print >>stderr, ('render_contents: resources=%r, streams=%r, ctm=%r' %
|
||||
(resources, streams, ctm))
|
||||
self.init_resources(resources)
|
||||
self.init_state(ctm)
|
||||
self.execute(list_value(streams))
|
||||
|
@ -773,12 +788,12 @@ class PDFPageInterpreter(object):
|
|||
nargs = func.func_code.co_argcount-1
|
||||
if nargs:
|
||||
args = self.pop(nargs)
|
||||
if 1 <= self.debug:
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'exec: %s %r' % (name, args)
|
||||
if len(args) == nargs:
|
||||
func(*args)
|
||||
else:
|
||||
if 1 <= self.debug:
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'exec: %s' % (name)
|
||||
func()
|
||||
else:
|
||||
|
@ -793,7 +808,7 @@ class PDFPageInterpreter(object):
|
|||
##
|
||||
class PDFTextExtractionNotAllowed(PDFInterpreterError): pass
|
||||
|
||||
def process_pdf(rsrc, device, fp, pagenos=None, maxpages=0, password=''):
|
||||
def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password=''):
|
||||
doc = PDFDocument()
|
||||
parser = PDFParser(fp)
|
||||
parser.set_document(doc)
|
||||
|
@ -801,7 +816,7 @@ def process_pdf(rsrc, device, fp, pagenos=None, maxpages=0, password=''):
|
|||
doc.initialize(password)
|
||||
if not doc.is_extractable:
|
||||
raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
|
||||
interpreter = PDFPageInterpreter(rsrc, device)
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
for (pageno,page) in enumerate(doc.get_pages()):
|
||||
if pagenos and (pageno not in pagenos): continue
|
||||
interpreter.process_page(page)
|
||||
|
|
|
@ -94,7 +94,7 @@ class PDFXRef(PDFBaseXRef):
|
|||
(pos, genno, use) = f
|
||||
if use != 'n': continue
|
||||
self.offsets[objid] = (int(genno), long(pos))
|
||||
if debug:
|
||||
if 1 <= debug:
|
||||
print >>stderr, 'xref objects:', self.offsets
|
||||
self.load_trailer(parser)
|
||||
return
|
||||
|
@ -178,7 +178,7 @@ class PDFXRefStream(PDFBaseXRef):
|
|||
self.data = stream.get_data()
|
||||
self.entlen = self.fl1+self.fl2+self.fl3
|
||||
self.trailer = stream.attrs
|
||||
if debug:
|
||||
if 1 <= debug:
|
||||
print >>stderr, ('xref stream: objid=%s, fields=%d,%d,%d' %
|
||||
(', '.join(map(repr, self.objid_ranges)),
|
||||
self.fl1, self.fl2, self.fl3))
|
||||
|
@ -650,7 +650,7 @@ class PDFParser(PSStackParser):
|
|||
objlen += len(line)
|
||||
data += line
|
||||
self.seek(pos+objlen)
|
||||
if 1 <= self.debug:
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
|
||||
(pos, objlen, dic, data[:10])
|
||||
obj = PDFStream(dic, data, self.doc.decipher)
|
||||
|
|
|
@ -59,7 +59,7 @@ def main(argv):
|
|||
PDFPageInterpreter.debug = debug
|
||||
PDFDevice.debug = debug
|
||||
#
|
||||
rsrc = PDFResourceManager()
|
||||
rsrcmgr = PDFResourceManager()
|
||||
if not outtype:
|
||||
outtype = 'text'
|
||||
if outfile:
|
||||
|
@ -74,18 +74,18 @@ def main(argv):
|
|||
else:
|
||||
outfp = sys.stdout
|
||||
if outtype == 'text':
|
||||
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
|
||||
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
|
||||
elif outtype == 'xml':
|
||||
device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams, outdir=outdir)
|
||||
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
|
||||
elif outtype == 'html':
|
||||
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir)
|
||||
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir)
|
||||
elif outtype == 'tag':
|
||||
device = TagExtractor(rsrc, outfp, codec=codec)
|
||||
device = TagExtractor(rsrcmgr, outfp, codec=codec)
|
||||
else:
|
||||
return usage()
|
||||
for fname in args:
|
||||
fp = file(fname, 'rb')
|
||||
process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages, password=password)
|
||||
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password)
|
||||
fp.close()
|
||||
device.close()
|
||||
outfp.close()
|
||||
|
|
Loading…
Reference in New Issue