fix xobject resources bug, thanks to Jose Maria

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@209 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2010-04-24 04:32:03 +00:00
parent 34665b7374
commit 97848409e5
6 changed files with 52 additions and 37 deletions

View File

@ -14,8 +14,8 @@ from utils import enc, bbox2str, create_bmp
## ##
class PDFPageAggregator(PDFTextDevice): class PDFPageAggregator(PDFTextDevice):
def __init__(self, rsrc, pageno=1, laparams=None): def __init__(self, rsrcmgr, pageno=1, laparams=None):
PDFTextDevice.__init__(self, rsrc) PDFTextDevice.__init__(self, rsrcmgr)
self.laparams = laparams self.laparams = laparams
self.pageno = pageno self.pageno = pageno
self.stack = [] self.stack = []
@ -100,8 +100,8 @@ class PDFPageAggregator(PDFTextDevice):
## ##
class PDFConverter(PDFPageAggregator): class PDFConverter(PDFPageAggregator):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None): def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None):
PDFPageAggregator.__init__(self, rsrc, pageno=pageno, laparams=laparams) PDFPageAggregator.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
self.outfp = outfp self.outfp = outfp
self.codec = codec self.codec = codec
return return
@ -138,9 +138,9 @@ class PDFConverter(PDFPageAggregator):
## ##
class TextConverter(PDFConverter): class TextConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
showpageno=False): showpageno=False):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams) PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.showpageno = showpageno self.showpageno = showpageno
return return
@ -169,9 +169,9 @@ class TextConverter(PDFConverter):
## ##
class HTMLConverter(PDFConverter): class HTMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
scale=1, showpageno=True, pagepad=50, outdir=None): scale=1, showpageno=True, pagepad=50, outdir=None):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams) PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.showpageno = showpageno self.showpageno = showpageno
self.pagepad = pagepad self.pagepad = pagepad
self.outdir = outdir self.outdir = outdir
@ -261,8 +261,8 @@ class HTMLConverter(PDFConverter):
## ##
class XMLConverter(PDFConverter): class XMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, outdir=None): def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, outdir=None):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams) PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.outdir = outdir self.outdir = outdir
self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % codec) self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % codec)
self.outfp.write('<pages>\n') self.outfp.write('<pages>\n')
@ -349,8 +349,8 @@ class XMLConverter(PDFConverter):
## ##
class TagExtractor(PDFDevice): class TagExtractor(PDFDevice):
def __init__(self, rsrc, outfp, codec='utf-8'): def __init__(self, rsrcmgr, outfp, codec='utf-8'):
PDFDevice.__init__(self, rsrc) PDFDevice.__init__(self, rsrcmgr)
self.outfp = outfp self.outfp = outfp
self.codec = codec self.codec = codec
self.pageno = 0 self.pageno = 0

View File

@ -11,8 +11,8 @@ class PDFDevice(object):
debug = 0 debug = 0
def __init__(self, rsrc): def __init__(self, rsrcmgr):
self.rsrc = rsrc self.rsrcmgr = rsrcmgr
self.ctm = None self.ctm = None
return return

View File

@ -419,7 +419,7 @@ class PDFSimpleFont(PDFFont):
# PDFType1Font # PDFType1Font
class PDFType1Font(PDFSimpleFont): class PDFType1Font(PDFSimpleFont):
def __init__(self, rsrc, spec): def __init__(self, rsrcmgr, spec):
try: try:
self.basefont = literal_name(spec['BaseFont']) self.basefont = literal_name(spec['BaseFont'])
except KeyError: except KeyError:
@ -449,7 +449,7 @@ class PDFTrueTypeFont(PDFType1Font):
# PDFType3Font # PDFType3Font
class PDFType3Font(PDFSimpleFont): class PDFType3Font(PDFSimpleFont):
def __init__(self, rsrc, spec): def __init__(self, rsrcmgr, spec):
firstchar = int_value(spec.get('FirstChar', 0)) firstchar = int_value(spec.get('FirstChar', 0))
lastchar = int_value(spec.get('LastChar', 0)) lastchar = int_value(spec.get('LastChar', 0))
widths = list_value(spec.get('Widths', [0]*256)) widths = list_value(spec.get('Widths', [0]*256))
@ -472,7 +472,7 @@ class PDFType3Font(PDFSimpleFont):
# PDFCIDFont # PDFCIDFont
class PDFCIDFont(PDFFont): class PDFCIDFont(PDFFont):
def __init__(self, rsrc, spec): def __init__(self, rsrcmgr, spec):
try: try:
self.basefont = literal_name(spec['BaseFont']) self.basefont = literal_name(spec['BaseFont'])
except KeyError: except KeyError:

View File

@ -157,6 +157,8 @@ class PDFResourceManager(object):
if objid and objid in self.fonts: if objid and objid in self.fonts:
font = self.fonts[objid] font = self.fonts[objid]
else: else:
if 2 <= self.debug:
print >>stderr, 'get_font: create: objid=%r, spec=%r' % (objid, spec)
if STRICT: if STRICT:
if spec['Type'] is not LITERAL_FONT: if spec['Type'] is not LITERAL_FONT:
raise PDFFontError('Type is not /Font') raise PDFFontError('Type is not /Font')
@ -297,17 +299,18 @@ class PDFPageInterpreter(object):
debug = 0 debug = 0
def __init__(self, rsrc, device): def __init__(self, rsrcmgr, device):
self.rsrc = rsrc self.rsrcmgr = rsrcmgr
self.device = device self.device = device
return return
def dup(self): def dup(self):
return PDFPageInterpreter(self.rsrc, self.device) return PDFPageInterpreter(self.rsrcmgr, self.device)
# init_resources(resources): # init_resources(resources):
# Prepare the fonts and XObjects listed in the Resource attribute. # Prepare the fonts and XObjects listed in the Resource attribute.
def init_resources(self, resources): def init_resources(self, resources):
self.resources = resources
self.fontmap = {} self.fontmap = {}
self.xobjmap = {} self.xobjmap = {}
self.csmap = PREDEFINED_COLORSPACE.copy() self.csmap = PREDEFINED_COLORSPACE.copy()
@ -324,7 +327,7 @@ class PDFPageInterpreter(object):
else: else:
return PREDEFINED_COLORSPACE[name] return PREDEFINED_COLORSPACE[name]
for (k,v) in dict_value(resources).iteritems(): for (k,v) in dict_value(resources).iteritems():
if 1 <= self.debug: if 2 <= self.debug:
print >>stderr, 'Resource: %r: %r' % (k,v) print >>stderr, 'Resource: %r: %r' % (k,v)
if k == 'Font': if k == 'Font':
for (fontid,spec) in dict_value(v).iteritems(): for (fontid,spec) in dict_value(v).iteritems():
@ -332,12 +335,12 @@ class PDFPageInterpreter(object):
if isinstance(spec, PDFObjRef): if isinstance(spec, PDFObjRef):
objid = spec.objid objid = spec.objid
spec = dict_value(spec) spec = dict_value(spec)
self.fontmap[fontid] = self.rsrc.get_font(objid, spec) self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
elif k == 'ColorSpace': elif k == 'ColorSpace':
for (csid,spec) in dict_value(v).iteritems(): for (csid,spec) in dict_value(v).iteritems():
self.csmap[csid] = get_colorspace(resolve1(spec)) self.csmap[csid] = get_colorspace(resolve1(spec))
elif k == 'ProcSet': elif k == 'ProcSet':
self.rsrc.get_procset(list_value(v)) self.rsrcmgr.get_procset(list_value(v))
elif k == 'XObject': elif k == 'XObject':
for (xobjid,xobjstrm) in dict_value(v).iteritems(): for (xobjid,xobjstrm) in dict_value(v).iteritems():
self.xobjmap[xobjid] = xobjstrm self.xobjmap[xobjid] = xobjstrm
@ -625,6 +628,7 @@ class PDFPageInterpreter(object):
try: try:
self.textstate.font = self.fontmap[literal_name(fontid)] self.textstate.font = self.fontmap[literal_name(fontid)]
except KeyError: except KeyError:
raise
if STRICT: if STRICT:
raise PDFInterpreterError('Undefined Font id: %r' % fontid) raise PDFInterpreterError('Undefined Font id: %r' % fontid)
return return
@ -669,6 +673,10 @@ class PDFPageInterpreter(object):
# show-pos # show-pos
def do_TJ(self, seq): def do_TJ(self, seq):
#print >>stderr, 'TJ(%r): %r' % (seq,self.textstate) #print >>stderr, 'TJ(%r): %r' % (seq,self.textstate)
if self.textstate.font is None:
if STRICT:
raise PDFInterpreterError('No font specified!')
return
self.device.render_string(self.textstate, seq) self.device.render_string(self.textstate, seq)
return return
# show # show
@ -716,8 +724,12 @@ class PDFPageInterpreter(object):
interpreter = self.dup() interpreter = self.dup()
bbox = list_value(xobj['BBox']) bbox = list_value(xobj['BBox'])
matrix = list_value(xobj.get('Matrix', MATRIX_IDENTITY)) matrix = list_value(xobj.get('Matrix', MATRIX_IDENTITY))
# According to PDF reference 1.7 section 4.9.1, XObjects in
# earlier PDFs (prior to v1.2) use the page's Resources entry
# instead of having their own Resources entry.
resources = dict_value(xobj.get('Resources')) or self.resources.copy()
self.device.begin_figure(xobjid, bbox, matrix) self.device.begin_figure(xobjid, bbox, matrix)
interpreter.render_contents(dict_value(xobj.get('Resources')), [xobj], ctm=mult_matrix(matrix, self.ctm)) interpreter.render_contents(resources, [xobj], ctm=mult_matrix(matrix, self.ctm))
self.device.end_figure(xobjid) self.device.end_figure(xobjid)
elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj: elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj:
self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY) self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY)
@ -749,6 +761,9 @@ class PDFPageInterpreter(object):
# Render the content streams. # Render the content streams.
# This method may be called recursively. # This method may be called recursively.
def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY): def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY):
if 1 <= self.debug:
print >>stderr, ('render_contents: resources=%r, streams=%r, ctm=%r' %
(resources, streams, ctm))
self.init_resources(resources) self.init_resources(resources)
self.init_state(ctm) self.init_state(ctm)
self.execute(list_value(streams)) self.execute(list_value(streams))
@ -773,12 +788,12 @@ class PDFPageInterpreter(object):
nargs = func.func_code.co_argcount-1 nargs = func.func_code.co_argcount-1
if nargs: if nargs:
args = self.pop(nargs) args = self.pop(nargs)
if 1 <= self.debug: if 2 <= self.debug:
print >>stderr, 'exec: %s %r' % (name, args) print >>stderr, 'exec: %s %r' % (name, args)
if len(args) == nargs: if len(args) == nargs:
func(*args) func(*args)
else: else:
if 1 <= self.debug: if 2 <= self.debug:
print >>stderr, 'exec: %s' % (name) print >>stderr, 'exec: %s' % (name)
func() func()
else: else:
@ -793,7 +808,7 @@ class PDFPageInterpreter(object):
## ##
class PDFTextExtractionNotAllowed(PDFInterpreterError): pass class PDFTextExtractionNotAllowed(PDFInterpreterError): pass
def process_pdf(rsrc, device, fp, pagenos=None, maxpages=0, password=''): def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password=''):
doc = PDFDocument() doc = PDFDocument()
parser = PDFParser(fp) parser = PDFParser(fp)
parser.set_document(doc) parser.set_document(doc)
@ -801,7 +816,7 @@ def process_pdf(rsrc, device, fp, pagenos=None, maxpages=0, password=''):
doc.initialize(password) doc.initialize(password)
if not doc.is_extractable: if not doc.is_extractable:
raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp) raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
interpreter = PDFPageInterpreter(rsrc, device) interpreter = PDFPageInterpreter(rsrcmgr, device)
for (pageno,page) in enumerate(doc.get_pages()): for (pageno,page) in enumerate(doc.get_pages()):
if pagenos and (pageno not in pagenos): continue if pagenos and (pageno not in pagenos): continue
interpreter.process_page(page) interpreter.process_page(page)

View File

@ -94,7 +94,7 @@ class PDFXRef(PDFBaseXRef):
(pos, genno, use) = f (pos, genno, use) = f
if use != 'n': continue if use != 'n': continue
self.offsets[objid] = (int(genno), long(pos)) self.offsets[objid] = (int(genno), long(pos))
if debug: if 1 <= debug:
print >>stderr, 'xref objects:', self.offsets print >>stderr, 'xref objects:', self.offsets
self.load_trailer(parser) self.load_trailer(parser)
return return
@ -178,7 +178,7 @@ class PDFXRefStream(PDFBaseXRef):
self.data = stream.get_data() self.data = stream.get_data()
self.entlen = self.fl1+self.fl2+self.fl3 self.entlen = self.fl1+self.fl2+self.fl3
self.trailer = stream.attrs self.trailer = stream.attrs
if debug: if 1 <= debug:
print >>stderr, ('xref stream: objid=%s, fields=%d,%d,%d' % print >>stderr, ('xref stream: objid=%s, fields=%d,%d,%d' %
(', '.join(map(repr, self.objid_ranges)), (', '.join(map(repr, self.objid_ranges)),
self.fl1, self.fl2, self.fl3)) self.fl1, self.fl2, self.fl3))
@ -650,7 +650,7 @@ class PDFParser(PSStackParser):
objlen += len(line) objlen += len(line)
data += line data += line
self.seek(pos+objlen) self.seek(pos+objlen)
if 1 <= self.debug: if 2 <= self.debug:
print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \ print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
(pos, objlen, dic, data[:10]) (pos, objlen, dic, data[:10])
obj = PDFStream(dic, data, self.doc.decipher) obj = PDFStream(dic, data, self.doc.decipher)

View File

@ -59,7 +59,7 @@ def main(argv):
PDFPageInterpreter.debug = debug PDFPageInterpreter.debug = debug
PDFDevice.debug = debug PDFDevice.debug = debug
# #
rsrc = PDFResourceManager() rsrcmgr = PDFResourceManager()
if not outtype: if not outtype:
outtype = 'text' outtype = 'text'
if outfile: if outfile:
@ -74,18 +74,18 @@ def main(argv):
else: else:
outfp = sys.stdout outfp = sys.stdout
if outtype == 'text': if outtype == 'text':
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams) device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
elif outtype == 'xml': elif outtype == 'xml':
device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams, outdir=outdir) device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
elif outtype == 'html': elif outtype == 'html':
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir) device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir)
elif outtype == 'tag': elif outtype == 'tag':
device = TagExtractor(rsrc, outfp, codec=codec) device = TagExtractor(rsrcmgr, outfp, codec=codec)
else: else:
return usage() return usage()
for fname in args: for fname in args:
fp = file(fname, 'rb') fp = file(fname, 'rb')
process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages, password=password) process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password)
fp.close() fp.close()
device.close() device.close()
outfp.close() outfp.close()