From 97848409e5eb7b34d6d4f54f9b012ea3aa846744 Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Sat, 24 Apr 2010 04:32:03 +0000 Subject: [PATCH] fix xobject resources bug, thanks to Jose Maria git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@209 1aa58f4a-7d42-0410-adbc-911cccaed67c --- pdfminer/converter.py | 24 ++++++++++++------------ pdfminer/pdfdevice.py | 4 ++-- pdfminer/pdffont.py | 6 +++--- pdfminer/pdfinterp.py | 37 ++++++++++++++++++++++++++----------- pdfminer/pdfparser.py | 6 +++--- tools/pdf2txt.py | 12 ++++++------ 6 files changed, 52 insertions(+), 37 deletions(-) diff --git a/pdfminer/converter.py b/pdfminer/converter.py index f8ed681..8fd7acf 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -14,8 +14,8 @@ from utils import enc, bbox2str, create_bmp ## class PDFPageAggregator(PDFTextDevice): - def __init__(self, rsrc, pageno=1, laparams=None): - PDFTextDevice.__init__(self, rsrc) + def __init__(self, rsrcmgr, pageno=1, laparams=None): + PDFTextDevice.__init__(self, rsrcmgr) self.laparams = laparams self.pageno = pageno self.stack = [] @@ -100,8 +100,8 @@ class PDFPageAggregator(PDFTextDevice): ## class PDFConverter(PDFPageAggregator): - def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None): - PDFPageAggregator.__init__(self, rsrc, pageno=pageno, laparams=laparams) + def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None): + PDFPageAggregator.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams) self.outfp = outfp self.codec = codec return @@ -138,9 +138,9 @@ class PDFConverter(PDFPageAggregator): ## class TextConverter(PDFConverter): - def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, + def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, showpageno=False): - PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams) + PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) self.showpageno = showpageno return @@ -169,9 +169,9 @@ class TextConverter(PDFConverter): ## class HTMLConverter(PDFConverter): - def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, + def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, scale=1, showpageno=True, pagepad=50, outdir=None): - PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams) + PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) self.showpageno = showpageno self.pagepad = pagepad self.outdir = outdir @@ -261,8 +261,8 @@ class HTMLConverter(PDFConverter): ## class XMLConverter(PDFConverter): - def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, outdir=None): - PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams) + def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, outdir=None): + PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) self.outdir = outdir self.outfp.write('\n' % codec) self.outfp.write('\n') @@ -349,8 +349,8 @@ class XMLConverter(PDFConverter): ## class TagExtractor(PDFDevice): - def __init__(self, rsrc, outfp, codec='utf-8'): - PDFDevice.__init__(self, rsrc) + def __init__(self, rsrcmgr, outfp, codec='utf-8'): + PDFDevice.__init__(self, rsrcmgr) self.outfp = outfp self.codec = codec self.pageno = 0 diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py index ab0b829..b0b39aa 100644 --- a/pdfminer/pdfdevice.py +++ b/pdfminer/pdfdevice.py @@ -11,8 +11,8 @@ class PDFDevice(object): debug = 0 - def __init__(self, rsrc): - self.rsrc = rsrc + def __init__(self, rsrcmgr): + self.rsrcmgr = rsrcmgr self.ctm = None return diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 01ac3b3..10d72db 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -419,7 +419,7 @@ class PDFSimpleFont(PDFFont): # PDFType1Font class PDFType1Font(PDFSimpleFont): - def __init__(self, rsrc, spec): + def __init__(self, rsrcmgr, spec): try: self.basefont = literal_name(spec['BaseFont']) except KeyError: @@ -449,7 +449,7 @@ class PDFTrueTypeFont(PDFType1Font): # PDFType3Font class PDFType3Font(PDFSimpleFont): - def __init__(self, rsrc, spec): + def __init__(self, rsrcmgr, spec): firstchar = int_value(spec.get('FirstChar', 0)) lastchar = int_value(spec.get('LastChar', 0)) widths = list_value(spec.get('Widths', [0]*256)) @@ -472,7 +472,7 @@ class PDFType3Font(PDFSimpleFont): # PDFCIDFont class PDFCIDFont(PDFFont): - def __init__(self, rsrc, spec): + def __init__(self, rsrcmgr, spec): try: self.basefont = literal_name(spec['BaseFont']) except KeyError: diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index 0695741..2b02a8e 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -157,6 +157,8 @@ class PDFResourceManager(object): if objid and objid in self.fonts: font = self.fonts[objid] else: + if 2 <= self.debug: + print >>stderr, 'get_font: create: objid=%r, spec=%r' % (objid, spec) if STRICT: if spec['Type'] is not LITERAL_FONT: raise PDFFontError('Type is not /Font') @@ -297,17 +299,18 @@ class PDFPageInterpreter(object): debug = 0 - def __init__(self, rsrc, device): - self.rsrc = rsrc + def __init__(self, rsrcmgr, device): + self.rsrcmgr = rsrcmgr self.device = device return def dup(self): - return PDFPageInterpreter(self.rsrc, self.device) + return PDFPageInterpreter(self.rsrcmgr, self.device) # init_resources(resources): # Prepare the fonts and XObjects listed in the Resource attribute. def init_resources(self, resources): + self.resources = resources self.fontmap = {} self.xobjmap = {} self.csmap = PREDEFINED_COLORSPACE.copy() @@ -324,7 +327,7 @@ class PDFPageInterpreter(object): else: return PREDEFINED_COLORSPACE[name] for (k,v) in dict_value(resources).iteritems(): - if 1 <= self.debug: + if 2 <= self.debug: print >>stderr, 'Resource: %r: %r' % (k,v) if k == 'Font': for (fontid,spec) in dict_value(v).iteritems(): @@ -332,12 +335,12 @@ class PDFPageInterpreter(object): if isinstance(spec, PDFObjRef): objid = spec.objid spec = dict_value(spec) - self.fontmap[fontid] = self.rsrc.get_font(objid, spec) + self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec) elif k == 'ColorSpace': for (csid,spec) in dict_value(v).iteritems(): self.csmap[csid] = get_colorspace(resolve1(spec)) elif k == 'ProcSet': - self.rsrc.get_procset(list_value(v)) + self.rsrcmgr.get_procset(list_value(v)) elif k == 'XObject': for (xobjid,xobjstrm) in dict_value(v).iteritems(): self.xobjmap[xobjid] = xobjstrm @@ -625,6 +628,7 @@ class PDFPageInterpreter(object): try: self.textstate.font = self.fontmap[literal_name(fontid)] except KeyError: + raise if STRICT: raise PDFInterpreterError('Undefined Font id: %r' % fontid) return @@ -669,6 +673,10 @@ class PDFPageInterpreter(object): # show-pos def do_TJ(self, seq): #print >>stderr, 'TJ(%r): %r' % (seq,self.textstate) + if self.textstate.font is None: + if STRICT: + raise PDFInterpreterError('No font specified!') + return self.device.render_string(self.textstate, seq) return # show @@ -716,8 +724,12 @@ class PDFPageInterpreter(object): interpreter = self.dup() bbox = list_value(xobj['BBox']) matrix = list_value(xobj.get('Matrix', MATRIX_IDENTITY)) + # According to PDF reference 1.7 section 4.9.1, XObjects in + # earlier PDFs (prior to v1.2) use the page's Resources entry + # instead of having their own Resources entry. + resources = dict_value(xobj.get('Resources')) or self.resources.copy() self.device.begin_figure(xobjid, bbox, matrix) - interpreter.render_contents(dict_value(xobj.get('Resources')), [xobj], ctm=mult_matrix(matrix, self.ctm)) + interpreter.render_contents(resources, [xobj], ctm=mult_matrix(matrix, self.ctm)) self.device.end_figure(xobjid) elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj: self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY) @@ -749,6 +761,9 @@ class PDFPageInterpreter(object): # Render the content streams. # This method may be called recursively. def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY): + if 1 <= self.debug: + print >>stderr, ('render_contents: resources=%r, streams=%r, ctm=%r' % + (resources, streams, ctm)) self.init_resources(resources) self.init_state(ctm) self.execute(list_value(streams)) @@ -773,12 +788,12 @@ class PDFPageInterpreter(object): nargs = func.func_code.co_argcount-1 if nargs: args = self.pop(nargs) - if 1 <= self.debug: + if 2 <= self.debug: print >>stderr, 'exec: %s %r' % (name, args) if len(args) == nargs: func(*args) else: - if 1 <= self.debug: + if 2 <= self.debug: print >>stderr, 'exec: %s' % (name) func() else: @@ -793,7 +808,7 @@ class PDFPageInterpreter(object): ## class PDFTextExtractionNotAllowed(PDFInterpreterError): pass -def process_pdf(rsrc, device, fp, pagenos=None, maxpages=0, password=''): +def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password=''): doc = PDFDocument() parser = PDFParser(fp) parser.set_document(doc) @@ -801,7 +816,7 @@ def process_pdf(rsrc, device, fp, pagenos=None, maxpages=0, password=''): doc.initialize(password) if not doc.is_extractable: raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp) - interpreter = PDFPageInterpreter(rsrc, device) + interpreter = PDFPageInterpreter(rsrcmgr, device) for (pageno,page) in enumerate(doc.get_pages()): if pagenos and (pageno not in pagenos): continue interpreter.process_page(page) diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py index 87e6d9e..a7167a2 100644 --- a/pdfminer/pdfparser.py +++ b/pdfminer/pdfparser.py @@ -94,7 +94,7 @@ class PDFXRef(PDFBaseXRef): (pos, genno, use) = f if use != 'n': continue self.offsets[objid] = (int(genno), long(pos)) - if debug: + if 1 <= debug: print >>stderr, 'xref objects:', self.offsets self.load_trailer(parser) return @@ -178,7 +178,7 @@ class PDFXRefStream(PDFBaseXRef): self.data = stream.get_data() self.entlen = self.fl1+self.fl2+self.fl3 self.trailer = stream.attrs - if debug: + if 1 <= debug: print >>stderr, ('xref stream: objid=%s, fields=%d,%d,%d' % (', '.join(map(repr, self.objid_ranges)), self.fl1, self.fl2, self.fl3)) @@ -650,7 +650,7 @@ class PDFParser(PSStackParser): objlen += len(line) data += line self.seek(pos+objlen) - if 1 <= self.debug: + if 2 <= self.debug: print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \ (pos, objlen, dic, data[:10]) obj = PDFStream(dic, data, self.doc.decipher) diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index 13b4758..d3ad816 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -59,7 +59,7 @@ def main(argv): PDFPageInterpreter.debug = debug PDFDevice.debug = debug # - rsrc = PDFResourceManager() + rsrcmgr = PDFResourceManager() if not outtype: outtype = 'text' if outfile: @@ -74,18 +74,18 @@ def main(argv): else: outfp = sys.stdout if outtype == 'text': - device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams) + device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) elif outtype == 'xml': - device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams, outdir=outdir) + device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == 'html': - device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir) + device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir) elif outtype == 'tag': - device = TagExtractor(rsrc, outfp, codec=codec) + device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') - process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages, password=password) + process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password) fp.close() device.close() outfp.close()