From 94859ea42854d7c32df0d416867bc27e3772868b Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Sun, 20 Jan 2008 04:44:16 +0000 Subject: [PATCH] add non-strict mode. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@16 1aa58f4a-7d42-0410-adbc-911cccaed67c --- pdfinterp.py | 103 ++++++++++++++++++++++++------------------- pdfparser.py | 120 +++++++++++++++++++++++++++++++++------------------ psparser.py | 70 ++++++++++++++++++++---------- 3 files changed, 184 insertions(+), 109 deletions(-) diff --git a/pdfinterp.py b/pdfinterp.py index 2b611ee..9f3ca8c 100644 --- a/pdfinterp.py +++ b/pdfinterp.py @@ -7,7 +7,7 @@ try: except ImportError: from StringIO import StringIO from psparser import PSException, PSSyntaxError, PSTypeError, \ - PSStackParser, PSLiteral, PSKeyword, \ + PSStackParser, PSLiteral, PSKeyword, STRICT, \ PSLiteralTable, PSKeywordTable, literal_name, keyword_name from pdfparser import PDFException, PDFStream, PDFObjRef, resolve1, \ int_value, float_value, num_value, \ @@ -84,14 +84,14 @@ class PDFFont: def __init__(self, descriptor, widths, default_width=None): self.descriptor = descriptor self.widths = widths - self.fontname = descriptor['FontName'] + self.fontname = descriptor.get('FontName', 'unknown') if isinstance(self.fontname, PSLiteral): self.fontname = literal_name(self.fontname) - self.ascent = descriptor['Ascent'] - self.descent = descriptor['Descent'] + self.ascent = num_value(descriptor.get('Ascent', 0)) + self.descent = num_value(descriptor.get('Descent', 0)) self.default_width = default_width or descriptor.get('MissingWidth', 0) - self.leading = descriptor.get('Leading', 0) - self.bbox = list_value(descriptor['FontBBox']) + self.leading = num_value(descriptor.get('Leading', 0)) + self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0))) return def __repr__(self): @@ -155,20 +155,20 @@ class PDFSimpleFont(PDFFont): class PDFType1Font(PDFSimpleFont): def __init__(self, spec): - if 'BaseFont' not in spec: - raise PDFFontError('BaseFont is missing') - self.basefont = literal_name(spec['BaseFont']) + try: + self.basefont = literal_name(spec['BaseFont']) + except KeyError: + if STRICT: + raise PDFFontError('BaseFont is missing') + self.basefont = 'unknown' try: (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont) except KeyError: - try: - descriptor = dict_value(spec['FontDescriptor']) - firstchar = int_value(spec['FirstChar']) - lastchar = int_value(spec['LastChar']) - widths = dict( (i+firstchar,w) for (i,w) - in enumerate(list_value(spec['Widths'])) ) - except KeyError, k: - raise PDFFontError('%s is missing' % k) + descriptor = dict_value(spec.get('FontDescriptor', {})) + firstchar = int_value(spec.get('FirstChar', 0)) + lastchar = int_value(spec.get('LastChar', 255)) + widths = list_value(spec.get('Widths', [0]*256)) + widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) ) PDFSimpleFont.__init__(self, descriptor, widths, spec) return @@ -179,13 +179,10 @@ class PDFTrueTypeFont(PDFType1Font): # PDFType3Font class PDFType3Font(PDFSimpleFont): def __init__(self, spec): - try: - firstchar = int_value(spec['FirstChar']) - lastchar = int_value(spec['LastChar']) - widths = dict( (i+firstchar,w) for (i,w) - in enumerate(list_value(spec['Widths'])) ) - except KeyError, k: - raise PDFFontError('%s is missing' % k) + firstchar = int_value(spec.get('FirstChar', 0)) + lastchar = int_value(spec.get('LastChar', 0)) + widths = list_value(spec.get('Widths', [0]*256)) + widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths)) if 'FontDescriptor' in spec: descriptor = dict_value(spec['FontDescriptor']) else: @@ -215,7 +212,8 @@ class TrueTypeFont: return def create_cmap(self): - if 'cmap' not in self.tables: raise TrueTypeFont.CMapNotFound + if 'cmap' not in self.tables: + raise TrueTypeFont.CMapNotFound (base_offset, length) = self.tables['cmap'] fp = self.fp fp.seek(base_offset) @@ -274,15 +272,15 @@ class TrueTypeFont: class PDFCIDFont(PDFFont): def __init__(self, spec): - if 'BaseFont' not in spec: - raise PDFFontError('BaseFont is missing') try: - self.cidsysteminfo = dict_value(spec['CIDSystemInfo']) - self.cidcoding = '%s-%s' % (self.cidsysteminfo['Registry'], - self.cidsysteminfo['Ordering']) + self.basefont = literal_name(spec['BaseFont']) except KeyError: - raise PDFFontError('CIDSystemInfo not properly defined.') - self.basefont = literal_name(spec['BaseFont']) + if STRICT: + raise PDFFontError('BaseFont is missing') + self.basefont = 'unknown' + self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {})) + self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'), + self.cidsysteminfo.get('Ordering', 'unknown')) self.cmap = CMapDB.get_cmap(literal_name(spec['Encoding'])) descriptor = dict_value(spec['FontDescriptor']) ttf = None @@ -391,11 +389,16 @@ class PDFResourceManager: if objid and objid in self.fonts: font = self.fonts[objid] else: - assert spec['Type'] == LITERAL_FONT + if STRICT: + if spec['Type'] != LITERAL_FONT: + raise PDFFontError('Type is not /Font') # Create a Font object. - if 'Subtype' not in spec: - raise PDFFontError('Font Subtype is not specified.') - subtype = literal_name(spec['Subtype']) + if 'Subtype' in spec: + subtype = literal_name(spec['Subtype']) + else: + if STRICT: + raise PDFFontError('Font Subtype is not specified.') + subtype = 'Type1' if subtype in ('Type1', 'MMType1'): # Type1 Font font = PDFType1Font(spec) @@ -411,14 +414,16 @@ class PDFResourceManager: elif subtype == 'Type0': # Type0 Font dfonts = list_value(spec['DescendantFonts']) - assert len(dfonts) == 1 + assert dfonts subspec = dict_value(dfonts[0]).copy() for k in ('Encoding', 'ToUnicode'): if k in spec: subspec[k] = resolve1(spec[k]) font = self.get_font(None, subspec) else: - raise PDFFontError('Invalid Font: %r' % spec) + if STRICT: + raise PDFFontError('Invalid Font: %r' % spec) + font = PDFType1Font(spec) # this is so wrong! if objid: self.fonts[objid] = font return font @@ -480,14 +485,17 @@ class PDFContentParser(PSStackParser): objs = self.partobj (type0, self.partobj) = self.context.pop() if len(objs) % 2 != 0: - raise PSTypeError('invalid dictionary construct: %r' % objs) + if STRICT: + raise PSTypeError('invalid dictionary construct: %r' % objs) dic = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) ) pos += len('ID ') self.fp.seek(pos) - data = self.fp.read(8192) # XXX how do we know the real length other than scanning? - m = self.EOIPAT.search(data) - assert m + data = '' + while 1: + data += self.fp.read(4096) + m = self.EOIPAT.search(data) + if m: break objlen = m.start(0) obj = PDFStream(dic, data[:objlen]) self.push(obj) @@ -731,7 +739,9 @@ class PDFPageInterpreter: try: self.textstate.font = self.fontmap[literal_name(fontid)] except KeyError: - raise PDFInterpreterError('Undefined font id: %r' % fontid) + if STRICT: + raise PDFInterpreterError('Undefined font id: %r' % fontid) + return self.textstate.fontsize = fontsize return # setrendering @@ -816,7 +826,9 @@ class PDFPageInterpreter: try: xobj = stream_value(self.xobjmap[xobjid]) except KeyError: - raise PDFInterpreterError('Undefined xobject id: %r' % xobjid) + if STRICT: + raise PDFInterpreterError('Undefined xobject id: %r' % xobjid) + return if xobj.dic['Subtype'] == LITERAL_FORM: if 1 <= self.debug: print >>stderr, 'Processing xobj: %r' % xobj @@ -897,7 +909,8 @@ class PDFPageInterpreter: print >>stderr, 'exec: %s' % (obj.name) func() else: - raise PDFInterpreterError('unknown operator: %r' % obj.name) + if STRICT: + raise PDFInterpreterError('unknown operator: %r' % obj.name) else: self.push(obj) return diff --git a/pdfparser.py b/pdfparser.py index 96cc542..b12742b 100755 --- a/pdfparser.py +++ b/pdfparser.py @@ -24,7 +24,7 @@ from utils import choplist, nunpack from psparser import PSException, PSSyntaxError, PSTypeError, \ PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \ literal_name, keyword_name, \ - PSStackParser + PSStackParser, STRICT ## PDF Exceptions @@ -52,7 +52,8 @@ class PDFObjRef: def __init__(self, doc, objid, genno): if objid == 0: - raise PDFValueError('objid cannot be 0.') + if STRICT: + raise PDFValueError('objid cannot be 0.') self.doc = doc self.objid = objid #self.genno = genno # Never used. @@ -94,43 +95,57 @@ def resolveall(x): def int_value(x): x = resolve1(x) if not isinstance(x, int): - raise PDFTypeError('integer required: %r' % x) + if STRICT: + raise PDFTypeError('integer required: %r' % x) + return 0 return x def float_value(x): x = resolve1(x) if not isinstance(x, float): - raise PDFTypeError('float required: %r' % x) + if STRICT: + raise PDFTypeError('float required: %r' % x) + return 0.0 return x def num_value(x): x = resolve1(x) if not (isinstance(x, int) or isinstance(x, float)): - raise PDFTypeError('int or float required: %r' % x) + if STRICT: + raise PDFTypeError('int or float required: %r' % x) + return 0 return x def str_value(x): x = resolve1(x) if not isinstance(x, str): - raise PDFTypeError('string required: %r' % x) + if STRICT: + raise PDFTypeError('string required: %r' % x) + return '' return x def list_value(x): x = resolve1(x) if not (isinstance(x, list) or isinstance(x, tuple)): - raise PDFTypeError('list required: %r' % x) + if STRICT: + raise PDFTypeError('list required: %r' % x) + return [] return x def dict_value(x): x = resolve1(x) if not isinstance(x, dict): - raise PDFTypeError('dict required: %r' % x) + if STRICT: + raise PDFTypeError('dict required: %r' % x) + return {} return x def stream_value(x): x = resolve1(x) if not isinstance(x, PDFStream): - raise PDFTypeError('stream required: %r' % x) + if STRICT: + raise PDFTypeError('stream required: %r' % x) + return PDFStream({}, '') return x @@ -186,7 +201,8 @@ class PDFStream: ent0 = ent1 data = buf else: - raise PDFValueError('Invalid filter spec: %r' % f) + if STRICT: + raise PDFValueError('Invalid filter spec: %r' % f) self.data = data self.rawdata = None return @@ -235,12 +251,15 @@ class PDFXRef: while 1: (_, line) = parser.nextline() if not line: - raise PDFSyntaxError('premature eof: %r' % parser) + if STRICT: + raise PDFSyntaxError('premature eof: %r' % parser) + break line = line.strip() f = line.split(' ') if len(f) != 2: if line != 'trailer': - raise PDFSyntaxError('trailer not found: %r: line=%r' % (parser, line)) + if STRICT: + raise PDFSyntaxError('trailer not found: %r: line=%r' % (parser, line)) break (start, nobjs) = map(long, f) self.objid0 = start @@ -250,7 +269,9 @@ class PDFXRef: (_, line) = parser.nextline() f = line.strip().split(' ') if len(f) != 3: - raise PDFSyntaxError('invalid xref format: %r, line=%r' % (parser, line)) + if STRICT: + raise PDFSyntaxError('invalid xref format: %r, line=%r' % (parser, line)) + continue (pos, genno, use) = f self.offsets.append((int(genno), long(pos), use)) # read trailer @@ -259,10 +280,11 @@ class PDFXRef: def getpos(self, objid): if objid < self.objid0 or self.objid1 <= objid: - raise IndexError + raise IndexError(objid) (genno, pos, use) = self.offsets[objid-self.objid0] if use != 'n': - raise PDFValueError('unused objid=%r' % objid) + if STRICT: + raise PDFValueError('unused objid=%r' % objid) return (None, pos) @@ -272,7 +294,8 @@ class PDFXRefStream: def __init__(self, parser): (objid, genno, _, stream) = list_value(parser.parse()) - assert stream.dic['Type'] == LITERAL_XREF + if STRICT: + assert stream.dic['Type'] == LITERAL_XREF size = stream.dic['Size'] (start, nobjs) = stream.dic.get('Index', (0,size)) self.objid0 = start @@ -285,7 +308,7 @@ class PDFXRefStream: def getpos(self, objid): if objid < self.objid0 or self.objid1 <= objid: - raise IndexError + raise IndexError(objid) i = self.entlen * (objid-self.objid0) ent = self.data[i:i+self.entlen] f1 = nunpack(ent[:self.fl1], 1) @@ -334,7 +357,7 @@ class PDFDocument: return def getobj(self, objid): - assert self.xrefs + #assert self.xrefs if objid in self.objs: obj = self.objs[objid] else: @@ -345,13 +368,20 @@ class PDFDocument: except IndexError: pass else: - raise PDFValueError('Cannot locate objid=%r' % objid) + if STRICT: + raise PDFValueError('Cannot locate objid=%r' % objid) + return None if strmid: stream = stream_value(self.getobj(strmid)) if stream.dic['Type'] != LITERAL_OBJSTM: - raise PDFSyntaxError('Not a stream object: %r' % stream) - if 'N' not in stream.dic: - raise PDFSyntaxError('N is not defined: %r' % stream) + if STRICT: + raise PDFSyntaxError('Not a stream object: %r' % stream) + try: + n = stream.dic['N'] + except KeyError: + if STRICT: + raise PDFSyntaxError('N is not defined: %r' % stream) + n = 0 if strmid in self.parsed_objs: objs = self.parsed_objs[stream] else: @@ -363,8 +393,10 @@ class PDFDocument: else: prevpos = self.parser.seek(index) seq = list_value(self.parser.parse()) - if not (len(seq) == 4 and seq[0] == objid and seq[2] == KEYWORD_OBJ): - raise PDFSyntaxError('invalid stream spec: %r' % seq) + if not (4 <= len(seq) and seq[0] == objid and seq[2] == KEYWORD_OBJ): + if STRICT: + raise PDFSyntaxError('invalid stream spec: %r' % seq) + return None obj = seq[3] self.parser.seek(prevpos) if 2 <= self.debug: @@ -373,7 +405,7 @@ class PDFDocument: return obj def get_pages(self, debug=0): - assert self.xrefs + #assert self.xrefs def search(obj, parent): tree = dict_value(obj).copy() for (k,v) in parent.iteritems(): @@ -397,7 +429,8 @@ class PDFDocument: self.root = root self.catalog = dict_value(self.root) if self.catalog['Type'] != LITERAL_CATALOG: - raise PDFValueError('Catalog not found!') + if STRICT: + raise PDFValueError('Catalog not found!') self.outline = self.catalog.get('Outline') return @@ -437,24 +470,24 @@ class PDFParser(PSStackParser): # stream object (dic,) = self.pop(1) dic = dict_value(dic) - if 'Length' not in dic: - raise PDFValueError('/Length is undefined: %r' % dic) - objlen = int_value(dic['Length']) + try: + objlen = int_value(dic['Length']) + except KeyError: + if STRICT: + raise PDFValueError('/Length is undefined: %r' % dic) + objlen = 0 self.seek(pos) (_, line) = self.nextline() # 'stream' - self.fp.seek(pos+len(line)) + pos += len(line) + self.fp.seek(pos) data = self.fp.read(objlen) - self.seek(pos+len(line)+objlen) + self.seek(pos+objlen) while 1: (linepos, line) = self.nextline() - if not line: - raise PDFSyntaxError('premature eof, need endstream: linepos=%d, line=%r' % - (linepos, line)) - if line.strip(): - if not line.startswith('endstream'): - raise PDFSyntaxError('need endstream: linepos=%d, line=%r' % - (linepos, line)) + if not line or line.startswith('endstream'): break + objlen += len(line) + data += line if 1 <= self.debug: print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \ (pos, objlen, dic, data[:10]) @@ -477,7 +510,9 @@ class PDFParser(PSStackParser): if line: prev = line else: - raise PDFSyntaxError('startxref not found!') + if STRICT: + raise PDFSyntaxError('startxref not found!') + prev = 0 if 1 <= self.debug: print >>stderr, 'xref found: pos=%r' % prev self.seek(long(prev)) @@ -495,10 +530,11 @@ class PDFParser(PSStackParser): # XRefStream: PDF-1.5 self.seek(linepos) xref = PDFXRefStream(self) - elif line.strip() != 'xref': - raise PDFSyntaxError('xref not found: linepos=%d, line=%r' % - (linepos, line)) else: + if line.strip() != 'xref': + if STRICT: + raise PDFSyntaxError('xref not found: linepos=%d, line=%r' % + (linepos, line)) xref = PDFXRef(self) yield xref trailer = xref.trailer diff --git a/psparser.py b/psparser.py index 181b774..9ab2f9d 100644 --- a/psparser.py +++ b/psparser.py @@ -3,6 +3,8 @@ import sys, re stderr = sys.stderr from utils import choplist +STRICT = 0 + ## PS Exceptions ## @@ -73,12 +75,18 @@ PSKeywordTable = PSSymbolTable(PSKeyword) def literal_name(x): if not isinstance(x, PSLiteral): - raise PSTypeError('literal required: %r' % x) + if STRICT: + raise PSTypeError('literal required: %r' % x) + else: + return str(x) return x.name def keyword_name(x): if not isinstance(x, PSKeyword): - raise PSTypeError('keyword required: %r' % x) + if STRICT: + raise PSTypeError('keyword required: %r' % x) + else: + return str(x) return x.name @@ -237,23 +245,30 @@ class PSBaseParser: s += s1[-1:] (linepos, line) = self.nextline() if not line: - raise PSSyntaxError('end inside string: linepos=%d, line=%r' % - (linepos, line)) + if STRICT: + raise PSSyntaxError('end inside string: linepos=%d, line=%r' % + (linepos, line)) + break charpos = 0 elif charpos == len(line): s += s1 (linepos, line) = self.nextline() if not line: - raise PSSyntaxError('end inside string: linepos=%d, line=%r' % - (linepos, line)) + if STRICT: + raise PSSyntaxError('end inside string: linepos=%d, line=%r' % + (linepos, line)) + break charpos = 0 else: s += s1 break - if line[charpos] != ')': - raise PSSyntaxError('no close paren: linepos=%d, line=%r' % - (linepos, line)) - charpos += 1 + if line[charpos] == ')': + charpos += 1 + else: + if STRICT: + raise PSSyntaxError('no close paren: linepos=%d, line=%r' % + (linepos, line)) + pass def convesc(m): x = m.group(0) if x[1:].isdigit(): @@ -271,10 +286,12 @@ class PSBaseParser: # hex string object ms = self.STRING_HEX.match(line, charpos) charpos = ms.end(0) - if line[charpos] != '>': - raise PSSyntaxError('no close paren: linepos=%d, line=%r' % - (linepos, line)) - charpos += 1 + if line[charpos] == '>': + charpos += 1 + else: + if STRICT: + raise PSSyntaxError('no close paren: linepos=%d, line=%r' % + (linepos, line)) def convhex(m1): return chr(int(m1.group(0), 16)) s = self.STRING_HEX_SUB.sub(convhex, ms.group(0)) @@ -341,7 +358,8 @@ class PSStackParser(PSBaseParser): Pop N objects from the stack. ''' if len(self.partobj) < n: - raise PSSyntaxError('stack too short < %d' % n) + if STRICT: + raise PSSyntaxError('stack too short < %d' % n) r = self.partobj[-n:] self.partobj = self.partobj[:-n] return r @@ -366,12 +384,18 @@ class PSStackParser(PSBaseParser): return def endobj(type1): - assert self.context + if not self.context: + if STRICT: + raise PSTypeError('stack empty.') obj = self.partobj - (type0, self.partobj) = self.context.pop() - if type0 != type1: - raise PSTypeError('type mismatch: %r(%r) != %r(%r)' % - (type0, self.partobj, type1, obj)) + (type0, partobj) = self.context[-1] + if type0 == type1: + self.partobj = partobj + self.context.pop() + else: + if STRICT: + raise PSTypeError('type mismatch: %r(%r) != %r(%r)' % + (type0, self.partobj, type1, obj)) return obj startobj('o') @@ -407,7 +431,8 @@ class PSStackParser(PSBaseParser): # end dictionary objs = endobj('d') if len(objs) % 2 != 0: - raise PSTypeError('invalid dictionary construct: %r' % objs) + if STRICT: + raise PSTypeError('invalid dictionary construct: %r' % objs) d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) ) if 2 <= self.debug: print >>stderr, 'end dict: %r' % d @@ -415,4 +440,5 @@ class PSStackParser(PSBaseParser): elif self.do_token(pos, t): break - return endobj('o') + objs = endobj('o') + return objs