From 366143361fe1a3f83932b7a7e95cd666c690ff2c Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Sun, 3 Feb 2008 09:36:34 +0000 Subject: [PATCH] Restructuring core lexical handlings. Fix several bugs. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@17 1aa58f4a-7d42-0410-adbc-911cccaed67c --- cmap.py | 83 +++--- pdf2txt.py | 12 +- pdfinterp.py | 227 ++++++++++------ pdfparser.py | 116 ++++---- psparser.py | 728 +++++++++++++++++++++++++++++++-------------------- 5 files changed, 712 insertions(+), 454 deletions(-) diff --git a/cmap.py b/cmap.py index 9d5285f..432ca64 100644 --- a/cmap.py +++ b/cmap.py @@ -3,7 +3,7 @@ import sys stderr = sys.stderr from struct import pack, unpack from utils import choplist, nunpack -from psparser import PSException, PSSyntaxError, PSTypeError, \ +from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \ PSLiteral, PSKeyword, literal_name, keyword_name, \ PSStackParser try: @@ -39,17 +39,17 @@ class CMap: return self def register_code2cid(self, code, cid): - assert isinstance(code, str) - assert isinstance(cid, int) - self.code2cid[code] = cid + if isinstance(code, str) and isinstance(cid, int): + self.code2cid[code] = cid return self def register_cid2code(self, cid, code): from glyphlist import charname2unicode - assert isinstance(cid, int) - if isinstance(code, PSLiteral): - code = pack('>H', charname2unicode[code.name]) - self.cid2code[cid] = code + if isinstance(cid, int): + if isinstance(code, PSLiteral): + self.cid2code[cid] = pack('>H', charname2unicode[code.name]) + elif isinstance(code, str): + self.cid2code[cid] = code return self def decode(self, bytes): @@ -195,7 +195,7 @@ class CMapDB: print >>stderr, 'Reading: CMap %r...' % fname cmap = CMap() fp = file(fname) - CMapParser(cmap, fp).parse() + CMapParser(cmap, fp, debug=klass.debug).run() fp.close() else: raise KeyError(cmapname) @@ -213,7 +213,14 @@ class CMapParser(PSStackParser): self.in_cmap = False return - def do_token(self, _, token): + def run(self): + try: + self.nextobject() + except PSEOF: + pass + return + + def do_keyword(self, pos, token): name = token.name if name == 'begincmap': self.in_cmap = True @@ -226,15 +233,15 @@ class CMapParser(PSStackParser): # if name == 'def': try: - (k,v) = self.pop(2) - self.cmap.attrs[literal_name(k)] = v + ((_,k),(_,v)) = self.pop(2) + self.cmap.attrs[str(k)] = v except PSSyntaxError: pass return if name == 'usecmap': try: - (cmapname,) = self.pop(1) + ((_,cmapname),) = self.pop(1) self.cmap.copycmap(CMapDB.get_cmap(literal_name(cmapname))) except PSSyntaxError: pass @@ -244,8 +251,6 @@ class CMapParser(PSStackParser): self.popall() return if name == 'endcodespacerange': - if 1 <= self.debug: - print >>stderr, 'codespace: %r' % self.partobj self.popall() return @@ -253,48 +258,45 @@ class CMapParser(PSStackParser): self.popall() return if name == 'endcidrange': - for (s,e,cid) in choplist(3, self.partobj): - assert isinstance(s, str) - assert isinstance(e, str) - assert isinstance(cid, int) - assert len(s) == len(e) + objs = [ obj for (_,obj) in self.popall() ] + for (s,e,cid) in choplist(3, objs): + if (not isinstance(s, str) or not isinstance(e, str) or + not isinstance(cid, int) or len(s) != len(e)): continue sprefix = s[:-4] eprefix = e[:-4] - assert sprefix == eprefix + if sprefix != eprefix: continue svar = s[-4:] evar = e[-4:] s1 = nunpack(svar) e1 = nunpack(evar) vlen = len(svar) - assert s1 <= e1 + #assert s1 <= e1 for i in xrange(e1-s1+1): x = sprefix+pack('>L',s1+i)[-vlen:] self.cmap.register_code2cid(x, cid+i) - self.popall() return if name == 'begincidchar': self.popall() return if name == 'endcidchar': - for (cid,code) in choplist(2, self.partobj): - assert isinstance(code, str) - assert isinstance(cid, str) - self.cmap.register_code2cid(code, nunpack(cid)) - self.popall() + objs = [ obj for (_,obj) in self.popall() ] + for (cid,code) in choplist(2, objs): + if isinstance(code, str) and isinstance(cid, str): + self.cmap.register_code2cid(code, nunpack(cid)) return if name == 'beginbfrange': self.popall() return if name == 'endbfrange': - for (s,e,code) in choplist(3, self.partobj): - assert isinstance(s, str) - assert isinstance(e, str) - assert len(s) == len(e) + objs = [ obj for (_,obj) in self.popall() ] + for (s,e,code) in choplist(3, objs): + if (not isinstance(s, str) or not isinstance(e, str) or + len(s) != len(e)): continue s1 = nunpack(s) e1 = nunpack(e) - assert s1 <= e1 + #assert s1 <= e1 if isinstance(code, list): for i in xrange(e1-s1+1): self.cmap.register_cid2code(s1+i, code[i]) @@ -306,29 +308,26 @@ class CMapParser(PSStackParser): for i in xrange(e1-s1+1): x = prefix+pack('>L',base+i)[-vlen:] self.cmap.register_cid2code(s1+i, x) - self.popall() return if name == 'beginbfchar': self.popall() return if name == 'endbfchar': - for (cid,code) in choplist(2, self.partobj): - assert isinstance(cid, str) - assert isinstance(code, str) - self.cmap.register_cid2code(nunpack(cid), code) - self.popall() + objs = [ obj for (_,obj) in self.popall() ] + for (cid,code) in choplist(2, objs): + if isinstance(cid, str) and isinstance(code, str): + self.cmap.register_cid2code(nunpack(cid), code) return if name == 'beginnotdefrange': self.popall() return if name == 'endnotdefrange': - if 1 <= self.debug: - print >>stderr, 'notdefrange: %r' % self.partobj self.popall() return - + + self.push((pos, token)) return diff --git a/pdf2txt.py b/pdf2txt.py index fd61d0e..9180067 100755 --- a/pdf2txt.py +++ b/pdf2txt.py @@ -13,8 +13,8 @@ from cmap import CMapDB ## class TextConverter(PDFDevice): - def __init__(self, outfp, rsrc, codec): - PDFDevice.__init__(self, rsrc) + def __init__(self, outfp, rsrc, codec, debug=0): + PDFDevice.__init__(self, rsrc, debug=debug) self.outfp = outfp self.codec = codec return @@ -22,7 +22,7 @@ class TextConverter(PDFDevice): def close(self): self.outfp.write('\n') return - + def begin_page(self, page): (x0,y0,x1,y1) = page.mediabox self.outfp.write('' % @@ -42,6 +42,10 @@ class TextConverter(PDFDevice): return def handle_undefined_char(self, cidcoding, cid): + if self.debug: + print >>stderr, 'undefined: %r, %r' % (cidcoding, cid) + #return unichr(cid) + #return unichr(cid+32) return def render_string(self, textstate, textmatrix, size, seq): @@ -81,7 +85,7 @@ class TextConverter(PDFDevice): # pdf2txt def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0): - device = TextConverter(outfp, rsrc, codec) + device = TextConverter(outfp, rsrc, codec, debug=debug) outfp.write('\n') doc = PDFDocument(debug=debug) fp = file(fname) diff --git a/pdfinterp.py b/pdfinterp.py index 9f3ca8c..7aa3f22 100644 --- a/pdfinterp.py +++ b/pdfinterp.py @@ -6,7 +6,7 @@ try: from cStringIO import StringIO except ImportError: from StringIO import StringIO -from psparser import PSException, PSSyntaxError, PSTypeError, \ +from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \ PSStackParser, PSLiteral, PSKeyword, STRICT, \ PSLiteralTable, PSKeywordTable, literal_name, keyword_name from pdfparser import PDFException, PDFStream, PDFObjRef, resolve1, \ @@ -45,6 +45,8 @@ LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding') LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray') LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB') LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK') +KEYWORD_BI = PSKeywordTable.intern('BI') +KEYWORD_ID = PSKeywordTable.intern('ID') KEYWORD_EI = PSKeywordTable.intern('EI') MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0) @@ -134,7 +136,7 @@ class PDFSimpleFont(PDFFont): if 'ToUnicode' in spec: strm = stream_value(spec['ToUnicode']) self.ucs2_cmap = CMap() - CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse() + CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run() PDFFont.__init__(self, descriptor, widths) return @@ -292,7 +294,7 @@ class PDFCIDFont(PDFFont): if 'ToUnicode' in spec: strm = stream_value(spec['ToUnicode']) self.ucs2_cmap = CMap() - CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse() + CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run() elif self.cidcoding == 'Adobe-Identity': if ttf: try: @@ -433,8 +435,9 @@ class PDFResourceManager: ## class PDFDevice: - def __init__(self, rsrc): + def __init__(self, rsrc, debug=0): self.rsrc = rsrc + self.debug = debug self.ctm = None return @@ -465,47 +468,91 @@ class PDFDevice: ## class PDFContentParser(PSStackParser): - def __init__(self, fp, debug=0): - PSStackParser.__init__(self, fp, debug=debug) + def __init__(self, streams, debug=0): + self.streams = streams + self.istream = 0 + PSStackParser.__init__(self, None, debug=debug) return def __repr__(self): return '' % self.linepos - EOIPAT = re.compile(r'\nEI\W') - def do_token(self, pos, token): - name = keyword_name(token) + def fillfp(self): + if not self.fp: + if self.istream < len(self.streams): + strm = stream_value(self.streams[self.istream]) + self.istream += 1 + else: + raise PSEOF + self.fp = StringIO(strm.get_data()) + return - if name == 'BI': + def seek(self, pos): + self.fillfp() + PSStackParser.seek(self, pos) + return + + def fillbuf(self): + if self.charpos < len(self.buf): return + while 1: + self.fillfp() + self.bufpos = self.fp.tell() + self.buf = self.fp.read(self.BUFSIZ) + if self.buf: break + self.fp = None + self.charpos = 0 + return + + def get_inline_data(self, pos, target='EI '): + self.seek(pos) + i = 0 + data = '' + while i < len(target): + self.fillbuf() + if i: + c = self.buf[self.charpos] + data += c + self.charpos += 1 + if c == target[i]: + i += 1 + else: + i = 0 + else: + try: + j = self.buf.index(target[0], self.charpos) + #print 'found', (0, self.buf[j:j+10]) + data += self.buf[self.charpos:j] + self.charpos = j+1 + i = 1 + except ValueError: + data += self.buf[self.charpos:] + self.charpos = len(self.buf) + data = data[:-len(target)] # strip the last part + return (pos, data) + + def flush(self): + self.add_results(*self.popall()) + return + + def do_keyword(self, pos, token): + if token == KEYWORD_BI: # inline image within a content stream - self.context.append(('BI', self.partobj)) - self.partobj = [] - - elif name == 'ID': - objs = self.partobj - (type0, self.partobj) = self.context.pop() - if len(objs) % 2 != 0: - if STRICT: + self.start_type(pos, 'inline') + elif token == KEYWORD_ID: + try: + (_, objs) = self.end_type('inline') + if len(objs) % 2 != 0: raise PSTypeError('invalid dictionary construct: %r' % objs) - dic = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) ) - pos += len('ID ') - self.fp.seek(pos) - # XXX how do we know the real length other than scanning? - data = '' - while 1: - data += self.fp.read(4096) - m = self.EOIPAT.search(data) - if m: break - objlen = m.start(0) - obj = PDFStream(dic, data[:objlen]) - self.push(obj) - self.seek(pos+objlen+len('\nEI')) - self.push(KEYWORD_EI) - + d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) ) + (pos, data) = self.get_inline_data(pos+len('ID ')) + obj = PDFStream(d, data) + self.push((pos, obj)) + self.push((pos, KEYWORD_EI)) + except PSTypeError: + if STRICT: raise else: - self.push(token) - - return False + self.push((pos, token)) + return ## Interpreter @@ -542,10 +589,44 @@ class PDFPageInterpreter: self.debug = debug return - def initpage(self, ctm): + def init_resources(self, resources): self.fontmap = {} self.xobjmap = {} self.csmap = PREDEFINED_COLORSPACE.copy() + # Handle resource declarations. + def get_colorspace(spec): + if isinstance(spec, list): + name = literal_name(spec[0]) + else: + name = literal_name(spec) + if name == 'ICCBased': + return ColorSpace(name, stream_value(spec[1]).dic['N']) + elif name == 'DeviceN': + return ColorSpace(name, len(list_value(spec[1]))) + else: + return PREDEFINED_COLORSPACE[name] + if resources: + for (k,v) in dict_value(resources).iteritems(): + if 1 <= self.debug: + print >>stderr, 'Resource: %r: %r' % (k,v) + if k == 'Font': + for (fontid,spec) in dict_value(v).iteritems(): + objid = None + if isinstance(spec, PDFObjRef): + objid = spec.objid + spec = dict_value(spec) + self.fontmap[fontid] = self.rsrc.get_font(objid, spec) + elif k == 'ColorSpace': + for (csid,spec) in dict_value(v).iteritems(): + self.csmap[csid] = get_colorspace(resolve1(spec)) + elif k == 'ProcSet': + self.rsrc.get_procset(list_value(v)) + elif k == 'XObject': + for (xobjid,xobjstrm) in dict_value(v).iteritems(): + self.xobjmap[xobjid] = xobjstrm + return + + def init_state(self, ctm): # gstack: stack for graphical states. self.gstack = [] self.ctm = ctm @@ -554,8 +635,9 @@ class PDFPageInterpreter: # argstack: stack for command arguments. self.argstack = [] # set some global states. - self.scs = None - self.ncs = None + self.scs = self.ncs = None + if self.csmap: + self.scs = self.ncs = self.csmap.values()[0] return def push(self, obj): @@ -683,10 +765,22 @@ class PDFPageInterpreter: # setcolor def do_SCN(self): - self.pop(self.scs.ncomponents) + if self.scs: + n = self.scs.ncomponents + else: + if STRICT: + raise PDFInterpreterError('no colorspace specified!') + n = 1 + self.pop(n) return def do_scn(self): - self.pop(self.ncs.ncomponents) + if self.ncs: + n = self.ncs.ncomponents + else: + if STRICT: + raise PDFInterpreterError('no colorspace specified!') + n = 1 + self.pop(n) return def do_SC(self): self.do_SCN() @@ -839,8 +933,7 @@ class PDFPageInterpreter: (x1,y1) = apply_matrix(ctm, (x1,y1)) bbox = (x0,y0,x1,y1) self.device.begin_figure(xobjid, bbox) - interpreter.render_contents(xobj.dic.get('Resources'), - [xobj], ctm=ctm) + interpreter.render_contents(xobj.dic.get('Resources'), [xobj], ctm=ctm) self.device.end_figure(xobjid) return @@ -853,46 +946,18 @@ class PDFPageInterpreter: return def render_contents(self, resources, contents, ctm=MATRIX_IDENTITY): - self.initpage(ctm) - # Handle resource declarations. - def get_colorspace(spec): - if isinstance(spec, list): - name = literal_name(spec[0]) - else: - name = literal_name(spec) - if name == 'ICCBased': - return ColorSpace(name, stream_value(spec[1]).dic['N']) - elif name == 'DeviceN': - return ColorSpace(name, len(list_value(cs[1]))) - else: - return PREDEFINED_COLORSPACE[name] - if resources: - for (k,v) in dict_value(resources).iteritems(): - if 1 <= self.debug: - print >>stderr, 'Resource: %r: %r' % (k,v) - if k == 'Font': - for (fontid,spec) in dict_value(v).iteritems(): - objid = None - if isinstance(spec, PDFObjRef): - objid = spec.objid - spec = dict_value(spec) - self.fontmap[fontid] = self.rsrc.get_font(objid, spec) - elif k == 'ColorSpace': - for (csid,spec) in dict_value(v).iteritems(): - self.csmap[csid] = get_colorspace(resolve1(spec)) - elif k == 'ProcSet': - self.rsrc.get_procset(list_value(v)) - elif k == 'XObject': - for (xobjid,xobjstrm) in dict_value(v).iteritems(): - self.xobjmap[xobjid] = xobjstrm - data = ''.join( stream_value(stream).get_data() - for stream in list_value(contents) ) - self.execute(data) + self.init_resources(resources) + self.init_state(ctm) + self.execute(list_value(contents)) return - def execute(self, data): - parser = PDFContentParser(StringIO(data), debug=self.debug) - for obj in parser.parse(): + def execute(self, streams): + parser = PDFContentParser(streams, debug=self.debug) + while 1: + try: + (_,obj) = parser.nextobject() + except PSEOF: + break if isinstance(obj, PSKeyword): name = 'do_%s' % obj.name.replace('*','_a').replace('"','_w').replace("'",'_q') if hasattr(self, name): diff --git a/pdfparser.py b/pdfparser.py index b12742b..e0c9df0 100755 --- a/pdfparser.py +++ b/pdfparser.py @@ -14,14 +14,10 @@ # - Linearized PDF. # - Encryption? -import sys, re -try: - from cStringIO import StringIO -except ImportError: - from StringIO import StringIO +import sys stderr = sys.stderr from utils import choplist, nunpack -from psparser import PSException, PSSyntaxError, PSTypeError, \ +from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \ PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \ literal_name, keyword_name, \ PSStackParser, STRICT @@ -43,14 +39,19 @@ LITERAL_PAGE = PSLiteralTable.intern('Page') LITERAL_PAGES = PSLiteralTable.intern('Pages') LITERAL_CATALOG = PSLiteralTable.intern('Catalog') LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode') +KEYWORD_R = PSKeywordTable.intern('R') KEYWORD_OBJ = PSKeywordTable.intern('obj') +KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj') +KEYWORD_STREAM = PSKeywordTable.intern('stream') +KEYWORD_XREF = PSKeywordTable.intern('xref') +KEYWORD_STARTXREF = PSKeywordTable.intern('startxref') ## PDFObjRef ## class PDFObjRef: - def __init__(self, doc, objid, genno): + def __init__(self, doc, objid, _): if objid == 0: if STRICT: raise PDFValueError('objid cannot be 0.') @@ -275,7 +276,8 @@ class PDFXRef: (pos, genno, use) = f self.offsets.append((int(genno), long(pos), use)) # read trailer - self.trailer = dict_value(parser.parse()[0]) + (_, dic) = parser.nextobject() + self.trailer = dict_value(dic) return def getpos(self, objid): @@ -293,9 +295,13 @@ class PDFXRef: class PDFXRefStream: def __init__(self, parser): - (objid, genno, _, stream) = list_value(parser.parse()) + (_,objid) = parser.nextobject() + (_,genno) = parser.nextobject() + parser.nextobject() + (_,stream) = parser.nextobject() if STRICT: - assert stream.dic['Type'] == LITERAL_XREF + if stream.dic['Type'] != LITERAL_XREF: + raise PDFSyntaxError('invalid stream spec.') size = stream.dic['Size'] (start, nobjs) = stream.dic.get('Index', (0,size)) self.objid0 = start @@ -385,20 +391,24 @@ class PDFDocument: if strmid in self.parsed_objs: objs = self.parsed_objs[stream] else: - parser = PDFParser(self, StringIO(stream.get_data()), - debug=self.debug) - objs = list(parser.parse()) + parser = PDFObjStrmParser(self, stream.get_data(), debug=self.debug) + objs = [] + try: + while 1: + (_,obj) = parser.nextobject() + objs.append(obj) + except PSEOF: + pass self.parsed_objs[stream] = objs obj = objs[stream.dic['N']*2+index] else: - prevpos = self.parser.seek(index) - seq = list_value(self.parser.parse()) - if not (4 <= len(seq) and seq[0] == objid and seq[2] == KEYWORD_OBJ): - if STRICT: - raise PDFSyntaxError('invalid stream spec: %r' % seq) - return None - obj = seq[3] - self.parser.seek(prevpos) + self.parser.seek(index) + (_,objid1) = self.parser.nextobject() # objid + (_,genno1) = self.parser.nextobject() # genno + (_,kwd) = self.parser.nextobject() + if kwd != KEYWORD_OBJ: + raise PDFSyntaxError('invalid obj spec: offset=%r' % index) + (_,obj) = self.parser.nextobject() if 2 <= self.debug: print >>stderr, 'register: objid=%r: %r' % (objid, obj) self.objs[objid] = obj @@ -446,29 +456,30 @@ class PDFParser(PSStackParser): return def __repr__(self): - return '' % self.linepos + return '' - EOIPAT = re.compile(r'\nEI\W') - def do_token(self, pos, token): - name = keyword_name(token) - if name in ('xref', 'trailer', 'startxref', 'endobj'): - return True - - if name == 'R': + def do_keyword(self, pos, token): + if token in (KEYWORD_XREF, KEYWORD_STARTXREF): + self.add_results(*self.pop(1)) + return + if token == KEYWORD_ENDOBJ: + self.add_results(*self.pop(4)) + return + + if token == KEYWORD_R: # reference to indirect object try: - (objid, genno) = self.pop(2) + ((_,objid), (_,genno)) = self.pop(2) (objid, genno) = (int(objid), int(genno)) obj = PDFObjRef(self.doc, objid, genno) - self.push(obj) - if 2 <= self.debug: - print >>stderr, 'refer obj: %r' % obj + self.push((pos, obj)) except PSSyntaxError: pass + return - elif name == 'stream': + if token == KEYWORD_STREAM: # stream object - (dic,) = self.pop(1) + ((_,dic),) = self.pop(1) dic = dict_value(dic) try: objlen = int_value(dic['Length']) @@ -484,20 +495,19 @@ class PDFParser(PSStackParser): self.seek(pos+objlen) while 1: (linepos, line) = self.nextline() - if not line or line.startswith('endstream'): - break + if line.startswith('endstream'): break objlen += len(line) data += line if 1 <= self.debug: print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \ (pos, objlen, dic, data[:10]) obj = PDFStream(dic, data, self.doc.decipher) - self.push(obj) - - else: - self.push(token) - - return False + self.push((pos, obj)) + return + + # others + self.push((pos, token)) + return def find_xref(self): # find the first xref table @@ -505,7 +515,7 @@ class PDFParser(PSStackParser): for line in self.revreadlines(): line = line.strip() if 2 <= self.debug: - print >>stderr, 'line: %r' % line + print >>stderr, 'find_xref: %r' % line if line == 'startxref': break if line: prev = line @@ -525,10 +535,11 @@ class PDFParser(PSStackParser): # read xref table (linepos, line) = self.nextline() if 2 <= self.debug: - print >>stderr, 'line: %r' % line + print >>stderr, 'read_xref: %r' % line if line[0].isdigit(): # XRefStream: PDF-1.5 self.seek(linepos) + self.reset() xref = PDFXRefStream(self) else: if line.strip() != 'xref': @@ -551,3 +562,18 @@ class PDFParser(PSStackParser): else: break return + +## PDFObjStrmParser +## +class PDFObjStrmParser(PDFParser): + def __init__(self, doc, data, debug=0): + try: + from cStringIO import StringIO + except ImportError: + from StringIO import StringIO + PDFParser.__init__(self, doc, StringIO(data), debug=debug) + return + + def flush(self): + self.add_results(*self.popall()) + return diff --git a/psparser.py b/psparser.py index 9ab2f9d..49893be 100644 --- a/psparser.py +++ b/psparser.py @@ -3,12 +3,13 @@ import sys, re stderr = sys.stderr from utils import choplist -STRICT = 0 +STRICT = 1 ## PS Exceptions ## class PSException(Exception): pass +class PSEOF(PSException): pass class PSSyntaxError(PSException): pass class PSTypeError(PSException): pass class PSValueError(PSException): pass @@ -71,6 +72,14 @@ class PSSymbolTable: PSLiteralTable = PSSymbolTable(PSLiteral) PSKeywordTable = PSSymbolTable(PSKeyword) +LIT = PSLiteralTable.intern +KWD = PSKeywordTable.intern +KEYWORD_BRACE_BEGIN = KWD('{') +KEYWORD_BRACE_END = KWD('}') +KEYWORD_ARRAY_BEGIN = KWD('[') +KEYWORD_ARRAY_END = KWD(']') +KEYWORD_DICT_BEGIN = KWD('<<') +KEYWORD_DICT_END = KWD('>>') def literal_name(x): @@ -92,72 +101,288 @@ def keyword_name(x): ## PSBaseParser ## +EOL = re.compile(r'[\r\n]') +SPC = re.compile(r'\s') +NONSPC = re.compile(r'\S') +HEX = re.compile(r'[0-9a-fA-F]') +END_LITERAL = re.compile(r'[#/%\[\]()<>{}\s]') +END_HEX_STRING = re.compile(r'[^\s0-9a-fA-F]') +HEX_PAIR = re.compile(r'[0-9a-fA-F]{2}|.') +END_NUMBER = re.compile(r'[^0-9]') +END_KEYWORD = re.compile(r'[#/%\[\]()<>{}\s]') +END_STRING = re.compile(r'[()\134]') +OCT_STRING = re.compile(r'[0-7]') +ESC_STRING = { 'b':8, 't':9, 'n':10, 'f':12, 'r':13, '(':40, ')':41, '\\':92 } class PSBaseParser: ''' Most basic PostScript parser that performs only basic tokenization. ''' - + BUFSIZ = 4096 + def __init__(self, fp, debug=0): self.fp = fp self.debug = debug - self.bufsize = 4096 - self.strfilter = None self.seek(0) return def __repr__(self): - return '' % (self.fp,) + return '' % (self.fp, self.bufpos) + + def tell(self): + return self.fp.tell() + + def poll(self, pos=None, n=80): + pos0 = self.fp.tell() + if not pos: + pos = self.bufpos+self.charpos + self.fp.seek(pos) + print >>stderr, 'poll(%d): %r' % (pos, self.fp.read(n)) + self.fp.seek(pos0) + return def seek(self, pos): ''' Seeks the parser to the given position. ''' if 2 <= self.debug: - print >>stderr, 'seek:', pos - prevpos = self.fp.tell() + print >>stderr, 'seek: %r' % pos self.fp.seek(pos) - self.linebuf = None # line buffer. - self.curpos = 0 # current position in the buffer. - self.linepos = pos # the beginning of the current line. - self.go = False - return prevpos + # reset the status for nextline() + self.bufpos = pos + self.buf = '' + self.charpos = 0 + # reset the status for nexttoken() + self.parse1 = self.parse_main + self.tokens = [] + return + + def fillbuf(self): + if self.charpos < len(self.buf): return + # fetch next chunk. + self.bufpos = self.fp.tell() + self.buf = self.fp.read(self.BUFSIZ) + if not self.buf: + raise PSEOF + self.charpos = 0 + return - EOLCHAR = re.compile(r'[\r\n]') + def parse_main(self, s, i): + m = NONSPC.search(s, i) + if not m: + return (self.parse_main, len(s)) + j = m.start(0) + c = s[j] + self.tokenstart = self.bufpos+j + if c == '%': + self.token = '%' + return (self.parse_comment, j+1) + if c == '/': + self.token = '' + return (self.parse_literal, j+1) + if c in '-+' or c.isdigit(): + self.token = c + return (self.parse_number, j+1) + if c == '.': + self.token = c + return (self.parse_float, j+1) + if c.isalpha(): + self.token = c + return (self.parse_keyword, j+1) + if c == '(': + self.token = '' + self.paren = 1 + return (self.parse_string, j+1) + if c == '<': + self.token = '' + return (self.parse_wopen, j+1) + if c == '>': + self.token = '' + return (self.parse_wclose, j+1) + self.add_token(KWD(c)) + return (self.parse_main, j+1) + + def add_token(self, obj): + self.tokens.append((self.tokenstart, obj)) + return + + def parse_comment(self, s, i): + m = EOL.search(s, i) + if not m: + self.token += s[i:] + return (self.parse_comment, len(s)) + j = m.start(0) + self.token += s[i:j] + # We ignore comments. + #self.tokens.append(self.token) + return (self.parse_main, j) + + def parse_literal(self, s, i): + m = END_LITERAL.search(s, i) + if not m: + self.token += s[i:] + return (self.parse_literal, len(s)) + j = m.start(0) + self.token += s[i:j] + c = s[j] + if c == '#': + self.hex = '' + return (self.parse_literal_hex, j+1) + self.add_token(LIT(self.token)) + return (self.parse_main, j) + + def parse_literal_hex(self, s, i): + c = s[i] + if HEX.match(c) and len(self.hex) < 2: + self.hex += c + return (self.parse_literal_hex, i+1) + if self.hex: + self.token += chr(int(self.hex, 16)) + return (self.parse_literal, i) + + def parse_number(self, s, i): + m = END_NUMBER.search(s, i) + if not m: + self.token += s[i:] + return (self.parse_number, len(s)) + j = m.start(0) + self.token += s[i:j] + c = s[j] + if c == '.': + self.token += c + return (self.parse_float, j+1) + try: + self.add_token(int(self.token)) + except ValueError: + pass + return (self.parse_main, j) + def parse_float(self, s, i): + m = END_NUMBER.search(s, i) + if not m: + self.token += s[i:] + return (self.parse_float, len(s)) + j = m.start(0) + self.token += s[i:j] + self.add_token(float(self.token)) + return (self.parse_main, j) + + def parse_keyword(self, s, i): + m = END_KEYWORD.search(s, i) + if not m: + self.token += s[i:] + return (self.parse_keyword, len(s)) + j = m.start(0) + self.token += s[i:j] + if self.token == 'true': + token = True + elif self.token == 'false': + token = False + else: + token = KWD(self.token) + self.add_token(token) + return (self.parse_main, j) + + def parse_string(self, s, i): + m = END_STRING.search(s, i) + if not m: + self.token += s[i:] + return (self.parse_string, len(s)) + j = m.start(0) + self.token += s[i:j] + c = s[j] + if c == '\\': + self.oct = '' + return (self.parse_string_1, j+1) + if c == '(': + self.paren += 1 + self.token += c + return (self.parse_string, j+1) + if c == ')': + self.paren -= 1 + if self.paren: # WTF, they said balanced parens need no special treatment. + self.token += c + return (self.parse_string, j+1) + self.add_token(self.token) + return (self.parse_main, j+1) + def parse_string_1(self, s, i): + c = s[i] + if OCT_STRING.match(c) and len(self.oct) < 3: + self.oct += c + return (self.parse_string_1, i+1) + if self.oct: + self.token += chr(int(self.oct, 8)) + return (self.parse_string, i) + if c in ESC_STRING: + self.token += chr(ESC_STRING[c]) + return (self.parse_string, i+1) + + def parse_wopen(self, s, i): + c = s[i] + if c.isspace() or HEX.match(c): + return (self.parse_hexstring, i) + if c == '<': + self.add_token(KEYWORD_DICT_BEGIN) + i += 1 + return (self.parse_main, i) + + def parse_wclose(self, s, i): + c = s[i] + if c == '>': + self.add_token(KEYWORD_DICT_END) + i += 1 + return (self.parse_main, i) + + def parse_hexstring(self, s, i): + m = END_HEX_STRING.search(s, i) + if not m: + self.token += s[i:] + return (self.parse_hexstring, len(s)) + j = m.start(0) + self.token += s[i:j] + token = HEX_PAIR.sub(lambda m: chr(int(m.group(0), 16)), + SPC.sub('', self.token)) + self.add_token(token) + return (self.parse_main, j) + + def nexttoken(self): + while not self.tokens: + self.fillbuf() + (self.parse1, self.charpos) = self.parse1(self.buf, self.charpos) + token = self.tokens.pop(0) + if 2 <= self.debug: + print >>stderr, 'nexttoken: %r' % (token,) + return token + def nextline(self): ''' Fetches a next line that ends either with \\r or \\n. ''' - line = '' - eol = None + linebuf = '' + linepos = self.bufpos + self.charpos + eol = False while 1: - if not self.linebuf or len(self.linebuf) <= self.curpos: - # fetch next chunk. - self.linebuf = self.fp.read(self.bufsize) - if not self.linebuf: - # at EOF. - break - self.curpos = 0 + self.fillbuf() if eol: - c = self.linebuf[self.curpos] + c = self.buf[self.charpos] # handle '\r\n' - if (eol == '\r' and c == '\n'): - line += c - self.curpos += 1 + if c == '\n': + linebuf += c + self.charpos += 1 break - m = self.EOLCHAR.search(self.linebuf, self.curpos) + m = EOL.search(self.buf, self.charpos) if m: - i = m.end(0) - line += self.linebuf[self.curpos:i] - eol = self.linebuf[i-1] - self.curpos = i + linebuf += self.buf[self.charpos:m.end(0)] + self.charpos = m.end(0) + if linebuf[-1] == '\r': + eol = True + else: + break else: - # fetch further - line += self.linebuf[self.curpos:] - self.linebuf = None - linepos = self.linepos - self.linepos += len(line) - return (linepos, line) + linebuf += self.buf[self.charpos:] + self.charpos = len(self.buf) + if 2 <= self.debug: + print >>stderr, 'nextline: %r' % ((linepos, linebuf),) + return (linepos, linebuf) def revreadlines(self): ''' @@ -168,9 +393,9 @@ class PSBaseParser: pos = self.fp.tell() buf = '' while 0 < pos: - pos = max(0, pos-self.bufsize) + pos = max(0, pos-self.BUFSIZ) self.fp.seek(pos) - s = self.fp.read(self.bufsize) + s = self.fp.read(self.BUFSIZ) if not s: break while 1: n = max(s.rfind('\r'), s.rfind('\n')) @@ -182,263 +407,202 @@ class PSBaseParser: buf = '' return - # regex patterns for basic lexical scanning. - SPECIAL = r'%\[\]()<>{}/\000\011\012\014\015\040' - TOKEN = re.compile(r'<<|>>|[%\[\]()<>{}/]|[^'+SPECIAL+r']+') - LITERAL = re.compile(r'([^#'+SPECIAL+r']|#[0-9abcdefABCDEF]{2})+') - NUMBER = re.compile(r'[+-]?[0-9][.0-9]*$') - STRING_NORM = re.compile(r'(\\[0-9]{1,3}|\\.|[^\)])+') - STRING_NORM_SUB = re.compile(r'\\[0-7]{1,3}|\\.') - STRING_HEX = re.compile(r'[\s0-9a-fA-F]+') - STRING_HEX_SUB = re.compile(r'[0-9a-fA-F]{1,2}') - - def parse(self): - ''' - Yields a list of tuples (pos, token) of the following: - keywords, literals, strings, numbers and parentheses. - Comments are skipped. - Nested objects (i.e. arrays and dictionaries) are not handled here. - ''' - while 1: - # do not strip line! we need to distinguish last '\n' or '\r' - (linepos, line) = self.nextline() - if not line: break - if 2 <= self.debug: - print >>stderr, 'line: (%d) %r' % (linepos, line) - # do this before removing comment - if line.startswith('%%EOF'): break - charpos = 0 - - # tokenize - self.go = True - while self.go: - m = self.TOKEN.search(line, charpos) - if not m: break - t = m.group(0) - pos = linepos + m.start(0) - charpos = m.end(0) - - if t == '%': - # skip comment - if 2 <= self.debug: - print >>stderr, 'comment: %r' % line[charpos:] - break - - elif t == '/': - # literal object - mn = self.LITERAL.match(line, m.start(0)+1) - lit = PSLiteralTable.intern(mn.group(0)) - yield (pos, lit) - charpos = mn.end(0) - if 2 <= self.debug: - print >>stderr, 'name: %r' % lit - - elif t == '(': - # normal string object - s = '' - while 1: - ms = self.STRING_NORM.match(line, charpos) - if not ms: break - s1 = ms.group(0) - charpos = ms.end(0) - if len(s1) == 1 and s1[-1] == '\\': - s += s1[-1:] - (linepos, line) = self.nextline() - if not line: - if STRICT: - raise PSSyntaxError('end inside string: linepos=%d, line=%r' % - (linepos, line)) - break - charpos = 0 - elif charpos == len(line): - s += s1 - (linepos, line) = self.nextline() - if not line: - if STRICT: - raise PSSyntaxError('end inside string: linepos=%d, line=%r' % - (linepos, line)) - break - charpos = 0 - else: - s += s1 - break - if line[charpos] == ')': - charpos += 1 - else: - if STRICT: - raise PSSyntaxError('no close paren: linepos=%d, line=%r' % - (linepos, line)) - pass - def convesc(m): - x = m.group(0) - if x[1:].isdigit(): - return chr(int(x[1:], 8)) - else: - return x[1] - s = self.STRING_NORM_SUB.sub(convesc, s) - if self.strfilter: - s = self.strfilter(s) - if 2 <= self.debug: - print >>stderr, 'str: %r' % s - yield (pos, s) - - elif t == '<': - # hex string object - ms = self.STRING_HEX.match(line, charpos) - charpos = ms.end(0) - if line[charpos] == '>': - charpos += 1 - else: - if STRICT: - raise PSSyntaxError('no close paren: linepos=%d, line=%r' % - (linepos, line)) - def convhex(m1): - return chr(int(m1.group(0), 16)) - s = self.STRING_HEX_SUB.sub(convhex, ms.group(0)) - if 2 <= self.debug: - print >>stderr, 'str: %r' % s - yield (pos, s) - - elif self.NUMBER.match(t): - # number - if '.' in t: - n = float(t) - else: - n = int(t) - if 2 <= self.debug: - print >>stderr, 'number: %r' % n - yield (pos, n) - - elif t in ('true', 'false'): - # boolean - if 2 <= self.debug: - print >>stderr, 'boolean: %r' % t - yield (pos, (t == 'true')) - - else: - # other token - if 2 <= self.debug: - print >>stderr, 'keyword: %r' % t - yield (pos, PSKeywordTable.intern(t)) - - return - ## PSStackParser ## class PSStackParser(PSBaseParser): - ''' - PostScript parser that recognizes compound objects - such as arrays and dictionaries. - ''' - def __init__(self, fp, debug=0): - PSBaseParser.__init__(self, fp, debug=debug) - self.context = [] - self.partobj = None + PSBaseParser.__init__(self,fp, debug=debug) + self.reset() return - - def do_token(self, pos, token): - ''' - Handles special tokens. - Returns true if the token denotes the end of an object. - ''' - return False - - def push(self, obj): - ''' - Push an object to the stack. - ''' - self.partobj.append(obj) - return - - def pop(self, n): - ''' - Pop N objects from the stack. - ''' - if len(self.partobj) < n: - if STRICT: - raise PSSyntaxError('stack too short < %d' % n) - r = self.partobj[-n:] - self.partobj = self.partobj[:-n] - return r - def popall(self): - ''' - Discards all the objects on the stack. - ''' - self.partobj = [] + def reset(self): + self.context = [] + self.curtype = None + self.curstack = [] + self.results = [] return - def parse(self): + def push(self, *objs): + self.curstack.extend(objs) + return + def pop(self, n): + objs = self.curstack[-n:] + self.curstack[-n:] = [] + return objs + def popall(self): + objs = self.curstack + self.curstack = [] + return objs + def add_results(self, *objs): + if 2 <= self.debug: + print >>stderr, 'add_results: %r' % (objs,) + self.results.extend(objs) + return + + def start_type(self, pos, type): + self.context.append((pos, self.curtype, self.curstack)) + (self.curtype, self.curstack) = (type, []) + if 2 <= self.debug: + print >>stderr, 'start_type: pos=%r, type=%r' % (pos, type) + return + def end_type(self, type): + if self.curtype != type: + raise PSTypeError('type mismatch: %r != %r' % (self.curtype, type)) + objs = [ obj for (_,obj) in self.curstack ] + (pos, self.curtype, self.curstack) = self.context.pop() + if 2 <= self.debug: + print >>stderr, 'end_type: pos=%r, type=%r, objs=%r' % (pos, type, objs) + return (pos, objs) + + def do_keyword(self, pos, token): + return + def flush(self): + return + + def nextobject(self): ''' Yields a list of objects: keywords, literals, strings, numbers, arrays and dictionaries. Arrays and dictionaries are represented as Python sequence and dictionaries. ''' - - def startobj(type): - self.context.append((type, self.partobj)) - self.partobj = [] - return - - def endobj(type1): - if not self.context: - if STRICT: - raise PSTypeError('stack empty.') - obj = self.partobj - (type0, partobj) = self.context[-1] - if type0 == type1: - self.partobj = partobj - self.context.pop() - else: - if STRICT: - raise PSTypeError('type mismatch: %r(%r) != %r(%r)' % - (type0, self.partobj, type1, obj)) - return obj - - startobj('o') - - for (pos,t) in PSBaseParser.parse(self): - if isinstance(t, int) or isinstance(t, float): - self.push(t) - elif isinstance(t, str): - self.push(t) - elif isinstance(t, PSLiteral): - self.push(t) - else: - c = keyword_name(t) - if c == '{' or c == '}': - self.push(t) - elif c == '[': - # begin array - if 2 <= self.debug: - print >>stderr, 'start array' - startobj('a') - elif c == ']': - # end array - a = endobj('a') - if 2 <= self.debug: - print >>stderr, 'end array: %r' % a - self.push(a) - elif c == '<<': - # begin dictionary - if 2 <= self.debug: - print >>stderr, 'start dict' - startobj('d') - elif c == '>>': - # end dictionary - objs = endobj('d') + while not self.results: + (pos, token) = self.nexttoken() + #print (pos,token), (self.curtype, self.curstack) + if (isinstance(token, int) or + isinstance(token, float) or + isinstance(token, bool) or + isinstance(token, str) or + isinstance(token, PSLiteral)): + # normal token + self.push((pos, token)) + elif token == KEYWORD_ARRAY_BEGIN: + # begin array + self.start_type(pos, 'a') + elif token == KEYWORD_ARRAY_END: + # end array + try: + self.push(self.end_type('a')) + except PSTypeError: + if STRICT: raise + elif token == KEYWORD_DICT_BEGIN: + # begin dictionary + self.start_type(pos, 'd') + elif token == KEYWORD_DICT_END: + # end dictionary + try: + (pos, objs) = self.end_type('d') if len(objs) % 2 != 0: - if STRICT: - raise PSTypeError('invalid dictionary construct: %r' % objs) - d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) ) - if 2 <= self.debug: - print >>stderr, 'end dict: %r' % d - self.push(d) - elif self.do_token(pos, t): - break + raise PSSyntaxError('invalid dictionary construct: %r' % objs) + d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs)) + self.push((pos, d)) + except PSTypeError: + if STRICT: raise + else: + if 2 <= self.debug: + print >>stderr, 'do_keyword: pos=%r, token=%r, stack=%r' % \ + (pos, token, self.curstack) + self.do_keyword(pos, token) + if self.context: + continue + else: + self.flush() + obj = self.results.pop(0) + if 2 <= self.debug: + print >>stderr, 'nextobject: %r' % (obj,) + return obj - objs = endobj('o') - return objs + +## Simplistic Test cases +## +import unittest +class TestPSBaseParser(unittest.TestCase): + + TESTDATA = r'''%!PS +begin end + " @ # +/a/BCD /Some_Name /foo#5f#xbaa +0 +1 -2 .5 1.234 +(abc) () (abc ( def ) ghi) +(def\040\0\0404ghi) (bach\\slask) (foo\nbaa) +(this % is not a comment.) +(foo +baa) +(foo\ +baa) +<20> < 40 4020 > + +func/a/b{(c)do*}def +[ 1 (z) ! ] +<< /foo (bar) >> +''' + + TOKENS = [ + (5, KWD('begin')), (11, KWD('end')), (16, KWD('"')), (19, KWD('@')), + (21, KWD('#')), (23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')), + (41, LIT('foo_xbaa')), (54, 0), (56, 1), (59, -2), (62, 0.5), + (65, 1.234), (71, 'abc'), (77, ''), (80, 'abc ( def ) ghi'), + (98, 'def \x00 4ghi'), (118, 'bach\\slask'), (132, 'foo\nbaa'), + (143, 'this % is not a comment.'), (170, 'foo\nbaa'), (180, 'foobaa'), + (191, ' '), (196, '@@ '), (208, '\xab\xcd\x00\x124\x05'), + (223, KWD('func')), (227, LIT('a')), (229, LIT('b')), + (231, KWD('{')), (232, 'c'), (235, KWD('do*')), (238, KWD('}')), + (239, KWD('def')), (243, KWD('[')), (245, 1), (247, 'z'), (251, KWD('!')), + (253, KWD(']')), (255, KWD('<<')), (258, LIT('foo')), (263, 'bar'), + (269, KWD('>>')) + ] + + OBJS = [ + (23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')), + (41, LIT('foo_xbaa')), (54, 0), (56, 1), (59, -2), (62, 0.5), + (65, 1.234), (71, 'abc'), (77, ''), (80, 'abc ( def ) ghi'), + (98, 'def \x00 4ghi'), (118, 'bach\\slask'), (132, 'foo\nbaa'), + (143, 'this % is not a comment.'), (170, 'foo\nbaa'), (180, 'foobaa'), + (191, ' '), (196, '@@ '), (208, '\xab\xcd\x00\x124\x05'), + (227, LIT('a')), (229, LIT('b')), (232, 'c'), (243, [1, 'z']), + (255, {'foo': 'bar'}), + ] + + def get_tokens(self, s): + import StringIO + class MyParser(PSBaseParser): + def flush(self): + self.add_results(*self.popall()) + parser = MyParser(StringIO.StringIO(s), debug=1) + r = [] + try: + while 1: + r.append(parser.nexttoken()) + except PSEOF: + pass + return r + + def get_objects(self, s): + import StringIO + class MyParser(PSStackParser): + def flush(self): + self.add_results(*self.popall()) + parser = MyParser(StringIO.StringIO(s), debug=1) + r = [] + try: + while 1: + r.append(parser.nextobject()) + except PSEOF: + pass + return r + + def test_1(self): + tokens = self.get_tokens(self.TESTDATA) + print tokens + self.assertEqual(tokens, self.TOKENS) + return + + def test_2(self): + objs = self.get_objects(self.TESTDATA) + print objs + self.assertEqual(objs, self.OBJS) + return + +if __name__ == '__main__': unittest.main()