Restructuring core lexical handlings.

Fix several bugs.


git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@17 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2008-02-03 09:36:34 +00:00
parent 94859ea428
commit 366143361f
5 changed files with 712 additions and 454 deletions

83
cmap.py
View File

@ -3,7 +3,7 @@ import sys
stderr = sys.stderr
from struct import pack, unpack
from utils import choplist, nunpack
from psparser import PSException, PSSyntaxError, PSTypeError, \
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
PSLiteral, PSKeyword, literal_name, keyword_name, \
PSStackParser
try:
@ -39,17 +39,17 @@ class CMap:
return self
def register_code2cid(self, code, cid):
assert isinstance(code, str)
assert isinstance(cid, int)
self.code2cid[code] = cid
if isinstance(code, str) and isinstance(cid, int):
self.code2cid[code] = cid
return self
def register_cid2code(self, cid, code):
from glyphlist import charname2unicode
assert isinstance(cid, int)
if isinstance(code, PSLiteral):
code = pack('>H', charname2unicode[code.name])
self.cid2code[cid] = code
if isinstance(cid, int):
if isinstance(code, PSLiteral):
self.cid2code[cid] = pack('>H', charname2unicode[code.name])
elif isinstance(code, str):
self.cid2code[cid] = code
return self
def decode(self, bytes):
@ -195,7 +195,7 @@ class CMapDB:
print >>stderr, 'Reading: CMap %r...' % fname
cmap = CMap()
fp = file(fname)
CMapParser(cmap, fp).parse()
CMapParser(cmap, fp, debug=klass.debug).run()
fp.close()
else:
raise KeyError(cmapname)
@ -213,7 +213,14 @@ class CMapParser(PSStackParser):
self.in_cmap = False
return
def do_token(self, _, token):
def run(self):
try:
self.nextobject()
except PSEOF:
pass
return
def do_keyword(self, pos, token):
name = token.name
if name == 'begincmap':
self.in_cmap = True
@ -226,15 +233,15 @@ class CMapParser(PSStackParser):
#
if name == 'def':
try:
(k,v) = self.pop(2)
self.cmap.attrs[literal_name(k)] = v
((_,k),(_,v)) = self.pop(2)
self.cmap.attrs[str(k)] = v
except PSSyntaxError:
pass
return
if name == 'usecmap':
try:
(cmapname,) = self.pop(1)
((_,cmapname),) = self.pop(1)
self.cmap.copycmap(CMapDB.get_cmap(literal_name(cmapname)))
except PSSyntaxError:
pass
@ -244,8 +251,6 @@ class CMapParser(PSStackParser):
self.popall()
return
if name == 'endcodespacerange':
if 1 <= self.debug:
print >>stderr, 'codespace: %r' % self.partobj
self.popall()
return
@ -253,48 +258,45 @@ class CMapParser(PSStackParser):
self.popall()
return
if name == 'endcidrange':
for (s,e,cid) in choplist(3, self.partobj):
assert isinstance(s, str)
assert isinstance(e, str)
assert isinstance(cid, int)
assert len(s) == len(e)
objs = [ obj for (_,obj) in self.popall() ]
for (s,e,cid) in choplist(3, objs):
if (not isinstance(s, str) or not isinstance(e, str) or
not isinstance(cid, int) or len(s) != len(e)): continue
sprefix = s[:-4]
eprefix = e[:-4]
assert sprefix == eprefix
if sprefix != eprefix: continue
svar = s[-4:]
evar = e[-4:]
s1 = nunpack(svar)
e1 = nunpack(evar)
vlen = len(svar)
assert s1 <= e1
#assert s1 <= e1
for i in xrange(e1-s1+1):
x = sprefix+pack('>L',s1+i)[-vlen:]
self.cmap.register_code2cid(x, cid+i)
self.popall()
return
if name == 'begincidchar':
self.popall()
return
if name == 'endcidchar':
for (cid,code) in choplist(2, self.partobj):
assert isinstance(code, str)
assert isinstance(cid, str)
self.cmap.register_code2cid(code, nunpack(cid))
self.popall()
objs = [ obj for (_,obj) in self.popall() ]
for (cid,code) in choplist(2, objs):
if isinstance(code, str) and isinstance(cid, str):
self.cmap.register_code2cid(code, nunpack(cid))
return
if name == 'beginbfrange':
self.popall()
return
if name == 'endbfrange':
for (s,e,code) in choplist(3, self.partobj):
assert isinstance(s, str)
assert isinstance(e, str)
assert len(s) == len(e)
objs = [ obj for (_,obj) in self.popall() ]
for (s,e,code) in choplist(3, objs):
if (not isinstance(s, str) or not isinstance(e, str) or
len(s) != len(e)): continue
s1 = nunpack(s)
e1 = nunpack(e)
assert s1 <= e1
#assert s1 <= e1
if isinstance(code, list):
for i in xrange(e1-s1+1):
self.cmap.register_cid2code(s1+i, code[i])
@ -306,29 +308,26 @@ class CMapParser(PSStackParser):
for i in xrange(e1-s1+1):
x = prefix+pack('>L',base+i)[-vlen:]
self.cmap.register_cid2code(s1+i, x)
self.popall()
return
if name == 'beginbfchar':
self.popall()
return
if name == 'endbfchar':
for (cid,code) in choplist(2, self.partobj):
assert isinstance(cid, str)
assert isinstance(code, str)
self.cmap.register_cid2code(nunpack(cid), code)
self.popall()
objs = [ obj for (_,obj) in self.popall() ]
for (cid,code) in choplist(2, objs):
if isinstance(cid, str) and isinstance(code, str):
self.cmap.register_cid2code(nunpack(cid), code)
return
if name == 'beginnotdefrange':
self.popall()
return
if name == 'endnotdefrange':
if 1 <= self.debug:
print >>stderr, 'notdefrange: %r' % self.partobj
self.popall()
return
self.push((pos, token))
return

View File

@ -13,8 +13,8 @@ from cmap import CMapDB
##
class TextConverter(PDFDevice):
def __init__(self, outfp, rsrc, codec):
PDFDevice.__init__(self, rsrc)
def __init__(self, outfp, rsrc, codec, debug=0):
PDFDevice.__init__(self, rsrc, debug=debug)
self.outfp = outfp
self.codec = codec
return
@ -22,7 +22,7 @@ class TextConverter(PDFDevice):
def close(self):
self.outfp.write('\n')
return
def begin_page(self, page):
(x0,y0,x1,y1) = page.mediabox
self.outfp.write('<page id="%d" mediabox="%d,%d,%d,%d" rotate="%d">' %
@ -42,6 +42,10 @@ class TextConverter(PDFDevice):
return
def handle_undefined_char(self, cidcoding, cid):
if self.debug:
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
#return unichr(cid)
#return unichr(cid+32)
return
def render_string(self, textstate, textmatrix, size, seq):
@ -81,7 +85,7 @@ class TextConverter(PDFDevice):
# pdf2txt
def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
device = TextConverter(outfp, rsrc, codec)
device = TextConverter(outfp, rsrc, codec, debug=debug)
outfp.write('<document>\n')
doc = PDFDocument(debug=debug)
fp = file(fname)

View File

@ -6,7 +6,7 @@ try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
from psparser import PSException, PSSyntaxError, PSTypeError, \
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
PSStackParser, PSLiteral, PSKeyword, STRICT, \
PSLiteralTable, PSKeywordTable, literal_name, keyword_name
from pdfparser import PDFException, PDFStream, PDFObjRef, resolve1, \
@ -45,6 +45,8 @@ LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
KEYWORD_BI = PSKeywordTable.intern('BI')
KEYWORD_ID = PSKeywordTable.intern('ID')
KEYWORD_EI = PSKeywordTable.intern('EI')
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
@ -134,7 +136,7 @@ class PDFSimpleFont(PDFFont):
if 'ToUnicode' in spec:
strm = stream_value(spec['ToUnicode'])
self.ucs2_cmap = CMap()
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse()
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
PDFFont.__init__(self, descriptor, widths)
return
@ -292,7 +294,7 @@ class PDFCIDFont(PDFFont):
if 'ToUnicode' in spec:
strm = stream_value(spec['ToUnicode'])
self.ucs2_cmap = CMap()
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse()
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
elif self.cidcoding == 'Adobe-Identity':
if ttf:
try:
@ -433,8 +435,9 @@ class PDFResourceManager:
##
class PDFDevice:
def __init__(self, rsrc):
def __init__(self, rsrc, debug=0):
self.rsrc = rsrc
self.debug = debug
self.ctm = None
return
@ -465,47 +468,91 @@ class PDFDevice:
##
class PDFContentParser(PSStackParser):
def __init__(self, fp, debug=0):
PSStackParser.__init__(self, fp, debug=debug)
def __init__(self, streams, debug=0):
self.streams = streams
self.istream = 0
PSStackParser.__init__(self, None, debug=debug)
return
def __repr__(self):
return '<PDFParser: linepos=%d>' % self.linepos
EOIPAT = re.compile(r'\nEI\W')
def do_token(self, pos, token):
name = keyword_name(token)
def fillfp(self):
if not self.fp:
if self.istream < len(self.streams):
strm = stream_value(self.streams[self.istream])
self.istream += 1
else:
raise PSEOF
self.fp = StringIO(strm.get_data())
return
if name == 'BI':
def seek(self, pos):
self.fillfp()
PSStackParser.seek(self, pos)
return
def fillbuf(self):
if self.charpos < len(self.buf): return
while 1:
self.fillfp()
self.bufpos = self.fp.tell()
self.buf = self.fp.read(self.BUFSIZ)
if self.buf: break
self.fp = None
self.charpos = 0
return
def get_inline_data(self, pos, target='EI '):
self.seek(pos)
i = 0
data = ''
while i < len(target):
self.fillbuf()
if i:
c = self.buf[self.charpos]
data += c
self.charpos += 1
if c == target[i]:
i += 1
else:
i = 0
else:
try:
j = self.buf.index(target[0], self.charpos)
#print 'found', (0, self.buf[j:j+10])
data += self.buf[self.charpos:j]
self.charpos = j+1
i = 1
except ValueError:
data += self.buf[self.charpos:]
self.charpos = len(self.buf)
data = data[:-len(target)] # strip the last part
return (pos, data)
def flush(self):
self.add_results(*self.popall())
return
def do_keyword(self, pos, token):
if token == KEYWORD_BI:
# inline image within a content stream
self.context.append(('BI', self.partobj))
self.partobj = []
elif name == 'ID':
objs = self.partobj
(type0, self.partobj) = self.context.pop()
if len(objs) % 2 != 0:
if STRICT:
self.start_type(pos, 'inline')
elif token == KEYWORD_ID:
try:
(_, objs) = self.end_type('inline')
if len(objs) % 2 != 0:
raise PSTypeError('invalid dictionary construct: %r' % objs)
dic = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
pos += len('ID ')
self.fp.seek(pos)
# XXX how do we know the real length other than scanning?
data = ''
while 1:
data += self.fp.read(4096)
m = self.EOIPAT.search(data)
if m: break
objlen = m.start(0)
obj = PDFStream(dic, data[:objlen])
self.push(obj)
self.seek(pos+objlen+len('\nEI'))
self.push(KEYWORD_EI)
d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
(pos, data) = self.get_inline_data(pos+len('ID '))
obj = PDFStream(d, data)
self.push((pos, obj))
self.push((pos, KEYWORD_EI))
except PSTypeError:
if STRICT: raise
else:
self.push(token)
return False
self.push((pos, token))
return
## Interpreter
@ -542,10 +589,44 @@ class PDFPageInterpreter:
self.debug = debug
return
def initpage(self, ctm):
def init_resources(self, resources):
self.fontmap = {}
self.xobjmap = {}
self.csmap = PREDEFINED_COLORSPACE.copy()
# Handle resource declarations.
def get_colorspace(spec):
if isinstance(spec, list):
name = literal_name(spec[0])
else:
name = literal_name(spec)
if name == 'ICCBased':
return ColorSpace(name, stream_value(spec[1]).dic['N'])
elif name == 'DeviceN':
return ColorSpace(name, len(list_value(spec[1])))
else:
return PREDEFINED_COLORSPACE[name]
if resources:
for (k,v) in dict_value(resources).iteritems():
if 1 <= self.debug:
print >>stderr, 'Resource: %r: %r' % (k,v)
if k == 'Font':
for (fontid,spec) in dict_value(v).iteritems():
objid = None
if isinstance(spec, PDFObjRef):
objid = spec.objid
spec = dict_value(spec)
self.fontmap[fontid] = self.rsrc.get_font(objid, spec)
elif k == 'ColorSpace':
for (csid,spec) in dict_value(v).iteritems():
self.csmap[csid] = get_colorspace(resolve1(spec))
elif k == 'ProcSet':
self.rsrc.get_procset(list_value(v))
elif k == 'XObject':
for (xobjid,xobjstrm) in dict_value(v).iteritems():
self.xobjmap[xobjid] = xobjstrm
return
def init_state(self, ctm):
# gstack: stack for graphical states.
self.gstack = []
self.ctm = ctm
@ -554,8 +635,9 @@ class PDFPageInterpreter:
# argstack: stack for command arguments.
self.argstack = []
# set some global states.
self.scs = None
self.ncs = None
self.scs = self.ncs = None
if self.csmap:
self.scs = self.ncs = self.csmap.values()[0]
return
def push(self, obj):
@ -683,10 +765,22 @@ class PDFPageInterpreter:
# setcolor
def do_SCN(self):
self.pop(self.scs.ncomponents)
if self.scs:
n = self.scs.ncomponents
else:
if STRICT:
raise PDFInterpreterError('no colorspace specified!')
n = 1
self.pop(n)
return
def do_scn(self):
self.pop(self.ncs.ncomponents)
if self.ncs:
n = self.ncs.ncomponents
else:
if STRICT:
raise PDFInterpreterError('no colorspace specified!')
n = 1
self.pop(n)
return
def do_SC(self):
self.do_SCN()
@ -839,8 +933,7 @@ class PDFPageInterpreter:
(x1,y1) = apply_matrix(ctm, (x1,y1))
bbox = (x0,y0,x1,y1)
self.device.begin_figure(xobjid, bbox)
interpreter.render_contents(xobj.dic.get('Resources'),
[xobj], ctm=ctm)
interpreter.render_contents(xobj.dic.get('Resources'), [xobj], ctm=ctm)
self.device.end_figure(xobjid)
return
@ -853,46 +946,18 @@ class PDFPageInterpreter:
return
def render_contents(self, resources, contents, ctm=MATRIX_IDENTITY):
self.initpage(ctm)
# Handle resource declarations.
def get_colorspace(spec):
if isinstance(spec, list):
name = literal_name(spec[0])
else:
name = literal_name(spec)
if name == 'ICCBased':
return ColorSpace(name, stream_value(spec[1]).dic['N'])
elif name == 'DeviceN':
return ColorSpace(name, len(list_value(cs[1])))
else:
return PREDEFINED_COLORSPACE[name]
if resources:
for (k,v) in dict_value(resources).iteritems():
if 1 <= self.debug:
print >>stderr, 'Resource: %r: %r' % (k,v)
if k == 'Font':
for (fontid,spec) in dict_value(v).iteritems():
objid = None
if isinstance(spec, PDFObjRef):
objid = spec.objid
spec = dict_value(spec)
self.fontmap[fontid] = self.rsrc.get_font(objid, spec)
elif k == 'ColorSpace':
for (csid,spec) in dict_value(v).iteritems():
self.csmap[csid] = get_colorspace(resolve1(spec))
elif k == 'ProcSet':
self.rsrc.get_procset(list_value(v))
elif k == 'XObject':
for (xobjid,xobjstrm) in dict_value(v).iteritems():
self.xobjmap[xobjid] = xobjstrm
data = ''.join( stream_value(stream).get_data()
for stream in list_value(contents) )
self.execute(data)
self.init_resources(resources)
self.init_state(ctm)
self.execute(list_value(contents))
return
def execute(self, data):
parser = PDFContentParser(StringIO(data), debug=self.debug)
for obj in parser.parse():
def execute(self, streams):
parser = PDFContentParser(streams, debug=self.debug)
while 1:
try:
(_,obj) = parser.nextobject()
except PSEOF:
break
if isinstance(obj, PSKeyword):
name = 'do_%s' % obj.name.replace('*','_a').replace('"','_w').replace("'",'_q')
if hasattr(self, name):

View File

@ -14,14 +14,10 @@
# - Linearized PDF.
# - Encryption?
import sys, re
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
import sys
stderr = sys.stderr
from utils import choplist, nunpack
from psparser import PSException, PSSyntaxError, PSTypeError, \
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
literal_name, keyword_name, \
PSStackParser, STRICT
@ -43,14 +39,19 @@ LITERAL_PAGE = PSLiteralTable.intern('Page')
LITERAL_PAGES = PSLiteralTable.intern('Pages')
LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode')
KEYWORD_R = PSKeywordTable.intern('R')
KEYWORD_OBJ = PSKeywordTable.intern('obj')
KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
KEYWORD_STREAM = PSKeywordTable.intern('stream')
KEYWORD_XREF = PSKeywordTable.intern('xref')
KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
## PDFObjRef
##
class PDFObjRef:
def __init__(self, doc, objid, genno):
def __init__(self, doc, objid, _):
if objid == 0:
if STRICT:
raise PDFValueError('objid cannot be 0.')
@ -275,7 +276,8 @@ class PDFXRef:
(pos, genno, use) = f
self.offsets.append((int(genno), long(pos), use))
# read trailer
self.trailer = dict_value(parser.parse()[0])
(_, dic) = parser.nextobject()
self.trailer = dict_value(dic)
return
def getpos(self, objid):
@ -293,9 +295,13 @@ class PDFXRef:
class PDFXRefStream:
def __init__(self, parser):
(objid, genno, _, stream) = list_value(parser.parse())
(_,objid) = parser.nextobject()
(_,genno) = parser.nextobject()
parser.nextobject()
(_,stream) = parser.nextobject()
if STRICT:
assert stream.dic['Type'] == LITERAL_XREF
if stream.dic['Type'] != LITERAL_XREF:
raise PDFSyntaxError('invalid stream spec.')
size = stream.dic['Size']
(start, nobjs) = stream.dic.get('Index', (0,size))
self.objid0 = start
@ -385,20 +391,24 @@ class PDFDocument:
if strmid in self.parsed_objs:
objs = self.parsed_objs[stream]
else:
parser = PDFParser(self, StringIO(stream.get_data()),
debug=self.debug)
objs = list(parser.parse())
parser = PDFObjStrmParser(self, stream.get_data(), debug=self.debug)
objs = []
try:
while 1:
(_,obj) = parser.nextobject()
objs.append(obj)
except PSEOF:
pass
self.parsed_objs[stream] = objs
obj = objs[stream.dic['N']*2+index]
else:
prevpos = self.parser.seek(index)
seq = list_value(self.parser.parse())
if not (4 <= len(seq) and seq[0] == objid and seq[2] == KEYWORD_OBJ):
if STRICT:
raise PDFSyntaxError('invalid stream spec: %r' % seq)
return None
obj = seq[3]
self.parser.seek(prevpos)
self.parser.seek(index)
(_,objid1) = self.parser.nextobject() # objid
(_,genno1) = self.parser.nextobject() # genno
(_,kwd) = self.parser.nextobject()
if kwd != KEYWORD_OBJ:
raise PDFSyntaxError('invalid obj spec: offset=%r' % index)
(_,obj) = self.parser.nextobject()
if 2 <= self.debug:
print >>stderr, 'register: objid=%r: %r' % (objid, obj)
self.objs[objid] = obj
@ -446,29 +456,30 @@ class PDFParser(PSStackParser):
return
def __repr__(self):
return '<PDFParser: linepos=%d>' % self.linepos
return '<PDFParser>'
EOIPAT = re.compile(r'\nEI\W')
def do_token(self, pos, token):
name = keyword_name(token)
if name in ('xref', 'trailer', 'startxref', 'endobj'):
return True
if name == 'R':
def do_keyword(self, pos, token):
if token in (KEYWORD_XREF, KEYWORD_STARTXREF):
self.add_results(*self.pop(1))
return
if token == KEYWORD_ENDOBJ:
self.add_results(*self.pop(4))
return
if token == KEYWORD_R:
# reference to indirect object
try:
(objid, genno) = self.pop(2)
((_,objid), (_,genno)) = self.pop(2)
(objid, genno) = (int(objid), int(genno))
obj = PDFObjRef(self.doc, objid, genno)
self.push(obj)
if 2 <= self.debug:
print >>stderr, 'refer obj: %r' % obj
self.push((pos, obj))
except PSSyntaxError:
pass
return
elif name == 'stream':
if token == KEYWORD_STREAM:
# stream object
(dic,) = self.pop(1)
((_,dic),) = self.pop(1)
dic = dict_value(dic)
try:
objlen = int_value(dic['Length'])
@ -484,20 +495,19 @@ class PDFParser(PSStackParser):
self.seek(pos+objlen)
while 1:
(linepos, line) = self.nextline()
if not line or line.startswith('endstream'):
break
if line.startswith('endstream'): break
objlen += len(line)
data += line
if 1 <= self.debug:
print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
(pos, objlen, dic, data[:10])
obj = PDFStream(dic, data, self.doc.decipher)
self.push(obj)
else:
self.push(token)
return False
self.push((pos, obj))
return
# others
self.push((pos, token))
return
def find_xref(self):
# find the first xref table
@ -505,7 +515,7 @@ class PDFParser(PSStackParser):
for line in self.revreadlines():
line = line.strip()
if 2 <= self.debug:
print >>stderr, 'line: %r' % line
print >>stderr, 'find_xref: %r' % line
if line == 'startxref': break
if line:
prev = line
@ -525,10 +535,11 @@ class PDFParser(PSStackParser):
# read xref table
(linepos, line) = self.nextline()
if 2 <= self.debug:
print >>stderr, 'line: %r' % line
print >>stderr, 'read_xref: %r' % line
if line[0].isdigit():
# XRefStream: PDF-1.5
self.seek(linepos)
self.reset()
xref = PDFXRefStream(self)
else:
if line.strip() != 'xref':
@ -551,3 +562,18 @@ class PDFParser(PSStackParser):
else:
break
return
## PDFObjStrmParser
##
class PDFObjStrmParser(PDFParser):
def __init__(self, doc, data, debug=0):
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
PDFParser.__init__(self, doc, StringIO(data), debug=debug)
return
def flush(self):
self.add_results(*self.popall())
return

View File

@ -3,12 +3,13 @@ import sys, re
stderr = sys.stderr
from utils import choplist
STRICT = 0
STRICT = 1
## PS Exceptions
##
class PSException(Exception): pass
class PSEOF(PSException): pass
class PSSyntaxError(PSException): pass
class PSTypeError(PSException): pass
class PSValueError(PSException): pass
@ -71,6 +72,14 @@ class PSSymbolTable:
PSLiteralTable = PSSymbolTable(PSLiteral)
PSKeywordTable = PSSymbolTable(PSKeyword)
LIT = PSLiteralTable.intern
KWD = PSKeywordTable.intern
KEYWORD_BRACE_BEGIN = KWD('{')
KEYWORD_BRACE_END = KWD('}')
KEYWORD_ARRAY_BEGIN = KWD('[')
KEYWORD_ARRAY_END = KWD(']')
KEYWORD_DICT_BEGIN = KWD('<<')
KEYWORD_DICT_END = KWD('>>')
def literal_name(x):
@ -92,72 +101,288 @@ def keyword_name(x):
## PSBaseParser
##
EOL = re.compile(r'[\r\n]')
SPC = re.compile(r'\s')
NONSPC = re.compile(r'\S')
HEX = re.compile(r'[0-9a-fA-F]')
END_LITERAL = re.compile(r'[#/%\[\]()<>{}\s]')
END_HEX_STRING = re.compile(r'[^\s0-9a-fA-F]')
HEX_PAIR = re.compile(r'[0-9a-fA-F]{2}|.')
END_NUMBER = re.compile(r'[^0-9]')
END_KEYWORD = re.compile(r'[#/%\[\]()<>{}\s]')
END_STRING = re.compile(r'[()\134]')
OCT_STRING = re.compile(r'[0-7]')
ESC_STRING = { 'b':8, 't':9, 'n':10, 'f':12, 'r':13, '(':40, ')':41, '\\':92 }
class PSBaseParser:
'''
Most basic PostScript parser that performs only basic tokenization.
'''
BUFSIZ = 4096
def __init__(self, fp, debug=0):
self.fp = fp
self.debug = debug
self.bufsize = 4096
self.strfilter = None
self.seek(0)
return
def __repr__(self):
return '<PSBaseParser: %r>' % (self.fp,)
return '<PSBaseParser: %r, bufpos=%d>' % (self.fp, self.bufpos)
def tell(self):
return self.fp.tell()
def poll(self, pos=None, n=80):
pos0 = self.fp.tell()
if not pos:
pos = self.bufpos+self.charpos
self.fp.seek(pos)
print >>stderr, 'poll(%d): %r' % (pos, self.fp.read(n))
self.fp.seek(pos0)
return
def seek(self, pos):
'''
Seeks the parser to the given position.
'''
if 2 <= self.debug:
print >>stderr, 'seek:', pos
prevpos = self.fp.tell()
print >>stderr, 'seek: %r' % pos
self.fp.seek(pos)
self.linebuf = None # line buffer.
self.curpos = 0 # current position in the buffer.
self.linepos = pos # the beginning of the current line.
self.go = False
return prevpos
# reset the status for nextline()
self.bufpos = pos
self.buf = ''
self.charpos = 0
# reset the status for nexttoken()
self.parse1 = self.parse_main
self.tokens = []
return
def fillbuf(self):
if self.charpos < len(self.buf): return
# fetch next chunk.
self.bufpos = self.fp.tell()
self.buf = self.fp.read(self.BUFSIZ)
if not self.buf:
raise PSEOF
self.charpos = 0
return
EOLCHAR = re.compile(r'[\r\n]')
def parse_main(self, s, i):
m = NONSPC.search(s, i)
if not m:
return (self.parse_main, len(s))
j = m.start(0)
c = s[j]
self.tokenstart = self.bufpos+j
if c == '%':
self.token = '%'
return (self.parse_comment, j+1)
if c == '/':
self.token = ''
return (self.parse_literal, j+1)
if c in '-+' or c.isdigit():
self.token = c
return (self.parse_number, j+1)
if c == '.':
self.token = c
return (self.parse_float, j+1)
if c.isalpha():
self.token = c
return (self.parse_keyword, j+1)
if c == '(':
self.token = ''
self.paren = 1
return (self.parse_string, j+1)
if c == '<':
self.token = ''
return (self.parse_wopen, j+1)
if c == '>':
self.token = ''
return (self.parse_wclose, j+1)
self.add_token(KWD(c))
return (self.parse_main, j+1)
def add_token(self, obj):
self.tokens.append((self.tokenstart, obj))
return
def parse_comment(self, s, i):
m = EOL.search(s, i)
if not m:
self.token += s[i:]
return (self.parse_comment, len(s))
j = m.start(0)
self.token += s[i:j]
# We ignore comments.
#self.tokens.append(self.token)
return (self.parse_main, j)
def parse_literal(self, s, i):
m = END_LITERAL.search(s, i)
if not m:
self.token += s[i:]
return (self.parse_literal, len(s))
j = m.start(0)
self.token += s[i:j]
c = s[j]
if c == '#':
self.hex = ''
return (self.parse_literal_hex, j+1)
self.add_token(LIT(self.token))
return (self.parse_main, j)
def parse_literal_hex(self, s, i):
c = s[i]
if HEX.match(c) and len(self.hex) < 2:
self.hex += c
return (self.parse_literal_hex, i+1)
if self.hex:
self.token += chr(int(self.hex, 16))
return (self.parse_literal, i)
def parse_number(self, s, i):
m = END_NUMBER.search(s, i)
if not m:
self.token += s[i:]
return (self.parse_number, len(s))
j = m.start(0)
self.token += s[i:j]
c = s[j]
if c == '.':
self.token += c
return (self.parse_float, j+1)
try:
self.add_token(int(self.token))
except ValueError:
pass
return (self.parse_main, j)
def parse_float(self, s, i):
m = END_NUMBER.search(s, i)
if not m:
self.token += s[i:]
return (self.parse_float, len(s))
j = m.start(0)
self.token += s[i:j]
self.add_token(float(self.token))
return (self.parse_main, j)
def parse_keyword(self, s, i):
m = END_KEYWORD.search(s, i)
if not m:
self.token += s[i:]
return (self.parse_keyword, len(s))
j = m.start(0)
self.token += s[i:j]
if self.token == 'true':
token = True
elif self.token == 'false':
token = False
else:
token = KWD(self.token)
self.add_token(token)
return (self.parse_main, j)
def parse_string(self, s, i):
m = END_STRING.search(s, i)
if not m:
self.token += s[i:]
return (self.parse_string, len(s))
j = m.start(0)
self.token += s[i:j]
c = s[j]
if c == '\\':
self.oct = ''
return (self.parse_string_1, j+1)
if c == '(':
self.paren += 1
self.token += c
return (self.parse_string, j+1)
if c == ')':
self.paren -= 1
if self.paren: # WTF, they said balanced parens need no special treatment.
self.token += c
return (self.parse_string, j+1)
self.add_token(self.token)
return (self.parse_main, j+1)
def parse_string_1(self, s, i):
c = s[i]
if OCT_STRING.match(c) and len(self.oct) < 3:
self.oct += c
return (self.parse_string_1, i+1)
if self.oct:
self.token += chr(int(self.oct, 8))
return (self.parse_string, i)
if c in ESC_STRING:
self.token += chr(ESC_STRING[c])
return (self.parse_string, i+1)
def parse_wopen(self, s, i):
c = s[i]
if c.isspace() or HEX.match(c):
return (self.parse_hexstring, i)
if c == '<':
self.add_token(KEYWORD_DICT_BEGIN)
i += 1
return (self.parse_main, i)
def parse_wclose(self, s, i):
c = s[i]
if c == '>':
self.add_token(KEYWORD_DICT_END)
i += 1
return (self.parse_main, i)
def parse_hexstring(self, s, i):
m = END_HEX_STRING.search(s, i)
if not m:
self.token += s[i:]
return (self.parse_hexstring, len(s))
j = m.start(0)
self.token += s[i:j]
token = HEX_PAIR.sub(lambda m: chr(int(m.group(0), 16)),
SPC.sub('', self.token))
self.add_token(token)
return (self.parse_main, j)
def nexttoken(self):
while not self.tokens:
self.fillbuf()
(self.parse1, self.charpos) = self.parse1(self.buf, self.charpos)
token = self.tokens.pop(0)
if 2 <= self.debug:
print >>stderr, 'nexttoken: %r' % (token,)
return token
def nextline(self):
'''
Fetches a next line that ends either with \\r or \\n.
'''
line = ''
eol = None
linebuf = ''
linepos = self.bufpos + self.charpos
eol = False
while 1:
if not self.linebuf or len(self.linebuf) <= self.curpos:
# fetch next chunk.
self.linebuf = self.fp.read(self.bufsize)
if not self.linebuf:
# at EOF.
break
self.curpos = 0
self.fillbuf()
if eol:
c = self.linebuf[self.curpos]
c = self.buf[self.charpos]
# handle '\r\n'
if (eol == '\r' and c == '\n'):
line += c
self.curpos += 1
if c == '\n':
linebuf += c
self.charpos += 1
break
m = self.EOLCHAR.search(self.linebuf, self.curpos)
m = EOL.search(self.buf, self.charpos)
if m:
i = m.end(0)
line += self.linebuf[self.curpos:i]
eol = self.linebuf[i-1]
self.curpos = i
linebuf += self.buf[self.charpos:m.end(0)]
self.charpos = m.end(0)
if linebuf[-1] == '\r':
eol = True
else:
break
else:
# fetch further
line += self.linebuf[self.curpos:]
self.linebuf = None
linepos = self.linepos
self.linepos += len(line)
return (linepos, line)
linebuf += self.buf[self.charpos:]
self.charpos = len(self.buf)
if 2 <= self.debug:
print >>stderr, 'nextline: %r' % ((linepos, linebuf),)
return (linepos, linebuf)
def revreadlines(self):
'''
@ -168,9 +393,9 @@ class PSBaseParser:
pos = self.fp.tell()
buf = ''
while 0 < pos:
pos = max(0, pos-self.bufsize)
pos = max(0, pos-self.BUFSIZ)
self.fp.seek(pos)
s = self.fp.read(self.bufsize)
s = self.fp.read(self.BUFSIZ)
if not s: break
while 1:
n = max(s.rfind('\r'), s.rfind('\n'))
@ -182,263 +407,202 @@ class PSBaseParser:
buf = ''
return
# regex patterns for basic lexical scanning.
SPECIAL = r'%\[\]()<>{}/\000\011\012\014\015\040'
TOKEN = re.compile(r'<<|>>|[%\[\]()<>{}/]|[^'+SPECIAL+r']+')
LITERAL = re.compile(r'([^#'+SPECIAL+r']|#[0-9abcdefABCDEF]{2})+')
NUMBER = re.compile(r'[+-]?[0-9][.0-9]*$')
STRING_NORM = re.compile(r'(\\[0-9]{1,3}|\\.|[^\)])+')
STRING_NORM_SUB = re.compile(r'\\[0-7]{1,3}|\\.')
STRING_HEX = re.compile(r'[\s0-9a-fA-F]+')
STRING_HEX_SUB = re.compile(r'[0-9a-fA-F]{1,2}')
def parse(self):
'''
Yields a list of tuples (pos, token) of the following:
keywords, literals, strings, numbers and parentheses.
Comments are skipped.
Nested objects (i.e. arrays and dictionaries) are not handled here.
'''
while 1:
# do not strip line! we need to distinguish last '\n' or '\r'
(linepos, line) = self.nextline()
if not line: break
if 2 <= self.debug:
print >>stderr, 'line: (%d) %r' % (linepos, line)
# do this before removing comment
if line.startswith('%%EOF'): break
charpos = 0
# tokenize
self.go = True
while self.go:
m = self.TOKEN.search(line, charpos)
if not m: break
t = m.group(0)
pos = linepos + m.start(0)
charpos = m.end(0)
if t == '%':
# skip comment
if 2 <= self.debug:
print >>stderr, 'comment: %r' % line[charpos:]
break
elif t == '/':
# literal object
mn = self.LITERAL.match(line, m.start(0)+1)
lit = PSLiteralTable.intern(mn.group(0))
yield (pos, lit)
charpos = mn.end(0)
if 2 <= self.debug:
print >>stderr, 'name: %r' % lit
elif t == '(':
# normal string object
s = ''
while 1:
ms = self.STRING_NORM.match(line, charpos)
if not ms: break
s1 = ms.group(0)
charpos = ms.end(0)
if len(s1) == 1 and s1[-1] == '\\':
s += s1[-1:]
(linepos, line) = self.nextline()
if not line:
if STRICT:
raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
(linepos, line))
break
charpos = 0
elif charpos == len(line):
s += s1
(linepos, line) = self.nextline()
if not line:
if STRICT:
raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
(linepos, line))
break
charpos = 0
else:
s += s1
break
if line[charpos] == ')':
charpos += 1
else:
if STRICT:
raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
(linepos, line))
pass
def convesc(m):
x = m.group(0)
if x[1:].isdigit():
return chr(int(x[1:], 8))
else:
return x[1]
s = self.STRING_NORM_SUB.sub(convesc, s)
if self.strfilter:
s = self.strfilter(s)
if 2 <= self.debug:
print >>stderr, 'str: %r' % s
yield (pos, s)
elif t == '<':
# hex string object
ms = self.STRING_HEX.match(line, charpos)
charpos = ms.end(0)
if line[charpos] == '>':
charpos += 1
else:
if STRICT:
raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
(linepos, line))
def convhex(m1):
return chr(int(m1.group(0), 16))
s = self.STRING_HEX_SUB.sub(convhex, ms.group(0))
if 2 <= self.debug:
print >>stderr, 'str: %r' % s
yield (pos, s)
elif self.NUMBER.match(t):
# number
if '.' in t:
n = float(t)
else:
n = int(t)
if 2 <= self.debug:
print >>stderr, 'number: %r' % n
yield (pos, n)
elif t in ('true', 'false'):
# boolean
if 2 <= self.debug:
print >>stderr, 'boolean: %r' % t
yield (pos, (t == 'true'))
else:
# other token
if 2 <= self.debug:
print >>stderr, 'keyword: %r' % t
yield (pos, PSKeywordTable.intern(t))
return
## PSStackParser
##
class PSStackParser(PSBaseParser):
'''
PostScript parser that recognizes compound objects
such as arrays and dictionaries.
'''
def __init__(self, fp, debug=0):
PSBaseParser.__init__(self, fp, debug=debug)
self.context = []
self.partobj = None
PSBaseParser.__init__(self,fp, debug=debug)
self.reset()
return
def do_token(self, pos, token):
'''
Handles special tokens.
Returns true if the token denotes the end of an object.
'''
return False
def push(self, obj):
'''
Push an object to the stack.
'''
self.partobj.append(obj)
return
def pop(self, n):
'''
Pop N objects from the stack.
'''
if len(self.partobj) < n:
if STRICT:
raise PSSyntaxError('stack too short < %d' % n)
r = self.partobj[-n:]
self.partobj = self.partobj[:-n]
return r
def popall(self):
'''
Discards all the objects on the stack.
'''
self.partobj = []
def reset(self):
self.context = []
self.curtype = None
self.curstack = []
self.results = []
return
def parse(self):
def push(self, *objs):
self.curstack.extend(objs)
return
def pop(self, n):
objs = self.curstack[-n:]
self.curstack[-n:] = []
return objs
def popall(self):
objs = self.curstack
self.curstack = []
return objs
def add_results(self, *objs):
if 2 <= self.debug:
print >>stderr, 'add_results: %r' % (objs,)
self.results.extend(objs)
return
def start_type(self, pos, type):
self.context.append((pos, self.curtype, self.curstack))
(self.curtype, self.curstack) = (type, [])
if 2 <= self.debug:
print >>stderr, 'start_type: pos=%r, type=%r' % (pos, type)
return
def end_type(self, type):
if self.curtype != type:
raise PSTypeError('type mismatch: %r != %r' % (self.curtype, type))
objs = [ obj for (_,obj) in self.curstack ]
(pos, self.curtype, self.curstack) = self.context.pop()
if 2 <= self.debug:
print >>stderr, 'end_type: pos=%r, type=%r, objs=%r' % (pos, type, objs)
return (pos, objs)
def do_keyword(self, pos, token):
return
def flush(self):
return
def nextobject(self):
'''
Yields a list of objects: keywords, literals, strings,
numbers, arrays and dictionaries. Arrays and dictionaries
are represented as Python sequence and dictionaries.
'''
def startobj(type):
self.context.append((type, self.partobj))
self.partobj = []
return
def endobj(type1):
if not self.context:
if STRICT:
raise PSTypeError('stack empty.')
obj = self.partobj
(type0, partobj) = self.context[-1]
if type0 == type1:
self.partobj = partobj
self.context.pop()
else:
if STRICT:
raise PSTypeError('type mismatch: %r(%r) != %r(%r)' %
(type0, self.partobj, type1, obj))
return obj
startobj('o')
for (pos,t) in PSBaseParser.parse(self):
if isinstance(t, int) or isinstance(t, float):
self.push(t)
elif isinstance(t, str):
self.push(t)
elif isinstance(t, PSLiteral):
self.push(t)
else:
c = keyword_name(t)
if c == '{' or c == '}':
self.push(t)
elif c == '[':
# begin array
if 2 <= self.debug:
print >>stderr, 'start array'
startobj('a')
elif c == ']':
# end array
a = endobj('a')
if 2 <= self.debug:
print >>stderr, 'end array: %r' % a
self.push(a)
elif c == '<<':
# begin dictionary
if 2 <= self.debug:
print >>stderr, 'start dict'
startobj('d')
elif c == '>>':
# end dictionary
objs = endobj('d')
while not self.results:
(pos, token) = self.nexttoken()
#print (pos,token), (self.curtype, self.curstack)
if (isinstance(token, int) or
isinstance(token, float) or
isinstance(token, bool) or
isinstance(token, str) or
isinstance(token, PSLiteral)):
# normal token
self.push((pos, token))
elif token == KEYWORD_ARRAY_BEGIN:
# begin array
self.start_type(pos, 'a')
elif token == KEYWORD_ARRAY_END:
# end array
try:
self.push(self.end_type('a'))
except PSTypeError:
if STRICT: raise
elif token == KEYWORD_DICT_BEGIN:
# begin dictionary
self.start_type(pos, 'd')
elif token == KEYWORD_DICT_END:
# end dictionary
try:
(pos, objs) = self.end_type('d')
if len(objs) % 2 != 0:
if STRICT:
raise PSTypeError('invalid dictionary construct: %r' % objs)
d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
if 2 <= self.debug:
print >>stderr, 'end dict: %r' % d
self.push(d)
elif self.do_token(pos, t):
break
raise PSSyntaxError('invalid dictionary construct: %r' % objs)
d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs))
self.push((pos, d))
except PSTypeError:
if STRICT: raise
else:
if 2 <= self.debug:
print >>stderr, 'do_keyword: pos=%r, token=%r, stack=%r' % \
(pos, token, self.curstack)
self.do_keyword(pos, token)
if self.context:
continue
else:
self.flush()
obj = self.results.pop(0)
if 2 <= self.debug:
print >>stderr, 'nextobject: %r' % (obj,)
return obj
objs = endobj('o')
return objs
## Simplistic Test cases
##
import unittest
class TestPSBaseParser(unittest.TestCase):
TESTDATA = r'''%!PS
begin end
" @ #
/a/BCD /Some_Name /foo#5f#xbaa
0 +1 -2 .5 1.234
(abc) () (abc ( def ) ghi)
(def\040\0\0404ghi) (bach\\slask) (foo\nbaa)
(this % is not a comment.)
(foo
baa)
(foo\
baa)
<20> < 40 4020 >
<abcd00
12345>
func/a/b{(c)do*}def
[ 1 (z) ! ]
<< /foo (bar) >>
'''
TOKENS = [
(5, KWD('begin')), (11, KWD('end')), (16, KWD('"')), (19, KWD('@')),
(21, KWD('#')), (23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')),
(41, LIT('foo_xbaa')), (54, 0), (56, 1), (59, -2), (62, 0.5),
(65, 1.234), (71, 'abc'), (77, ''), (80, 'abc ( def ) ghi'),
(98, 'def \x00 4ghi'), (118, 'bach\\slask'), (132, 'foo\nbaa'),
(143, 'this % is not a comment.'), (170, 'foo\nbaa'), (180, 'foobaa'),
(191, ' '), (196, '@@ '), (208, '\xab\xcd\x00\x124\x05'),
(223, KWD('func')), (227, LIT('a')), (229, LIT('b')),
(231, KWD('{')), (232, 'c'), (235, KWD('do*')), (238, KWD('}')),
(239, KWD('def')), (243, KWD('[')), (245, 1), (247, 'z'), (251, KWD('!')),
(253, KWD(']')), (255, KWD('<<')), (258, LIT('foo')), (263, 'bar'),
(269, KWD('>>'))
]
OBJS = [
(23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')),
(41, LIT('foo_xbaa')), (54, 0), (56, 1), (59, -2), (62, 0.5),
(65, 1.234), (71, 'abc'), (77, ''), (80, 'abc ( def ) ghi'),
(98, 'def \x00 4ghi'), (118, 'bach\\slask'), (132, 'foo\nbaa'),
(143, 'this % is not a comment.'), (170, 'foo\nbaa'), (180, 'foobaa'),
(191, ' '), (196, '@@ '), (208, '\xab\xcd\x00\x124\x05'),
(227, LIT('a')), (229, LIT('b')), (232, 'c'), (243, [1, 'z']),
(255, {'foo': 'bar'}),
]
def get_tokens(self, s):
import StringIO
class MyParser(PSBaseParser):
def flush(self):
self.add_results(*self.popall())
parser = MyParser(StringIO.StringIO(s), debug=1)
r = []
try:
while 1:
r.append(parser.nexttoken())
except PSEOF:
pass
return r
def get_objects(self, s):
import StringIO
class MyParser(PSStackParser):
def flush(self):
self.add_results(*self.popall())
parser = MyParser(StringIO.StringIO(s), debug=1)
r = []
try:
while 1:
r.append(parser.nextobject())
except PSEOF:
pass
return r
def test_1(self):
tokens = self.get_tokens(self.TESTDATA)
print tokens
self.assertEqual(tokens, self.TOKENS)
return
def test_2(self):
objs = self.get_objects(self.TESTDATA)
print objs
self.assertEqual(objs, self.OBJS)
return
if __name__ == '__main__': unittest.main()