Restructuring core lexical handlings.
Fix several bugs. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@17 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
94859ea428
commit
366143361f
71
cmap.py
71
cmap.py
|
@ -3,7 +3,7 @@ import sys
|
||||||
stderr = sys.stderr
|
stderr = sys.stderr
|
||||||
from struct import pack, unpack
|
from struct import pack, unpack
|
||||||
from utils import choplist, nunpack
|
from utils import choplist, nunpack
|
||||||
from psparser import PSException, PSSyntaxError, PSTypeError, \
|
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
|
||||||
PSLiteral, PSKeyword, literal_name, keyword_name, \
|
PSLiteral, PSKeyword, literal_name, keyword_name, \
|
||||||
PSStackParser
|
PSStackParser
|
||||||
try:
|
try:
|
||||||
|
@ -39,16 +39,16 @@ class CMap:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def register_code2cid(self, code, cid):
|
def register_code2cid(self, code, cid):
|
||||||
assert isinstance(code, str)
|
if isinstance(code, str) and isinstance(cid, int):
|
||||||
assert isinstance(cid, int)
|
|
||||||
self.code2cid[code] = cid
|
self.code2cid[code] = cid
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def register_cid2code(self, cid, code):
|
def register_cid2code(self, cid, code):
|
||||||
from glyphlist import charname2unicode
|
from glyphlist import charname2unicode
|
||||||
assert isinstance(cid, int)
|
if isinstance(cid, int):
|
||||||
if isinstance(code, PSLiteral):
|
if isinstance(code, PSLiteral):
|
||||||
code = pack('>H', charname2unicode[code.name])
|
self.cid2code[cid] = pack('>H', charname2unicode[code.name])
|
||||||
|
elif isinstance(code, str):
|
||||||
self.cid2code[cid] = code
|
self.cid2code[cid] = code
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
@ -195,7 +195,7 @@ class CMapDB:
|
||||||
print >>stderr, 'Reading: CMap %r...' % fname
|
print >>stderr, 'Reading: CMap %r...' % fname
|
||||||
cmap = CMap()
|
cmap = CMap()
|
||||||
fp = file(fname)
|
fp = file(fname)
|
||||||
CMapParser(cmap, fp).parse()
|
CMapParser(cmap, fp, debug=klass.debug).run()
|
||||||
fp.close()
|
fp.close()
|
||||||
else:
|
else:
|
||||||
raise KeyError(cmapname)
|
raise KeyError(cmapname)
|
||||||
|
@ -213,7 +213,14 @@ class CMapParser(PSStackParser):
|
||||||
self.in_cmap = False
|
self.in_cmap = False
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_token(self, _, token):
|
def run(self):
|
||||||
|
try:
|
||||||
|
self.nextobject()
|
||||||
|
except PSEOF:
|
||||||
|
pass
|
||||||
|
return
|
||||||
|
|
||||||
|
def do_keyword(self, pos, token):
|
||||||
name = token.name
|
name = token.name
|
||||||
if name == 'begincmap':
|
if name == 'begincmap':
|
||||||
self.in_cmap = True
|
self.in_cmap = True
|
||||||
|
@ -226,15 +233,15 @@ class CMapParser(PSStackParser):
|
||||||
#
|
#
|
||||||
if name == 'def':
|
if name == 'def':
|
||||||
try:
|
try:
|
||||||
(k,v) = self.pop(2)
|
((_,k),(_,v)) = self.pop(2)
|
||||||
self.cmap.attrs[literal_name(k)] = v
|
self.cmap.attrs[str(k)] = v
|
||||||
except PSSyntaxError:
|
except PSSyntaxError:
|
||||||
pass
|
pass
|
||||||
return
|
return
|
||||||
|
|
||||||
if name == 'usecmap':
|
if name == 'usecmap':
|
||||||
try:
|
try:
|
||||||
(cmapname,) = self.pop(1)
|
((_,cmapname),) = self.pop(1)
|
||||||
self.cmap.copycmap(CMapDB.get_cmap(literal_name(cmapname)))
|
self.cmap.copycmap(CMapDB.get_cmap(literal_name(cmapname)))
|
||||||
except PSSyntaxError:
|
except PSSyntaxError:
|
||||||
pass
|
pass
|
||||||
|
@ -244,8 +251,6 @@ class CMapParser(PSStackParser):
|
||||||
self.popall()
|
self.popall()
|
||||||
return
|
return
|
||||||
if name == 'endcodespacerange':
|
if name == 'endcodespacerange':
|
||||||
if 1 <= self.debug:
|
|
||||||
print >>stderr, 'codespace: %r' % self.partobj
|
|
||||||
self.popall()
|
self.popall()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -253,48 +258,45 @@ class CMapParser(PSStackParser):
|
||||||
self.popall()
|
self.popall()
|
||||||
return
|
return
|
||||||
if name == 'endcidrange':
|
if name == 'endcidrange':
|
||||||
for (s,e,cid) in choplist(3, self.partobj):
|
objs = [ obj for (_,obj) in self.popall() ]
|
||||||
assert isinstance(s, str)
|
for (s,e,cid) in choplist(3, objs):
|
||||||
assert isinstance(e, str)
|
if (not isinstance(s, str) or not isinstance(e, str) or
|
||||||
assert isinstance(cid, int)
|
not isinstance(cid, int) or len(s) != len(e)): continue
|
||||||
assert len(s) == len(e)
|
|
||||||
sprefix = s[:-4]
|
sprefix = s[:-4]
|
||||||
eprefix = e[:-4]
|
eprefix = e[:-4]
|
||||||
assert sprefix == eprefix
|
if sprefix != eprefix: continue
|
||||||
svar = s[-4:]
|
svar = s[-4:]
|
||||||
evar = e[-4:]
|
evar = e[-4:]
|
||||||
s1 = nunpack(svar)
|
s1 = nunpack(svar)
|
||||||
e1 = nunpack(evar)
|
e1 = nunpack(evar)
|
||||||
vlen = len(svar)
|
vlen = len(svar)
|
||||||
assert s1 <= e1
|
#assert s1 <= e1
|
||||||
for i in xrange(e1-s1+1):
|
for i in xrange(e1-s1+1):
|
||||||
x = sprefix+pack('>L',s1+i)[-vlen:]
|
x = sprefix+pack('>L',s1+i)[-vlen:]
|
||||||
self.cmap.register_code2cid(x, cid+i)
|
self.cmap.register_code2cid(x, cid+i)
|
||||||
self.popall()
|
|
||||||
return
|
return
|
||||||
|
|
||||||
if name == 'begincidchar':
|
if name == 'begincidchar':
|
||||||
self.popall()
|
self.popall()
|
||||||
return
|
return
|
||||||
if name == 'endcidchar':
|
if name == 'endcidchar':
|
||||||
for (cid,code) in choplist(2, self.partobj):
|
objs = [ obj for (_,obj) in self.popall() ]
|
||||||
assert isinstance(code, str)
|
for (cid,code) in choplist(2, objs):
|
||||||
assert isinstance(cid, str)
|
if isinstance(code, str) and isinstance(cid, str):
|
||||||
self.cmap.register_code2cid(code, nunpack(cid))
|
self.cmap.register_code2cid(code, nunpack(cid))
|
||||||
self.popall()
|
|
||||||
return
|
return
|
||||||
|
|
||||||
if name == 'beginbfrange':
|
if name == 'beginbfrange':
|
||||||
self.popall()
|
self.popall()
|
||||||
return
|
return
|
||||||
if name == 'endbfrange':
|
if name == 'endbfrange':
|
||||||
for (s,e,code) in choplist(3, self.partobj):
|
objs = [ obj for (_,obj) in self.popall() ]
|
||||||
assert isinstance(s, str)
|
for (s,e,code) in choplist(3, objs):
|
||||||
assert isinstance(e, str)
|
if (not isinstance(s, str) or not isinstance(e, str) or
|
||||||
assert len(s) == len(e)
|
len(s) != len(e)): continue
|
||||||
s1 = nunpack(s)
|
s1 = nunpack(s)
|
||||||
e1 = nunpack(e)
|
e1 = nunpack(e)
|
||||||
assert s1 <= e1
|
#assert s1 <= e1
|
||||||
if isinstance(code, list):
|
if isinstance(code, list):
|
||||||
for i in xrange(e1-s1+1):
|
for i in xrange(e1-s1+1):
|
||||||
self.cmap.register_cid2code(s1+i, code[i])
|
self.cmap.register_cid2code(s1+i, code[i])
|
||||||
|
@ -306,29 +308,26 @@ class CMapParser(PSStackParser):
|
||||||
for i in xrange(e1-s1+1):
|
for i in xrange(e1-s1+1):
|
||||||
x = prefix+pack('>L',base+i)[-vlen:]
|
x = prefix+pack('>L',base+i)[-vlen:]
|
||||||
self.cmap.register_cid2code(s1+i, x)
|
self.cmap.register_cid2code(s1+i, x)
|
||||||
self.popall()
|
|
||||||
return
|
return
|
||||||
|
|
||||||
if name == 'beginbfchar':
|
if name == 'beginbfchar':
|
||||||
self.popall()
|
self.popall()
|
||||||
return
|
return
|
||||||
if name == 'endbfchar':
|
if name == 'endbfchar':
|
||||||
for (cid,code) in choplist(2, self.partobj):
|
objs = [ obj for (_,obj) in self.popall() ]
|
||||||
assert isinstance(cid, str)
|
for (cid,code) in choplist(2, objs):
|
||||||
assert isinstance(code, str)
|
if isinstance(cid, str) and isinstance(code, str):
|
||||||
self.cmap.register_cid2code(nunpack(cid), code)
|
self.cmap.register_cid2code(nunpack(cid), code)
|
||||||
self.popall()
|
|
||||||
return
|
return
|
||||||
|
|
||||||
if name == 'beginnotdefrange':
|
if name == 'beginnotdefrange':
|
||||||
self.popall()
|
self.popall()
|
||||||
return
|
return
|
||||||
if name == 'endnotdefrange':
|
if name == 'endnotdefrange':
|
||||||
if 1 <= self.debug:
|
|
||||||
print >>stderr, 'notdefrange: %r' % self.partobj
|
|
||||||
self.popall()
|
self.popall()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
self.push((pos, token))
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
|
10
pdf2txt.py
10
pdf2txt.py
|
@ -13,8 +13,8 @@ from cmap import CMapDB
|
||||||
##
|
##
|
||||||
class TextConverter(PDFDevice):
|
class TextConverter(PDFDevice):
|
||||||
|
|
||||||
def __init__(self, outfp, rsrc, codec):
|
def __init__(self, outfp, rsrc, codec, debug=0):
|
||||||
PDFDevice.__init__(self, rsrc)
|
PDFDevice.__init__(self, rsrc, debug=debug)
|
||||||
self.outfp = outfp
|
self.outfp = outfp
|
||||||
self.codec = codec
|
self.codec = codec
|
||||||
return
|
return
|
||||||
|
@ -42,6 +42,10 @@ class TextConverter(PDFDevice):
|
||||||
return
|
return
|
||||||
|
|
||||||
def handle_undefined_char(self, cidcoding, cid):
|
def handle_undefined_char(self, cidcoding, cid):
|
||||||
|
if self.debug:
|
||||||
|
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
|
||||||
|
#return unichr(cid)
|
||||||
|
#return unichr(cid+32)
|
||||||
return
|
return
|
||||||
|
|
||||||
def render_string(self, textstate, textmatrix, size, seq):
|
def render_string(self, textstate, textmatrix, size, seq):
|
||||||
|
@ -81,7 +85,7 @@ class TextConverter(PDFDevice):
|
||||||
|
|
||||||
# pdf2txt
|
# pdf2txt
|
||||||
def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
|
def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
|
||||||
device = TextConverter(outfp, rsrc, codec)
|
device = TextConverter(outfp, rsrc, codec, debug=debug)
|
||||||
outfp.write('<document>\n')
|
outfp.write('<document>\n')
|
||||||
doc = PDFDocument(debug=debug)
|
doc = PDFDocument(debug=debug)
|
||||||
fp = file(fname)
|
fp = file(fname)
|
||||||
|
|
231
pdfinterp.py
231
pdfinterp.py
|
@ -6,7 +6,7 @@ try:
|
||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
from psparser import PSException, PSSyntaxError, PSTypeError, \
|
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
|
||||||
PSStackParser, PSLiteral, PSKeyword, STRICT, \
|
PSStackParser, PSLiteral, PSKeyword, STRICT, \
|
||||||
PSLiteralTable, PSKeywordTable, literal_name, keyword_name
|
PSLiteralTable, PSKeywordTable, literal_name, keyword_name
|
||||||
from pdfparser import PDFException, PDFStream, PDFObjRef, resolve1, \
|
from pdfparser import PDFException, PDFStream, PDFObjRef, resolve1, \
|
||||||
|
@ -45,6 +45,8 @@ LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
|
||||||
LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
|
LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
|
||||||
LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
|
LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
|
||||||
LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
|
LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
|
||||||
|
KEYWORD_BI = PSKeywordTable.intern('BI')
|
||||||
|
KEYWORD_ID = PSKeywordTable.intern('ID')
|
||||||
KEYWORD_EI = PSKeywordTable.intern('EI')
|
KEYWORD_EI = PSKeywordTable.intern('EI')
|
||||||
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
|
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
|
||||||
|
|
||||||
|
@ -134,7 +136,7 @@ class PDFSimpleFont(PDFFont):
|
||||||
if 'ToUnicode' in spec:
|
if 'ToUnicode' in spec:
|
||||||
strm = stream_value(spec['ToUnicode'])
|
strm = stream_value(spec['ToUnicode'])
|
||||||
self.ucs2_cmap = CMap()
|
self.ucs2_cmap = CMap()
|
||||||
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse()
|
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
|
||||||
PDFFont.__init__(self, descriptor, widths)
|
PDFFont.__init__(self, descriptor, widths)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -292,7 +294,7 @@ class PDFCIDFont(PDFFont):
|
||||||
if 'ToUnicode' in spec:
|
if 'ToUnicode' in spec:
|
||||||
strm = stream_value(spec['ToUnicode'])
|
strm = stream_value(spec['ToUnicode'])
|
||||||
self.ucs2_cmap = CMap()
|
self.ucs2_cmap = CMap()
|
||||||
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse()
|
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
|
||||||
elif self.cidcoding == 'Adobe-Identity':
|
elif self.cidcoding == 'Adobe-Identity':
|
||||||
if ttf:
|
if ttf:
|
||||||
try:
|
try:
|
||||||
|
@ -433,8 +435,9 @@ class PDFResourceManager:
|
||||||
##
|
##
|
||||||
class PDFDevice:
|
class PDFDevice:
|
||||||
|
|
||||||
def __init__(self, rsrc):
|
def __init__(self, rsrc, debug=0):
|
||||||
self.rsrc = rsrc
|
self.rsrc = rsrc
|
||||||
|
self.debug = debug
|
||||||
self.ctm = None
|
self.ctm = None
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -465,47 +468,91 @@ class PDFDevice:
|
||||||
##
|
##
|
||||||
class PDFContentParser(PSStackParser):
|
class PDFContentParser(PSStackParser):
|
||||||
|
|
||||||
def __init__(self, fp, debug=0):
|
def __init__(self, streams, debug=0):
|
||||||
PSStackParser.__init__(self, fp, debug=debug)
|
self.streams = streams
|
||||||
|
self.istream = 0
|
||||||
|
PSStackParser.__init__(self, None, debug=debug)
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<PDFParser: linepos=%d>' % self.linepos
|
return '<PDFParser: linepos=%d>' % self.linepos
|
||||||
|
|
||||||
EOIPAT = re.compile(r'\nEI\W')
|
def fillfp(self):
|
||||||
def do_token(self, pos, token):
|
if not self.fp:
|
||||||
name = keyword_name(token)
|
if self.istream < len(self.streams):
|
||||||
|
strm = stream_value(self.streams[self.istream])
|
||||||
if name == 'BI':
|
self.istream += 1
|
||||||
# inline image within a content stream
|
|
||||||
self.context.append(('BI', self.partobj))
|
|
||||||
self.partobj = []
|
|
||||||
|
|
||||||
elif name == 'ID':
|
|
||||||
objs = self.partobj
|
|
||||||
(type0, self.partobj) = self.context.pop()
|
|
||||||
if len(objs) % 2 != 0:
|
|
||||||
if STRICT:
|
|
||||||
raise PSTypeError('invalid dictionary construct: %r' % objs)
|
|
||||||
dic = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
|
|
||||||
pos += len('ID ')
|
|
||||||
self.fp.seek(pos)
|
|
||||||
# XXX how do we know the real length other than scanning?
|
|
||||||
data = ''
|
|
||||||
while 1:
|
|
||||||
data += self.fp.read(4096)
|
|
||||||
m = self.EOIPAT.search(data)
|
|
||||||
if m: break
|
|
||||||
objlen = m.start(0)
|
|
||||||
obj = PDFStream(dic, data[:objlen])
|
|
||||||
self.push(obj)
|
|
||||||
self.seek(pos+objlen+len('\nEI'))
|
|
||||||
self.push(KEYWORD_EI)
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
self.push(token)
|
raise PSEOF
|
||||||
|
self.fp = StringIO(strm.get_data())
|
||||||
|
return
|
||||||
|
|
||||||
return False
|
def seek(self, pos):
|
||||||
|
self.fillfp()
|
||||||
|
PSStackParser.seek(self, pos)
|
||||||
|
return
|
||||||
|
|
||||||
|
def fillbuf(self):
|
||||||
|
if self.charpos < len(self.buf): return
|
||||||
|
while 1:
|
||||||
|
self.fillfp()
|
||||||
|
self.bufpos = self.fp.tell()
|
||||||
|
self.buf = self.fp.read(self.BUFSIZ)
|
||||||
|
if self.buf: break
|
||||||
|
self.fp = None
|
||||||
|
self.charpos = 0
|
||||||
|
return
|
||||||
|
|
||||||
|
def get_inline_data(self, pos, target='EI '):
|
||||||
|
self.seek(pos)
|
||||||
|
i = 0
|
||||||
|
data = ''
|
||||||
|
while i < len(target):
|
||||||
|
self.fillbuf()
|
||||||
|
if i:
|
||||||
|
c = self.buf[self.charpos]
|
||||||
|
data += c
|
||||||
|
self.charpos += 1
|
||||||
|
if c == target[i]:
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
i = 0
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
j = self.buf.index(target[0], self.charpos)
|
||||||
|
#print 'found', (0, self.buf[j:j+10])
|
||||||
|
data += self.buf[self.charpos:j]
|
||||||
|
self.charpos = j+1
|
||||||
|
i = 1
|
||||||
|
except ValueError:
|
||||||
|
data += self.buf[self.charpos:]
|
||||||
|
self.charpos = len(self.buf)
|
||||||
|
data = data[:-len(target)] # strip the last part
|
||||||
|
return (pos, data)
|
||||||
|
|
||||||
|
def flush(self):
|
||||||
|
self.add_results(*self.popall())
|
||||||
|
return
|
||||||
|
|
||||||
|
def do_keyword(self, pos, token):
|
||||||
|
if token == KEYWORD_BI:
|
||||||
|
# inline image within a content stream
|
||||||
|
self.start_type(pos, 'inline')
|
||||||
|
elif token == KEYWORD_ID:
|
||||||
|
try:
|
||||||
|
(_, objs) = self.end_type('inline')
|
||||||
|
if len(objs) % 2 != 0:
|
||||||
|
raise PSTypeError('invalid dictionary construct: %r' % objs)
|
||||||
|
d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
|
||||||
|
(pos, data) = self.get_inline_data(pos+len('ID '))
|
||||||
|
obj = PDFStream(d, data)
|
||||||
|
self.push((pos, obj))
|
||||||
|
self.push((pos, KEYWORD_EI))
|
||||||
|
except PSTypeError:
|
||||||
|
if STRICT: raise
|
||||||
|
else:
|
||||||
|
self.push((pos, token))
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
## Interpreter
|
## Interpreter
|
||||||
|
@ -542,10 +589,44 @@ class PDFPageInterpreter:
|
||||||
self.debug = debug
|
self.debug = debug
|
||||||
return
|
return
|
||||||
|
|
||||||
def initpage(self, ctm):
|
def init_resources(self, resources):
|
||||||
self.fontmap = {}
|
self.fontmap = {}
|
||||||
self.xobjmap = {}
|
self.xobjmap = {}
|
||||||
self.csmap = PREDEFINED_COLORSPACE.copy()
|
self.csmap = PREDEFINED_COLORSPACE.copy()
|
||||||
|
# Handle resource declarations.
|
||||||
|
def get_colorspace(spec):
|
||||||
|
if isinstance(spec, list):
|
||||||
|
name = literal_name(spec[0])
|
||||||
|
else:
|
||||||
|
name = literal_name(spec)
|
||||||
|
if name == 'ICCBased':
|
||||||
|
return ColorSpace(name, stream_value(spec[1]).dic['N'])
|
||||||
|
elif name == 'DeviceN':
|
||||||
|
return ColorSpace(name, len(list_value(spec[1])))
|
||||||
|
else:
|
||||||
|
return PREDEFINED_COLORSPACE[name]
|
||||||
|
if resources:
|
||||||
|
for (k,v) in dict_value(resources).iteritems():
|
||||||
|
if 1 <= self.debug:
|
||||||
|
print >>stderr, 'Resource: %r: %r' % (k,v)
|
||||||
|
if k == 'Font':
|
||||||
|
for (fontid,spec) in dict_value(v).iteritems():
|
||||||
|
objid = None
|
||||||
|
if isinstance(spec, PDFObjRef):
|
||||||
|
objid = spec.objid
|
||||||
|
spec = dict_value(spec)
|
||||||
|
self.fontmap[fontid] = self.rsrc.get_font(objid, spec)
|
||||||
|
elif k == 'ColorSpace':
|
||||||
|
for (csid,spec) in dict_value(v).iteritems():
|
||||||
|
self.csmap[csid] = get_colorspace(resolve1(spec))
|
||||||
|
elif k == 'ProcSet':
|
||||||
|
self.rsrc.get_procset(list_value(v))
|
||||||
|
elif k == 'XObject':
|
||||||
|
for (xobjid,xobjstrm) in dict_value(v).iteritems():
|
||||||
|
self.xobjmap[xobjid] = xobjstrm
|
||||||
|
return
|
||||||
|
|
||||||
|
def init_state(self, ctm):
|
||||||
# gstack: stack for graphical states.
|
# gstack: stack for graphical states.
|
||||||
self.gstack = []
|
self.gstack = []
|
||||||
self.ctm = ctm
|
self.ctm = ctm
|
||||||
|
@ -554,8 +635,9 @@ class PDFPageInterpreter:
|
||||||
# argstack: stack for command arguments.
|
# argstack: stack for command arguments.
|
||||||
self.argstack = []
|
self.argstack = []
|
||||||
# set some global states.
|
# set some global states.
|
||||||
self.scs = None
|
self.scs = self.ncs = None
|
||||||
self.ncs = None
|
if self.csmap:
|
||||||
|
self.scs = self.ncs = self.csmap.values()[0]
|
||||||
return
|
return
|
||||||
|
|
||||||
def push(self, obj):
|
def push(self, obj):
|
||||||
|
@ -683,10 +765,22 @@ class PDFPageInterpreter:
|
||||||
|
|
||||||
# setcolor
|
# setcolor
|
||||||
def do_SCN(self):
|
def do_SCN(self):
|
||||||
self.pop(self.scs.ncomponents)
|
if self.scs:
|
||||||
|
n = self.scs.ncomponents
|
||||||
|
else:
|
||||||
|
if STRICT:
|
||||||
|
raise PDFInterpreterError('no colorspace specified!')
|
||||||
|
n = 1
|
||||||
|
self.pop(n)
|
||||||
return
|
return
|
||||||
def do_scn(self):
|
def do_scn(self):
|
||||||
self.pop(self.ncs.ncomponents)
|
if self.ncs:
|
||||||
|
n = self.ncs.ncomponents
|
||||||
|
else:
|
||||||
|
if STRICT:
|
||||||
|
raise PDFInterpreterError('no colorspace specified!')
|
||||||
|
n = 1
|
||||||
|
self.pop(n)
|
||||||
return
|
return
|
||||||
def do_SC(self):
|
def do_SC(self):
|
||||||
self.do_SCN()
|
self.do_SCN()
|
||||||
|
@ -839,8 +933,7 @@ class PDFPageInterpreter:
|
||||||
(x1,y1) = apply_matrix(ctm, (x1,y1))
|
(x1,y1) = apply_matrix(ctm, (x1,y1))
|
||||||
bbox = (x0,y0,x1,y1)
|
bbox = (x0,y0,x1,y1)
|
||||||
self.device.begin_figure(xobjid, bbox)
|
self.device.begin_figure(xobjid, bbox)
|
||||||
interpreter.render_contents(xobj.dic.get('Resources'),
|
interpreter.render_contents(xobj.dic.get('Resources'), [xobj], ctm=ctm)
|
||||||
[xobj], ctm=ctm)
|
|
||||||
self.device.end_figure(xobjid)
|
self.device.end_figure(xobjid)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -853,46 +946,18 @@ class PDFPageInterpreter:
|
||||||
return
|
return
|
||||||
|
|
||||||
def render_contents(self, resources, contents, ctm=MATRIX_IDENTITY):
|
def render_contents(self, resources, contents, ctm=MATRIX_IDENTITY):
|
||||||
self.initpage(ctm)
|
self.init_resources(resources)
|
||||||
# Handle resource declarations.
|
self.init_state(ctm)
|
||||||
def get_colorspace(spec):
|
self.execute(list_value(contents))
|
||||||
if isinstance(spec, list):
|
|
||||||
name = literal_name(spec[0])
|
|
||||||
else:
|
|
||||||
name = literal_name(spec)
|
|
||||||
if name == 'ICCBased':
|
|
||||||
return ColorSpace(name, stream_value(spec[1]).dic['N'])
|
|
||||||
elif name == 'DeviceN':
|
|
||||||
return ColorSpace(name, len(list_value(cs[1])))
|
|
||||||
else:
|
|
||||||
return PREDEFINED_COLORSPACE[name]
|
|
||||||
if resources:
|
|
||||||
for (k,v) in dict_value(resources).iteritems():
|
|
||||||
if 1 <= self.debug:
|
|
||||||
print >>stderr, 'Resource: %r: %r' % (k,v)
|
|
||||||
if k == 'Font':
|
|
||||||
for (fontid,spec) in dict_value(v).iteritems():
|
|
||||||
objid = None
|
|
||||||
if isinstance(spec, PDFObjRef):
|
|
||||||
objid = spec.objid
|
|
||||||
spec = dict_value(spec)
|
|
||||||
self.fontmap[fontid] = self.rsrc.get_font(objid, spec)
|
|
||||||
elif k == 'ColorSpace':
|
|
||||||
for (csid,spec) in dict_value(v).iteritems():
|
|
||||||
self.csmap[csid] = get_colorspace(resolve1(spec))
|
|
||||||
elif k == 'ProcSet':
|
|
||||||
self.rsrc.get_procset(list_value(v))
|
|
||||||
elif k == 'XObject':
|
|
||||||
for (xobjid,xobjstrm) in dict_value(v).iteritems():
|
|
||||||
self.xobjmap[xobjid] = xobjstrm
|
|
||||||
data = ''.join( stream_value(stream).get_data()
|
|
||||||
for stream in list_value(contents) )
|
|
||||||
self.execute(data)
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def execute(self, data):
|
def execute(self, streams):
|
||||||
parser = PDFContentParser(StringIO(data), debug=self.debug)
|
parser = PDFContentParser(streams, debug=self.debug)
|
||||||
for obj in parser.parse():
|
while 1:
|
||||||
|
try:
|
||||||
|
(_,obj) = parser.nextobject()
|
||||||
|
except PSEOF:
|
||||||
|
break
|
||||||
if isinstance(obj, PSKeyword):
|
if isinstance(obj, PSKeyword):
|
||||||
name = 'do_%s' % obj.name.replace('*','_a').replace('"','_w').replace("'",'_q')
|
name = 'do_%s' % obj.name.replace('*','_a').replace('"','_w').replace("'",'_q')
|
||||||
if hasattr(self, name):
|
if hasattr(self, name):
|
||||||
|
|
112
pdfparser.py
112
pdfparser.py
|
@ -14,14 +14,10 @@
|
||||||
# - Linearized PDF.
|
# - Linearized PDF.
|
||||||
# - Encryption?
|
# - Encryption?
|
||||||
|
|
||||||
import sys, re
|
import sys
|
||||||
try:
|
|
||||||
from cStringIO import StringIO
|
|
||||||
except ImportError:
|
|
||||||
from StringIO import StringIO
|
|
||||||
stderr = sys.stderr
|
stderr = sys.stderr
|
||||||
from utils import choplist, nunpack
|
from utils import choplist, nunpack
|
||||||
from psparser import PSException, PSSyntaxError, PSTypeError, \
|
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
|
||||||
PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
|
PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
|
||||||
literal_name, keyword_name, \
|
literal_name, keyword_name, \
|
||||||
PSStackParser, STRICT
|
PSStackParser, STRICT
|
||||||
|
@ -43,14 +39,19 @@ LITERAL_PAGE = PSLiteralTable.intern('Page')
|
||||||
LITERAL_PAGES = PSLiteralTable.intern('Pages')
|
LITERAL_PAGES = PSLiteralTable.intern('Pages')
|
||||||
LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
|
LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
|
||||||
LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode')
|
LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode')
|
||||||
|
KEYWORD_R = PSKeywordTable.intern('R')
|
||||||
KEYWORD_OBJ = PSKeywordTable.intern('obj')
|
KEYWORD_OBJ = PSKeywordTable.intern('obj')
|
||||||
|
KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
|
||||||
|
KEYWORD_STREAM = PSKeywordTable.intern('stream')
|
||||||
|
KEYWORD_XREF = PSKeywordTable.intern('xref')
|
||||||
|
KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
|
||||||
|
|
||||||
|
|
||||||
## PDFObjRef
|
## PDFObjRef
|
||||||
##
|
##
|
||||||
class PDFObjRef:
|
class PDFObjRef:
|
||||||
|
|
||||||
def __init__(self, doc, objid, genno):
|
def __init__(self, doc, objid, _):
|
||||||
if objid == 0:
|
if objid == 0:
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFValueError('objid cannot be 0.')
|
raise PDFValueError('objid cannot be 0.')
|
||||||
|
@ -275,7 +276,8 @@ class PDFXRef:
|
||||||
(pos, genno, use) = f
|
(pos, genno, use) = f
|
||||||
self.offsets.append((int(genno), long(pos), use))
|
self.offsets.append((int(genno), long(pos), use))
|
||||||
# read trailer
|
# read trailer
|
||||||
self.trailer = dict_value(parser.parse()[0])
|
(_, dic) = parser.nextobject()
|
||||||
|
self.trailer = dict_value(dic)
|
||||||
return
|
return
|
||||||
|
|
||||||
def getpos(self, objid):
|
def getpos(self, objid):
|
||||||
|
@ -293,9 +295,13 @@ class PDFXRef:
|
||||||
class PDFXRefStream:
|
class PDFXRefStream:
|
||||||
|
|
||||||
def __init__(self, parser):
|
def __init__(self, parser):
|
||||||
(objid, genno, _, stream) = list_value(parser.parse())
|
(_,objid) = parser.nextobject()
|
||||||
|
(_,genno) = parser.nextobject()
|
||||||
|
parser.nextobject()
|
||||||
|
(_,stream) = parser.nextobject()
|
||||||
if STRICT:
|
if STRICT:
|
||||||
assert stream.dic['Type'] == LITERAL_XREF
|
if stream.dic['Type'] != LITERAL_XREF:
|
||||||
|
raise PDFSyntaxError('invalid stream spec.')
|
||||||
size = stream.dic['Size']
|
size = stream.dic['Size']
|
||||||
(start, nobjs) = stream.dic.get('Index', (0,size))
|
(start, nobjs) = stream.dic.get('Index', (0,size))
|
||||||
self.objid0 = start
|
self.objid0 = start
|
||||||
|
@ -385,20 +391,24 @@ class PDFDocument:
|
||||||
if strmid in self.parsed_objs:
|
if strmid in self.parsed_objs:
|
||||||
objs = self.parsed_objs[stream]
|
objs = self.parsed_objs[stream]
|
||||||
else:
|
else:
|
||||||
parser = PDFParser(self, StringIO(stream.get_data()),
|
parser = PDFObjStrmParser(self, stream.get_data(), debug=self.debug)
|
||||||
debug=self.debug)
|
objs = []
|
||||||
objs = list(parser.parse())
|
try:
|
||||||
|
while 1:
|
||||||
|
(_,obj) = parser.nextobject()
|
||||||
|
objs.append(obj)
|
||||||
|
except PSEOF:
|
||||||
|
pass
|
||||||
self.parsed_objs[stream] = objs
|
self.parsed_objs[stream] = objs
|
||||||
obj = objs[stream.dic['N']*2+index]
|
obj = objs[stream.dic['N']*2+index]
|
||||||
else:
|
else:
|
||||||
prevpos = self.parser.seek(index)
|
self.parser.seek(index)
|
||||||
seq = list_value(self.parser.parse())
|
(_,objid1) = self.parser.nextobject() # objid
|
||||||
if not (4 <= len(seq) and seq[0] == objid and seq[2] == KEYWORD_OBJ):
|
(_,genno1) = self.parser.nextobject() # genno
|
||||||
if STRICT:
|
(_,kwd) = self.parser.nextobject()
|
||||||
raise PDFSyntaxError('invalid stream spec: %r' % seq)
|
if kwd != KEYWORD_OBJ:
|
||||||
return None
|
raise PDFSyntaxError('invalid obj spec: offset=%r' % index)
|
||||||
obj = seq[3]
|
(_,obj) = self.parser.nextobject()
|
||||||
self.parser.seek(prevpos)
|
|
||||||
if 2 <= self.debug:
|
if 2 <= self.debug:
|
||||||
print >>stderr, 'register: objid=%r: %r' % (objid, obj)
|
print >>stderr, 'register: objid=%r: %r' % (objid, obj)
|
||||||
self.objs[objid] = obj
|
self.objs[objid] = obj
|
||||||
|
@ -446,29 +456,30 @@ class PDFParser(PSStackParser):
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<PDFParser: linepos=%d>' % self.linepos
|
return '<PDFParser>'
|
||||||
|
|
||||||
EOIPAT = re.compile(r'\nEI\W')
|
def do_keyword(self, pos, token):
|
||||||
def do_token(self, pos, token):
|
if token in (KEYWORD_XREF, KEYWORD_STARTXREF):
|
||||||
name = keyword_name(token)
|
self.add_results(*self.pop(1))
|
||||||
if name in ('xref', 'trailer', 'startxref', 'endobj'):
|
return
|
||||||
return True
|
if token == KEYWORD_ENDOBJ:
|
||||||
|
self.add_results(*self.pop(4))
|
||||||
|
return
|
||||||
|
|
||||||
if name == 'R':
|
if token == KEYWORD_R:
|
||||||
# reference to indirect object
|
# reference to indirect object
|
||||||
try:
|
try:
|
||||||
(objid, genno) = self.pop(2)
|
((_,objid), (_,genno)) = self.pop(2)
|
||||||
(objid, genno) = (int(objid), int(genno))
|
(objid, genno) = (int(objid), int(genno))
|
||||||
obj = PDFObjRef(self.doc, objid, genno)
|
obj = PDFObjRef(self.doc, objid, genno)
|
||||||
self.push(obj)
|
self.push((pos, obj))
|
||||||
if 2 <= self.debug:
|
|
||||||
print >>stderr, 'refer obj: %r' % obj
|
|
||||||
except PSSyntaxError:
|
except PSSyntaxError:
|
||||||
pass
|
pass
|
||||||
|
return
|
||||||
|
|
||||||
elif name == 'stream':
|
if token == KEYWORD_STREAM:
|
||||||
# stream object
|
# stream object
|
||||||
(dic,) = self.pop(1)
|
((_,dic),) = self.pop(1)
|
||||||
dic = dict_value(dic)
|
dic = dict_value(dic)
|
||||||
try:
|
try:
|
||||||
objlen = int_value(dic['Length'])
|
objlen = int_value(dic['Length'])
|
||||||
|
@ -484,20 +495,19 @@ class PDFParser(PSStackParser):
|
||||||
self.seek(pos+objlen)
|
self.seek(pos+objlen)
|
||||||
while 1:
|
while 1:
|
||||||
(linepos, line) = self.nextline()
|
(linepos, line) = self.nextline()
|
||||||
if not line or line.startswith('endstream'):
|
if line.startswith('endstream'): break
|
||||||
break
|
|
||||||
objlen += len(line)
|
objlen += len(line)
|
||||||
data += line
|
data += line
|
||||||
if 1 <= self.debug:
|
if 1 <= self.debug:
|
||||||
print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
|
print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
|
||||||
(pos, objlen, dic, data[:10])
|
(pos, objlen, dic, data[:10])
|
||||||
obj = PDFStream(dic, data, self.doc.decipher)
|
obj = PDFStream(dic, data, self.doc.decipher)
|
||||||
self.push(obj)
|
self.push((pos, obj))
|
||||||
|
return
|
||||||
|
|
||||||
else:
|
# others
|
||||||
self.push(token)
|
self.push((pos, token))
|
||||||
|
return
|
||||||
return False
|
|
||||||
|
|
||||||
def find_xref(self):
|
def find_xref(self):
|
||||||
# find the first xref table
|
# find the first xref table
|
||||||
|
@ -505,7 +515,7 @@ class PDFParser(PSStackParser):
|
||||||
for line in self.revreadlines():
|
for line in self.revreadlines():
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if 2 <= self.debug:
|
if 2 <= self.debug:
|
||||||
print >>stderr, 'line: %r' % line
|
print >>stderr, 'find_xref: %r' % line
|
||||||
if line == 'startxref': break
|
if line == 'startxref': break
|
||||||
if line:
|
if line:
|
||||||
prev = line
|
prev = line
|
||||||
|
@ -525,10 +535,11 @@ class PDFParser(PSStackParser):
|
||||||
# read xref table
|
# read xref table
|
||||||
(linepos, line) = self.nextline()
|
(linepos, line) = self.nextline()
|
||||||
if 2 <= self.debug:
|
if 2 <= self.debug:
|
||||||
print >>stderr, 'line: %r' % line
|
print >>stderr, 'read_xref: %r' % line
|
||||||
if line[0].isdigit():
|
if line[0].isdigit():
|
||||||
# XRefStream: PDF-1.5
|
# XRefStream: PDF-1.5
|
||||||
self.seek(linepos)
|
self.seek(linepos)
|
||||||
|
self.reset()
|
||||||
xref = PDFXRefStream(self)
|
xref = PDFXRefStream(self)
|
||||||
else:
|
else:
|
||||||
if line.strip() != 'xref':
|
if line.strip() != 'xref':
|
||||||
|
@ -551,3 +562,18 @@ class PDFParser(PSStackParser):
|
||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
return
|
return
|
||||||
|
|
||||||
|
## PDFObjStrmParser
|
||||||
|
##
|
||||||
|
class PDFObjStrmParser(PDFParser):
|
||||||
|
def __init__(self, doc, data, debug=0):
|
||||||
|
try:
|
||||||
|
from cStringIO import StringIO
|
||||||
|
except ImportError:
|
||||||
|
from StringIO import StringIO
|
||||||
|
PDFParser.__init__(self, doc, StringIO(data), debug=debug)
|
||||||
|
return
|
||||||
|
|
||||||
|
def flush(self):
|
||||||
|
self.add_results(*self.popall())
|
||||||
|
return
|
||||||
|
|
712
psparser.py
712
psparser.py
|
@ -3,12 +3,13 @@ import sys, re
|
||||||
stderr = sys.stderr
|
stderr = sys.stderr
|
||||||
from utils import choplist
|
from utils import choplist
|
||||||
|
|
||||||
STRICT = 0
|
STRICT = 1
|
||||||
|
|
||||||
|
|
||||||
## PS Exceptions
|
## PS Exceptions
|
||||||
##
|
##
|
||||||
class PSException(Exception): pass
|
class PSException(Exception): pass
|
||||||
|
class PSEOF(PSException): pass
|
||||||
class PSSyntaxError(PSException): pass
|
class PSSyntaxError(PSException): pass
|
||||||
class PSTypeError(PSException): pass
|
class PSTypeError(PSException): pass
|
||||||
class PSValueError(PSException): pass
|
class PSValueError(PSException): pass
|
||||||
|
@ -71,6 +72,14 @@ class PSSymbolTable:
|
||||||
|
|
||||||
PSLiteralTable = PSSymbolTable(PSLiteral)
|
PSLiteralTable = PSSymbolTable(PSLiteral)
|
||||||
PSKeywordTable = PSSymbolTable(PSKeyword)
|
PSKeywordTable = PSSymbolTable(PSKeyword)
|
||||||
|
LIT = PSLiteralTable.intern
|
||||||
|
KWD = PSKeywordTable.intern
|
||||||
|
KEYWORD_BRACE_BEGIN = KWD('{')
|
||||||
|
KEYWORD_BRACE_END = KWD('}')
|
||||||
|
KEYWORD_ARRAY_BEGIN = KWD('[')
|
||||||
|
KEYWORD_ARRAY_END = KWD(']')
|
||||||
|
KEYWORD_DICT_BEGIN = KWD('<<')
|
||||||
|
KEYWORD_DICT_END = KWD('>>')
|
||||||
|
|
||||||
|
|
||||||
def literal_name(x):
|
def literal_name(x):
|
||||||
|
@ -92,72 +101,288 @@ def keyword_name(x):
|
||||||
|
|
||||||
## PSBaseParser
|
## PSBaseParser
|
||||||
##
|
##
|
||||||
|
EOL = re.compile(r'[\r\n]')
|
||||||
|
SPC = re.compile(r'\s')
|
||||||
|
NONSPC = re.compile(r'\S')
|
||||||
|
HEX = re.compile(r'[0-9a-fA-F]')
|
||||||
|
END_LITERAL = re.compile(r'[#/%\[\]()<>{}\s]')
|
||||||
|
END_HEX_STRING = re.compile(r'[^\s0-9a-fA-F]')
|
||||||
|
HEX_PAIR = re.compile(r'[0-9a-fA-F]{2}|.')
|
||||||
|
END_NUMBER = re.compile(r'[^0-9]')
|
||||||
|
END_KEYWORD = re.compile(r'[#/%\[\]()<>{}\s]')
|
||||||
|
END_STRING = re.compile(r'[()\134]')
|
||||||
|
OCT_STRING = re.compile(r'[0-7]')
|
||||||
|
ESC_STRING = { 'b':8, 't':9, 'n':10, 'f':12, 'r':13, '(':40, ')':41, '\\':92 }
|
||||||
class PSBaseParser:
|
class PSBaseParser:
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Most basic PostScript parser that performs only basic tokenization.
|
Most basic PostScript parser that performs only basic tokenization.
|
||||||
'''
|
'''
|
||||||
|
BUFSIZ = 4096
|
||||||
|
|
||||||
def __init__(self, fp, debug=0):
|
def __init__(self, fp, debug=0):
|
||||||
self.fp = fp
|
self.fp = fp
|
||||||
self.debug = debug
|
self.debug = debug
|
||||||
self.bufsize = 4096
|
|
||||||
self.strfilter = None
|
|
||||||
self.seek(0)
|
self.seek(0)
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<PSBaseParser: %r>' % (self.fp,)
|
return '<PSBaseParser: %r, bufpos=%d>' % (self.fp, self.bufpos)
|
||||||
|
|
||||||
|
def tell(self):
|
||||||
|
return self.fp.tell()
|
||||||
|
|
||||||
|
def poll(self, pos=None, n=80):
|
||||||
|
pos0 = self.fp.tell()
|
||||||
|
if not pos:
|
||||||
|
pos = self.bufpos+self.charpos
|
||||||
|
self.fp.seek(pos)
|
||||||
|
print >>stderr, 'poll(%d): %r' % (pos, self.fp.read(n))
|
||||||
|
self.fp.seek(pos0)
|
||||||
|
return
|
||||||
|
|
||||||
def seek(self, pos):
|
def seek(self, pos):
|
||||||
'''
|
'''
|
||||||
Seeks the parser to the given position.
|
Seeks the parser to the given position.
|
||||||
'''
|
'''
|
||||||
if 2 <= self.debug:
|
if 2 <= self.debug:
|
||||||
print >>stderr, 'seek:', pos
|
print >>stderr, 'seek: %r' % pos
|
||||||
prevpos = self.fp.tell()
|
|
||||||
self.fp.seek(pos)
|
self.fp.seek(pos)
|
||||||
self.linebuf = None # line buffer.
|
# reset the status for nextline()
|
||||||
self.curpos = 0 # current position in the buffer.
|
self.bufpos = pos
|
||||||
self.linepos = pos # the beginning of the current line.
|
self.buf = ''
|
||||||
self.go = False
|
self.charpos = 0
|
||||||
return prevpos
|
# reset the status for nexttoken()
|
||||||
|
self.parse1 = self.parse_main
|
||||||
|
self.tokens = []
|
||||||
|
return
|
||||||
|
|
||||||
|
def fillbuf(self):
|
||||||
|
if self.charpos < len(self.buf): return
|
||||||
|
# fetch next chunk.
|
||||||
|
self.bufpos = self.fp.tell()
|
||||||
|
self.buf = self.fp.read(self.BUFSIZ)
|
||||||
|
if not self.buf:
|
||||||
|
raise PSEOF
|
||||||
|
self.charpos = 0
|
||||||
|
return
|
||||||
|
|
||||||
|
def parse_main(self, s, i):
|
||||||
|
m = NONSPC.search(s, i)
|
||||||
|
if not m:
|
||||||
|
return (self.parse_main, len(s))
|
||||||
|
j = m.start(0)
|
||||||
|
c = s[j]
|
||||||
|
self.tokenstart = self.bufpos+j
|
||||||
|
if c == '%':
|
||||||
|
self.token = '%'
|
||||||
|
return (self.parse_comment, j+1)
|
||||||
|
if c == '/':
|
||||||
|
self.token = ''
|
||||||
|
return (self.parse_literal, j+1)
|
||||||
|
if c in '-+' or c.isdigit():
|
||||||
|
self.token = c
|
||||||
|
return (self.parse_number, j+1)
|
||||||
|
if c == '.':
|
||||||
|
self.token = c
|
||||||
|
return (self.parse_float, j+1)
|
||||||
|
if c.isalpha():
|
||||||
|
self.token = c
|
||||||
|
return (self.parse_keyword, j+1)
|
||||||
|
if c == '(':
|
||||||
|
self.token = ''
|
||||||
|
self.paren = 1
|
||||||
|
return (self.parse_string, j+1)
|
||||||
|
if c == '<':
|
||||||
|
self.token = ''
|
||||||
|
return (self.parse_wopen, j+1)
|
||||||
|
if c == '>':
|
||||||
|
self.token = ''
|
||||||
|
return (self.parse_wclose, j+1)
|
||||||
|
self.add_token(KWD(c))
|
||||||
|
return (self.parse_main, j+1)
|
||||||
|
|
||||||
|
def add_token(self, obj):
|
||||||
|
self.tokens.append((self.tokenstart, obj))
|
||||||
|
return
|
||||||
|
|
||||||
|
def parse_comment(self, s, i):
|
||||||
|
m = EOL.search(s, i)
|
||||||
|
if not m:
|
||||||
|
self.token += s[i:]
|
||||||
|
return (self.parse_comment, len(s))
|
||||||
|
j = m.start(0)
|
||||||
|
self.token += s[i:j]
|
||||||
|
# We ignore comments.
|
||||||
|
#self.tokens.append(self.token)
|
||||||
|
return (self.parse_main, j)
|
||||||
|
|
||||||
|
def parse_literal(self, s, i):
|
||||||
|
m = END_LITERAL.search(s, i)
|
||||||
|
if not m:
|
||||||
|
self.token += s[i:]
|
||||||
|
return (self.parse_literal, len(s))
|
||||||
|
j = m.start(0)
|
||||||
|
self.token += s[i:j]
|
||||||
|
c = s[j]
|
||||||
|
if c == '#':
|
||||||
|
self.hex = ''
|
||||||
|
return (self.parse_literal_hex, j+1)
|
||||||
|
self.add_token(LIT(self.token))
|
||||||
|
return (self.parse_main, j)
|
||||||
|
|
||||||
|
def parse_literal_hex(self, s, i):
|
||||||
|
c = s[i]
|
||||||
|
if HEX.match(c) and len(self.hex) < 2:
|
||||||
|
self.hex += c
|
||||||
|
return (self.parse_literal_hex, i+1)
|
||||||
|
if self.hex:
|
||||||
|
self.token += chr(int(self.hex, 16))
|
||||||
|
return (self.parse_literal, i)
|
||||||
|
|
||||||
|
def parse_number(self, s, i):
|
||||||
|
m = END_NUMBER.search(s, i)
|
||||||
|
if not m:
|
||||||
|
self.token += s[i:]
|
||||||
|
return (self.parse_number, len(s))
|
||||||
|
j = m.start(0)
|
||||||
|
self.token += s[i:j]
|
||||||
|
c = s[j]
|
||||||
|
if c == '.':
|
||||||
|
self.token += c
|
||||||
|
return (self.parse_float, j+1)
|
||||||
|
try:
|
||||||
|
self.add_token(int(self.token))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
return (self.parse_main, j)
|
||||||
|
def parse_float(self, s, i):
|
||||||
|
m = END_NUMBER.search(s, i)
|
||||||
|
if not m:
|
||||||
|
self.token += s[i:]
|
||||||
|
return (self.parse_float, len(s))
|
||||||
|
j = m.start(0)
|
||||||
|
self.token += s[i:j]
|
||||||
|
self.add_token(float(self.token))
|
||||||
|
return (self.parse_main, j)
|
||||||
|
|
||||||
|
def parse_keyword(self, s, i):
|
||||||
|
m = END_KEYWORD.search(s, i)
|
||||||
|
if not m:
|
||||||
|
self.token += s[i:]
|
||||||
|
return (self.parse_keyword, len(s))
|
||||||
|
j = m.start(0)
|
||||||
|
self.token += s[i:j]
|
||||||
|
if self.token == 'true':
|
||||||
|
token = True
|
||||||
|
elif self.token == 'false':
|
||||||
|
token = False
|
||||||
|
else:
|
||||||
|
token = KWD(self.token)
|
||||||
|
self.add_token(token)
|
||||||
|
return (self.parse_main, j)
|
||||||
|
|
||||||
|
def parse_string(self, s, i):
|
||||||
|
m = END_STRING.search(s, i)
|
||||||
|
if not m:
|
||||||
|
self.token += s[i:]
|
||||||
|
return (self.parse_string, len(s))
|
||||||
|
j = m.start(0)
|
||||||
|
self.token += s[i:j]
|
||||||
|
c = s[j]
|
||||||
|
if c == '\\':
|
||||||
|
self.oct = ''
|
||||||
|
return (self.parse_string_1, j+1)
|
||||||
|
if c == '(':
|
||||||
|
self.paren += 1
|
||||||
|
self.token += c
|
||||||
|
return (self.parse_string, j+1)
|
||||||
|
if c == ')':
|
||||||
|
self.paren -= 1
|
||||||
|
if self.paren: # WTF, they said balanced parens need no special treatment.
|
||||||
|
self.token += c
|
||||||
|
return (self.parse_string, j+1)
|
||||||
|
self.add_token(self.token)
|
||||||
|
return (self.parse_main, j+1)
|
||||||
|
def parse_string_1(self, s, i):
|
||||||
|
c = s[i]
|
||||||
|
if OCT_STRING.match(c) and len(self.oct) < 3:
|
||||||
|
self.oct += c
|
||||||
|
return (self.parse_string_1, i+1)
|
||||||
|
if self.oct:
|
||||||
|
self.token += chr(int(self.oct, 8))
|
||||||
|
return (self.parse_string, i)
|
||||||
|
if c in ESC_STRING:
|
||||||
|
self.token += chr(ESC_STRING[c])
|
||||||
|
return (self.parse_string, i+1)
|
||||||
|
|
||||||
|
def parse_wopen(self, s, i):
|
||||||
|
c = s[i]
|
||||||
|
if c.isspace() or HEX.match(c):
|
||||||
|
return (self.parse_hexstring, i)
|
||||||
|
if c == '<':
|
||||||
|
self.add_token(KEYWORD_DICT_BEGIN)
|
||||||
|
i += 1
|
||||||
|
return (self.parse_main, i)
|
||||||
|
|
||||||
|
def parse_wclose(self, s, i):
|
||||||
|
c = s[i]
|
||||||
|
if c == '>':
|
||||||
|
self.add_token(KEYWORD_DICT_END)
|
||||||
|
i += 1
|
||||||
|
return (self.parse_main, i)
|
||||||
|
|
||||||
|
def parse_hexstring(self, s, i):
|
||||||
|
m = END_HEX_STRING.search(s, i)
|
||||||
|
if not m:
|
||||||
|
self.token += s[i:]
|
||||||
|
return (self.parse_hexstring, len(s))
|
||||||
|
j = m.start(0)
|
||||||
|
self.token += s[i:j]
|
||||||
|
token = HEX_PAIR.sub(lambda m: chr(int(m.group(0), 16)),
|
||||||
|
SPC.sub('', self.token))
|
||||||
|
self.add_token(token)
|
||||||
|
return (self.parse_main, j)
|
||||||
|
|
||||||
|
def nexttoken(self):
|
||||||
|
while not self.tokens:
|
||||||
|
self.fillbuf()
|
||||||
|
(self.parse1, self.charpos) = self.parse1(self.buf, self.charpos)
|
||||||
|
token = self.tokens.pop(0)
|
||||||
|
if 2 <= self.debug:
|
||||||
|
print >>stderr, 'nexttoken: %r' % (token,)
|
||||||
|
return token
|
||||||
|
|
||||||
EOLCHAR = re.compile(r'[\r\n]')
|
|
||||||
def nextline(self):
|
def nextline(self):
|
||||||
'''
|
'''
|
||||||
Fetches a next line that ends either with \\r or \\n.
|
Fetches a next line that ends either with \\r or \\n.
|
||||||
'''
|
'''
|
||||||
line = ''
|
linebuf = ''
|
||||||
eol = None
|
linepos = self.bufpos + self.charpos
|
||||||
|
eol = False
|
||||||
while 1:
|
while 1:
|
||||||
if not self.linebuf or len(self.linebuf) <= self.curpos:
|
self.fillbuf()
|
||||||
# fetch next chunk.
|
|
||||||
self.linebuf = self.fp.read(self.bufsize)
|
|
||||||
if not self.linebuf:
|
|
||||||
# at EOF.
|
|
||||||
break
|
|
||||||
self.curpos = 0
|
|
||||||
if eol:
|
if eol:
|
||||||
c = self.linebuf[self.curpos]
|
c = self.buf[self.charpos]
|
||||||
# handle '\r\n'
|
# handle '\r\n'
|
||||||
if (eol == '\r' and c == '\n'):
|
if c == '\n':
|
||||||
line += c
|
linebuf += c
|
||||||
self.curpos += 1
|
self.charpos += 1
|
||||||
break
|
break
|
||||||
m = self.EOLCHAR.search(self.linebuf, self.curpos)
|
m = EOL.search(self.buf, self.charpos)
|
||||||
if m:
|
if m:
|
||||||
i = m.end(0)
|
linebuf += self.buf[self.charpos:m.end(0)]
|
||||||
line += self.linebuf[self.curpos:i]
|
self.charpos = m.end(0)
|
||||||
eol = self.linebuf[i-1]
|
if linebuf[-1] == '\r':
|
||||||
self.curpos = i
|
eol = True
|
||||||
else:
|
else:
|
||||||
# fetch further
|
break
|
||||||
line += self.linebuf[self.curpos:]
|
else:
|
||||||
self.linebuf = None
|
linebuf += self.buf[self.charpos:]
|
||||||
linepos = self.linepos
|
self.charpos = len(self.buf)
|
||||||
self.linepos += len(line)
|
if 2 <= self.debug:
|
||||||
return (linepos, line)
|
print >>stderr, 'nextline: %r' % ((linepos, linebuf),)
|
||||||
|
return (linepos, linebuf)
|
||||||
|
|
||||||
def revreadlines(self):
|
def revreadlines(self):
|
||||||
'''
|
'''
|
||||||
|
@ -168,9 +393,9 @@ class PSBaseParser:
|
||||||
pos = self.fp.tell()
|
pos = self.fp.tell()
|
||||||
buf = ''
|
buf = ''
|
||||||
while 0 < pos:
|
while 0 < pos:
|
||||||
pos = max(0, pos-self.bufsize)
|
pos = max(0, pos-self.BUFSIZ)
|
||||||
self.fp.seek(pos)
|
self.fp.seek(pos)
|
||||||
s = self.fp.read(self.bufsize)
|
s = self.fp.read(self.BUFSIZ)
|
||||||
if not s: break
|
if not s: break
|
||||||
while 1:
|
while 1:
|
||||||
n = max(s.rfind('\r'), s.rfind('\n'))
|
n = max(s.rfind('\r'), s.rfind('\n'))
|
||||||
|
@ -182,263 +407,202 @@ class PSBaseParser:
|
||||||
buf = ''
|
buf = ''
|
||||||
return
|
return
|
||||||
|
|
||||||
# regex patterns for basic lexical scanning.
|
|
||||||
SPECIAL = r'%\[\]()<>{}/\000\011\012\014\015\040'
|
|
||||||
TOKEN = re.compile(r'<<|>>|[%\[\]()<>{}/]|[^'+SPECIAL+r']+')
|
|
||||||
LITERAL = re.compile(r'([^#'+SPECIAL+r']|#[0-9abcdefABCDEF]{2})+')
|
|
||||||
NUMBER = re.compile(r'[+-]?[0-9][.0-9]*$')
|
|
||||||
STRING_NORM = re.compile(r'(\\[0-9]{1,3}|\\.|[^\)])+')
|
|
||||||
STRING_NORM_SUB = re.compile(r'\\[0-7]{1,3}|\\.')
|
|
||||||
STRING_HEX = re.compile(r'[\s0-9a-fA-F]+')
|
|
||||||
STRING_HEX_SUB = re.compile(r'[0-9a-fA-F]{1,2}')
|
|
||||||
|
|
||||||
def parse(self):
|
|
||||||
'''
|
|
||||||
Yields a list of tuples (pos, token) of the following:
|
|
||||||
keywords, literals, strings, numbers and parentheses.
|
|
||||||
Comments are skipped.
|
|
||||||
Nested objects (i.e. arrays and dictionaries) are not handled here.
|
|
||||||
'''
|
|
||||||
while 1:
|
|
||||||
# do not strip line! we need to distinguish last '\n' or '\r'
|
|
||||||
(linepos, line) = self.nextline()
|
|
||||||
if not line: break
|
|
||||||
if 2 <= self.debug:
|
|
||||||
print >>stderr, 'line: (%d) %r' % (linepos, line)
|
|
||||||
# do this before removing comment
|
|
||||||
if line.startswith('%%EOF'): break
|
|
||||||
charpos = 0
|
|
||||||
|
|
||||||
# tokenize
|
|
||||||
self.go = True
|
|
||||||
while self.go:
|
|
||||||
m = self.TOKEN.search(line, charpos)
|
|
||||||
if not m: break
|
|
||||||
t = m.group(0)
|
|
||||||
pos = linepos + m.start(0)
|
|
||||||
charpos = m.end(0)
|
|
||||||
|
|
||||||
if t == '%':
|
|
||||||
# skip comment
|
|
||||||
if 2 <= self.debug:
|
|
||||||
print >>stderr, 'comment: %r' % line[charpos:]
|
|
||||||
break
|
|
||||||
|
|
||||||
elif t == '/':
|
|
||||||
# literal object
|
|
||||||
mn = self.LITERAL.match(line, m.start(0)+1)
|
|
||||||
lit = PSLiteralTable.intern(mn.group(0))
|
|
||||||
yield (pos, lit)
|
|
||||||
charpos = mn.end(0)
|
|
||||||
if 2 <= self.debug:
|
|
||||||
print >>stderr, 'name: %r' % lit
|
|
||||||
|
|
||||||
elif t == '(':
|
|
||||||
# normal string object
|
|
||||||
s = ''
|
|
||||||
while 1:
|
|
||||||
ms = self.STRING_NORM.match(line, charpos)
|
|
||||||
if not ms: break
|
|
||||||
s1 = ms.group(0)
|
|
||||||
charpos = ms.end(0)
|
|
||||||
if len(s1) == 1 and s1[-1] == '\\':
|
|
||||||
s += s1[-1:]
|
|
||||||
(linepos, line) = self.nextline()
|
|
||||||
if not line:
|
|
||||||
if STRICT:
|
|
||||||
raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
|
|
||||||
(linepos, line))
|
|
||||||
break
|
|
||||||
charpos = 0
|
|
||||||
elif charpos == len(line):
|
|
||||||
s += s1
|
|
||||||
(linepos, line) = self.nextline()
|
|
||||||
if not line:
|
|
||||||
if STRICT:
|
|
||||||
raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
|
|
||||||
(linepos, line))
|
|
||||||
break
|
|
||||||
charpos = 0
|
|
||||||
else:
|
|
||||||
s += s1
|
|
||||||
break
|
|
||||||
if line[charpos] == ')':
|
|
||||||
charpos += 1
|
|
||||||
else:
|
|
||||||
if STRICT:
|
|
||||||
raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
|
|
||||||
(linepos, line))
|
|
||||||
pass
|
|
||||||
def convesc(m):
|
|
||||||
x = m.group(0)
|
|
||||||
if x[1:].isdigit():
|
|
||||||
return chr(int(x[1:], 8))
|
|
||||||
else:
|
|
||||||
return x[1]
|
|
||||||
s = self.STRING_NORM_SUB.sub(convesc, s)
|
|
||||||
if self.strfilter:
|
|
||||||
s = self.strfilter(s)
|
|
||||||
if 2 <= self.debug:
|
|
||||||
print >>stderr, 'str: %r' % s
|
|
||||||
yield (pos, s)
|
|
||||||
|
|
||||||
elif t == '<':
|
|
||||||
# hex string object
|
|
||||||
ms = self.STRING_HEX.match(line, charpos)
|
|
||||||
charpos = ms.end(0)
|
|
||||||
if line[charpos] == '>':
|
|
||||||
charpos += 1
|
|
||||||
else:
|
|
||||||
if STRICT:
|
|
||||||
raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
|
|
||||||
(linepos, line))
|
|
||||||
def convhex(m1):
|
|
||||||
return chr(int(m1.group(0), 16))
|
|
||||||
s = self.STRING_HEX_SUB.sub(convhex, ms.group(0))
|
|
||||||
if 2 <= self.debug:
|
|
||||||
print >>stderr, 'str: %r' % s
|
|
||||||
yield (pos, s)
|
|
||||||
|
|
||||||
elif self.NUMBER.match(t):
|
|
||||||
# number
|
|
||||||
if '.' in t:
|
|
||||||
n = float(t)
|
|
||||||
else:
|
|
||||||
n = int(t)
|
|
||||||
if 2 <= self.debug:
|
|
||||||
print >>stderr, 'number: %r' % n
|
|
||||||
yield (pos, n)
|
|
||||||
|
|
||||||
elif t in ('true', 'false'):
|
|
||||||
# boolean
|
|
||||||
if 2 <= self.debug:
|
|
||||||
print >>stderr, 'boolean: %r' % t
|
|
||||||
yield (pos, (t == 'true'))
|
|
||||||
|
|
||||||
else:
|
|
||||||
# other token
|
|
||||||
if 2 <= self.debug:
|
|
||||||
print >>stderr, 'keyword: %r' % t
|
|
||||||
yield (pos, PSKeywordTable.intern(t))
|
|
||||||
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
## PSStackParser
|
## PSStackParser
|
||||||
##
|
##
|
||||||
class PSStackParser(PSBaseParser):
|
class PSStackParser(PSBaseParser):
|
||||||
|
|
||||||
'''
|
|
||||||
PostScript parser that recognizes compound objects
|
|
||||||
such as arrays and dictionaries.
|
|
||||||
'''
|
|
||||||
|
|
||||||
def __init__(self, fp, debug=0):
|
def __init__(self, fp, debug=0):
|
||||||
PSBaseParser.__init__(self, fp, debug=debug)
|
PSBaseParser.__init__(self,fp, debug=debug)
|
||||||
|
self.reset()
|
||||||
|
return
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
self.context = []
|
self.context = []
|
||||||
self.partobj = None
|
self.curtype = None
|
||||||
|
self.curstack = []
|
||||||
|
self.results = []
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_token(self, pos, token):
|
def push(self, *objs):
|
||||||
'''
|
self.curstack.extend(objs)
|
||||||
Handles special tokens.
|
|
||||||
Returns true if the token denotes the end of an object.
|
|
||||||
'''
|
|
||||||
return False
|
|
||||||
|
|
||||||
def push(self, obj):
|
|
||||||
'''
|
|
||||||
Push an object to the stack.
|
|
||||||
'''
|
|
||||||
self.partobj.append(obj)
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def pop(self, n):
|
def pop(self, n):
|
||||||
'''
|
objs = self.curstack[-n:]
|
||||||
Pop N objects from the stack.
|
self.curstack[-n:] = []
|
||||||
'''
|
return objs
|
||||||
if len(self.partobj) < n:
|
|
||||||
if STRICT:
|
|
||||||
raise PSSyntaxError('stack too short < %d' % n)
|
|
||||||
r = self.partobj[-n:]
|
|
||||||
self.partobj = self.partobj[:-n]
|
|
||||||
return r
|
|
||||||
|
|
||||||
def popall(self):
|
def popall(self):
|
||||||
'''
|
objs = self.curstack
|
||||||
Discards all the objects on the stack.
|
self.curstack = []
|
||||||
'''
|
return objs
|
||||||
self.partobj = []
|
def add_results(self, *objs):
|
||||||
|
if 2 <= self.debug:
|
||||||
|
print >>stderr, 'add_results: %r' % (objs,)
|
||||||
|
self.results.extend(objs)
|
||||||
return
|
return
|
||||||
|
|
||||||
def parse(self):
|
def start_type(self, pos, type):
|
||||||
|
self.context.append((pos, self.curtype, self.curstack))
|
||||||
|
(self.curtype, self.curstack) = (type, [])
|
||||||
|
if 2 <= self.debug:
|
||||||
|
print >>stderr, 'start_type: pos=%r, type=%r' % (pos, type)
|
||||||
|
return
|
||||||
|
def end_type(self, type):
|
||||||
|
if self.curtype != type:
|
||||||
|
raise PSTypeError('type mismatch: %r != %r' % (self.curtype, type))
|
||||||
|
objs = [ obj for (_,obj) in self.curstack ]
|
||||||
|
(pos, self.curtype, self.curstack) = self.context.pop()
|
||||||
|
if 2 <= self.debug:
|
||||||
|
print >>stderr, 'end_type: pos=%r, type=%r, objs=%r' % (pos, type, objs)
|
||||||
|
return (pos, objs)
|
||||||
|
|
||||||
|
def do_keyword(self, pos, token):
|
||||||
|
return
|
||||||
|
def flush(self):
|
||||||
|
return
|
||||||
|
|
||||||
|
def nextobject(self):
|
||||||
'''
|
'''
|
||||||
Yields a list of objects: keywords, literals, strings,
|
Yields a list of objects: keywords, literals, strings,
|
||||||
numbers, arrays and dictionaries. Arrays and dictionaries
|
numbers, arrays and dictionaries. Arrays and dictionaries
|
||||||
are represented as Python sequence and dictionaries.
|
are represented as Python sequence and dictionaries.
|
||||||
'''
|
'''
|
||||||
|
while not self.results:
|
||||||
def startobj(type):
|
(pos, token) = self.nexttoken()
|
||||||
self.context.append((type, self.partobj))
|
#print (pos,token), (self.curtype, self.curstack)
|
||||||
self.partobj = []
|
if (isinstance(token, int) or
|
||||||
return
|
isinstance(token, float) or
|
||||||
|
isinstance(token, bool) or
|
||||||
def endobj(type1):
|
isinstance(token, str) or
|
||||||
if not self.context:
|
isinstance(token, PSLiteral)):
|
||||||
if STRICT:
|
# normal token
|
||||||
raise PSTypeError('stack empty.')
|
self.push((pos, token))
|
||||||
obj = self.partobj
|
elif token == KEYWORD_ARRAY_BEGIN:
|
||||||
(type0, partobj) = self.context[-1]
|
# begin array
|
||||||
if type0 == type1:
|
self.start_type(pos, 'a')
|
||||||
self.partobj = partobj
|
elif token == KEYWORD_ARRAY_END:
|
||||||
self.context.pop()
|
# end array
|
||||||
|
try:
|
||||||
|
self.push(self.end_type('a'))
|
||||||
|
except PSTypeError:
|
||||||
|
if STRICT: raise
|
||||||
|
elif token == KEYWORD_DICT_BEGIN:
|
||||||
|
# begin dictionary
|
||||||
|
self.start_type(pos, 'd')
|
||||||
|
elif token == KEYWORD_DICT_END:
|
||||||
|
# end dictionary
|
||||||
|
try:
|
||||||
|
(pos, objs) = self.end_type('d')
|
||||||
|
if len(objs) % 2 != 0:
|
||||||
|
raise PSSyntaxError('invalid dictionary construct: %r' % objs)
|
||||||
|
d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs))
|
||||||
|
self.push((pos, d))
|
||||||
|
except PSTypeError:
|
||||||
|
if STRICT: raise
|
||||||
else:
|
else:
|
||||||
if STRICT:
|
if 2 <= self.debug:
|
||||||
raise PSTypeError('type mismatch: %r(%r) != %r(%r)' %
|
print >>stderr, 'do_keyword: pos=%r, token=%r, stack=%r' % \
|
||||||
(type0, self.partobj, type1, obj))
|
(pos, token, self.curstack)
|
||||||
|
self.do_keyword(pos, token)
|
||||||
|
if self.context:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
self.flush()
|
||||||
|
obj = self.results.pop(0)
|
||||||
|
if 2 <= self.debug:
|
||||||
|
print >>stderr, 'nextobject: %r' % (obj,)
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
startobj('o')
|
|
||||||
|
|
||||||
for (pos,t) in PSBaseParser.parse(self):
|
## Simplistic Test cases
|
||||||
if isinstance(t, int) or isinstance(t, float):
|
##
|
||||||
self.push(t)
|
import unittest
|
||||||
elif isinstance(t, str):
|
class TestPSBaseParser(unittest.TestCase):
|
||||||
self.push(t)
|
|
||||||
elif isinstance(t, PSLiteral):
|
|
||||||
self.push(t)
|
|
||||||
else:
|
|
||||||
c = keyword_name(t)
|
|
||||||
if c == '{' or c == '}':
|
|
||||||
self.push(t)
|
|
||||||
elif c == '[':
|
|
||||||
# begin array
|
|
||||||
if 2 <= self.debug:
|
|
||||||
print >>stderr, 'start array'
|
|
||||||
startobj('a')
|
|
||||||
elif c == ']':
|
|
||||||
# end array
|
|
||||||
a = endobj('a')
|
|
||||||
if 2 <= self.debug:
|
|
||||||
print >>stderr, 'end array: %r' % a
|
|
||||||
self.push(a)
|
|
||||||
elif c == '<<':
|
|
||||||
# begin dictionary
|
|
||||||
if 2 <= self.debug:
|
|
||||||
print >>stderr, 'start dict'
|
|
||||||
startobj('d')
|
|
||||||
elif c == '>>':
|
|
||||||
# end dictionary
|
|
||||||
objs = endobj('d')
|
|
||||||
if len(objs) % 2 != 0:
|
|
||||||
if STRICT:
|
|
||||||
raise PSTypeError('invalid dictionary construct: %r' % objs)
|
|
||||||
d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
|
|
||||||
if 2 <= self.debug:
|
|
||||||
print >>stderr, 'end dict: %r' % d
|
|
||||||
self.push(d)
|
|
||||||
elif self.do_token(pos, t):
|
|
||||||
break
|
|
||||||
|
|
||||||
objs = endobj('o')
|
TESTDATA = r'''%!PS
|
||||||
return objs
|
begin end
|
||||||
|
" @ #
|
||||||
|
/a/BCD /Some_Name /foo#5f#xbaa
|
||||||
|
0 +1 -2 .5 1.234
|
||||||
|
(abc) () (abc ( def ) ghi)
|
||||||
|
(def\040\0\0404ghi) (bach\\slask) (foo\nbaa)
|
||||||
|
(this % is not a comment.)
|
||||||
|
(foo
|
||||||
|
baa)
|
||||||
|
(foo\
|
||||||
|
baa)
|
||||||
|
<20> < 40 4020 >
|
||||||
|
<abcd00
|
||||||
|
12345>
|
||||||
|
func/a/b{(c)do*}def
|
||||||
|
[ 1 (z) ! ]
|
||||||
|
<< /foo (bar) >>
|
||||||
|
'''
|
||||||
|
|
||||||
|
TOKENS = [
|
||||||
|
(5, KWD('begin')), (11, KWD('end')), (16, KWD('"')), (19, KWD('@')),
|
||||||
|
(21, KWD('#')), (23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')),
|
||||||
|
(41, LIT('foo_xbaa')), (54, 0), (56, 1), (59, -2), (62, 0.5),
|
||||||
|
(65, 1.234), (71, 'abc'), (77, ''), (80, 'abc ( def ) ghi'),
|
||||||
|
(98, 'def \x00 4ghi'), (118, 'bach\\slask'), (132, 'foo\nbaa'),
|
||||||
|
(143, 'this % is not a comment.'), (170, 'foo\nbaa'), (180, 'foobaa'),
|
||||||
|
(191, ' '), (196, '@@ '), (208, '\xab\xcd\x00\x124\x05'),
|
||||||
|
(223, KWD('func')), (227, LIT('a')), (229, LIT('b')),
|
||||||
|
(231, KWD('{')), (232, 'c'), (235, KWD('do*')), (238, KWD('}')),
|
||||||
|
(239, KWD('def')), (243, KWD('[')), (245, 1), (247, 'z'), (251, KWD('!')),
|
||||||
|
(253, KWD(']')), (255, KWD('<<')), (258, LIT('foo')), (263, 'bar'),
|
||||||
|
(269, KWD('>>'))
|
||||||
|
]
|
||||||
|
|
||||||
|
OBJS = [
|
||||||
|
(23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')),
|
||||||
|
(41, LIT('foo_xbaa')), (54, 0), (56, 1), (59, -2), (62, 0.5),
|
||||||
|
(65, 1.234), (71, 'abc'), (77, ''), (80, 'abc ( def ) ghi'),
|
||||||
|
(98, 'def \x00 4ghi'), (118, 'bach\\slask'), (132, 'foo\nbaa'),
|
||||||
|
(143, 'this % is not a comment.'), (170, 'foo\nbaa'), (180, 'foobaa'),
|
||||||
|
(191, ' '), (196, '@@ '), (208, '\xab\xcd\x00\x124\x05'),
|
||||||
|
(227, LIT('a')), (229, LIT('b')), (232, 'c'), (243, [1, 'z']),
|
||||||
|
(255, {'foo': 'bar'}),
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_tokens(self, s):
|
||||||
|
import StringIO
|
||||||
|
class MyParser(PSBaseParser):
|
||||||
|
def flush(self):
|
||||||
|
self.add_results(*self.popall())
|
||||||
|
parser = MyParser(StringIO.StringIO(s), debug=1)
|
||||||
|
r = []
|
||||||
|
try:
|
||||||
|
while 1:
|
||||||
|
r.append(parser.nexttoken())
|
||||||
|
except PSEOF:
|
||||||
|
pass
|
||||||
|
return r
|
||||||
|
|
||||||
|
def get_objects(self, s):
|
||||||
|
import StringIO
|
||||||
|
class MyParser(PSStackParser):
|
||||||
|
def flush(self):
|
||||||
|
self.add_results(*self.popall())
|
||||||
|
parser = MyParser(StringIO.StringIO(s), debug=1)
|
||||||
|
r = []
|
||||||
|
try:
|
||||||
|
while 1:
|
||||||
|
r.append(parser.nextobject())
|
||||||
|
except PSEOF:
|
||||||
|
pass
|
||||||
|
return r
|
||||||
|
|
||||||
|
def test_1(self):
|
||||||
|
tokens = self.get_tokens(self.TESTDATA)
|
||||||
|
print tokens
|
||||||
|
self.assertEqual(tokens, self.TOKENS)
|
||||||
|
return
|
||||||
|
|
||||||
|
def test_2(self):
|
||||||
|
objs = self.get_objects(self.TESTDATA)
|
||||||
|
print objs
|
||||||
|
self.assertEqual(objs, self.OBJS)
|
||||||
|
return
|
||||||
|
|
||||||
|
if __name__ == '__main__': unittest.main()
|
||||||
|
|
Loading…
Reference in New Issue