Restructuring core lexical handlings.

Fix several bugs.


git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@17 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2008-02-03 09:36:34 +00:00
parent 94859ea428
commit 366143361f
5 changed files with 712 additions and 454 deletions

71
cmap.py
View File

@ -3,7 +3,7 @@ import sys
stderr = sys.stderr stderr = sys.stderr
from struct import pack, unpack from struct import pack, unpack
from utils import choplist, nunpack from utils import choplist, nunpack
from psparser import PSException, PSSyntaxError, PSTypeError, \ from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
PSLiteral, PSKeyword, literal_name, keyword_name, \ PSLiteral, PSKeyword, literal_name, keyword_name, \
PSStackParser PSStackParser
try: try:
@ -39,16 +39,16 @@ class CMap:
return self return self
def register_code2cid(self, code, cid): def register_code2cid(self, code, cid):
assert isinstance(code, str) if isinstance(code, str) and isinstance(cid, int):
assert isinstance(cid, int)
self.code2cid[code] = cid self.code2cid[code] = cid
return self return self
def register_cid2code(self, cid, code): def register_cid2code(self, cid, code):
from glyphlist import charname2unicode from glyphlist import charname2unicode
assert isinstance(cid, int) if isinstance(cid, int):
if isinstance(code, PSLiteral): if isinstance(code, PSLiteral):
code = pack('>H', charname2unicode[code.name]) self.cid2code[cid] = pack('>H', charname2unicode[code.name])
elif isinstance(code, str):
self.cid2code[cid] = code self.cid2code[cid] = code
return self return self
@ -195,7 +195,7 @@ class CMapDB:
print >>stderr, 'Reading: CMap %r...' % fname print >>stderr, 'Reading: CMap %r...' % fname
cmap = CMap() cmap = CMap()
fp = file(fname) fp = file(fname)
CMapParser(cmap, fp).parse() CMapParser(cmap, fp, debug=klass.debug).run()
fp.close() fp.close()
else: else:
raise KeyError(cmapname) raise KeyError(cmapname)
@ -213,7 +213,14 @@ class CMapParser(PSStackParser):
self.in_cmap = False self.in_cmap = False
return return
def do_token(self, _, token): def run(self):
try:
self.nextobject()
except PSEOF:
pass
return
def do_keyword(self, pos, token):
name = token.name name = token.name
if name == 'begincmap': if name == 'begincmap':
self.in_cmap = True self.in_cmap = True
@ -226,15 +233,15 @@ class CMapParser(PSStackParser):
# #
if name == 'def': if name == 'def':
try: try:
(k,v) = self.pop(2) ((_,k),(_,v)) = self.pop(2)
self.cmap.attrs[literal_name(k)] = v self.cmap.attrs[str(k)] = v
except PSSyntaxError: except PSSyntaxError:
pass pass
return return
if name == 'usecmap': if name == 'usecmap':
try: try:
(cmapname,) = self.pop(1) ((_,cmapname),) = self.pop(1)
self.cmap.copycmap(CMapDB.get_cmap(literal_name(cmapname))) self.cmap.copycmap(CMapDB.get_cmap(literal_name(cmapname)))
except PSSyntaxError: except PSSyntaxError:
pass pass
@ -244,8 +251,6 @@ class CMapParser(PSStackParser):
self.popall() self.popall()
return return
if name == 'endcodespacerange': if name == 'endcodespacerange':
if 1 <= self.debug:
print >>stderr, 'codespace: %r' % self.partobj
self.popall() self.popall()
return return
@ -253,48 +258,45 @@ class CMapParser(PSStackParser):
self.popall() self.popall()
return return
if name == 'endcidrange': if name == 'endcidrange':
for (s,e,cid) in choplist(3, self.partobj): objs = [ obj for (_,obj) in self.popall() ]
assert isinstance(s, str) for (s,e,cid) in choplist(3, objs):
assert isinstance(e, str) if (not isinstance(s, str) or not isinstance(e, str) or
assert isinstance(cid, int) not isinstance(cid, int) or len(s) != len(e)): continue
assert len(s) == len(e)
sprefix = s[:-4] sprefix = s[:-4]
eprefix = e[:-4] eprefix = e[:-4]
assert sprefix == eprefix if sprefix != eprefix: continue
svar = s[-4:] svar = s[-4:]
evar = e[-4:] evar = e[-4:]
s1 = nunpack(svar) s1 = nunpack(svar)
e1 = nunpack(evar) e1 = nunpack(evar)
vlen = len(svar) vlen = len(svar)
assert s1 <= e1 #assert s1 <= e1
for i in xrange(e1-s1+1): for i in xrange(e1-s1+1):
x = sprefix+pack('>L',s1+i)[-vlen:] x = sprefix+pack('>L',s1+i)[-vlen:]
self.cmap.register_code2cid(x, cid+i) self.cmap.register_code2cid(x, cid+i)
self.popall()
return return
if name == 'begincidchar': if name == 'begincidchar':
self.popall() self.popall()
return return
if name == 'endcidchar': if name == 'endcidchar':
for (cid,code) in choplist(2, self.partobj): objs = [ obj for (_,obj) in self.popall() ]
assert isinstance(code, str) for (cid,code) in choplist(2, objs):
assert isinstance(cid, str) if isinstance(code, str) and isinstance(cid, str):
self.cmap.register_code2cid(code, nunpack(cid)) self.cmap.register_code2cid(code, nunpack(cid))
self.popall()
return return
if name == 'beginbfrange': if name == 'beginbfrange':
self.popall() self.popall()
return return
if name == 'endbfrange': if name == 'endbfrange':
for (s,e,code) in choplist(3, self.partobj): objs = [ obj for (_,obj) in self.popall() ]
assert isinstance(s, str) for (s,e,code) in choplist(3, objs):
assert isinstance(e, str) if (not isinstance(s, str) or not isinstance(e, str) or
assert len(s) == len(e) len(s) != len(e)): continue
s1 = nunpack(s) s1 = nunpack(s)
e1 = nunpack(e) e1 = nunpack(e)
assert s1 <= e1 #assert s1 <= e1
if isinstance(code, list): if isinstance(code, list):
for i in xrange(e1-s1+1): for i in xrange(e1-s1+1):
self.cmap.register_cid2code(s1+i, code[i]) self.cmap.register_cid2code(s1+i, code[i])
@ -306,29 +308,26 @@ class CMapParser(PSStackParser):
for i in xrange(e1-s1+1): for i in xrange(e1-s1+1):
x = prefix+pack('>L',base+i)[-vlen:] x = prefix+pack('>L',base+i)[-vlen:]
self.cmap.register_cid2code(s1+i, x) self.cmap.register_cid2code(s1+i, x)
self.popall()
return return
if name == 'beginbfchar': if name == 'beginbfchar':
self.popall() self.popall()
return return
if name == 'endbfchar': if name == 'endbfchar':
for (cid,code) in choplist(2, self.partobj): objs = [ obj for (_,obj) in self.popall() ]
assert isinstance(cid, str) for (cid,code) in choplist(2, objs):
assert isinstance(code, str) if isinstance(cid, str) and isinstance(code, str):
self.cmap.register_cid2code(nunpack(cid), code) self.cmap.register_cid2code(nunpack(cid), code)
self.popall()
return return
if name == 'beginnotdefrange': if name == 'beginnotdefrange':
self.popall() self.popall()
return return
if name == 'endnotdefrange': if name == 'endnotdefrange':
if 1 <= self.debug:
print >>stderr, 'notdefrange: %r' % self.partobj
self.popall() self.popall()
return return
self.push((pos, token))
return return

View File

@ -13,8 +13,8 @@ from cmap import CMapDB
## ##
class TextConverter(PDFDevice): class TextConverter(PDFDevice):
def __init__(self, outfp, rsrc, codec): def __init__(self, outfp, rsrc, codec, debug=0):
PDFDevice.__init__(self, rsrc) PDFDevice.__init__(self, rsrc, debug=debug)
self.outfp = outfp self.outfp = outfp
self.codec = codec self.codec = codec
return return
@ -42,6 +42,10 @@ class TextConverter(PDFDevice):
return return
def handle_undefined_char(self, cidcoding, cid): def handle_undefined_char(self, cidcoding, cid):
if self.debug:
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
#return unichr(cid)
#return unichr(cid+32)
return return
def render_string(self, textstate, textmatrix, size, seq): def render_string(self, textstate, textmatrix, size, seq):
@ -81,7 +85,7 @@ class TextConverter(PDFDevice):
# pdf2txt # pdf2txt
def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0): def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
device = TextConverter(outfp, rsrc, codec) device = TextConverter(outfp, rsrc, codec, debug=debug)
outfp.write('<document>\n') outfp.write('<document>\n')
doc = PDFDocument(debug=debug) doc = PDFDocument(debug=debug)
fp = file(fname) fp = file(fname)

View File

@ -6,7 +6,7 @@ try:
from cStringIO import StringIO from cStringIO import StringIO
except ImportError: except ImportError:
from StringIO import StringIO from StringIO import StringIO
from psparser import PSException, PSSyntaxError, PSTypeError, \ from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
PSStackParser, PSLiteral, PSKeyword, STRICT, \ PSStackParser, PSLiteral, PSKeyword, STRICT, \
PSLiteralTable, PSKeywordTable, literal_name, keyword_name PSLiteralTable, PSKeywordTable, literal_name, keyword_name
from pdfparser import PDFException, PDFStream, PDFObjRef, resolve1, \ from pdfparser import PDFException, PDFStream, PDFObjRef, resolve1, \
@ -45,6 +45,8 @@ LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray') LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB') LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK') LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
KEYWORD_BI = PSKeywordTable.intern('BI')
KEYWORD_ID = PSKeywordTable.intern('ID')
KEYWORD_EI = PSKeywordTable.intern('EI') KEYWORD_EI = PSKeywordTable.intern('EI')
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0) MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
@ -134,7 +136,7 @@ class PDFSimpleFont(PDFFont):
if 'ToUnicode' in spec: if 'ToUnicode' in spec:
strm = stream_value(spec['ToUnicode']) strm = stream_value(spec['ToUnicode'])
self.ucs2_cmap = CMap() self.ucs2_cmap = CMap()
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse() CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
PDFFont.__init__(self, descriptor, widths) PDFFont.__init__(self, descriptor, widths)
return return
@ -292,7 +294,7 @@ class PDFCIDFont(PDFFont):
if 'ToUnicode' in spec: if 'ToUnicode' in spec:
strm = stream_value(spec['ToUnicode']) strm = stream_value(spec['ToUnicode'])
self.ucs2_cmap = CMap() self.ucs2_cmap = CMap()
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse() CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
elif self.cidcoding == 'Adobe-Identity': elif self.cidcoding == 'Adobe-Identity':
if ttf: if ttf:
try: try:
@ -433,8 +435,9 @@ class PDFResourceManager:
## ##
class PDFDevice: class PDFDevice:
def __init__(self, rsrc): def __init__(self, rsrc, debug=0):
self.rsrc = rsrc self.rsrc = rsrc
self.debug = debug
self.ctm = None self.ctm = None
return return
@ -465,47 +468,91 @@ class PDFDevice:
## ##
class PDFContentParser(PSStackParser): class PDFContentParser(PSStackParser):
def __init__(self, fp, debug=0): def __init__(self, streams, debug=0):
PSStackParser.__init__(self, fp, debug=debug) self.streams = streams
self.istream = 0
PSStackParser.__init__(self, None, debug=debug)
return return
def __repr__(self): def __repr__(self):
return '<PDFParser: linepos=%d>' % self.linepos return '<PDFParser: linepos=%d>' % self.linepos
EOIPAT = re.compile(r'\nEI\W') def fillfp(self):
def do_token(self, pos, token): if not self.fp:
name = keyword_name(token) if self.istream < len(self.streams):
strm = stream_value(self.streams[self.istream])
if name == 'BI': self.istream += 1
# inline image within a content stream
self.context.append(('BI', self.partobj))
self.partobj = []
elif name == 'ID':
objs = self.partobj
(type0, self.partobj) = self.context.pop()
if len(objs) % 2 != 0:
if STRICT:
raise PSTypeError('invalid dictionary construct: %r' % objs)
dic = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
pos += len('ID ')
self.fp.seek(pos)
# XXX how do we know the real length other than scanning?
data = ''
while 1:
data += self.fp.read(4096)
m = self.EOIPAT.search(data)
if m: break
objlen = m.start(0)
obj = PDFStream(dic, data[:objlen])
self.push(obj)
self.seek(pos+objlen+len('\nEI'))
self.push(KEYWORD_EI)
else: else:
self.push(token) raise PSEOF
self.fp = StringIO(strm.get_data())
return
return False def seek(self, pos):
self.fillfp()
PSStackParser.seek(self, pos)
return
def fillbuf(self):
if self.charpos < len(self.buf): return
while 1:
self.fillfp()
self.bufpos = self.fp.tell()
self.buf = self.fp.read(self.BUFSIZ)
if self.buf: break
self.fp = None
self.charpos = 0
return
def get_inline_data(self, pos, target='EI '):
self.seek(pos)
i = 0
data = ''
while i < len(target):
self.fillbuf()
if i:
c = self.buf[self.charpos]
data += c
self.charpos += 1
if c == target[i]:
i += 1
else:
i = 0
else:
try:
j = self.buf.index(target[0], self.charpos)
#print 'found', (0, self.buf[j:j+10])
data += self.buf[self.charpos:j]
self.charpos = j+1
i = 1
except ValueError:
data += self.buf[self.charpos:]
self.charpos = len(self.buf)
data = data[:-len(target)] # strip the last part
return (pos, data)
def flush(self):
self.add_results(*self.popall())
return
def do_keyword(self, pos, token):
if token == KEYWORD_BI:
# inline image within a content stream
self.start_type(pos, 'inline')
elif token == KEYWORD_ID:
try:
(_, objs) = self.end_type('inline')
if len(objs) % 2 != 0:
raise PSTypeError('invalid dictionary construct: %r' % objs)
d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
(pos, data) = self.get_inline_data(pos+len('ID '))
obj = PDFStream(d, data)
self.push((pos, obj))
self.push((pos, KEYWORD_EI))
except PSTypeError:
if STRICT: raise
else:
self.push((pos, token))
return
## Interpreter ## Interpreter
@ -542,10 +589,44 @@ class PDFPageInterpreter:
self.debug = debug self.debug = debug
return return
def initpage(self, ctm): def init_resources(self, resources):
self.fontmap = {} self.fontmap = {}
self.xobjmap = {} self.xobjmap = {}
self.csmap = PREDEFINED_COLORSPACE.copy() self.csmap = PREDEFINED_COLORSPACE.copy()
# Handle resource declarations.
def get_colorspace(spec):
if isinstance(spec, list):
name = literal_name(spec[0])
else:
name = literal_name(spec)
if name == 'ICCBased':
return ColorSpace(name, stream_value(spec[1]).dic['N'])
elif name == 'DeviceN':
return ColorSpace(name, len(list_value(spec[1])))
else:
return PREDEFINED_COLORSPACE[name]
if resources:
for (k,v) in dict_value(resources).iteritems():
if 1 <= self.debug:
print >>stderr, 'Resource: %r: %r' % (k,v)
if k == 'Font':
for (fontid,spec) in dict_value(v).iteritems():
objid = None
if isinstance(spec, PDFObjRef):
objid = spec.objid
spec = dict_value(spec)
self.fontmap[fontid] = self.rsrc.get_font(objid, spec)
elif k == 'ColorSpace':
for (csid,spec) in dict_value(v).iteritems():
self.csmap[csid] = get_colorspace(resolve1(spec))
elif k == 'ProcSet':
self.rsrc.get_procset(list_value(v))
elif k == 'XObject':
for (xobjid,xobjstrm) in dict_value(v).iteritems():
self.xobjmap[xobjid] = xobjstrm
return
def init_state(self, ctm):
# gstack: stack for graphical states. # gstack: stack for graphical states.
self.gstack = [] self.gstack = []
self.ctm = ctm self.ctm = ctm
@ -554,8 +635,9 @@ class PDFPageInterpreter:
# argstack: stack for command arguments. # argstack: stack for command arguments.
self.argstack = [] self.argstack = []
# set some global states. # set some global states.
self.scs = None self.scs = self.ncs = None
self.ncs = None if self.csmap:
self.scs = self.ncs = self.csmap.values()[0]
return return
def push(self, obj): def push(self, obj):
@ -683,10 +765,22 @@ class PDFPageInterpreter:
# setcolor # setcolor
def do_SCN(self): def do_SCN(self):
self.pop(self.scs.ncomponents) if self.scs:
n = self.scs.ncomponents
else:
if STRICT:
raise PDFInterpreterError('no colorspace specified!')
n = 1
self.pop(n)
return return
def do_scn(self): def do_scn(self):
self.pop(self.ncs.ncomponents) if self.ncs:
n = self.ncs.ncomponents
else:
if STRICT:
raise PDFInterpreterError('no colorspace specified!')
n = 1
self.pop(n)
return return
def do_SC(self): def do_SC(self):
self.do_SCN() self.do_SCN()
@ -839,8 +933,7 @@ class PDFPageInterpreter:
(x1,y1) = apply_matrix(ctm, (x1,y1)) (x1,y1) = apply_matrix(ctm, (x1,y1))
bbox = (x0,y0,x1,y1) bbox = (x0,y0,x1,y1)
self.device.begin_figure(xobjid, bbox) self.device.begin_figure(xobjid, bbox)
interpreter.render_contents(xobj.dic.get('Resources'), interpreter.render_contents(xobj.dic.get('Resources'), [xobj], ctm=ctm)
[xobj], ctm=ctm)
self.device.end_figure(xobjid) self.device.end_figure(xobjid)
return return
@ -853,46 +946,18 @@ class PDFPageInterpreter:
return return
def render_contents(self, resources, contents, ctm=MATRIX_IDENTITY): def render_contents(self, resources, contents, ctm=MATRIX_IDENTITY):
self.initpage(ctm) self.init_resources(resources)
# Handle resource declarations. self.init_state(ctm)
def get_colorspace(spec): self.execute(list_value(contents))
if isinstance(spec, list):
name = literal_name(spec[0])
else:
name = literal_name(spec)
if name == 'ICCBased':
return ColorSpace(name, stream_value(spec[1]).dic['N'])
elif name == 'DeviceN':
return ColorSpace(name, len(list_value(cs[1])))
else:
return PREDEFINED_COLORSPACE[name]
if resources:
for (k,v) in dict_value(resources).iteritems():
if 1 <= self.debug:
print >>stderr, 'Resource: %r: %r' % (k,v)
if k == 'Font':
for (fontid,spec) in dict_value(v).iteritems():
objid = None
if isinstance(spec, PDFObjRef):
objid = spec.objid
spec = dict_value(spec)
self.fontmap[fontid] = self.rsrc.get_font(objid, spec)
elif k == 'ColorSpace':
for (csid,spec) in dict_value(v).iteritems():
self.csmap[csid] = get_colorspace(resolve1(spec))
elif k == 'ProcSet':
self.rsrc.get_procset(list_value(v))
elif k == 'XObject':
for (xobjid,xobjstrm) in dict_value(v).iteritems():
self.xobjmap[xobjid] = xobjstrm
data = ''.join( stream_value(stream).get_data()
for stream in list_value(contents) )
self.execute(data)
return return
def execute(self, data): def execute(self, streams):
parser = PDFContentParser(StringIO(data), debug=self.debug) parser = PDFContentParser(streams, debug=self.debug)
for obj in parser.parse(): while 1:
try:
(_,obj) = parser.nextobject()
except PSEOF:
break
if isinstance(obj, PSKeyword): if isinstance(obj, PSKeyword):
name = 'do_%s' % obj.name.replace('*','_a').replace('"','_w').replace("'",'_q') name = 'do_%s' % obj.name.replace('*','_a').replace('"','_w').replace("'",'_q')
if hasattr(self, name): if hasattr(self, name):

View File

@ -14,14 +14,10 @@
# - Linearized PDF. # - Linearized PDF.
# - Encryption? # - Encryption?
import sys, re import sys
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
stderr = sys.stderr stderr = sys.stderr
from utils import choplist, nunpack from utils import choplist, nunpack
from psparser import PSException, PSSyntaxError, PSTypeError, \ from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \ PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
literal_name, keyword_name, \ literal_name, keyword_name, \
PSStackParser, STRICT PSStackParser, STRICT
@ -43,14 +39,19 @@ LITERAL_PAGE = PSLiteralTable.intern('Page')
LITERAL_PAGES = PSLiteralTable.intern('Pages') LITERAL_PAGES = PSLiteralTable.intern('Pages')
LITERAL_CATALOG = PSLiteralTable.intern('Catalog') LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode') LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode')
KEYWORD_R = PSKeywordTable.intern('R')
KEYWORD_OBJ = PSKeywordTable.intern('obj') KEYWORD_OBJ = PSKeywordTable.intern('obj')
KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
KEYWORD_STREAM = PSKeywordTable.intern('stream')
KEYWORD_XREF = PSKeywordTable.intern('xref')
KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
## PDFObjRef ## PDFObjRef
## ##
class PDFObjRef: class PDFObjRef:
def __init__(self, doc, objid, genno): def __init__(self, doc, objid, _):
if objid == 0: if objid == 0:
if STRICT: if STRICT:
raise PDFValueError('objid cannot be 0.') raise PDFValueError('objid cannot be 0.')
@ -275,7 +276,8 @@ class PDFXRef:
(pos, genno, use) = f (pos, genno, use) = f
self.offsets.append((int(genno), long(pos), use)) self.offsets.append((int(genno), long(pos), use))
# read trailer # read trailer
self.trailer = dict_value(parser.parse()[0]) (_, dic) = parser.nextobject()
self.trailer = dict_value(dic)
return return
def getpos(self, objid): def getpos(self, objid):
@ -293,9 +295,13 @@ class PDFXRef:
class PDFXRefStream: class PDFXRefStream:
def __init__(self, parser): def __init__(self, parser):
(objid, genno, _, stream) = list_value(parser.parse()) (_,objid) = parser.nextobject()
(_,genno) = parser.nextobject()
parser.nextobject()
(_,stream) = parser.nextobject()
if STRICT: if STRICT:
assert stream.dic['Type'] == LITERAL_XREF if stream.dic['Type'] != LITERAL_XREF:
raise PDFSyntaxError('invalid stream spec.')
size = stream.dic['Size'] size = stream.dic['Size']
(start, nobjs) = stream.dic.get('Index', (0,size)) (start, nobjs) = stream.dic.get('Index', (0,size))
self.objid0 = start self.objid0 = start
@ -385,20 +391,24 @@ class PDFDocument:
if strmid in self.parsed_objs: if strmid in self.parsed_objs:
objs = self.parsed_objs[stream] objs = self.parsed_objs[stream]
else: else:
parser = PDFParser(self, StringIO(stream.get_data()), parser = PDFObjStrmParser(self, stream.get_data(), debug=self.debug)
debug=self.debug) objs = []
objs = list(parser.parse()) try:
while 1:
(_,obj) = parser.nextobject()
objs.append(obj)
except PSEOF:
pass
self.parsed_objs[stream] = objs self.parsed_objs[stream] = objs
obj = objs[stream.dic['N']*2+index] obj = objs[stream.dic['N']*2+index]
else: else:
prevpos = self.parser.seek(index) self.parser.seek(index)
seq = list_value(self.parser.parse()) (_,objid1) = self.parser.nextobject() # objid
if not (4 <= len(seq) and seq[0] == objid and seq[2] == KEYWORD_OBJ): (_,genno1) = self.parser.nextobject() # genno
if STRICT: (_,kwd) = self.parser.nextobject()
raise PDFSyntaxError('invalid stream spec: %r' % seq) if kwd != KEYWORD_OBJ:
return None raise PDFSyntaxError('invalid obj spec: offset=%r' % index)
obj = seq[3] (_,obj) = self.parser.nextobject()
self.parser.seek(prevpos)
if 2 <= self.debug: if 2 <= self.debug:
print >>stderr, 'register: objid=%r: %r' % (objid, obj) print >>stderr, 'register: objid=%r: %r' % (objid, obj)
self.objs[objid] = obj self.objs[objid] = obj
@ -446,29 +456,30 @@ class PDFParser(PSStackParser):
return return
def __repr__(self): def __repr__(self):
return '<PDFParser: linepos=%d>' % self.linepos return '<PDFParser>'
EOIPAT = re.compile(r'\nEI\W') def do_keyword(self, pos, token):
def do_token(self, pos, token): if token in (KEYWORD_XREF, KEYWORD_STARTXREF):
name = keyword_name(token) self.add_results(*self.pop(1))
if name in ('xref', 'trailer', 'startxref', 'endobj'): return
return True if token == KEYWORD_ENDOBJ:
self.add_results(*self.pop(4))
return
if name == 'R': if token == KEYWORD_R:
# reference to indirect object # reference to indirect object
try: try:
(objid, genno) = self.pop(2) ((_,objid), (_,genno)) = self.pop(2)
(objid, genno) = (int(objid), int(genno)) (objid, genno) = (int(objid), int(genno))
obj = PDFObjRef(self.doc, objid, genno) obj = PDFObjRef(self.doc, objid, genno)
self.push(obj) self.push((pos, obj))
if 2 <= self.debug:
print >>stderr, 'refer obj: %r' % obj
except PSSyntaxError: except PSSyntaxError:
pass pass
return
elif name == 'stream': if token == KEYWORD_STREAM:
# stream object # stream object
(dic,) = self.pop(1) ((_,dic),) = self.pop(1)
dic = dict_value(dic) dic = dict_value(dic)
try: try:
objlen = int_value(dic['Length']) objlen = int_value(dic['Length'])
@ -484,20 +495,19 @@ class PDFParser(PSStackParser):
self.seek(pos+objlen) self.seek(pos+objlen)
while 1: while 1:
(linepos, line) = self.nextline() (linepos, line) = self.nextline()
if not line or line.startswith('endstream'): if line.startswith('endstream'): break
break
objlen += len(line) objlen += len(line)
data += line data += line
if 1 <= self.debug: if 1 <= self.debug:
print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \ print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
(pos, objlen, dic, data[:10]) (pos, objlen, dic, data[:10])
obj = PDFStream(dic, data, self.doc.decipher) obj = PDFStream(dic, data, self.doc.decipher)
self.push(obj) self.push((pos, obj))
return
else: # others
self.push(token) self.push((pos, token))
return
return False
def find_xref(self): def find_xref(self):
# find the first xref table # find the first xref table
@ -505,7 +515,7 @@ class PDFParser(PSStackParser):
for line in self.revreadlines(): for line in self.revreadlines():
line = line.strip() line = line.strip()
if 2 <= self.debug: if 2 <= self.debug:
print >>stderr, 'line: %r' % line print >>stderr, 'find_xref: %r' % line
if line == 'startxref': break if line == 'startxref': break
if line: if line:
prev = line prev = line
@ -525,10 +535,11 @@ class PDFParser(PSStackParser):
# read xref table # read xref table
(linepos, line) = self.nextline() (linepos, line) = self.nextline()
if 2 <= self.debug: if 2 <= self.debug:
print >>stderr, 'line: %r' % line print >>stderr, 'read_xref: %r' % line
if line[0].isdigit(): if line[0].isdigit():
# XRefStream: PDF-1.5 # XRefStream: PDF-1.5
self.seek(linepos) self.seek(linepos)
self.reset()
xref = PDFXRefStream(self) xref = PDFXRefStream(self)
else: else:
if line.strip() != 'xref': if line.strip() != 'xref':
@ -551,3 +562,18 @@ class PDFParser(PSStackParser):
else: else:
break break
return return
## PDFObjStrmParser
##
class PDFObjStrmParser(PDFParser):
def __init__(self, doc, data, debug=0):
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
PDFParser.__init__(self, doc, StringIO(data), debug=debug)
return
def flush(self):
self.add_results(*self.popall())
return

View File

@ -3,12 +3,13 @@ import sys, re
stderr = sys.stderr stderr = sys.stderr
from utils import choplist from utils import choplist
STRICT = 0 STRICT = 1
## PS Exceptions ## PS Exceptions
## ##
class PSException(Exception): pass class PSException(Exception): pass
class PSEOF(PSException): pass
class PSSyntaxError(PSException): pass class PSSyntaxError(PSException): pass
class PSTypeError(PSException): pass class PSTypeError(PSException): pass
class PSValueError(PSException): pass class PSValueError(PSException): pass
@ -71,6 +72,14 @@ class PSSymbolTable:
PSLiteralTable = PSSymbolTable(PSLiteral) PSLiteralTable = PSSymbolTable(PSLiteral)
PSKeywordTable = PSSymbolTable(PSKeyword) PSKeywordTable = PSSymbolTable(PSKeyword)
LIT = PSLiteralTable.intern
KWD = PSKeywordTable.intern
KEYWORD_BRACE_BEGIN = KWD('{')
KEYWORD_BRACE_END = KWD('}')
KEYWORD_ARRAY_BEGIN = KWD('[')
KEYWORD_ARRAY_END = KWD(']')
KEYWORD_DICT_BEGIN = KWD('<<')
KEYWORD_DICT_END = KWD('>>')
def literal_name(x): def literal_name(x):
@ -92,72 +101,288 @@ def keyword_name(x):
## PSBaseParser ## PSBaseParser
## ##
EOL = re.compile(r'[\r\n]')
SPC = re.compile(r'\s')
NONSPC = re.compile(r'\S')
HEX = re.compile(r'[0-9a-fA-F]')
END_LITERAL = re.compile(r'[#/%\[\]()<>{}\s]')
END_HEX_STRING = re.compile(r'[^\s0-9a-fA-F]')
HEX_PAIR = re.compile(r'[0-9a-fA-F]{2}|.')
END_NUMBER = re.compile(r'[^0-9]')
END_KEYWORD = re.compile(r'[#/%\[\]()<>{}\s]')
END_STRING = re.compile(r'[()\134]')
OCT_STRING = re.compile(r'[0-7]')
ESC_STRING = { 'b':8, 't':9, 'n':10, 'f':12, 'r':13, '(':40, ')':41, '\\':92 }
class PSBaseParser: class PSBaseParser:
''' '''
Most basic PostScript parser that performs only basic tokenization. Most basic PostScript parser that performs only basic tokenization.
''' '''
BUFSIZ = 4096
def __init__(self, fp, debug=0): def __init__(self, fp, debug=0):
self.fp = fp self.fp = fp
self.debug = debug self.debug = debug
self.bufsize = 4096
self.strfilter = None
self.seek(0) self.seek(0)
return return
def __repr__(self): def __repr__(self):
return '<PSBaseParser: %r>' % (self.fp,) return '<PSBaseParser: %r, bufpos=%d>' % (self.fp, self.bufpos)
def tell(self):
return self.fp.tell()
def poll(self, pos=None, n=80):
pos0 = self.fp.tell()
if not pos:
pos = self.bufpos+self.charpos
self.fp.seek(pos)
print >>stderr, 'poll(%d): %r' % (pos, self.fp.read(n))
self.fp.seek(pos0)
return
def seek(self, pos): def seek(self, pos):
''' '''
Seeks the parser to the given position. Seeks the parser to the given position.
''' '''
if 2 <= self.debug: if 2 <= self.debug:
print >>stderr, 'seek:', pos print >>stderr, 'seek: %r' % pos
prevpos = self.fp.tell()
self.fp.seek(pos) self.fp.seek(pos)
self.linebuf = None # line buffer. # reset the status for nextline()
self.curpos = 0 # current position in the buffer. self.bufpos = pos
self.linepos = pos # the beginning of the current line. self.buf = ''
self.go = False self.charpos = 0
return prevpos # reset the status for nexttoken()
self.parse1 = self.parse_main
self.tokens = []
return
def fillbuf(self):
if self.charpos < len(self.buf): return
# fetch next chunk.
self.bufpos = self.fp.tell()
self.buf = self.fp.read(self.BUFSIZ)
if not self.buf:
raise PSEOF
self.charpos = 0
return
def parse_main(self, s, i):
m = NONSPC.search(s, i)
if not m:
return (self.parse_main, len(s))
j = m.start(0)
c = s[j]
self.tokenstart = self.bufpos+j
if c == '%':
self.token = '%'
return (self.parse_comment, j+1)
if c == '/':
self.token = ''
return (self.parse_literal, j+1)
if c in '-+' or c.isdigit():
self.token = c
return (self.parse_number, j+1)
if c == '.':
self.token = c
return (self.parse_float, j+1)
if c.isalpha():
self.token = c
return (self.parse_keyword, j+1)
if c == '(':
self.token = ''
self.paren = 1
return (self.parse_string, j+1)
if c == '<':
self.token = ''
return (self.parse_wopen, j+1)
if c == '>':
self.token = ''
return (self.parse_wclose, j+1)
self.add_token(KWD(c))
return (self.parse_main, j+1)
def add_token(self, obj):
self.tokens.append((self.tokenstart, obj))
return
def parse_comment(self, s, i):
m = EOL.search(s, i)
if not m:
self.token += s[i:]
return (self.parse_comment, len(s))
j = m.start(0)
self.token += s[i:j]
# We ignore comments.
#self.tokens.append(self.token)
return (self.parse_main, j)
def parse_literal(self, s, i):
m = END_LITERAL.search(s, i)
if not m:
self.token += s[i:]
return (self.parse_literal, len(s))
j = m.start(0)
self.token += s[i:j]
c = s[j]
if c == '#':
self.hex = ''
return (self.parse_literal_hex, j+1)
self.add_token(LIT(self.token))
return (self.parse_main, j)
def parse_literal_hex(self, s, i):
c = s[i]
if HEX.match(c) and len(self.hex) < 2:
self.hex += c
return (self.parse_literal_hex, i+1)
if self.hex:
self.token += chr(int(self.hex, 16))
return (self.parse_literal, i)
def parse_number(self, s, i):
m = END_NUMBER.search(s, i)
if not m:
self.token += s[i:]
return (self.parse_number, len(s))
j = m.start(0)
self.token += s[i:j]
c = s[j]
if c == '.':
self.token += c
return (self.parse_float, j+1)
try:
self.add_token(int(self.token))
except ValueError:
pass
return (self.parse_main, j)
def parse_float(self, s, i):
m = END_NUMBER.search(s, i)
if not m:
self.token += s[i:]
return (self.parse_float, len(s))
j = m.start(0)
self.token += s[i:j]
self.add_token(float(self.token))
return (self.parse_main, j)
def parse_keyword(self, s, i):
m = END_KEYWORD.search(s, i)
if not m:
self.token += s[i:]
return (self.parse_keyword, len(s))
j = m.start(0)
self.token += s[i:j]
if self.token == 'true':
token = True
elif self.token == 'false':
token = False
else:
token = KWD(self.token)
self.add_token(token)
return (self.parse_main, j)
def parse_string(self, s, i):
m = END_STRING.search(s, i)
if not m:
self.token += s[i:]
return (self.parse_string, len(s))
j = m.start(0)
self.token += s[i:j]
c = s[j]
if c == '\\':
self.oct = ''
return (self.parse_string_1, j+1)
if c == '(':
self.paren += 1
self.token += c
return (self.parse_string, j+1)
if c == ')':
self.paren -= 1
if self.paren: # WTF, they said balanced parens need no special treatment.
self.token += c
return (self.parse_string, j+1)
self.add_token(self.token)
return (self.parse_main, j+1)
def parse_string_1(self, s, i):
c = s[i]
if OCT_STRING.match(c) and len(self.oct) < 3:
self.oct += c
return (self.parse_string_1, i+1)
if self.oct:
self.token += chr(int(self.oct, 8))
return (self.parse_string, i)
if c in ESC_STRING:
self.token += chr(ESC_STRING[c])
return (self.parse_string, i+1)
def parse_wopen(self, s, i):
c = s[i]
if c.isspace() or HEX.match(c):
return (self.parse_hexstring, i)
if c == '<':
self.add_token(KEYWORD_DICT_BEGIN)
i += 1
return (self.parse_main, i)
def parse_wclose(self, s, i):
c = s[i]
if c == '>':
self.add_token(KEYWORD_DICT_END)
i += 1
return (self.parse_main, i)
def parse_hexstring(self, s, i):
m = END_HEX_STRING.search(s, i)
if not m:
self.token += s[i:]
return (self.parse_hexstring, len(s))
j = m.start(0)
self.token += s[i:j]
token = HEX_PAIR.sub(lambda m: chr(int(m.group(0), 16)),
SPC.sub('', self.token))
self.add_token(token)
return (self.parse_main, j)
def nexttoken(self):
while not self.tokens:
self.fillbuf()
(self.parse1, self.charpos) = self.parse1(self.buf, self.charpos)
token = self.tokens.pop(0)
if 2 <= self.debug:
print >>stderr, 'nexttoken: %r' % (token,)
return token
EOLCHAR = re.compile(r'[\r\n]')
def nextline(self): def nextline(self):
''' '''
Fetches a next line that ends either with \\r or \\n. Fetches a next line that ends either with \\r or \\n.
''' '''
line = '' linebuf = ''
eol = None linepos = self.bufpos + self.charpos
eol = False
while 1: while 1:
if not self.linebuf or len(self.linebuf) <= self.curpos: self.fillbuf()
# fetch next chunk.
self.linebuf = self.fp.read(self.bufsize)
if not self.linebuf:
# at EOF.
break
self.curpos = 0
if eol: if eol:
c = self.linebuf[self.curpos] c = self.buf[self.charpos]
# handle '\r\n' # handle '\r\n'
if (eol == '\r' and c == '\n'): if c == '\n':
line += c linebuf += c
self.curpos += 1 self.charpos += 1
break break
m = self.EOLCHAR.search(self.linebuf, self.curpos) m = EOL.search(self.buf, self.charpos)
if m: if m:
i = m.end(0) linebuf += self.buf[self.charpos:m.end(0)]
line += self.linebuf[self.curpos:i] self.charpos = m.end(0)
eol = self.linebuf[i-1] if linebuf[-1] == '\r':
self.curpos = i eol = True
else: else:
# fetch further break
line += self.linebuf[self.curpos:] else:
self.linebuf = None linebuf += self.buf[self.charpos:]
linepos = self.linepos self.charpos = len(self.buf)
self.linepos += len(line) if 2 <= self.debug:
return (linepos, line) print >>stderr, 'nextline: %r' % ((linepos, linebuf),)
return (linepos, linebuf)
def revreadlines(self): def revreadlines(self):
''' '''
@ -168,9 +393,9 @@ class PSBaseParser:
pos = self.fp.tell() pos = self.fp.tell()
buf = '' buf = ''
while 0 < pos: while 0 < pos:
pos = max(0, pos-self.bufsize) pos = max(0, pos-self.BUFSIZ)
self.fp.seek(pos) self.fp.seek(pos)
s = self.fp.read(self.bufsize) s = self.fp.read(self.BUFSIZ)
if not s: break if not s: break
while 1: while 1:
n = max(s.rfind('\r'), s.rfind('\n')) n = max(s.rfind('\r'), s.rfind('\n'))
@ -182,263 +407,202 @@ class PSBaseParser:
buf = '' buf = ''
return return
# regex patterns for basic lexical scanning.
SPECIAL = r'%\[\]()<>{}/\000\011\012\014\015\040'
TOKEN = re.compile(r'<<|>>|[%\[\]()<>{}/]|[^'+SPECIAL+r']+')
LITERAL = re.compile(r'([^#'+SPECIAL+r']|#[0-9abcdefABCDEF]{2})+')
NUMBER = re.compile(r'[+-]?[0-9][.0-9]*$')
STRING_NORM = re.compile(r'(\\[0-9]{1,3}|\\.|[^\)])+')
STRING_NORM_SUB = re.compile(r'\\[0-7]{1,3}|\\.')
STRING_HEX = re.compile(r'[\s0-9a-fA-F]+')
STRING_HEX_SUB = re.compile(r'[0-9a-fA-F]{1,2}')
def parse(self):
'''
Yields a list of tuples (pos, token) of the following:
keywords, literals, strings, numbers and parentheses.
Comments are skipped.
Nested objects (i.e. arrays and dictionaries) are not handled here.
'''
while 1:
# do not strip line! we need to distinguish last '\n' or '\r'
(linepos, line) = self.nextline()
if not line: break
if 2 <= self.debug:
print >>stderr, 'line: (%d) %r' % (linepos, line)
# do this before removing comment
if line.startswith('%%EOF'): break
charpos = 0
# tokenize
self.go = True
while self.go:
m = self.TOKEN.search(line, charpos)
if not m: break
t = m.group(0)
pos = linepos + m.start(0)
charpos = m.end(0)
if t == '%':
# skip comment
if 2 <= self.debug:
print >>stderr, 'comment: %r' % line[charpos:]
break
elif t == '/':
# literal object
mn = self.LITERAL.match(line, m.start(0)+1)
lit = PSLiteralTable.intern(mn.group(0))
yield (pos, lit)
charpos = mn.end(0)
if 2 <= self.debug:
print >>stderr, 'name: %r' % lit
elif t == '(':
# normal string object
s = ''
while 1:
ms = self.STRING_NORM.match(line, charpos)
if not ms: break
s1 = ms.group(0)
charpos = ms.end(0)
if len(s1) == 1 and s1[-1] == '\\':
s += s1[-1:]
(linepos, line) = self.nextline()
if not line:
if STRICT:
raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
(linepos, line))
break
charpos = 0
elif charpos == len(line):
s += s1
(linepos, line) = self.nextline()
if not line:
if STRICT:
raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
(linepos, line))
break
charpos = 0
else:
s += s1
break
if line[charpos] == ')':
charpos += 1
else:
if STRICT:
raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
(linepos, line))
pass
def convesc(m):
x = m.group(0)
if x[1:].isdigit():
return chr(int(x[1:], 8))
else:
return x[1]
s = self.STRING_NORM_SUB.sub(convesc, s)
if self.strfilter:
s = self.strfilter(s)
if 2 <= self.debug:
print >>stderr, 'str: %r' % s
yield (pos, s)
elif t == '<':
# hex string object
ms = self.STRING_HEX.match(line, charpos)
charpos = ms.end(0)
if line[charpos] == '>':
charpos += 1
else:
if STRICT:
raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
(linepos, line))
def convhex(m1):
return chr(int(m1.group(0), 16))
s = self.STRING_HEX_SUB.sub(convhex, ms.group(0))
if 2 <= self.debug:
print >>stderr, 'str: %r' % s
yield (pos, s)
elif self.NUMBER.match(t):
# number
if '.' in t:
n = float(t)
else:
n = int(t)
if 2 <= self.debug:
print >>stderr, 'number: %r' % n
yield (pos, n)
elif t in ('true', 'false'):
# boolean
if 2 <= self.debug:
print >>stderr, 'boolean: %r' % t
yield (pos, (t == 'true'))
else:
# other token
if 2 <= self.debug:
print >>stderr, 'keyword: %r' % t
yield (pos, PSKeywordTable.intern(t))
return
## PSStackParser ## PSStackParser
## ##
class PSStackParser(PSBaseParser): class PSStackParser(PSBaseParser):
'''
PostScript parser that recognizes compound objects
such as arrays and dictionaries.
'''
def __init__(self, fp, debug=0): def __init__(self, fp, debug=0):
PSBaseParser.__init__(self, fp, debug=debug) PSBaseParser.__init__(self,fp, debug=debug)
self.reset()
return
def reset(self):
self.context = [] self.context = []
self.partobj = None self.curtype = None
self.curstack = []
self.results = []
return return
def do_token(self, pos, token): def push(self, *objs):
''' self.curstack.extend(objs)
Handles special tokens.
Returns true if the token denotes the end of an object.
'''
return False
def push(self, obj):
'''
Push an object to the stack.
'''
self.partobj.append(obj)
return return
def pop(self, n): def pop(self, n):
''' objs = self.curstack[-n:]
Pop N objects from the stack. self.curstack[-n:] = []
''' return objs
if len(self.partobj) < n:
if STRICT:
raise PSSyntaxError('stack too short < %d' % n)
r = self.partobj[-n:]
self.partobj = self.partobj[:-n]
return r
def popall(self): def popall(self):
''' objs = self.curstack
Discards all the objects on the stack. self.curstack = []
''' return objs
self.partobj = [] def add_results(self, *objs):
if 2 <= self.debug:
print >>stderr, 'add_results: %r' % (objs,)
self.results.extend(objs)
return return
def parse(self): def start_type(self, pos, type):
self.context.append((pos, self.curtype, self.curstack))
(self.curtype, self.curstack) = (type, [])
if 2 <= self.debug:
print >>stderr, 'start_type: pos=%r, type=%r' % (pos, type)
return
def end_type(self, type):
if self.curtype != type:
raise PSTypeError('type mismatch: %r != %r' % (self.curtype, type))
objs = [ obj for (_,obj) in self.curstack ]
(pos, self.curtype, self.curstack) = self.context.pop()
if 2 <= self.debug:
print >>stderr, 'end_type: pos=%r, type=%r, objs=%r' % (pos, type, objs)
return (pos, objs)
def do_keyword(self, pos, token):
return
def flush(self):
return
def nextobject(self):
''' '''
Yields a list of objects: keywords, literals, strings, Yields a list of objects: keywords, literals, strings,
numbers, arrays and dictionaries. Arrays and dictionaries numbers, arrays and dictionaries. Arrays and dictionaries
are represented as Python sequence and dictionaries. are represented as Python sequence and dictionaries.
''' '''
while not self.results:
def startobj(type): (pos, token) = self.nexttoken()
self.context.append((type, self.partobj)) #print (pos,token), (self.curtype, self.curstack)
self.partobj = [] if (isinstance(token, int) or
return isinstance(token, float) or
isinstance(token, bool) or
def endobj(type1): isinstance(token, str) or
if not self.context: isinstance(token, PSLiteral)):
if STRICT: # normal token
raise PSTypeError('stack empty.') self.push((pos, token))
obj = self.partobj elif token == KEYWORD_ARRAY_BEGIN:
(type0, partobj) = self.context[-1] # begin array
if type0 == type1: self.start_type(pos, 'a')
self.partobj = partobj elif token == KEYWORD_ARRAY_END:
self.context.pop() # end array
try:
self.push(self.end_type('a'))
except PSTypeError:
if STRICT: raise
elif token == KEYWORD_DICT_BEGIN:
# begin dictionary
self.start_type(pos, 'd')
elif token == KEYWORD_DICT_END:
# end dictionary
try:
(pos, objs) = self.end_type('d')
if len(objs) % 2 != 0:
raise PSSyntaxError('invalid dictionary construct: %r' % objs)
d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs))
self.push((pos, d))
except PSTypeError:
if STRICT: raise
else: else:
if STRICT: if 2 <= self.debug:
raise PSTypeError('type mismatch: %r(%r) != %r(%r)' % print >>stderr, 'do_keyword: pos=%r, token=%r, stack=%r' % \
(type0, self.partobj, type1, obj)) (pos, token, self.curstack)
self.do_keyword(pos, token)
if self.context:
continue
else:
self.flush()
obj = self.results.pop(0)
if 2 <= self.debug:
print >>stderr, 'nextobject: %r' % (obj,)
return obj return obj
startobj('o')
for (pos,t) in PSBaseParser.parse(self): ## Simplistic Test cases
if isinstance(t, int) or isinstance(t, float): ##
self.push(t) import unittest
elif isinstance(t, str): class TestPSBaseParser(unittest.TestCase):
self.push(t)
elif isinstance(t, PSLiteral):
self.push(t)
else:
c = keyword_name(t)
if c == '{' or c == '}':
self.push(t)
elif c == '[':
# begin array
if 2 <= self.debug:
print >>stderr, 'start array'
startobj('a')
elif c == ']':
# end array
a = endobj('a')
if 2 <= self.debug:
print >>stderr, 'end array: %r' % a
self.push(a)
elif c == '<<':
# begin dictionary
if 2 <= self.debug:
print >>stderr, 'start dict'
startobj('d')
elif c == '>>':
# end dictionary
objs = endobj('d')
if len(objs) % 2 != 0:
if STRICT:
raise PSTypeError('invalid dictionary construct: %r' % objs)
d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
if 2 <= self.debug:
print >>stderr, 'end dict: %r' % d
self.push(d)
elif self.do_token(pos, t):
break
objs = endobj('o') TESTDATA = r'''%!PS
return objs begin end
" @ #
/a/BCD /Some_Name /foo#5f#xbaa
0 +1 -2 .5 1.234
(abc) () (abc ( def ) ghi)
(def\040\0\0404ghi) (bach\\slask) (foo\nbaa)
(this % is not a comment.)
(foo
baa)
(foo\
baa)
<20> < 40 4020 >
<abcd00
12345>
func/a/b{(c)do*}def
[ 1 (z) ! ]
<< /foo (bar) >>
'''
TOKENS = [
(5, KWD('begin')), (11, KWD('end')), (16, KWD('"')), (19, KWD('@')),
(21, KWD('#')), (23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')),
(41, LIT('foo_xbaa')), (54, 0), (56, 1), (59, -2), (62, 0.5),
(65, 1.234), (71, 'abc'), (77, ''), (80, 'abc ( def ) ghi'),
(98, 'def \x00 4ghi'), (118, 'bach\\slask'), (132, 'foo\nbaa'),
(143, 'this % is not a comment.'), (170, 'foo\nbaa'), (180, 'foobaa'),
(191, ' '), (196, '@@ '), (208, '\xab\xcd\x00\x124\x05'),
(223, KWD('func')), (227, LIT('a')), (229, LIT('b')),
(231, KWD('{')), (232, 'c'), (235, KWD('do*')), (238, KWD('}')),
(239, KWD('def')), (243, KWD('[')), (245, 1), (247, 'z'), (251, KWD('!')),
(253, KWD(']')), (255, KWD('<<')), (258, LIT('foo')), (263, 'bar'),
(269, KWD('>>'))
]
OBJS = [
(23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')),
(41, LIT('foo_xbaa')), (54, 0), (56, 1), (59, -2), (62, 0.5),
(65, 1.234), (71, 'abc'), (77, ''), (80, 'abc ( def ) ghi'),
(98, 'def \x00 4ghi'), (118, 'bach\\slask'), (132, 'foo\nbaa'),
(143, 'this % is not a comment.'), (170, 'foo\nbaa'), (180, 'foobaa'),
(191, ' '), (196, '@@ '), (208, '\xab\xcd\x00\x124\x05'),
(227, LIT('a')), (229, LIT('b')), (232, 'c'), (243, [1, 'z']),
(255, {'foo': 'bar'}),
]
def get_tokens(self, s):
import StringIO
class MyParser(PSBaseParser):
def flush(self):
self.add_results(*self.popall())
parser = MyParser(StringIO.StringIO(s), debug=1)
r = []
try:
while 1:
r.append(parser.nexttoken())
except PSEOF:
pass
return r
def get_objects(self, s):
import StringIO
class MyParser(PSStackParser):
def flush(self):
self.add_results(*self.popall())
parser = MyParser(StringIO.StringIO(s), debug=1)
r = []
try:
while 1:
r.append(parser.nextobject())
except PSEOF:
pass
return r
def test_1(self):
tokens = self.get_tokens(self.TESTDATA)
print tokens
self.assertEqual(tokens, self.TOKENS)
return
def test_2(self):
objs = self.get_objects(self.TESTDATA)
print objs
self.assertEqual(objs, self.OBJS)
return
if __name__ == '__main__': unittest.main()