Restructuring core lexical handlings.
Fix several bugs. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@17 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
94859ea428
commit
366143361f
71
cmap.py
71
cmap.py
|
@ -3,7 +3,7 @@ import sys
|
|||
stderr = sys.stderr
|
||||
from struct import pack, unpack
|
||||
from utils import choplist, nunpack
|
||||
from psparser import PSException, PSSyntaxError, PSTypeError, \
|
||||
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
|
||||
PSLiteral, PSKeyword, literal_name, keyword_name, \
|
||||
PSStackParser
|
||||
try:
|
||||
|
@ -39,16 +39,16 @@ class CMap:
|
|||
return self
|
||||
|
||||
def register_code2cid(self, code, cid):
|
||||
assert isinstance(code, str)
|
||||
assert isinstance(cid, int)
|
||||
if isinstance(code, str) and isinstance(cid, int):
|
||||
self.code2cid[code] = cid
|
||||
return self
|
||||
|
||||
def register_cid2code(self, cid, code):
|
||||
from glyphlist import charname2unicode
|
||||
assert isinstance(cid, int)
|
||||
if isinstance(cid, int):
|
||||
if isinstance(code, PSLiteral):
|
||||
code = pack('>H', charname2unicode[code.name])
|
||||
self.cid2code[cid] = pack('>H', charname2unicode[code.name])
|
||||
elif isinstance(code, str):
|
||||
self.cid2code[cid] = code
|
||||
return self
|
||||
|
||||
|
@ -195,7 +195,7 @@ class CMapDB:
|
|||
print >>stderr, 'Reading: CMap %r...' % fname
|
||||
cmap = CMap()
|
||||
fp = file(fname)
|
||||
CMapParser(cmap, fp).parse()
|
||||
CMapParser(cmap, fp, debug=klass.debug).run()
|
||||
fp.close()
|
||||
else:
|
||||
raise KeyError(cmapname)
|
||||
|
@ -213,7 +213,14 @@ class CMapParser(PSStackParser):
|
|||
self.in_cmap = False
|
||||
return
|
||||
|
||||
def do_token(self, _, token):
|
||||
def run(self):
|
||||
try:
|
||||
self.nextobject()
|
||||
except PSEOF:
|
||||
pass
|
||||
return
|
||||
|
||||
def do_keyword(self, pos, token):
|
||||
name = token.name
|
||||
if name == 'begincmap':
|
||||
self.in_cmap = True
|
||||
|
@ -226,15 +233,15 @@ class CMapParser(PSStackParser):
|
|||
#
|
||||
if name == 'def':
|
||||
try:
|
||||
(k,v) = self.pop(2)
|
||||
self.cmap.attrs[literal_name(k)] = v
|
||||
((_,k),(_,v)) = self.pop(2)
|
||||
self.cmap.attrs[str(k)] = v
|
||||
except PSSyntaxError:
|
||||
pass
|
||||
return
|
||||
|
||||
if name == 'usecmap':
|
||||
try:
|
||||
(cmapname,) = self.pop(1)
|
||||
((_,cmapname),) = self.pop(1)
|
||||
self.cmap.copycmap(CMapDB.get_cmap(literal_name(cmapname)))
|
||||
except PSSyntaxError:
|
||||
pass
|
||||
|
@ -244,8 +251,6 @@ class CMapParser(PSStackParser):
|
|||
self.popall()
|
||||
return
|
||||
if name == 'endcodespacerange':
|
||||
if 1 <= self.debug:
|
||||
print >>stderr, 'codespace: %r' % self.partobj
|
||||
self.popall()
|
||||
return
|
||||
|
||||
|
@ -253,48 +258,45 @@ class CMapParser(PSStackParser):
|
|||
self.popall()
|
||||
return
|
||||
if name == 'endcidrange':
|
||||
for (s,e,cid) in choplist(3, self.partobj):
|
||||
assert isinstance(s, str)
|
||||
assert isinstance(e, str)
|
||||
assert isinstance(cid, int)
|
||||
assert len(s) == len(e)
|
||||
objs = [ obj for (_,obj) in self.popall() ]
|
||||
for (s,e,cid) in choplist(3, objs):
|
||||
if (not isinstance(s, str) or not isinstance(e, str) or
|
||||
not isinstance(cid, int) or len(s) != len(e)): continue
|
||||
sprefix = s[:-4]
|
||||
eprefix = e[:-4]
|
||||
assert sprefix == eprefix
|
||||
if sprefix != eprefix: continue
|
||||
svar = s[-4:]
|
||||
evar = e[-4:]
|
||||
s1 = nunpack(svar)
|
||||
e1 = nunpack(evar)
|
||||
vlen = len(svar)
|
||||
assert s1 <= e1
|
||||
#assert s1 <= e1
|
||||
for i in xrange(e1-s1+1):
|
||||
x = sprefix+pack('>L',s1+i)[-vlen:]
|
||||
self.cmap.register_code2cid(x, cid+i)
|
||||
self.popall()
|
||||
return
|
||||
|
||||
if name == 'begincidchar':
|
||||
self.popall()
|
||||
return
|
||||
if name == 'endcidchar':
|
||||
for (cid,code) in choplist(2, self.partobj):
|
||||
assert isinstance(code, str)
|
||||
assert isinstance(cid, str)
|
||||
objs = [ obj for (_,obj) in self.popall() ]
|
||||
for (cid,code) in choplist(2, objs):
|
||||
if isinstance(code, str) and isinstance(cid, str):
|
||||
self.cmap.register_code2cid(code, nunpack(cid))
|
||||
self.popall()
|
||||
return
|
||||
|
||||
if name == 'beginbfrange':
|
||||
self.popall()
|
||||
return
|
||||
if name == 'endbfrange':
|
||||
for (s,e,code) in choplist(3, self.partobj):
|
||||
assert isinstance(s, str)
|
||||
assert isinstance(e, str)
|
||||
assert len(s) == len(e)
|
||||
objs = [ obj for (_,obj) in self.popall() ]
|
||||
for (s,e,code) in choplist(3, objs):
|
||||
if (not isinstance(s, str) or not isinstance(e, str) or
|
||||
len(s) != len(e)): continue
|
||||
s1 = nunpack(s)
|
||||
e1 = nunpack(e)
|
||||
assert s1 <= e1
|
||||
#assert s1 <= e1
|
||||
if isinstance(code, list):
|
||||
for i in xrange(e1-s1+1):
|
||||
self.cmap.register_cid2code(s1+i, code[i])
|
||||
|
@ -306,29 +308,26 @@ class CMapParser(PSStackParser):
|
|||
for i in xrange(e1-s1+1):
|
||||
x = prefix+pack('>L',base+i)[-vlen:]
|
||||
self.cmap.register_cid2code(s1+i, x)
|
||||
self.popall()
|
||||
return
|
||||
|
||||
if name == 'beginbfchar':
|
||||
self.popall()
|
||||
return
|
||||
if name == 'endbfchar':
|
||||
for (cid,code) in choplist(2, self.partobj):
|
||||
assert isinstance(cid, str)
|
||||
assert isinstance(code, str)
|
||||
objs = [ obj for (_,obj) in self.popall() ]
|
||||
for (cid,code) in choplist(2, objs):
|
||||
if isinstance(cid, str) and isinstance(code, str):
|
||||
self.cmap.register_cid2code(nunpack(cid), code)
|
||||
self.popall()
|
||||
return
|
||||
|
||||
if name == 'beginnotdefrange':
|
||||
self.popall()
|
||||
return
|
||||
if name == 'endnotdefrange':
|
||||
if 1 <= self.debug:
|
||||
print >>stderr, 'notdefrange: %r' % self.partobj
|
||||
self.popall()
|
||||
return
|
||||
|
||||
self.push((pos, token))
|
||||
return
|
||||
|
||||
|
||||
|
|
10
pdf2txt.py
10
pdf2txt.py
|
@ -13,8 +13,8 @@ from cmap import CMapDB
|
|||
##
|
||||
class TextConverter(PDFDevice):
|
||||
|
||||
def __init__(self, outfp, rsrc, codec):
|
||||
PDFDevice.__init__(self, rsrc)
|
||||
def __init__(self, outfp, rsrc, codec, debug=0):
|
||||
PDFDevice.__init__(self, rsrc, debug=debug)
|
||||
self.outfp = outfp
|
||||
self.codec = codec
|
||||
return
|
||||
|
@ -42,6 +42,10 @@ class TextConverter(PDFDevice):
|
|||
return
|
||||
|
||||
def handle_undefined_char(self, cidcoding, cid):
|
||||
if self.debug:
|
||||
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
|
||||
#return unichr(cid)
|
||||
#return unichr(cid+32)
|
||||
return
|
||||
|
||||
def render_string(self, textstate, textmatrix, size, seq):
|
||||
|
@ -81,7 +85,7 @@ class TextConverter(PDFDevice):
|
|||
|
||||
# pdf2txt
|
||||
def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
|
||||
device = TextConverter(outfp, rsrc, codec)
|
||||
device = TextConverter(outfp, rsrc, codec, debug=debug)
|
||||
outfp.write('<document>\n')
|
||||
doc = PDFDocument(debug=debug)
|
||||
fp = file(fname)
|
||||
|
|
231
pdfinterp.py
231
pdfinterp.py
|
@ -6,7 +6,7 @@ try:
|
|||
from cStringIO import StringIO
|
||||
except ImportError:
|
||||
from StringIO import StringIO
|
||||
from psparser import PSException, PSSyntaxError, PSTypeError, \
|
||||
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
|
||||
PSStackParser, PSLiteral, PSKeyword, STRICT, \
|
||||
PSLiteralTable, PSKeywordTable, literal_name, keyword_name
|
||||
from pdfparser import PDFException, PDFStream, PDFObjRef, resolve1, \
|
||||
|
@ -45,6 +45,8 @@ LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
|
|||
LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
|
||||
LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
|
||||
LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
|
||||
KEYWORD_BI = PSKeywordTable.intern('BI')
|
||||
KEYWORD_ID = PSKeywordTable.intern('ID')
|
||||
KEYWORD_EI = PSKeywordTable.intern('EI')
|
||||
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
|
||||
|
||||
|
@ -134,7 +136,7 @@ class PDFSimpleFont(PDFFont):
|
|||
if 'ToUnicode' in spec:
|
||||
strm = stream_value(spec['ToUnicode'])
|
||||
self.ucs2_cmap = CMap()
|
||||
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse()
|
||||
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
|
||||
PDFFont.__init__(self, descriptor, widths)
|
||||
return
|
||||
|
||||
|
@ -292,7 +294,7 @@ class PDFCIDFont(PDFFont):
|
|||
if 'ToUnicode' in spec:
|
||||
strm = stream_value(spec['ToUnicode'])
|
||||
self.ucs2_cmap = CMap()
|
||||
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse()
|
||||
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
|
||||
elif self.cidcoding == 'Adobe-Identity':
|
||||
if ttf:
|
||||
try:
|
||||
|
@ -433,8 +435,9 @@ class PDFResourceManager:
|
|||
##
|
||||
class PDFDevice:
|
||||
|
||||
def __init__(self, rsrc):
|
||||
def __init__(self, rsrc, debug=0):
|
||||
self.rsrc = rsrc
|
||||
self.debug = debug
|
||||
self.ctm = None
|
||||
return
|
||||
|
||||
|
@ -465,47 +468,91 @@ class PDFDevice:
|
|||
##
|
||||
class PDFContentParser(PSStackParser):
|
||||
|
||||
def __init__(self, fp, debug=0):
|
||||
PSStackParser.__init__(self, fp, debug=debug)
|
||||
def __init__(self, streams, debug=0):
|
||||
self.streams = streams
|
||||
self.istream = 0
|
||||
PSStackParser.__init__(self, None, debug=debug)
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFParser: linepos=%d>' % self.linepos
|
||||
|
||||
EOIPAT = re.compile(r'\nEI\W')
|
||||
def do_token(self, pos, token):
|
||||
name = keyword_name(token)
|
||||
|
||||
if name == 'BI':
|
||||
# inline image within a content stream
|
||||
self.context.append(('BI', self.partobj))
|
||||
self.partobj = []
|
||||
|
||||
elif name == 'ID':
|
||||
objs = self.partobj
|
||||
(type0, self.partobj) = self.context.pop()
|
||||
if len(objs) % 2 != 0:
|
||||
if STRICT:
|
||||
raise PSTypeError('invalid dictionary construct: %r' % objs)
|
||||
dic = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
|
||||
pos += len('ID ')
|
||||
self.fp.seek(pos)
|
||||
# XXX how do we know the real length other than scanning?
|
||||
data = ''
|
||||
while 1:
|
||||
data += self.fp.read(4096)
|
||||
m = self.EOIPAT.search(data)
|
||||
if m: break
|
||||
objlen = m.start(0)
|
||||
obj = PDFStream(dic, data[:objlen])
|
||||
self.push(obj)
|
||||
self.seek(pos+objlen+len('\nEI'))
|
||||
self.push(KEYWORD_EI)
|
||||
|
||||
def fillfp(self):
|
||||
if not self.fp:
|
||||
if self.istream < len(self.streams):
|
||||
strm = stream_value(self.streams[self.istream])
|
||||
self.istream += 1
|
||||
else:
|
||||
self.push(token)
|
||||
raise PSEOF
|
||||
self.fp = StringIO(strm.get_data())
|
||||
return
|
||||
|
||||
return False
|
||||
def seek(self, pos):
|
||||
self.fillfp()
|
||||
PSStackParser.seek(self, pos)
|
||||
return
|
||||
|
||||
def fillbuf(self):
|
||||
if self.charpos < len(self.buf): return
|
||||
while 1:
|
||||
self.fillfp()
|
||||
self.bufpos = self.fp.tell()
|
||||
self.buf = self.fp.read(self.BUFSIZ)
|
||||
if self.buf: break
|
||||
self.fp = None
|
||||
self.charpos = 0
|
||||
return
|
||||
|
||||
def get_inline_data(self, pos, target='EI '):
|
||||
self.seek(pos)
|
||||
i = 0
|
||||
data = ''
|
||||
while i < len(target):
|
||||
self.fillbuf()
|
||||
if i:
|
||||
c = self.buf[self.charpos]
|
||||
data += c
|
||||
self.charpos += 1
|
||||
if c == target[i]:
|
||||
i += 1
|
||||
else:
|
||||
i = 0
|
||||
else:
|
||||
try:
|
||||
j = self.buf.index(target[0], self.charpos)
|
||||
#print 'found', (0, self.buf[j:j+10])
|
||||
data += self.buf[self.charpos:j]
|
||||
self.charpos = j+1
|
||||
i = 1
|
||||
except ValueError:
|
||||
data += self.buf[self.charpos:]
|
||||
self.charpos = len(self.buf)
|
||||
data = data[:-len(target)] # strip the last part
|
||||
return (pos, data)
|
||||
|
||||
def flush(self):
|
||||
self.add_results(*self.popall())
|
||||
return
|
||||
|
||||
def do_keyword(self, pos, token):
|
||||
if token == KEYWORD_BI:
|
||||
# inline image within a content stream
|
||||
self.start_type(pos, 'inline')
|
||||
elif token == KEYWORD_ID:
|
||||
try:
|
||||
(_, objs) = self.end_type('inline')
|
||||
if len(objs) % 2 != 0:
|
||||
raise PSTypeError('invalid dictionary construct: %r' % objs)
|
||||
d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
|
||||
(pos, data) = self.get_inline_data(pos+len('ID '))
|
||||
obj = PDFStream(d, data)
|
||||
self.push((pos, obj))
|
||||
self.push((pos, KEYWORD_EI))
|
||||
except PSTypeError:
|
||||
if STRICT: raise
|
||||
else:
|
||||
self.push((pos, token))
|
||||
return
|
||||
|
||||
|
||||
## Interpreter
|
||||
|
@ -542,10 +589,44 @@ class PDFPageInterpreter:
|
|||
self.debug = debug
|
||||
return
|
||||
|
||||
def initpage(self, ctm):
|
||||
def init_resources(self, resources):
|
||||
self.fontmap = {}
|
||||
self.xobjmap = {}
|
||||
self.csmap = PREDEFINED_COLORSPACE.copy()
|
||||
# Handle resource declarations.
|
||||
def get_colorspace(spec):
|
||||
if isinstance(spec, list):
|
||||
name = literal_name(spec[0])
|
||||
else:
|
||||
name = literal_name(spec)
|
||||
if name == 'ICCBased':
|
||||
return ColorSpace(name, stream_value(spec[1]).dic['N'])
|
||||
elif name == 'DeviceN':
|
||||
return ColorSpace(name, len(list_value(spec[1])))
|
||||
else:
|
||||
return PREDEFINED_COLORSPACE[name]
|
||||
if resources:
|
||||
for (k,v) in dict_value(resources).iteritems():
|
||||
if 1 <= self.debug:
|
||||
print >>stderr, 'Resource: %r: %r' % (k,v)
|
||||
if k == 'Font':
|
||||
for (fontid,spec) in dict_value(v).iteritems():
|
||||
objid = None
|
||||
if isinstance(spec, PDFObjRef):
|
||||
objid = spec.objid
|
||||
spec = dict_value(spec)
|
||||
self.fontmap[fontid] = self.rsrc.get_font(objid, spec)
|
||||
elif k == 'ColorSpace':
|
||||
for (csid,spec) in dict_value(v).iteritems():
|
||||
self.csmap[csid] = get_colorspace(resolve1(spec))
|
||||
elif k == 'ProcSet':
|
||||
self.rsrc.get_procset(list_value(v))
|
||||
elif k == 'XObject':
|
||||
for (xobjid,xobjstrm) in dict_value(v).iteritems():
|
||||
self.xobjmap[xobjid] = xobjstrm
|
||||
return
|
||||
|
||||
def init_state(self, ctm):
|
||||
# gstack: stack for graphical states.
|
||||
self.gstack = []
|
||||
self.ctm = ctm
|
||||
|
@ -554,8 +635,9 @@ class PDFPageInterpreter:
|
|||
# argstack: stack for command arguments.
|
||||
self.argstack = []
|
||||
# set some global states.
|
||||
self.scs = None
|
||||
self.ncs = None
|
||||
self.scs = self.ncs = None
|
||||
if self.csmap:
|
||||
self.scs = self.ncs = self.csmap.values()[0]
|
||||
return
|
||||
|
||||
def push(self, obj):
|
||||
|
@ -683,10 +765,22 @@ class PDFPageInterpreter:
|
|||
|
||||
# setcolor
|
||||
def do_SCN(self):
|
||||
self.pop(self.scs.ncomponents)
|
||||
if self.scs:
|
||||
n = self.scs.ncomponents
|
||||
else:
|
||||
if STRICT:
|
||||
raise PDFInterpreterError('no colorspace specified!')
|
||||
n = 1
|
||||
self.pop(n)
|
||||
return
|
||||
def do_scn(self):
|
||||
self.pop(self.ncs.ncomponents)
|
||||
if self.ncs:
|
||||
n = self.ncs.ncomponents
|
||||
else:
|
||||
if STRICT:
|
||||
raise PDFInterpreterError('no colorspace specified!')
|
||||
n = 1
|
||||
self.pop(n)
|
||||
return
|
||||
def do_SC(self):
|
||||
self.do_SCN()
|
||||
|
@ -839,8 +933,7 @@ class PDFPageInterpreter:
|
|||
(x1,y1) = apply_matrix(ctm, (x1,y1))
|
||||
bbox = (x0,y0,x1,y1)
|
||||
self.device.begin_figure(xobjid, bbox)
|
||||
interpreter.render_contents(xobj.dic.get('Resources'),
|
||||
[xobj], ctm=ctm)
|
||||
interpreter.render_contents(xobj.dic.get('Resources'), [xobj], ctm=ctm)
|
||||
self.device.end_figure(xobjid)
|
||||
return
|
||||
|
||||
|
@ -853,46 +946,18 @@ class PDFPageInterpreter:
|
|||
return
|
||||
|
||||
def render_contents(self, resources, contents, ctm=MATRIX_IDENTITY):
|
||||
self.initpage(ctm)
|
||||
# Handle resource declarations.
|
||||
def get_colorspace(spec):
|
||||
if isinstance(spec, list):
|
||||
name = literal_name(spec[0])
|
||||
else:
|
||||
name = literal_name(spec)
|
||||
if name == 'ICCBased':
|
||||
return ColorSpace(name, stream_value(spec[1]).dic['N'])
|
||||
elif name == 'DeviceN':
|
||||
return ColorSpace(name, len(list_value(cs[1])))
|
||||
else:
|
||||
return PREDEFINED_COLORSPACE[name]
|
||||
if resources:
|
||||
for (k,v) in dict_value(resources).iteritems():
|
||||
if 1 <= self.debug:
|
||||
print >>stderr, 'Resource: %r: %r' % (k,v)
|
||||
if k == 'Font':
|
||||
for (fontid,spec) in dict_value(v).iteritems():
|
||||
objid = None
|
||||
if isinstance(spec, PDFObjRef):
|
||||
objid = spec.objid
|
||||
spec = dict_value(spec)
|
||||
self.fontmap[fontid] = self.rsrc.get_font(objid, spec)
|
||||
elif k == 'ColorSpace':
|
||||
for (csid,spec) in dict_value(v).iteritems():
|
||||
self.csmap[csid] = get_colorspace(resolve1(spec))
|
||||
elif k == 'ProcSet':
|
||||
self.rsrc.get_procset(list_value(v))
|
||||
elif k == 'XObject':
|
||||
for (xobjid,xobjstrm) in dict_value(v).iteritems():
|
||||
self.xobjmap[xobjid] = xobjstrm
|
||||
data = ''.join( stream_value(stream).get_data()
|
||||
for stream in list_value(contents) )
|
||||
self.execute(data)
|
||||
self.init_resources(resources)
|
||||
self.init_state(ctm)
|
||||
self.execute(list_value(contents))
|
||||
return
|
||||
|
||||
def execute(self, data):
|
||||
parser = PDFContentParser(StringIO(data), debug=self.debug)
|
||||
for obj in parser.parse():
|
||||
def execute(self, streams):
|
||||
parser = PDFContentParser(streams, debug=self.debug)
|
||||
while 1:
|
||||
try:
|
||||
(_,obj) = parser.nextobject()
|
||||
except PSEOF:
|
||||
break
|
||||
if isinstance(obj, PSKeyword):
|
||||
name = 'do_%s' % obj.name.replace('*','_a').replace('"','_w').replace("'",'_q')
|
||||
if hasattr(self, name):
|
||||
|
|
112
pdfparser.py
112
pdfparser.py
|
@ -14,14 +14,10 @@
|
|||
# - Linearized PDF.
|
||||
# - Encryption?
|
||||
|
||||
import sys, re
|
||||
try:
|
||||
from cStringIO import StringIO
|
||||
except ImportError:
|
||||
from StringIO import StringIO
|
||||
import sys
|
||||
stderr = sys.stderr
|
||||
from utils import choplist, nunpack
|
||||
from psparser import PSException, PSSyntaxError, PSTypeError, \
|
||||
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
|
||||
PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
|
||||
literal_name, keyword_name, \
|
||||
PSStackParser, STRICT
|
||||
|
@ -43,14 +39,19 @@ LITERAL_PAGE = PSLiteralTable.intern('Page')
|
|||
LITERAL_PAGES = PSLiteralTable.intern('Pages')
|
||||
LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
|
||||
LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode')
|
||||
KEYWORD_R = PSKeywordTable.intern('R')
|
||||
KEYWORD_OBJ = PSKeywordTable.intern('obj')
|
||||
KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
|
||||
KEYWORD_STREAM = PSKeywordTable.intern('stream')
|
||||
KEYWORD_XREF = PSKeywordTable.intern('xref')
|
||||
KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
|
||||
|
||||
|
||||
## PDFObjRef
|
||||
##
|
||||
class PDFObjRef:
|
||||
|
||||
def __init__(self, doc, objid, genno):
|
||||
def __init__(self, doc, objid, _):
|
||||
if objid == 0:
|
||||
if STRICT:
|
||||
raise PDFValueError('objid cannot be 0.')
|
||||
|
@ -275,7 +276,8 @@ class PDFXRef:
|
|||
(pos, genno, use) = f
|
||||
self.offsets.append((int(genno), long(pos), use))
|
||||
# read trailer
|
||||
self.trailer = dict_value(parser.parse()[0])
|
||||
(_, dic) = parser.nextobject()
|
||||
self.trailer = dict_value(dic)
|
||||
return
|
||||
|
||||
def getpos(self, objid):
|
||||
|
@ -293,9 +295,13 @@ class PDFXRef:
|
|||
class PDFXRefStream:
|
||||
|
||||
def __init__(self, parser):
|
||||
(objid, genno, _, stream) = list_value(parser.parse())
|
||||
(_,objid) = parser.nextobject()
|
||||
(_,genno) = parser.nextobject()
|
||||
parser.nextobject()
|
||||
(_,stream) = parser.nextobject()
|
||||
if STRICT:
|
||||
assert stream.dic['Type'] == LITERAL_XREF
|
||||
if stream.dic['Type'] != LITERAL_XREF:
|
||||
raise PDFSyntaxError('invalid stream spec.')
|
||||
size = stream.dic['Size']
|
||||
(start, nobjs) = stream.dic.get('Index', (0,size))
|
||||
self.objid0 = start
|
||||
|
@ -385,20 +391,24 @@ class PDFDocument:
|
|||
if strmid in self.parsed_objs:
|
||||
objs = self.parsed_objs[stream]
|
||||
else:
|
||||
parser = PDFParser(self, StringIO(stream.get_data()),
|
||||
debug=self.debug)
|
||||
objs = list(parser.parse())
|
||||
parser = PDFObjStrmParser(self, stream.get_data(), debug=self.debug)
|
||||
objs = []
|
||||
try:
|
||||
while 1:
|
||||
(_,obj) = parser.nextobject()
|
||||
objs.append(obj)
|
||||
except PSEOF:
|
||||
pass
|
||||
self.parsed_objs[stream] = objs
|
||||
obj = objs[stream.dic['N']*2+index]
|
||||
else:
|
||||
prevpos = self.parser.seek(index)
|
||||
seq = list_value(self.parser.parse())
|
||||
if not (4 <= len(seq) and seq[0] == objid and seq[2] == KEYWORD_OBJ):
|
||||
if STRICT:
|
||||
raise PDFSyntaxError('invalid stream spec: %r' % seq)
|
||||
return None
|
||||
obj = seq[3]
|
||||
self.parser.seek(prevpos)
|
||||
self.parser.seek(index)
|
||||
(_,objid1) = self.parser.nextobject() # objid
|
||||
(_,genno1) = self.parser.nextobject() # genno
|
||||
(_,kwd) = self.parser.nextobject()
|
||||
if kwd != KEYWORD_OBJ:
|
||||
raise PDFSyntaxError('invalid obj spec: offset=%r' % index)
|
||||
(_,obj) = self.parser.nextobject()
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'register: objid=%r: %r' % (objid, obj)
|
||||
self.objs[objid] = obj
|
||||
|
@ -446,29 +456,30 @@ class PDFParser(PSStackParser):
|
|||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFParser: linepos=%d>' % self.linepos
|
||||
return '<PDFParser>'
|
||||
|
||||
EOIPAT = re.compile(r'\nEI\W')
|
||||
def do_token(self, pos, token):
|
||||
name = keyword_name(token)
|
||||
if name in ('xref', 'trailer', 'startxref', 'endobj'):
|
||||
return True
|
||||
def do_keyword(self, pos, token):
|
||||
if token in (KEYWORD_XREF, KEYWORD_STARTXREF):
|
||||
self.add_results(*self.pop(1))
|
||||
return
|
||||
if token == KEYWORD_ENDOBJ:
|
||||
self.add_results(*self.pop(4))
|
||||
return
|
||||
|
||||
if name == 'R':
|
||||
if token == KEYWORD_R:
|
||||
# reference to indirect object
|
||||
try:
|
||||
(objid, genno) = self.pop(2)
|
||||
((_,objid), (_,genno)) = self.pop(2)
|
||||
(objid, genno) = (int(objid), int(genno))
|
||||
obj = PDFObjRef(self.doc, objid, genno)
|
||||
self.push(obj)
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'refer obj: %r' % obj
|
||||
self.push((pos, obj))
|
||||
except PSSyntaxError:
|
||||
pass
|
||||
return
|
||||
|
||||
elif name == 'stream':
|
||||
if token == KEYWORD_STREAM:
|
||||
# stream object
|
||||
(dic,) = self.pop(1)
|
||||
((_,dic),) = self.pop(1)
|
||||
dic = dict_value(dic)
|
||||
try:
|
||||
objlen = int_value(dic['Length'])
|
||||
|
@ -484,20 +495,19 @@ class PDFParser(PSStackParser):
|
|||
self.seek(pos+objlen)
|
||||
while 1:
|
||||
(linepos, line) = self.nextline()
|
||||
if not line or line.startswith('endstream'):
|
||||
break
|
||||
if line.startswith('endstream'): break
|
||||
objlen += len(line)
|
||||
data += line
|
||||
if 1 <= self.debug:
|
||||
print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
|
||||
(pos, objlen, dic, data[:10])
|
||||
obj = PDFStream(dic, data, self.doc.decipher)
|
||||
self.push(obj)
|
||||
self.push((pos, obj))
|
||||
return
|
||||
|
||||
else:
|
||||
self.push(token)
|
||||
|
||||
return False
|
||||
# others
|
||||
self.push((pos, token))
|
||||
return
|
||||
|
||||
def find_xref(self):
|
||||
# find the first xref table
|
||||
|
@ -505,7 +515,7 @@ class PDFParser(PSStackParser):
|
|||
for line in self.revreadlines():
|
||||
line = line.strip()
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'line: %r' % line
|
||||
print >>stderr, 'find_xref: %r' % line
|
||||
if line == 'startxref': break
|
||||
if line:
|
||||
prev = line
|
||||
|
@ -525,10 +535,11 @@ class PDFParser(PSStackParser):
|
|||
# read xref table
|
||||
(linepos, line) = self.nextline()
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'line: %r' % line
|
||||
print >>stderr, 'read_xref: %r' % line
|
||||
if line[0].isdigit():
|
||||
# XRefStream: PDF-1.5
|
||||
self.seek(linepos)
|
||||
self.reset()
|
||||
xref = PDFXRefStream(self)
|
||||
else:
|
||||
if line.strip() != 'xref':
|
||||
|
@ -551,3 +562,18 @@ class PDFParser(PSStackParser):
|
|||
else:
|
||||
break
|
||||
return
|
||||
|
||||
## PDFObjStrmParser
|
||||
##
|
||||
class PDFObjStrmParser(PDFParser):
|
||||
def __init__(self, doc, data, debug=0):
|
||||
try:
|
||||
from cStringIO import StringIO
|
||||
except ImportError:
|
||||
from StringIO import StringIO
|
||||
PDFParser.__init__(self, doc, StringIO(data), debug=debug)
|
||||
return
|
||||
|
||||
def flush(self):
|
||||
self.add_results(*self.popall())
|
||||
return
|
||||
|
|
710
psparser.py
710
psparser.py
|
@ -3,12 +3,13 @@ import sys, re
|
|||
stderr = sys.stderr
|
||||
from utils import choplist
|
||||
|
||||
STRICT = 0
|
||||
STRICT = 1
|
||||
|
||||
|
||||
## PS Exceptions
|
||||
##
|
||||
class PSException(Exception): pass
|
||||
class PSEOF(PSException): pass
|
||||
class PSSyntaxError(PSException): pass
|
||||
class PSTypeError(PSException): pass
|
||||
class PSValueError(PSException): pass
|
||||
|
@ -71,6 +72,14 @@ class PSSymbolTable:
|
|||
|
||||
PSLiteralTable = PSSymbolTable(PSLiteral)
|
||||
PSKeywordTable = PSSymbolTable(PSKeyword)
|
||||
LIT = PSLiteralTable.intern
|
||||
KWD = PSKeywordTable.intern
|
||||
KEYWORD_BRACE_BEGIN = KWD('{')
|
||||
KEYWORD_BRACE_END = KWD('}')
|
||||
KEYWORD_ARRAY_BEGIN = KWD('[')
|
||||
KEYWORD_ARRAY_END = KWD(']')
|
||||
KEYWORD_DICT_BEGIN = KWD('<<')
|
||||
KEYWORD_DICT_END = KWD('>>')
|
||||
|
||||
|
||||
def literal_name(x):
|
||||
|
@ -92,72 +101,288 @@ def keyword_name(x):
|
|||
|
||||
## PSBaseParser
|
||||
##
|
||||
EOL = re.compile(r'[\r\n]')
|
||||
SPC = re.compile(r'\s')
|
||||
NONSPC = re.compile(r'\S')
|
||||
HEX = re.compile(r'[0-9a-fA-F]')
|
||||
END_LITERAL = re.compile(r'[#/%\[\]()<>{}\s]')
|
||||
END_HEX_STRING = re.compile(r'[^\s0-9a-fA-F]')
|
||||
HEX_PAIR = re.compile(r'[0-9a-fA-F]{2}|.')
|
||||
END_NUMBER = re.compile(r'[^0-9]')
|
||||
END_KEYWORD = re.compile(r'[#/%\[\]()<>{}\s]')
|
||||
END_STRING = re.compile(r'[()\134]')
|
||||
OCT_STRING = re.compile(r'[0-7]')
|
||||
ESC_STRING = { 'b':8, 't':9, 'n':10, 'f':12, 'r':13, '(':40, ')':41, '\\':92 }
|
||||
class PSBaseParser:
|
||||
|
||||
'''
|
||||
Most basic PostScript parser that performs only basic tokenization.
|
||||
'''
|
||||
BUFSIZ = 4096
|
||||
|
||||
def __init__(self, fp, debug=0):
|
||||
self.fp = fp
|
||||
self.debug = debug
|
||||
self.bufsize = 4096
|
||||
self.strfilter = None
|
||||
self.seek(0)
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PSBaseParser: %r>' % (self.fp,)
|
||||
return '<PSBaseParser: %r, bufpos=%d>' % (self.fp, self.bufpos)
|
||||
|
||||
def tell(self):
|
||||
return self.fp.tell()
|
||||
|
||||
def poll(self, pos=None, n=80):
|
||||
pos0 = self.fp.tell()
|
||||
if not pos:
|
||||
pos = self.bufpos+self.charpos
|
||||
self.fp.seek(pos)
|
||||
print >>stderr, 'poll(%d): %r' % (pos, self.fp.read(n))
|
||||
self.fp.seek(pos0)
|
||||
return
|
||||
|
||||
def seek(self, pos):
|
||||
'''
|
||||
Seeks the parser to the given position.
|
||||
'''
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'seek:', pos
|
||||
prevpos = self.fp.tell()
|
||||
print >>stderr, 'seek: %r' % pos
|
||||
self.fp.seek(pos)
|
||||
self.linebuf = None # line buffer.
|
||||
self.curpos = 0 # current position in the buffer.
|
||||
self.linepos = pos # the beginning of the current line.
|
||||
self.go = False
|
||||
return prevpos
|
||||
# reset the status for nextline()
|
||||
self.bufpos = pos
|
||||
self.buf = ''
|
||||
self.charpos = 0
|
||||
# reset the status for nexttoken()
|
||||
self.parse1 = self.parse_main
|
||||
self.tokens = []
|
||||
return
|
||||
|
||||
def fillbuf(self):
|
||||
if self.charpos < len(self.buf): return
|
||||
# fetch next chunk.
|
||||
self.bufpos = self.fp.tell()
|
||||
self.buf = self.fp.read(self.BUFSIZ)
|
||||
if not self.buf:
|
||||
raise PSEOF
|
||||
self.charpos = 0
|
||||
return
|
||||
|
||||
def parse_main(self, s, i):
|
||||
m = NONSPC.search(s, i)
|
||||
if not m:
|
||||
return (self.parse_main, len(s))
|
||||
j = m.start(0)
|
||||
c = s[j]
|
||||
self.tokenstart = self.bufpos+j
|
||||
if c == '%':
|
||||
self.token = '%'
|
||||
return (self.parse_comment, j+1)
|
||||
if c == '/':
|
||||
self.token = ''
|
||||
return (self.parse_literal, j+1)
|
||||
if c in '-+' or c.isdigit():
|
||||
self.token = c
|
||||
return (self.parse_number, j+1)
|
||||
if c == '.':
|
||||
self.token = c
|
||||
return (self.parse_float, j+1)
|
||||
if c.isalpha():
|
||||
self.token = c
|
||||
return (self.parse_keyword, j+1)
|
||||
if c == '(':
|
||||
self.token = ''
|
||||
self.paren = 1
|
||||
return (self.parse_string, j+1)
|
||||
if c == '<':
|
||||
self.token = ''
|
||||
return (self.parse_wopen, j+1)
|
||||
if c == '>':
|
||||
self.token = ''
|
||||
return (self.parse_wclose, j+1)
|
||||
self.add_token(KWD(c))
|
||||
return (self.parse_main, j+1)
|
||||
|
||||
def add_token(self, obj):
|
||||
self.tokens.append((self.tokenstart, obj))
|
||||
return
|
||||
|
||||
def parse_comment(self, s, i):
|
||||
m = EOL.search(s, i)
|
||||
if not m:
|
||||
self.token += s[i:]
|
||||
return (self.parse_comment, len(s))
|
||||
j = m.start(0)
|
||||
self.token += s[i:j]
|
||||
# We ignore comments.
|
||||
#self.tokens.append(self.token)
|
||||
return (self.parse_main, j)
|
||||
|
||||
def parse_literal(self, s, i):
|
||||
m = END_LITERAL.search(s, i)
|
||||
if not m:
|
||||
self.token += s[i:]
|
||||
return (self.parse_literal, len(s))
|
||||
j = m.start(0)
|
||||
self.token += s[i:j]
|
||||
c = s[j]
|
||||
if c == '#':
|
||||
self.hex = ''
|
||||
return (self.parse_literal_hex, j+1)
|
||||
self.add_token(LIT(self.token))
|
||||
return (self.parse_main, j)
|
||||
|
||||
def parse_literal_hex(self, s, i):
|
||||
c = s[i]
|
||||
if HEX.match(c) and len(self.hex) < 2:
|
||||
self.hex += c
|
||||
return (self.parse_literal_hex, i+1)
|
||||
if self.hex:
|
||||
self.token += chr(int(self.hex, 16))
|
||||
return (self.parse_literal, i)
|
||||
|
||||
def parse_number(self, s, i):
|
||||
m = END_NUMBER.search(s, i)
|
||||
if not m:
|
||||
self.token += s[i:]
|
||||
return (self.parse_number, len(s))
|
||||
j = m.start(0)
|
||||
self.token += s[i:j]
|
||||
c = s[j]
|
||||
if c == '.':
|
||||
self.token += c
|
||||
return (self.parse_float, j+1)
|
||||
try:
|
||||
self.add_token(int(self.token))
|
||||
except ValueError:
|
||||
pass
|
||||
return (self.parse_main, j)
|
||||
def parse_float(self, s, i):
|
||||
m = END_NUMBER.search(s, i)
|
||||
if not m:
|
||||
self.token += s[i:]
|
||||
return (self.parse_float, len(s))
|
||||
j = m.start(0)
|
||||
self.token += s[i:j]
|
||||
self.add_token(float(self.token))
|
||||
return (self.parse_main, j)
|
||||
|
||||
def parse_keyword(self, s, i):
|
||||
m = END_KEYWORD.search(s, i)
|
||||
if not m:
|
||||
self.token += s[i:]
|
||||
return (self.parse_keyword, len(s))
|
||||
j = m.start(0)
|
||||
self.token += s[i:j]
|
||||
if self.token == 'true':
|
||||
token = True
|
||||
elif self.token == 'false':
|
||||
token = False
|
||||
else:
|
||||
token = KWD(self.token)
|
||||
self.add_token(token)
|
||||
return (self.parse_main, j)
|
||||
|
||||
def parse_string(self, s, i):
|
||||
m = END_STRING.search(s, i)
|
||||
if not m:
|
||||
self.token += s[i:]
|
||||
return (self.parse_string, len(s))
|
||||
j = m.start(0)
|
||||
self.token += s[i:j]
|
||||
c = s[j]
|
||||
if c == '\\':
|
||||
self.oct = ''
|
||||
return (self.parse_string_1, j+1)
|
||||
if c == '(':
|
||||
self.paren += 1
|
||||
self.token += c
|
||||
return (self.parse_string, j+1)
|
||||
if c == ')':
|
||||
self.paren -= 1
|
||||
if self.paren: # WTF, they said balanced parens need no special treatment.
|
||||
self.token += c
|
||||
return (self.parse_string, j+1)
|
||||
self.add_token(self.token)
|
||||
return (self.parse_main, j+1)
|
||||
def parse_string_1(self, s, i):
|
||||
c = s[i]
|
||||
if OCT_STRING.match(c) and len(self.oct) < 3:
|
||||
self.oct += c
|
||||
return (self.parse_string_1, i+1)
|
||||
if self.oct:
|
||||
self.token += chr(int(self.oct, 8))
|
||||
return (self.parse_string, i)
|
||||
if c in ESC_STRING:
|
||||
self.token += chr(ESC_STRING[c])
|
||||
return (self.parse_string, i+1)
|
||||
|
||||
def parse_wopen(self, s, i):
|
||||
c = s[i]
|
||||
if c.isspace() or HEX.match(c):
|
||||
return (self.parse_hexstring, i)
|
||||
if c == '<':
|
||||
self.add_token(KEYWORD_DICT_BEGIN)
|
||||
i += 1
|
||||
return (self.parse_main, i)
|
||||
|
||||
def parse_wclose(self, s, i):
|
||||
c = s[i]
|
||||
if c == '>':
|
||||
self.add_token(KEYWORD_DICT_END)
|
||||
i += 1
|
||||
return (self.parse_main, i)
|
||||
|
||||
def parse_hexstring(self, s, i):
|
||||
m = END_HEX_STRING.search(s, i)
|
||||
if not m:
|
||||
self.token += s[i:]
|
||||
return (self.parse_hexstring, len(s))
|
||||
j = m.start(0)
|
||||
self.token += s[i:j]
|
||||
token = HEX_PAIR.sub(lambda m: chr(int(m.group(0), 16)),
|
||||
SPC.sub('', self.token))
|
||||
self.add_token(token)
|
||||
return (self.parse_main, j)
|
||||
|
||||
def nexttoken(self):
|
||||
while not self.tokens:
|
||||
self.fillbuf()
|
||||
(self.parse1, self.charpos) = self.parse1(self.buf, self.charpos)
|
||||
token = self.tokens.pop(0)
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'nexttoken: %r' % (token,)
|
||||
return token
|
||||
|
||||
EOLCHAR = re.compile(r'[\r\n]')
|
||||
def nextline(self):
|
||||
'''
|
||||
Fetches a next line that ends either with \\r or \\n.
|
||||
'''
|
||||
line = ''
|
||||
eol = None
|
||||
linebuf = ''
|
||||
linepos = self.bufpos + self.charpos
|
||||
eol = False
|
||||
while 1:
|
||||
if not self.linebuf or len(self.linebuf) <= self.curpos:
|
||||
# fetch next chunk.
|
||||
self.linebuf = self.fp.read(self.bufsize)
|
||||
if not self.linebuf:
|
||||
# at EOF.
|
||||
break
|
||||
self.curpos = 0
|
||||
self.fillbuf()
|
||||
if eol:
|
||||
c = self.linebuf[self.curpos]
|
||||
c = self.buf[self.charpos]
|
||||
# handle '\r\n'
|
||||
if (eol == '\r' and c == '\n'):
|
||||
line += c
|
||||
self.curpos += 1
|
||||
if c == '\n':
|
||||
linebuf += c
|
||||
self.charpos += 1
|
||||
break
|
||||
m = self.EOLCHAR.search(self.linebuf, self.curpos)
|
||||
m = EOL.search(self.buf, self.charpos)
|
||||
if m:
|
||||
i = m.end(0)
|
||||
line += self.linebuf[self.curpos:i]
|
||||
eol = self.linebuf[i-1]
|
||||
self.curpos = i
|
||||
linebuf += self.buf[self.charpos:m.end(0)]
|
||||
self.charpos = m.end(0)
|
||||
if linebuf[-1] == '\r':
|
||||
eol = True
|
||||
else:
|
||||
# fetch further
|
||||
line += self.linebuf[self.curpos:]
|
||||
self.linebuf = None
|
||||
linepos = self.linepos
|
||||
self.linepos += len(line)
|
||||
return (linepos, line)
|
||||
break
|
||||
else:
|
||||
linebuf += self.buf[self.charpos:]
|
||||
self.charpos = len(self.buf)
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'nextline: %r' % ((linepos, linebuf),)
|
||||
return (linepos, linebuf)
|
||||
|
||||
def revreadlines(self):
|
||||
'''
|
||||
|
@ -168,9 +393,9 @@ class PSBaseParser:
|
|||
pos = self.fp.tell()
|
||||
buf = ''
|
||||
while 0 < pos:
|
||||
pos = max(0, pos-self.bufsize)
|
||||
pos = max(0, pos-self.BUFSIZ)
|
||||
self.fp.seek(pos)
|
||||
s = self.fp.read(self.bufsize)
|
||||
s = self.fp.read(self.BUFSIZ)
|
||||
if not s: break
|
||||
while 1:
|
||||
n = max(s.rfind('\r'), s.rfind('\n'))
|
||||
|
@ -182,263 +407,202 @@ class PSBaseParser:
|
|||
buf = ''
|
||||
return
|
||||
|
||||
# regex patterns for basic lexical scanning.
|
||||
SPECIAL = r'%\[\]()<>{}/\000\011\012\014\015\040'
|
||||
TOKEN = re.compile(r'<<|>>|[%\[\]()<>{}/]|[^'+SPECIAL+r']+')
|
||||
LITERAL = re.compile(r'([^#'+SPECIAL+r']|#[0-9abcdefABCDEF]{2})+')
|
||||
NUMBER = re.compile(r'[+-]?[0-9][.0-9]*$')
|
||||
STRING_NORM = re.compile(r'(\\[0-9]{1,3}|\\.|[^\)])+')
|
||||
STRING_NORM_SUB = re.compile(r'\\[0-7]{1,3}|\\.')
|
||||
STRING_HEX = re.compile(r'[\s0-9a-fA-F]+')
|
||||
STRING_HEX_SUB = re.compile(r'[0-9a-fA-F]{1,2}')
|
||||
|
||||
def parse(self):
|
||||
'''
|
||||
Yields a list of tuples (pos, token) of the following:
|
||||
keywords, literals, strings, numbers and parentheses.
|
||||
Comments are skipped.
|
||||
Nested objects (i.e. arrays and dictionaries) are not handled here.
|
||||
'''
|
||||
while 1:
|
||||
# do not strip line! we need to distinguish last '\n' or '\r'
|
||||
(linepos, line) = self.nextline()
|
||||
if not line: break
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'line: (%d) %r' % (linepos, line)
|
||||
# do this before removing comment
|
||||
if line.startswith('%%EOF'): break
|
||||
charpos = 0
|
||||
|
||||
# tokenize
|
||||
self.go = True
|
||||
while self.go:
|
||||
m = self.TOKEN.search(line, charpos)
|
||||
if not m: break
|
||||
t = m.group(0)
|
||||
pos = linepos + m.start(0)
|
||||
charpos = m.end(0)
|
||||
|
||||
if t == '%':
|
||||
# skip comment
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'comment: %r' % line[charpos:]
|
||||
break
|
||||
|
||||
elif t == '/':
|
||||
# literal object
|
||||
mn = self.LITERAL.match(line, m.start(0)+1)
|
||||
lit = PSLiteralTable.intern(mn.group(0))
|
||||
yield (pos, lit)
|
||||
charpos = mn.end(0)
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'name: %r' % lit
|
||||
|
||||
elif t == '(':
|
||||
# normal string object
|
||||
s = ''
|
||||
while 1:
|
||||
ms = self.STRING_NORM.match(line, charpos)
|
||||
if not ms: break
|
||||
s1 = ms.group(0)
|
||||
charpos = ms.end(0)
|
||||
if len(s1) == 1 and s1[-1] == '\\':
|
||||
s += s1[-1:]
|
||||
(linepos, line) = self.nextline()
|
||||
if not line:
|
||||
if STRICT:
|
||||
raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
|
||||
(linepos, line))
|
||||
break
|
||||
charpos = 0
|
||||
elif charpos == len(line):
|
||||
s += s1
|
||||
(linepos, line) = self.nextline()
|
||||
if not line:
|
||||
if STRICT:
|
||||
raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
|
||||
(linepos, line))
|
||||
break
|
||||
charpos = 0
|
||||
else:
|
||||
s += s1
|
||||
break
|
||||
if line[charpos] == ')':
|
||||
charpos += 1
|
||||
else:
|
||||
if STRICT:
|
||||
raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
|
||||
(linepos, line))
|
||||
pass
|
||||
def convesc(m):
|
||||
x = m.group(0)
|
||||
if x[1:].isdigit():
|
||||
return chr(int(x[1:], 8))
|
||||
else:
|
||||
return x[1]
|
||||
s = self.STRING_NORM_SUB.sub(convesc, s)
|
||||
if self.strfilter:
|
||||
s = self.strfilter(s)
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'str: %r' % s
|
||||
yield (pos, s)
|
||||
|
||||
elif t == '<':
|
||||
# hex string object
|
||||
ms = self.STRING_HEX.match(line, charpos)
|
||||
charpos = ms.end(0)
|
||||
if line[charpos] == '>':
|
||||
charpos += 1
|
||||
else:
|
||||
if STRICT:
|
||||
raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
|
||||
(linepos, line))
|
||||
def convhex(m1):
|
||||
return chr(int(m1.group(0), 16))
|
||||
s = self.STRING_HEX_SUB.sub(convhex, ms.group(0))
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'str: %r' % s
|
||||
yield (pos, s)
|
||||
|
||||
elif self.NUMBER.match(t):
|
||||
# number
|
||||
if '.' in t:
|
||||
n = float(t)
|
||||
else:
|
||||
n = int(t)
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'number: %r' % n
|
||||
yield (pos, n)
|
||||
|
||||
elif t in ('true', 'false'):
|
||||
# boolean
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'boolean: %r' % t
|
||||
yield (pos, (t == 'true'))
|
||||
|
||||
else:
|
||||
# other token
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'keyword: %r' % t
|
||||
yield (pos, PSKeywordTable.intern(t))
|
||||
|
||||
return
|
||||
|
||||
|
||||
## PSStackParser
|
||||
##
|
||||
class PSStackParser(PSBaseParser):
|
||||
|
||||
'''
|
||||
PostScript parser that recognizes compound objects
|
||||
such as arrays and dictionaries.
|
||||
'''
|
||||
|
||||
def __init__(self, fp, debug=0):
|
||||
PSBaseParser.__init__(self,fp, debug=debug)
|
||||
self.reset()
|
||||
return
|
||||
|
||||
def reset(self):
|
||||
self.context = []
|
||||
self.partobj = None
|
||||
self.curtype = None
|
||||
self.curstack = []
|
||||
self.results = []
|
||||
return
|
||||
|
||||
def do_token(self, pos, token):
|
||||
'''
|
||||
Handles special tokens.
|
||||
Returns true if the token denotes the end of an object.
|
||||
'''
|
||||
return False
|
||||
|
||||
def push(self, obj):
|
||||
'''
|
||||
Push an object to the stack.
|
||||
'''
|
||||
self.partobj.append(obj)
|
||||
def push(self, *objs):
|
||||
self.curstack.extend(objs)
|
||||
return
|
||||
|
||||
def pop(self, n):
|
||||
'''
|
||||
Pop N objects from the stack.
|
||||
'''
|
||||
if len(self.partobj) < n:
|
||||
if STRICT:
|
||||
raise PSSyntaxError('stack too short < %d' % n)
|
||||
r = self.partobj[-n:]
|
||||
self.partobj = self.partobj[:-n]
|
||||
return r
|
||||
|
||||
objs = self.curstack[-n:]
|
||||
self.curstack[-n:] = []
|
||||
return objs
|
||||
def popall(self):
|
||||
'''
|
||||
Discards all the objects on the stack.
|
||||
'''
|
||||
self.partobj = []
|
||||
objs = self.curstack
|
||||
self.curstack = []
|
||||
return objs
|
||||
def add_results(self, *objs):
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'add_results: %r' % (objs,)
|
||||
self.results.extend(objs)
|
||||
return
|
||||
|
||||
def parse(self):
|
||||
def start_type(self, pos, type):
|
||||
self.context.append((pos, self.curtype, self.curstack))
|
||||
(self.curtype, self.curstack) = (type, [])
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'start_type: pos=%r, type=%r' % (pos, type)
|
||||
return
|
||||
def end_type(self, type):
|
||||
if self.curtype != type:
|
||||
raise PSTypeError('type mismatch: %r != %r' % (self.curtype, type))
|
||||
objs = [ obj for (_,obj) in self.curstack ]
|
||||
(pos, self.curtype, self.curstack) = self.context.pop()
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'end_type: pos=%r, type=%r, objs=%r' % (pos, type, objs)
|
||||
return (pos, objs)
|
||||
|
||||
def do_keyword(self, pos, token):
|
||||
return
|
||||
def flush(self):
|
||||
return
|
||||
|
||||
def nextobject(self):
|
||||
'''
|
||||
Yields a list of objects: keywords, literals, strings,
|
||||
numbers, arrays and dictionaries. Arrays and dictionaries
|
||||
are represented as Python sequence and dictionaries.
|
||||
'''
|
||||
|
||||
def startobj(type):
|
||||
self.context.append((type, self.partobj))
|
||||
self.partobj = []
|
||||
return
|
||||
|
||||
def endobj(type1):
|
||||
if not self.context:
|
||||
if STRICT:
|
||||
raise PSTypeError('stack empty.')
|
||||
obj = self.partobj
|
||||
(type0, partobj) = self.context[-1]
|
||||
if type0 == type1:
|
||||
self.partobj = partobj
|
||||
self.context.pop()
|
||||
while not self.results:
|
||||
(pos, token) = self.nexttoken()
|
||||
#print (pos,token), (self.curtype, self.curstack)
|
||||
if (isinstance(token, int) or
|
||||
isinstance(token, float) or
|
||||
isinstance(token, bool) or
|
||||
isinstance(token, str) or
|
||||
isinstance(token, PSLiteral)):
|
||||
# normal token
|
||||
self.push((pos, token))
|
||||
elif token == KEYWORD_ARRAY_BEGIN:
|
||||
# begin array
|
||||
self.start_type(pos, 'a')
|
||||
elif token == KEYWORD_ARRAY_END:
|
||||
# end array
|
||||
try:
|
||||
self.push(self.end_type('a'))
|
||||
except PSTypeError:
|
||||
if STRICT: raise
|
||||
elif token == KEYWORD_DICT_BEGIN:
|
||||
# begin dictionary
|
||||
self.start_type(pos, 'd')
|
||||
elif token == KEYWORD_DICT_END:
|
||||
# end dictionary
|
||||
try:
|
||||
(pos, objs) = self.end_type('d')
|
||||
if len(objs) % 2 != 0:
|
||||
raise PSSyntaxError('invalid dictionary construct: %r' % objs)
|
||||
d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs))
|
||||
self.push((pos, d))
|
||||
except PSTypeError:
|
||||
if STRICT: raise
|
||||
else:
|
||||
if STRICT:
|
||||
raise PSTypeError('type mismatch: %r(%r) != %r(%r)' %
|
||||
(type0, self.partobj, type1, obj))
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'do_keyword: pos=%r, token=%r, stack=%r' % \
|
||||
(pos, token, self.curstack)
|
||||
self.do_keyword(pos, token)
|
||||
if self.context:
|
||||
continue
|
||||
else:
|
||||
self.flush()
|
||||
obj = self.results.pop(0)
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'nextobject: %r' % (obj,)
|
||||
return obj
|
||||
|
||||
startobj('o')
|
||||
|
||||
for (pos,t) in PSBaseParser.parse(self):
|
||||
if isinstance(t, int) or isinstance(t, float):
|
||||
self.push(t)
|
||||
elif isinstance(t, str):
|
||||
self.push(t)
|
||||
elif isinstance(t, PSLiteral):
|
||||
self.push(t)
|
||||
else:
|
||||
c = keyword_name(t)
|
||||
if c == '{' or c == '}':
|
||||
self.push(t)
|
||||
elif c == '[':
|
||||
# begin array
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'start array'
|
||||
startobj('a')
|
||||
elif c == ']':
|
||||
# end array
|
||||
a = endobj('a')
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'end array: %r' % a
|
||||
self.push(a)
|
||||
elif c == '<<':
|
||||
# begin dictionary
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'start dict'
|
||||
startobj('d')
|
||||
elif c == '>>':
|
||||
# end dictionary
|
||||
objs = endobj('d')
|
||||
if len(objs) % 2 != 0:
|
||||
if STRICT:
|
||||
raise PSTypeError('invalid dictionary construct: %r' % objs)
|
||||
d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'end dict: %r' % d
|
||||
self.push(d)
|
||||
elif self.do_token(pos, t):
|
||||
break
|
||||
## Simplistic Test cases
|
||||
##
|
||||
import unittest
|
||||
class TestPSBaseParser(unittest.TestCase):
|
||||
|
||||
objs = endobj('o')
|
||||
return objs
|
||||
TESTDATA = r'''%!PS
|
||||
begin end
|
||||
" @ #
|
||||
/a/BCD /Some_Name /foo#5f#xbaa
|
||||
0 +1 -2 .5 1.234
|
||||
(abc) () (abc ( def ) ghi)
|
||||
(def\040\0\0404ghi) (bach\\slask) (foo\nbaa)
|
||||
(this % is not a comment.)
|
||||
(foo
|
||||
baa)
|
||||
(foo\
|
||||
baa)
|
||||
<20> < 40 4020 >
|
||||
<abcd00
|
||||
12345>
|
||||
func/a/b{(c)do*}def
|
||||
[ 1 (z) ! ]
|
||||
<< /foo (bar) >>
|
||||
'''
|
||||
|
||||
TOKENS = [
|
||||
(5, KWD('begin')), (11, KWD('end')), (16, KWD('"')), (19, KWD('@')),
|
||||
(21, KWD('#')), (23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')),
|
||||
(41, LIT('foo_xbaa')), (54, 0), (56, 1), (59, -2), (62, 0.5),
|
||||
(65, 1.234), (71, 'abc'), (77, ''), (80, 'abc ( def ) ghi'),
|
||||
(98, 'def \x00 4ghi'), (118, 'bach\\slask'), (132, 'foo\nbaa'),
|
||||
(143, 'this % is not a comment.'), (170, 'foo\nbaa'), (180, 'foobaa'),
|
||||
(191, ' '), (196, '@@ '), (208, '\xab\xcd\x00\x124\x05'),
|
||||
(223, KWD('func')), (227, LIT('a')), (229, LIT('b')),
|
||||
(231, KWD('{')), (232, 'c'), (235, KWD('do*')), (238, KWD('}')),
|
||||
(239, KWD('def')), (243, KWD('[')), (245, 1), (247, 'z'), (251, KWD('!')),
|
||||
(253, KWD(']')), (255, KWD('<<')), (258, LIT('foo')), (263, 'bar'),
|
||||
(269, KWD('>>'))
|
||||
]
|
||||
|
||||
OBJS = [
|
||||
(23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')),
|
||||
(41, LIT('foo_xbaa')), (54, 0), (56, 1), (59, -2), (62, 0.5),
|
||||
(65, 1.234), (71, 'abc'), (77, ''), (80, 'abc ( def ) ghi'),
|
||||
(98, 'def \x00 4ghi'), (118, 'bach\\slask'), (132, 'foo\nbaa'),
|
||||
(143, 'this % is not a comment.'), (170, 'foo\nbaa'), (180, 'foobaa'),
|
||||
(191, ' '), (196, '@@ '), (208, '\xab\xcd\x00\x124\x05'),
|
||||
(227, LIT('a')), (229, LIT('b')), (232, 'c'), (243, [1, 'z']),
|
||||
(255, {'foo': 'bar'}),
|
||||
]
|
||||
|
||||
def get_tokens(self, s):
|
||||
import StringIO
|
||||
class MyParser(PSBaseParser):
|
||||
def flush(self):
|
||||
self.add_results(*self.popall())
|
||||
parser = MyParser(StringIO.StringIO(s), debug=1)
|
||||
r = []
|
||||
try:
|
||||
while 1:
|
||||
r.append(parser.nexttoken())
|
||||
except PSEOF:
|
||||
pass
|
||||
return r
|
||||
|
||||
def get_objects(self, s):
|
||||
import StringIO
|
||||
class MyParser(PSStackParser):
|
||||
def flush(self):
|
||||
self.add_results(*self.popall())
|
||||
parser = MyParser(StringIO.StringIO(s), debug=1)
|
||||
r = []
|
||||
try:
|
||||
while 1:
|
||||
r.append(parser.nextobject())
|
||||
except PSEOF:
|
||||
pass
|
||||
return r
|
||||
|
||||
def test_1(self):
|
||||
tokens = self.get_tokens(self.TESTDATA)
|
||||
print tokens
|
||||
self.assertEqual(tokens, self.TOKENS)
|
||||
return
|
||||
|
||||
def test_2(self):
|
||||
objs = self.get_objects(self.TESTDATA)
|
||||
print objs
|
||||
self.assertEqual(objs, self.OBJS)
|
||||
return
|
||||
|
||||
if __name__ == '__main__': unittest.main()
|
||||
|
|
Loading…
Reference in New Issue