git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@9 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2008-01-07 13:47:52 +00:00
parent a8bae61c25
commit 196ece7913
7 changed files with 141 additions and 105 deletions

View File

@ -6,7 +6,7 @@ TAR=tar
SVN=svn SVN=svn
PYTHON=python PYTHON=python
WORKDIR=.. WORKDIR=/tmp
DISTNAME=$(PACKAGE)-dist-$(VERSION) DISTNAME=$(PACKAGE)-dist-$(VERSION)
DISTFILE=$(DISTNAME).tar.gz DISTFILE=$(DISTNAME).tar.gz

View File

@ -21,6 +21,7 @@ http://www.unixuser.org/~euske/python/pdfminer/index.html
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20071231.tar.gz"> <a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20071231.tar.gz">
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20071231.tar.gz http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20071231.tar.gz
</a> </a>
(220kbytes)
<P> <P>
<strong>Svn repository:</strong><br> <strong>Svn repository:</strong><br>
@ -46,7 +47,7 @@ http://www.unixuser.org/~euske/pub/CMap.tar.bz2
<p> <p>
<strong>Dump the contents:</strong> <strong>Dump the contents:</strong>
<blockquote><pre> <blockquote><pre>
$ ./dumppdf.py foo.pdf $ ./dumppdf.py -a foo.pdf
</pre></blockquote> </pre></blockquote>
<p> <p>
@ -56,6 +57,14 @@ $ ./pdf2txt.py samples/naacl06-shinyama.pdf
$ ./pdf2txt.py -c euc-jp samples/jo.pdf $ ./pdf2txt.py -c euc-jp samples/jo.pdf
</pre></blockquote> </pre></blockquote>
<hr>
<h2>Similar Projects</h2>
<ul>
<li> <a href="http://www.foolabs.com/xpdf/">xpdf</a>
<li> <a href="http://www.pdfbox.org/">pdfbox</a>
</ul>
<hr> <hr>
<h2>Terms and conditions</h2> <h2>Terms and conditions</h2>
<p> <p>

View File

@ -83,8 +83,8 @@ def dumptrailers(out, doc):
out.write('\n</trailer>\n\n') out.write('\n</trailer>\n\n')
return return
# dumpall # dumpallobjs
def dumpall(out, doc): def dumpallobjs(out, doc):
out.write('<pdf>') out.write('<pdf>')
for xref in doc.xrefs: for xref in doc.xrefs:
for objid in xrange(xref.objid0, xref.objid1+1): for objid in xrange(xref.objid0, xref.objid1+1):
@ -93,7 +93,7 @@ def dumpall(out, doc):
out.write('<object id="%d">\n' % objid) out.write('<object id="%d">\n' % objid)
dumpxml(out, obj) dumpxml(out, obj)
out.write('\n</object>\n\n') out.write('\n</object>\n\n')
except PDFValueError: except:
pass pass
dumptrailers(out, doc) dumptrailers(out, doc)
out.write('</pdf>') out.write('</pdf>')
@ -117,7 +117,7 @@ def dumppdf(outfp, fname, objids, pageids,
if page.pageid in pageids: if page.pageid in pageids:
dumpxml(outfp, page.attrs) dumpxml(outfp, page.attrs)
if dumpall: if dumpall:
dumpall(outfp, doc) dumpallobjs(outfp, doc)
if (not objids) and (not pageids) and (not dumpall): if (not objids) and (not pageids) and (not dumpall):
dumptrailers(outfp, doc) dumptrailers(outfp, doc)
fp.close() fp.close()

View File

@ -23,8 +23,9 @@ class TextConverter(PDFDevice):
self.outfp.write('\n') self.outfp.write('\n')
return return
def begin_block(self, name): def begin_block(self, name, (x0,y0,x1,y1)):
self.outfp.write('<block name="%s">\n' % name) self.outfp.write('<block name="%s" x0="%d" y0="%d" x1="%d" y1="%d">\n' %
(name,x0,y0,x1,y1))
return return
def end_block(self): def end_block(self):
self.outfp.write('</block>\n') self.outfp.write('</block>\n')
@ -83,10 +84,10 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
def main(argv): def main(argv):
import getopt import getopt
def usage(): def usage():
print 'usage: %s [-d] [-v] [-c codec] [-p pages] file ...' % argv[0] print 'usage: %s [-d] [-c codec] [-p pages] file ...' % argv[0]
return 100 return 100
try: try:
(opts, args) = getopt.getopt(argv[1:], 'dvp:c:') (opts, args) = getopt.getopt(argv[1:], 'dp:c:')
except getopt.GetoptError: except getopt.GetoptError:
return usage() return usage()
if not args: return usage() if not args: return usage()

View File

@ -57,17 +57,18 @@ def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0) a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
def apply_matrix((a,b,c,d,e,f), (x,y)): def apply_matrix((a,b,c,d,e,f), (x,y)):
'''Applies a matrix to a coordination.''' '''Applies a matrix to coordinates.'''
return (a*x+c*y+e, b*x+d*y+f) return (a*x+c*y+e, b*x+d*y+f)
def cs_params(cs): def cs_params(cs):
'''Returns a number of components for a given colorspace.'''
t = cs[0] t = cs[0]
if t == LITERAL_ICC_BASED: if t == LITERAL_ICC_BASED:
return stream_value(cs[1]).dic['N'] return stream_value(cs[1]).dic['N']
elif t == LITERAL_DEVICE_N: elif t == LITERAL_DEVICE_N:
return len(list_value(cs[1])) return len(list_value(cs[1]))
else: else:
return CS_COMPONENTS[t] return CS_COMPONENTS.get(t, 0)
## Fonts ## Fonts
@ -438,7 +439,7 @@ class PDFDevice:
self.ctm = ctm self.ctm = ctm
return return
def begin_block(self, name): def begin_block(self, name, bbox):
return return
def end_block(self): def end_block(self):
return return
@ -589,11 +590,11 @@ class PDFPageInterpreter:
# setcolorspace-stroking # setcolorspace-stroking
def do_CS(self, name): def do_CS(self, name):
self.scs = self.csmap.get(literal_name(name), None) self.scs = self.csmap.get(literal_name(name), [name])
return return
# setcolorspace-non-strokine # setcolorspace-non-strokine
def do_cs(self, name): def do_cs(self, name):
self.ncs = self.csmap.get(literal_name(name), None) self.ncs = self.csmap.get(literal_name(name), [name])
return return
# setgray-stroking # setgray-stroking
def do_G(self, gray): def do_G(self, gray):
@ -770,34 +771,46 @@ class PDFPageInterpreter:
if 1 <= self.debug: if 1 <= self.debug:
print >>stderr, 'Processing xobj: %r' % xobj print >>stderr, 'Processing xobj: %r' % xobj
interpreter = PDFPageInterpreter(self.rsrc, self.device) interpreter = PDFPageInterpreter(self.rsrc, self.device)
interpreter.render_contents(xobjid, xobj.dic['Resources'], [xobj], (x0,y0,x1,y1) = xobj.dic['BBox']
xobj.dic.get('Matrix', MATRIX_IDENTITY)) ctm = mult_matrix(xobj.dic.get('Matrix', MATRIX_IDENTITY), self.ctm)
(x0,y0) = apply_matrix(ctm, (x0,y0))
(x1,y1) = apply_matrix(ctm, (x1,y1))
interpreter.render_contents(xobjid,
(x0,y0,x1,y1),
xobj.dic.get('Resources'),
[xobj],
ctm=ctm)
return return
def process_page(self, page): def process_page(self, page):
if 1 <= self.debug: if 1 <= self.debug:
print >>stderr, 'Processing page: %r' % page print >>stderr, 'Processing page: %r' % page
self.render_contents('page-%d' % page.pageid, page.resources, page.contents) self.render_contents('page-%d' % page.pageid,
page.mediabox,
page.resources,
page.contents)
return return
def render_contents(self, contid, resources, contents, ctm=MATRIX_IDENTITY): def render_contents(self, contid, mediabox, resources, contents,
ctm=MATRIX_IDENTITY):
self.initpage(ctm) self.initpage(ctm)
self.device.begin_block(contid) self.device.begin_block(contid, mediabox)
# Handle resource declarations. # Handle resource declarations.
for (k,v) in dict_value(resources).iteritems(): if resources:
if 1 <= self.debug: for (k,v) in dict_value(resources).iteritems():
print >>stderr, 'Resource: %r: %r' % (k,v) if 1 <= self.debug:
if k == 'Font': print >>stderr, 'Resource: %r: %r' % (k,v)
for (fontid,fontrsrc) in dict_value(v).iteritems(): if k == 'Font':
self.fontmap[fontid] = self.rsrc.get_font(fontid, fontrsrc) for (fontid,fontrsrc) in dict_value(v).iteritems():
elif k == 'ColorSpace': self.fontmap[fontid] = self.rsrc.get_font(fontid, fontrsrc)
for (csid,csspec) in dict_value(v).iteritems(): elif k == 'ColorSpace':
self.csmap[csid] = list_value(csspec) for (csid,csspec) in dict_value(v).iteritems():
elif k == 'ProcSet': self.csmap[csid] = list_value(csspec)
self.rsrc.get_procset(list_value(v)) elif k == 'ProcSet':
elif k == 'XObject': self.rsrc.get_procset(list_value(v))
for (xobjid,xobjstrm) in dict_value(v).iteritems(): elif k == 'XObject':
self.xobjmap[xobjid] = xobjstrm for (xobjid,xobjstrm) in dict_value(v).iteritems():
self.xobjmap[xobjid] = xobjstrm
for stream in list_value(contents): for stream in list_value(contents):
self.execute(stream_value(stream)) self.execute(stream_value(stream))
self.device.end_block() self.device.end_block()

View File

@ -18,8 +18,8 @@ import sys, re
stderr = sys.stderr stderr = sys.stderr
from utils import choplist, nunpack from utils import choplist, nunpack
from psparser import PSException, PSSyntaxError, PSTypeError, \ from psparser import PSException, PSSyntaxError, PSTypeError, \
PSLiteral, PSKeyword, \ PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \ literal_name, keyword_name, \
PSStackParser PSStackParser
@ -76,8 +76,7 @@ def resolveall(x):
''' '''
Recursively resolve X and all the internals. Recursively resolve X and all the internals.
Make sure there is no indirect reference within the nested object. Make sure there is no indirect reference within the nested object.
This procedure might be slow. Do not used it unless This procedure might be slow.
you really need it.
''' '''
while isinstance(x, PDFObjRef): while isinstance(x, PDFObjRef):
x = x.resolve() x = x.resolve()
@ -209,13 +208,12 @@ class PDFStream:
## ##
class PDFPage: class PDFPage:
def __init__(self, doc, pageidx, attrs, parent_attrs): def __init__(self, doc, pageidx, attrs):
self.doc = doc self.doc = doc
self.pageid = pageidx self.pageid = pageidx
self.attrs = dict_value(attrs) self.attrs = dict_value(attrs)
self.parent_attrs = parent_attrs self.resources = resolve1(self.attrs['Resources'])
self.resources = self.get_attr('Resources') self.mediabox = resolve1(self.attrs['MediaBox'])
self.mediabox = self.get_attr('MediaBox')
contents = resolve1(self.attrs['Contents']) contents = resolve1(self.attrs['Contents'])
if not isinstance(contents, list): if not isinstance(contents, list):
contents = [ contents ] contents = [ contents ]
@ -225,11 +223,6 @@ class PDFPage:
def __repr__(self): def __repr__(self):
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox) return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
def get_attr(self, k):
if k in self.attrs:
return resolve1(self.attrs[k])
return self.parent_attrs.get(k)
## XRefs ## XRefs
@ -239,7 +232,7 @@ class PDFXRef:
def __init__(self, parser): def __init__(self, parser):
while 1: while 1:
line = parser.nextline() (_, line) = parser.nextline()
if not line: if not line:
raise PDFSyntaxError('premature eof: %r' % parser) raise PDFSyntaxError('premature eof: %r' % parser)
line = line.strip() line = line.strip()
@ -253,7 +246,7 @@ class PDFXRef:
self.objid1 = start+nobjs self.objid1 = start+nobjs
self.offsets = [] self.offsets = []
for objid in xrange(start, start+nobjs): for objid in xrange(start, start+nobjs):
line = parser.nextline() (_, line) = parser.nextline()
f = line.strip().split(' ') f = line.strip().split(' ')
if len(f) != 3: if len(f) != 3:
raise PDFSyntaxError('invalid xref format: %r, line=%r' % (parser, line)) raise PDFSyntaxError('invalid xref format: %r, line=%r' % (parser, line))
@ -361,13 +354,12 @@ class PDFDocument:
self.parsed_objs[stream] = objs self.parsed_objs[stream] = objs
obj = objs[stream.dic['N']*2+index] obj = objs[stream.dic['N']*2+index]
else: else:
pos0 = self.parser.linepos prevpos = self.parser.seek(index)
self.parser.seek(index)
seq = list_value(self.parser.parse()) seq = list_value(self.parser.parse())
if not (len(seq) == 4 and seq[0] == objid and seq[2] == KEYWORD_OBJ): if not (len(seq) == 4 and seq[0] == objid and seq[2] == KEYWORD_OBJ):
raise PDFSyntaxError('invalid stream spec: %r' % seq) raise PDFSyntaxError('invalid stream spec: %r' % seq)
obj = seq[3] obj = seq[3]
self.parser.seek(pos0) self.parser.seek(prevpos)
if 2 <= self.debug: if 2 <= self.debug:
print >>stderr, 'register: objid=%r: %r' % (objid, obj) print >>stderr, 'register: objid=%r: %r' % (objid, obj)
self.objs[objid] = obj self.objs[objid] = obj
@ -376,7 +368,10 @@ class PDFDocument:
def get_pages(self, debug=0): def get_pages(self, debug=0):
assert self.xrefs assert self.xrefs
def search(obj, parent): def search(obj, parent):
tree = dict_value(obj) tree = dict_value(obj).copy()
for (k,v) in parent.iteritems():
if k not in tree:
tree[k] = v
if tree['Type'] == LITERAL_PAGES: if tree['Type'] == LITERAL_PAGES:
if 1 <= debug: if 1 <= debug:
print >>stderr, 'Pages: Kids=%r' % tree['Kids'] print >>stderr, 'Pages: Kids=%r' % tree['Kids']
@ -386,9 +381,9 @@ class PDFDocument:
elif tree['Type'] == LITERAL_PAGE: elif tree['Type'] == LITERAL_PAGE:
if 1 <= debug: if 1 <= debug:
print >>stderr, 'Page: %r' % tree print >>stderr, 'Page: %r' % tree
yield (tree, parent) yield tree
for (i,(tree,parent)) in enumerate(search(self.catalog['Pages'], self.catalog)): for (i,tree) in enumerate(search(self.catalog['Pages'], self.catalog)):
yield PDFPage(self, i, tree, parent) yield PDFPage(self, i, tree)
return return
def set_root(self, root): def set_root(self, root):
@ -440,19 +435,19 @@ class PDFParser(PSStackParser):
raise PDFValueError('/Length is undefined: %r' % dic) raise PDFValueError('/Length is undefined: %r' % dic)
objlen = int_value(dic['Length']) objlen = int_value(dic['Length'])
self.seek(pos) self.seek(pos)
line = self.nextline() # 'stream' (_, line) = self.nextline() # 'stream'
self.fp.seek(pos+len(line)) self.fp.seek(pos+len(line))
data = self.fp.read(objlen) data = self.fp.read(objlen)
self.seek(pos+len(line)+objlen) self.seek(pos+len(line)+objlen)
while 1: while 1:
line = self.nextline() (linepos, line) = self.nextline()
if not line: if not line:
raise PDFSyntaxError('premature eof, need endstream: linepos=%d, line=%r' % raise PDFSyntaxError('premature eof, need endstream: linepos=%d, line=%r' %
(self.linepos, line)) (linepos, line))
if line.strip(): if line.strip():
if not line.startswith('endstream'): if not line.startswith('endstream'):
raise PDFSyntaxError('need endstream: linepos=%d, line=%r' % raise PDFSyntaxError('need endstream: linepos=%d, line=%r' %
(self.linepos, line)) (linepos, line))
break break
if 1 <= self.debug: if 1 <= self.debug:
print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \ print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
@ -510,17 +505,16 @@ class PDFParser(PSStackParser):
self.find_xref() self.find_xref()
while 1: while 1:
# read xref table # read xref table
pos0 = self.linepos (linepos, line) = self.nextline()
line = self.nextline()
if 2 <= self.debug: if 2 <= self.debug:
print >>stderr, 'line: %r' % line print >>stderr, 'line: %r' % line
if line[0].isdigit(): if line[0].isdigit():
# XRefStream: PDF-1.5 # XRefStream: PDF-1.5
self.seek(pos0) self.seek(linepos)
xref = PDFXRefStream(self) xref = PDFXRefStream(self)
elif line.strip() != 'xref': elif line.strip() != 'xref':
raise PDFSyntaxError('xref not found: linepos=%d, line=%r' % raise PDFSyntaxError('xref not found: linepos=%d, line=%r' %
(self.linepos, line)) (linepos, line))
else: else:
xref = PDFXRef(self) xref = PDFXRef(self)
yield xref yield xref
@ -531,10 +525,10 @@ class PDFParser(PSStackParser):
self.seek(int_value(trailer['XRefStm'])) self.seek(int_value(trailer['XRefStm']))
if 'Prev' in trailer: if 'Prev' in trailer:
# find previous xref # find previous xref
pos0 = int_value(trailer['Prev']) pos = int_value(trailer['Prev'])
self.seek(pos0) self.seek(pos)
if 1 <= self.debug: if 1 <= self.debug:
print >>stderr, 'prev trailer: pos=%d' % pos0 print >>stderr, 'prev trailer: pos=%d' % pos
else: else:
break break
return return

View File

@ -12,36 +12,48 @@ class PSTypeError(PSException): pass
class PSValueError(PSException): pass class PSValueError(PSException): pass
## PostScript Types ## Basic PostScript Types
## ##
# PSLiteral
class PSLiteral: class PSLiteral:
''' '''
PS literals (e.g. "/Name"). PS literals (e.g. "/Name").
Caution: Never create these objects directly. Caution: Never create these objects directly.
Use PSLiteralTable.intern() instead. Use PSLiteralTable.intern() instead.
''' '''
def __init__(self, name): def __init__(self, name):
self.name = name self.name = name
return return
def __repr__(self): def __repr__(self):
return '/%s' % self.name return '/%s' % self.name
# PSKeyword
class PSKeyword: class PSKeyword:
''' '''
PS keywords (e.g. "showpage"). PS keywords (e.g. "showpage").
Caution: Never create these objects directly. Caution: Never create these objects directly.
Use PSKeywordTable.intern() instead. Use PSKeywordTable.intern() instead.
''' '''
def __init__(self, name): def __init__(self, name):
self.name = name self.name = name
return return
def __repr__(self): def __repr__(self):
return self.name return self.name
# PSSymbolTable
class PSSymbolTable: class PSSymbolTable:
''' '''
Symbol table that stores PSLiteral or PSKeyword. Symbol table that stores PSLiteral or PSKeyword.
''' '''
def __init__(self, classe): def __init__(self, classe):
self.dic = {} self.dic = {}
self.classe = classe self.classe = classe
@ -74,7 +86,9 @@ def keyword_name(x):
## ##
class PSBaseParser: class PSBaseParser:
'''PostScript parser that performs only basic tokenization.''' '''
Most basic PostScript parser that performs only basic tokenization.
'''
def __init__(self, fp, debug=0): def __init__(self, fp, debug=0):
self.fp = fp self.fp = fp
@ -88,21 +102,22 @@ class PSBaseParser:
def seek(self, pos): def seek(self, pos):
''' '''
seeks to the given pos. Seeks the parser to the given position.
''' '''
if 2 <= self.debug: if 2 <= self.debug:
print >>stderr, 'seek:', pos print >>stderr, 'seek:', pos
prevpos = self.fp.tell()
self.fp.seek(pos) self.fp.seek(pos)
self.linepos = pos self.linebuf = None # line buffer.
self.linebuf = None self.curpos = 0 # current position in the buffer.
self.curpos = 0 self.linepos = pos # the beginning of the current line.
self.line = '' self.go = False
return return prevpos
EOLCHAR = re.compile(r'[\r\n]') EOLCHAR = re.compile(r'[\r\n]')
def nextline(self): def nextline(self):
''' '''
fetches the next line that ends either with \\r or \\n. Fetches a next line that ends either with \\r or \\n.
''' '''
line = '' line = ''
eol = None eol = None
@ -131,12 +146,14 @@ class PSBaseParser:
# fetch further # fetch further
line += self.linebuf[self.curpos:] line += self.linebuf[self.curpos:]
self.linebuf = None self.linebuf = None
linepos = self.linepos
self.linepos += len(line) self.linepos += len(line)
return line return (linepos, line)
def revreadlines(self): def revreadlines(self):
''' '''
fetches lines backword. used to locate trailers. Fetches a next line backword. This is used to locate
the trailers at the end of a file.
''' '''
self.fp.seek(0, 2) self.fp.seek(0, 2)
pos = self.fp.tell() pos = self.fp.tell()
@ -156,6 +173,7 @@ class PSBaseParser:
buf = '' buf = ''
return return
# regex patterns for basic lexical scanning.
SPECIAL = r'%\[\]()<>{}/\000\011\012\014\015\040' SPECIAL = r'%\[\]()<>{}/\000\011\012\014\015\040'
TOKEN = re.compile(r'<<|>>|[%\[\]()<>{}/]|[^'+SPECIAL+r']+') TOKEN = re.compile(r'<<|>>|[%\[\]()<>{}/]|[^'+SPECIAL+r']+')
LITERAL = re.compile(r'([^#'+SPECIAL+r']|#[0-9abcdefABCDEF]{2})+') LITERAL = re.compile(r'([^#'+SPECIAL+r']|#[0-9abcdefABCDEF]{2})+')
@ -167,38 +185,39 @@ class PSBaseParser:
def parse(self): def parse(self):
''' '''
Yields a list of basic tokens: keywords, literals, strings, Yields a list of tuples (pos, token) of the following:
numbers and parentheses. Comments are skipped. keywords, literals, strings, numbers and parentheses.
Nested objects (i.e. arrays and dictionaries) are not handled. Comments are skipped.
Nested objects (i.e. arrays and dictionaries) are not handled here.
''' '''
while 1: while 1:
# do not strip line! we need to distinguish last '\n' or '\r' # do not strip line! we need to distinguish last '\n' or '\r'
linepos0 = self.linepos (linepos, line) = self.nextline()
self.line = self.nextline() if not line: break
if not self.line: break
if 2 <= self.debug: if 2 <= self.debug:
print >>stderr, 'line: (%d) %r' % (self.linepos, self.line) print >>stderr, 'line: (%d) %r' % (linepos, line)
# do this before removing comment # do this before removing comment
if self.line.startswith('%%EOF'): break if line.startswith('%%EOF'): break
charpos = 0 charpos = 0
# tokenize # tokenize
while 1: self.go = True
m = self.TOKEN.search(self.line, charpos) while self.go:
m = self.TOKEN.search(line, charpos)
if not m: break if not m: break
t = m.group(0) t = m.group(0)
pos = linepos0 + m.start(0) pos = linepos + m.start(0)
charpos = m.end(0) charpos = m.end(0)
if t == '%': if t == '%':
# skip comment # skip comment
if 2 <= self.debug: if 2 <= self.debug:
print >>stderr, 'comment: %r' % self.line[charpos:] print >>stderr, 'comment: %r' % line[charpos:]
break break
elif t == '/': elif t == '/':
# literal object # literal object
mn = self.LITERAL.match(self.line, m.start(0)+1) mn = self.LITERAL.match(line, m.start(0)+1)
lit = PSLiteralTable.intern(mn.group(0)) lit = PSLiteralTable.intern(mn.group(0))
yield (pos, lit) yield (pos, lit)
charpos = mn.end(0) charpos = mn.end(0)
@ -209,30 +228,30 @@ class PSBaseParser:
# normal string object # normal string object
s = '' s = ''
while 1: while 1:
ms = self.STRING_NORM.match(self.line, charpos) ms = self.STRING_NORM.match(line, charpos)
if not ms: break if not ms: break
s1 = ms.group(0) s1 = ms.group(0)
charpos = ms.end(0) charpos = ms.end(0)
if len(s1) == 1 and s1[-1] == '\\': if len(s1) == 1 and s1[-1] == '\\':
s += s1[-1:] s += s1[-1:]
self.line = self.nextline() (linepos, line) = self.nextline()
if not self.line: if not line:
raise PSSyntaxError('end inside string: linepos=%d, line=%r' % raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
(self.linepos, self.line)) (linepos, line))
charpos = 0 charpos = 0
elif charpos == len(self.line): elif charpos == len(line):
s += s1 s += s1
self.line = self.nextline() (linepos, line) = self.nextline()
if not self.line: if not line:
raise PSSyntaxError('end inside string: linepos=%d, line=%r' % raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
(self.linepos, self.line)) (linepos, line))
charpos = 0 charpos = 0
else: else:
s += s1 s += s1
break break
if self.line[charpos] != ')': if line[charpos] != ')':
raise PSSyntaxError('no close paren: linepos=%d, line=%r' % raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
(self.linepos, self.line)) (linepos, line))
charpos += 1 charpos += 1
def convesc(m): def convesc(m):
x = m.group(0) x = m.group(0)
@ -247,11 +266,11 @@ class PSBaseParser:
elif t == '<': elif t == '<':
# hex string object # hex string object
ms = self.STRING_HEX.match(self.line, charpos) ms = self.STRING_HEX.match(line, charpos)
charpos = ms.end(0) charpos = ms.end(0)
if self.line[charpos] != '>': if line[charpos] != '>':
raise PSSyntaxError('no close paren: linepos=%d, line=%r' % raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
(self.linepos, self.line)) (linepos, line))
charpos += 1 charpos += 1
def convhex(m1): def convhex(m1):
return chr(int(m1.group(0), 16)) return chr(int(m1.group(0), 16))
@ -270,7 +289,7 @@ class PSBaseParser:
print >>stderr, 'number: %r' % n print >>stderr, 'number: %r' % n
yield (pos, n) yield (pos, n)
elif t in ('true','false'): elif t in ('true', 'false'):
# boolean # boolean
if 2 <= self.debug: if 2 <= self.debug:
print >>stderr, 'boolean: %r' % t print >>stderr, 'boolean: %r' % t