yum-yum!
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@9 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
a8bae61c25
commit
196ece7913
2
Makefile
2
Makefile
|
@ -6,7 +6,7 @@ TAR=tar
|
|||
SVN=svn
|
||||
PYTHON=python
|
||||
|
||||
WORKDIR=..
|
||||
WORKDIR=/tmp
|
||||
DISTNAME=$(PACKAGE)-dist-$(VERSION)
|
||||
DISTFILE=$(DISTNAME).tar.gz
|
||||
|
||||
|
|
11
README.html
11
README.html
|
@ -21,6 +21,7 @@ http://www.unixuser.org/~euske/python/pdfminer/index.html
|
|||
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20071231.tar.gz">
|
||||
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20071231.tar.gz
|
||||
</a>
|
||||
(220kbytes)
|
||||
|
||||
<P>
|
||||
<strong>Svn repository:</strong><br>
|
||||
|
@ -46,7 +47,7 @@ http://www.unixuser.org/~euske/pub/CMap.tar.bz2
|
|||
<p>
|
||||
<strong>Dump the contents:</strong>
|
||||
<blockquote><pre>
|
||||
$ ./dumppdf.py foo.pdf
|
||||
$ ./dumppdf.py -a foo.pdf
|
||||
</pre></blockquote>
|
||||
|
||||
<p>
|
||||
|
@ -56,6 +57,14 @@ $ ./pdf2txt.py samples/naacl06-shinyama.pdf
|
|||
$ ./pdf2txt.py -c euc-jp samples/jo.pdf
|
||||
</pre></blockquote>
|
||||
|
||||
<hr>
|
||||
<h2>Similar Projects</h2>
|
||||
<ul>
|
||||
<li> <a href="http://www.foolabs.com/xpdf/">xpdf</a>
|
||||
<li> <a href="http://www.pdfbox.org/">pdfbox</a>
|
||||
</ul>
|
||||
|
||||
|
||||
<hr>
|
||||
<h2>Terms and conditions</h2>
|
||||
<p>
|
||||
|
|
|
@ -83,8 +83,8 @@ def dumptrailers(out, doc):
|
|||
out.write('\n</trailer>\n\n')
|
||||
return
|
||||
|
||||
# dumpall
|
||||
def dumpall(out, doc):
|
||||
# dumpallobjs
|
||||
def dumpallobjs(out, doc):
|
||||
out.write('<pdf>')
|
||||
for xref in doc.xrefs:
|
||||
for objid in xrange(xref.objid0, xref.objid1+1):
|
||||
|
@ -93,7 +93,7 @@ def dumpall(out, doc):
|
|||
out.write('<object id="%d">\n' % objid)
|
||||
dumpxml(out, obj)
|
||||
out.write('\n</object>\n\n')
|
||||
except PDFValueError:
|
||||
except:
|
||||
pass
|
||||
dumptrailers(out, doc)
|
||||
out.write('</pdf>')
|
||||
|
@ -117,7 +117,7 @@ def dumppdf(outfp, fname, objids, pageids,
|
|||
if page.pageid in pageids:
|
||||
dumpxml(outfp, page.attrs)
|
||||
if dumpall:
|
||||
dumpall(outfp, doc)
|
||||
dumpallobjs(outfp, doc)
|
||||
if (not objids) and (not pageids) and (not dumpall):
|
||||
dumptrailers(outfp, doc)
|
||||
fp.close()
|
||||
|
|
|
@ -23,8 +23,9 @@ class TextConverter(PDFDevice):
|
|||
self.outfp.write('\n')
|
||||
return
|
||||
|
||||
def begin_block(self, name):
|
||||
self.outfp.write('<block name="%s">\n' % name)
|
||||
def begin_block(self, name, (x0,y0,x1,y1)):
|
||||
self.outfp.write('<block name="%s" x0="%d" y0="%d" x1="%d" y1="%d">\n' %
|
||||
(name,x0,y0,x1,y1))
|
||||
return
|
||||
def end_block(self):
|
||||
self.outfp.write('</block>\n')
|
||||
|
@ -83,10 +84,10 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
|
|||
def main(argv):
|
||||
import getopt
|
||||
def usage():
|
||||
print 'usage: %s [-d] [-v] [-c codec] [-p pages] file ...' % argv[0]
|
||||
print 'usage: %s [-d] [-c codec] [-p pages] file ...' % argv[0]
|
||||
return 100
|
||||
try:
|
||||
(opts, args) = getopt.getopt(argv[1:], 'dvp:c:')
|
||||
(opts, args) = getopt.getopt(argv[1:], 'dp:c:')
|
||||
except getopt.GetoptError:
|
||||
return usage()
|
||||
if not args: return usage()
|
||||
|
|
61
pdfinterp.py
61
pdfinterp.py
|
@ -57,17 +57,18 @@ def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
|
|||
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
|
||||
|
||||
def apply_matrix((a,b,c,d,e,f), (x,y)):
|
||||
'''Applies a matrix to a coordination.'''
|
||||
'''Applies a matrix to coordinates.'''
|
||||
return (a*x+c*y+e, b*x+d*y+f)
|
||||
|
||||
def cs_params(cs):
|
||||
'''Returns a number of components for a given colorspace.'''
|
||||
t = cs[0]
|
||||
if t == LITERAL_ICC_BASED:
|
||||
return stream_value(cs[1]).dic['N']
|
||||
elif t == LITERAL_DEVICE_N:
|
||||
return len(list_value(cs[1]))
|
||||
else:
|
||||
return CS_COMPONENTS[t]
|
||||
return CS_COMPONENTS.get(t, 0)
|
||||
|
||||
|
||||
## Fonts
|
||||
|
@ -438,7 +439,7 @@ class PDFDevice:
|
|||
self.ctm = ctm
|
||||
return
|
||||
|
||||
def begin_block(self, name):
|
||||
def begin_block(self, name, bbox):
|
||||
return
|
||||
def end_block(self):
|
||||
return
|
||||
|
@ -589,11 +590,11 @@ class PDFPageInterpreter:
|
|||
|
||||
# setcolorspace-stroking
|
||||
def do_CS(self, name):
|
||||
self.scs = self.csmap.get(literal_name(name), None)
|
||||
self.scs = self.csmap.get(literal_name(name), [name])
|
||||
return
|
||||
# setcolorspace-non-strokine
|
||||
def do_cs(self, name):
|
||||
self.ncs = self.csmap.get(literal_name(name), None)
|
||||
self.ncs = self.csmap.get(literal_name(name), [name])
|
||||
return
|
||||
# setgray-stroking
|
||||
def do_G(self, gray):
|
||||
|
@ -770,34 +771,46 @@ class PDFPageInterpreter:
|
|||
if 1 <= self.debug:
|
||||
print >>stderr, 'Processing xobj: %r' % xobj
|
||||
interpreter = PDFPageInterpreter(self.rsrc, self.device)
|
||||
interpreter.render_contents(xobjid, xobj.dic['Resources'], [xobj],
|
||||
xobj.dic.get('Matrix', MATRIX_IDENTITY))
|
||||
(x0,y0,x1,y1) = xobj.dic['BBox']
|
||||
ctm = mult_matrix(xobj.dic.get('Matrix', MATRIX_IDENTITY), self.ctm)
|
||||
(x0,y0) = apply_matrix(ctm, (x0,y0))
|
||||
(x1,y1) = apply_matrix(ctm, (x1,y1))
|
||||
interpreter.render_contents(xobjid,
|
||||
(x0,y0,x1,y1),
|
||||
xobj.dic.get('Resources'),
|
||||
[xobj],
|
||||
ctm=ctm)
|
||||
return
|
||||
|
||||
def process_page(self, page):
|
||||
if 1 <= self.debug:
|
||||
print >>stderr, 'Processing page: %r' % page
|
||||
self.render_contents('page-%d' % page.pageid, page.resources, page.contents)
|
||||
self.render_contents('page-%d' % page.pageid,
|
||||
page.mediabox,
|
||||
page.resources,
|
||||
page.contents)
|
||||
return
|
||||
|
||||
def render_contents(self, contid, resources, contents, ctm=MATRIX_IDENTITY):
|
||||
def render_contents(self, contid, mediabox, resources, contents,
|
||||
ctm=MATRIX_IDENTITY):
|
||||
self.initpage(ctm)
|
||||
self.device.begin_block(contid)
|
||||
self.device.begin_block(contid, mediabox)
|
||||
# Handle resource declarations.
|
||||
for (k,v) in dict_value(resources).iteritems():
|
||||
if 1 <= self.debug:
|
||||
print >>stderr, 'Resource: %r: %r' % (k,v)
|
||||
if k == 'Font':
|
||||
for (fontid,fontrsrc) in dict_value(v).iteritems():
|
||||
self.fontmap[fontid] = self.rsrc.get_font(fontid, fontrsrc)
|
||||
elif k == 'ColorSpace':
|
||||
for (csid,csspec) in dict_value(v).iteritems():
|
||||
self.csmap[csid] = list_value(csspec)
|
||||
elif k == 'ProcSet':
|
||||
self.rsrc.get_procset(list_value(v))
|
||||
elif k == 'XObject':
|
||||
for (xobjid,xobjstrm) in dict_value(v).iteritems():
|
||||
self.xobjmap[xobjid] = xobjstrm
|
||||
if resources:
|
||||
for (k,v) in dict_value(resources).iteritems():
|
||||
if 1 <= self.debug:
|
||||
print >>stderr, 'Resource: %r: %r' % (k,v)
|
||||
if k == 'Font':
|
||||
for (fontid,fontrsrc) in dict_value(v).iteritems():
|
||||
self.fontmap[fontid] = self.rsrc.get_font(fontid, fontrsrc)
|
||||
elif k == 'ColorSpace':
|
||||
for (csid,csspec) in dict_value(v).iteritems():
|
||||
self.csmap[csid] = list_value(csspec)
|
||||
elif k == 'ProcSet':
|
||||
self.rsrc.get_procset(list_value(v))
|
||||
elif k == 'XObject':
|
||||
for (xobjid,xobjstrm) in dict_value(v).iteritems():
|
||||
self.xobjmap[xobjid] = xobjstrm
|
||||
for stream in list_value(contents):
|
||||
self.execute(stream_value(stream))
|
||||
self.device.end_block()
|
||||
|
|
60
pdfparser.py
60
pdfparser.py
|
@ -18,8 +18,8 @@ import sys, re
|
|||
stderr = sys.stderr
|
||||
from utils import choplist, nunpack
|
||||
from psparser import PSException, PSSyntaxError, PSTypeError, \
|
||||
PSLiteral, PSKeyword, \
|
||||
PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
|
||||
PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
|
||||
literal_name, keyword_name, \
|
||||
PSStackParser
|
||||
|
||||
|
||||
|
@ -76,8 +76,7 @@ def resolveall(x):
|
|||
'''
|
||||
Recursively resolve X and all the internals.
|
||||
Make sure there is no indirect reference within the nested object.
|
||||
This procedure might be slow. Do not used it unless
|
||||
you really need it.
|
||||
This procedure might be slow.
|
||||
'''
|
||||
while isinstance(x, PDFObjRef):
|
||||
x = x.resolve()
|
||||
|
@ -209,13 +208,12 @@ class PDFStream:
|
|||
##
|
||||
class PDFPage:
|
||||
|
||||
def __init__(self, doc, pageidx, attrs, parent_attrs):
|
||||
def __init__(self, doc, pageidx, attrs):
|
||||
self.doc = doc
|
||||
self.pageid = pageidx
|
||||
self.attrs = dict_value(attrs)
|
||||
self.parent_attrs = parent_attrs
|
||||
self.resources = self.get_attr('Resources')
|
||||
self.mediabox = self.get_attr('MediaBox')
|
||||
self.resources = resolve1(self.attrs['Resources'])
|
||||
self.mediabox = resolve1(self.attrs['MediaBox'])
|
||||
contents = resolve1(self.attrs['Contents'])
|
||||
if not isinstance(contents, list):
|
||||
contents = [ contents ]
|
||||
|
@ -225,11 +223,6 @@ class PDFPage:
|
|||
def __repr__(self):
|
||||
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
|
||||
|
||||
def get_attr(self, k):
|
||||
if k in self.attrs:
|
||||
return resolve1(self.attrs[k])
|
||||
return self.parent_attrs.get(k)
|
||||
|
||||
|
||||
## XRefs
|
||||
|
||||
|
@ -239,7 +232,7 @@ class PDFXRef:
|
|||
|
||||
def __init__(self, parser):
|
||||
while 1:
|
||||
line = parser.nextline()
|
||||
(_, line) = parser.nextline()
|
||||
if not line:
|
||||
raise PDFSyntaxError('premature eof: %r' % parser)
|
||||
line = line.strip()
|
||||
|
@ -253,7 +246,7 @@ class PDFXRef:
|
|||
self.objid1 = start+nobjs
|
||||
self.offsets = []
|
||||
for objid in xrange(start, start+nobjs):
|
||||
line = parser.nextline()
|
||||
(_, line) = parser.nextline()
|
||||
f = line.strip().split(' ')
|
||||
if len(f) != 3:
|
||||
raise PDFSyntaxError('invalid xref format: %r, line=%r' % (parser, line))
|
||||
|
@ -361,13 +354,12 @@ class PDFDocument:
|
|||
self.parsed_objs[stream] = objs
|
||||
obj = objs[stream.dic['N']*2+index]
|
||||
else:
|
||||
pos0 = self.parser.linepos
|
||||
self.parser.seek(index)
|
||||
prevpos = self.parser.seek(index)
|
||||
seq = list_value(self.parser.parse())
|
||||
if not (len(seq) == 4 and seq[0] == objid and seq[2] == KEYWORD_OBJ):
|
||||
raise PDFSyntaxError('invalid stream spec: %r' % seq)
|
||||
obj = seq[3]
|
||||
self.parser.seek(pos0)
|
||||
self.parser.seek(prevpos)
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'register: objid=%r: %r' % (objid, obj)
|
||||
self.objs[objid] = obj
|
||||
|
@ -376,7 +368,10 @@ class PDFDocument:
|
|||
def get_pages(self, debug=0):
|
||||
assert self.xrefs
|
||||
def search(obj, parent):
|
||||
tree = dict_value(obj)
|
||||
tree = dict_value(obj).copy()
|
||||
for (k,v) in parent.iteritems():
|
||||
if k not in tree:
|
||||
tree[k] = v
|
||||
if tree['Type'] == LITERAL_PAGES:
|
||||
if 1 <= debug:
|
||||
print >>stderr, 'Pages: Kids=%r' % tree['Kids']
|
||||
|
@ -386,9 +381,9 @@ class PDFDocument:
|
|||
elif tree['Type'] == LITERAL_PAGE:
|
||||
if 1 <= debug:
|
||||
print >>stderr, 'Page: %r' % tree
|
||||
yield (tree, parent)
|
||||
for (i,(tree,parent)) in enumerate(search(self.catalog['Pages'], self.catalog)):
|
||||
yield PDFPage(self, i, tree, parent)
|
||||
yield tree
|
||||
for (i,tree) in enumerate(search(self.catalog['Pages'], self.catalog)):
|
||||
yield PDFPage(self, i, tree)
|
||||
return
|
||||
|
||||
def set_root(self, root):
|
||||
|
@ -440,19 +435,19 @@ class PDFParser(PSStackParser):
|
|||
raise PDFValueError('/Length is undefined: %r' % dic)
|
||||
objlen = int_value(dic['Length'])
|
||||
self.seek(pos)
|
||||
line = self.nextline() # 'stream'
|
||||
(_, line) = self.nextline() # 'stream'
|
||||
self.fp.seek(pos+len(line))
|
||||
data = self.fp.read(objlen)
|
||||
self.seek(pos+len(line)+objlen)
|
||||
while 1:
|
||||
line = self.nextline()
|
||||
(linepos, line) = self.nextline()
|
||||
if not line:
|
||||
raise PDFSyntaxError('premature eof, need endstream: linepos=%d, line=%r' %
|
||||
(self.linepos, line))
|
||||
(linepos, line))
|
||||
if line.strip():
|
||||
if not line.startswith('endstream'):
|
||||
raise PDFSyntaxError('need endstream: linepos=%d, line=%r' %
|
||||
(self.linepos, line))
|
||||
(linepos, line))
|
||||
break
|
||||
if 1 <= self.debug:
|
||||
print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
|
||||
|
@ -510,17 +505,16 @@ class PDFParser(PSStackParser):
|
|||
self.find_xref()
|
||||
while 1:
|
||||
# read xref table
|
||||
pos0 = self.linepos
|
||||
line = self.nextline()
|
||||
(linepos, line) = self.nextline()
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'line: %r' % line
|
||||
if line[0].isdigit():
|
||||
# XRefStream: PDF-1.5
|
||||
self.seek(pos0)
|
||||
self.seek(linepos)
|
||||
xref = PDFXRefStream(self)
|
||||
elif line.strip() != 'xref':
|
||||
raise PDFSyntaxError('xref not found: linepos=%d, line=%r' %
|
||||
(self.linepos, line))
|
||||
(linepos, line))
|
||||
else:
|
||||
xref = PDFXRef(self)
|
||||
yield xref
|
||||
|
@ -531,10 +525,10 @@ class PDFParser(PSStackParser):
|
|||
self.seek(int_value(trailer['XRefStm']))
|
||||
if 'Prev' in trailer:
|
||||
# find previous xref
|
||||
pos0 = int_value(trailer['Prev'])
|
||||
self.seek(pos0)
|
||||
pos = int_value(trailer['Prev'])
|
||||
self.seek(pos)
|
||||
if 1 <= self.debug:
|
||||
print >>stderr, 'prev trailer: pos=%d' % pos0
|
||||
print >>stderr, 'prev trailer: pos=%d' % pos
|
||||
else:
|
||||
break
|
||||
return
|
||||
|
|
95
psparser.py
95
psparser.py
|
@ -12,36 +12,48 @@ class PSTypeError(PSException): pass
|
|||
class PSValueError(PSException): pass
|
||||
|
||||
|
||||
## PostScript Types
|
||||
## Basic PostScript Types
|
||||
##
|
||||
|
||||
# PSLiteral
|
||||
class PSLiteral:
|
||||
|
||||
'''
|
||||
PS literals (e.g. "/Name").
|
||||
Caution: Never create these objects directly.
|
||||
Use PSLiteralTable.intern() instead.
|
||||
'''
|
||||
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '/%s' % self.name
|
||||
|
||||
# PSKeyword
|
||||
class PSKeyword:
|
||||
|
||||
'''
|
||||
PS keywords (e.g. "showpage").
|
||||
Caution: Never create these objects directly.
|
||||
Use PSKeywordTable.intern() instead.
|
||||
'''
|
||||
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return self.name
|
||||
|
||||
# PSSymbolTable
|
||||
class PSSymbolTable:
|
||||
|
||||
'''
|
||||
Symbol table that stores PSLiteral or PSKeyword.
|
||||
'''
|
||||
|
||||
def __init__(self, classe):
|
||||
self.dic = {}
|
||||
self.classe = classe
|
||||
|
@ -74,7 +86,9 @@ def keyword_name(x):
|
|||
##
|
||||
class PSBaseParser:
|
||||
|
||||
'''PostScript parser that performs only basic tokenization.'''
|
||||
'''
|
||||
Most basic PostScript parser that performs only basic tokenization.
|
||||
'''
|
||||
|
||||
def __init__(self, fp, debug=0):
|
||||
self.fp = fp
|
||||
|
@ -88,21 +102,22 @@ class PSBaseParser:
|
|||
|
||||
def seek(self, pos):
|
||||
'''
|
||||
seeks to the given pos.
|
||||
Seeks the parser to the given position.
|
||||
'''
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'seek:', pos
|
||||
prevpos = self.fp.tell()
|
||||
self.fp.seek(pos)
|
||||
self.linepos = pos
|
||||
self.linebuf = None
|
||||
self.curpos = 0
|
||||
self.line = ''
|
||||
return
|
||||
self.linebuf = None # line buffer.
|
||||
self.curpos = 0 # current position in the buffer.
|
||||
self.linepos = pos # the beginning of the current line.
|
||||
self.go = False
|
||||
return prevpos
|
||||
|
||||
EOLCHAR = re.compile(r'[\r\n]')
|
||||
def nextline(self):
|
||||
'''
|
||||
fetches the next line that ends either with \\r or \\n.
|
||||
Fetches a next line that ends either with \\r or \\n.
|
||||
'''
|
||||
line = ''
|
||||
eol = None
|
||||
|
@ -131,12 +146,14 @@ class PSBaseParser:
|
|||
# fetch further
|
||||
line += self.linebuf[self.curpos:]
|
||||
self.linebuf = None
|
||||
linepos = self.linepos
|
||||
self.linepos += len(line)
|
||||
return line
|
||||
return (linepos, line)
|
||||
|
||||
def revreadlines(self):
|
||||
'''
|
||||
fetches lines backword. used to locate trailers.
|
||||
Fetches a next line backword. This is used to locate
|
||||
the trailers at the end of a file.
|
||||
'''
|
||||
self.fp.seek(0, 2)
|
||||
pos = self.fp.tell()
|
||||
|
@ -156,6 +173,7 @@ class PSBaseParser:
|
|||
buf = ''
|
||||
return
|
||||
|
||||
# regex patterns for basic lexical scanning.
|
||||
SPECIAL = r'%\[\]()<>{}/\000\011\012\014\015\040'
|
||||
TOKEN = re.compile(r'<<|>>|[%\[\]()<>{}/]|[^'+SPECIAL+r']+')
|
||||
LITERAL = re.compile(r'([^#'+SPECIAL+r']|#[0-9abcdefABCDEF]{2})+')
|
||||
|
@ -167,38 +185,39 @@ class PSBaseParser:
|
|||
|
||||
def parse(self):
|
||||
'''
|
||||
Yields a list of basic tokens: keywords, literals, strings,
|
||||
numbers and parentheses. Comments are skipped.
|
||||
Nested objects (i.e. arrays and dictionaries) are not handled.
|
||||
Yields a list of tuples (pos, token) of the following:
|
||||
keywords, literals, strings, numbers and parentheses.
|
||||
Comments are skipped.
|
||||
Nested objects (i.e. arrays and dictionaries) are not handled here.
|
||||
'''
|
||||
while 1:
|
||||
# do not strip line! we need to distinguish last '\n' or '\r'
|
||||
linepos0 = self.linepos
|
||||
self.line = self.nextline()
|
||||
if not self.line: break
|
||||
(linepos, line) = self.nextline()
|
||||
if not line: break
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'line: (%d) %r' % (self.linepos, self.line)
|
||||
print >>stderr, 'line: (%d) %r' % (linepos, line)
|
||||
# do this before removing comment
|
||||
if self.line.startswith('%%EOF'): break
|
||||
if line.startswith('%%EOF'): break
|
||||
charpos = 0
|
||||
|
||||
# tokenize
|
||||
while 1:
|
||||
m = self.TOKEN.search(self.line, charpos)
|
||||
self.go = True
|
||||
while self.go:
|
||||
m = self.TOKEN.search(line, charpos)
|
||||
if not m: break
|
||||
t = m.group(0)
|
||||
pos = linepos0 + m.start(0)
|
||||
pos = linepos + m.start(0)
|
||||
charpos = m.end(0)
|
||||
|
||||
if t == '%':
|
||||
# skip comment
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'comment: %r' % self.line[charpos:]
|
||||
print >>stderr, 'comment: %r' % line[charpos:]
|
||||
break
|
||||
|
||||
elif t == '/':
|
||||
# literal object
|
||||
mn = self.LITERAL.match(self.line, m.start(0)+1)
|
||||
mn = self.LITERAL.match(line, m.start(0)+1)
|
||||
lit = PSLiteralTable.intern(mn.group(0))
|
||||
yield (pos, lit)
|
||||
charpos = mn.end(0)
|
||||
|
@ -209,30 +228,30 @@ class PSBaseParser:
|
|||
# normal string object
|
||||
s = ''
|
||||
while 1:
|
||||
ms = self.STRING_NORM.match(self.line, charpos)
|
||||
ms = self.STRING_NORM.match(line, charpos)
|
||||
if not ms: break
|
||||
s1 = ms.group(0)
|
||||
charpos = ms.end(0)
|
||||
if len(s1) == 1 and s1[-1] == '\\':
|
||||
s += s1[-1:]
|
||||
self.line = self.nextline()
|
||||
if not self.line:
|
||||
(linepos, line) = self.nextline()
|
||||
if not line:
|
||||
raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
|
||||
(self.linepos, self.line))
|
||||
(linepos, line))
|
||||
charpos = 0
|
||||
elif charpos == len(self.line):
|
||||
elif charpos == len(line):
|
||||
s += s1
|
||||
self.line = self.nextline()
|
||||
if not self.line:
|
||||
(linepos, line) = self.nextline()
|
||||
if not line:
|
||||
raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
|
||||
(self.linepos, self.line))
|
||||
(linepos, line))
|
||||
charpos = 0
|
||||
else:
|
||||
s += s1
|
||||
break
|
||||
if self.line[charpos] != ')':
|
||||
if line[charpos] != ')':
|
||||
raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
|
||||
(self.linepos, self.line))
|
||||
(linepos, line))
|
||||
charpos += 1
|
||||
def convesc(m):
|
||||
x = m.group(0)
|
||||
|
@ -247,11 +266,11 @@ class PSBaseParser:
|
|||
|
||||
elif t == '<':
|
||||
# hex string object
|
||||
ms = self.STRING_HEX.match(self.line, charpos)
|
||||
ms = self.STRING_HEX.match(line, charpos)
|
||||
charpos = ms.end(0)
|
||||
if self.line[charpos] != '>':
|
||||
if line[charpos] != '>':
|
||||
raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
|
||||
(self.linepos, self.line))
|
||||
(linepos, line))
|
||||
charpos += 1
|
||||
def convhex(m1):
|
||||
return chr(int(m1.group(0), 16))
|
||||
|
@ -270,7 +289,7 @@ class PSBaseParser:
|
|||
print >>stderr, 'number: %r' % n
|
||||
yield (pos, n)
|
||||
|
||||
elif t in ('true','false'):
|
||||
elif t in ('true', 'false'):
|
||||
# boolean
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'boolean: %r' % t
|
||||
|
|
Loading…
Reference in New Issue