tidy up a bit
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@114 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
290eccbcc7
commit
fc453e2061
8
Makefile
8
Makefile
|
@ -19,12 +19,12 @@ install:
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
-rm -rf build
|
-rm -rf build
|
||||||
-cd $(PACKAGE) && make clean
|
-cd $(PACKAGE) && $(MAKE) clean
|
||||||
-cd tools && make clean
|
-cd tools && $(MAKE) clean
|
||||||
-cd samples && make clean
|
-cd samples && $(MAKE) clean
|
||||||
|
|
||||||
test:
|
test:
|
||||||
cd samples && make test
|
cd samples && $(MAKE) test
|
||||||
|
|
||||||
# Maintainance:
|
# Maintainance:
|
||||||
commit: clean
|
commit: clean
|
||||||
|
|
10
README.html
10
README.html
|
@ -18,7 +18,7 @@ Python PDF parser and analyzer
|
||||||
|
|
||||||
<div align=right class=lastmod>
|
<div align=right class=lastmod>
|
||||||
<!-- hhmts start -->
|
<!-- hhmts start -->
|
||||||
Last Modified: Sun May 17 23:10:42 JST 2009
|
Last Modified: Sat May 23 10:06:04 JST 2009
|
||||||
<!-- hhmts end -->
|
<!-- hhmts end -->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -146,7 +146,7 @@ PDFMiner comes with two handy tools:
|
||||||
<p>
|
<p>
|
||||||
<code>pdf2txt.py</code> extracts text contents from a PDF file.
|
<code>pdf2txt.py</code> extracts text contents from a PDF file.
|
||||||
It extracts all the texts that are to be rendered programatically,
|
It extracts all the texts that are to be rendered programatically,
|
||||||
i.e. it cannot extract texts drawn as images that require optical character recognition.
|
It cannot recognize texts drawn as images that would require optical character recognition.
|
||||||
It also extracts the corresponding locations, font names, font sizes, writing
|
It also extracts the corresponding locations, font names, font sizes, writing
|
||||||
direction (horizontal or vertical) for each text portion.
|
direction (horizontal or vertical) for each text portion.
|
||||||
You need to provide a password for protected PDF documents when its access is restricted.
|
You need to provide a password for protected PDF documents when its access is restricted.
|
||||||
|
@ -243,9 +243,9 @@ Note that page numbers start from one.
|
||||||
Because the contents of stream objects can be very large,
|
Because the contents of stream objects can be very large,
|
||||||
they are omitted when none of the options above is specified.
|
they are omitted when none of the options above is specified.
|
||||||
<p>
|
<p>
|
||||||
With <code>-r</code> option, all the stream contents are dumped without decoding.
|
With <code>-r</code> option, the "raw" stream contents are dumped without decompression.
|
||||||
With <code>-b</code> option, the contents are dumped as a binary blob.
|
With <code>-b</code> option, the decompressed contents are dumped as a binary blob.
|
||||||
With <code>-t</code> option, the contents are dumped in a text format,
|
With <code>-t</code> option, the decompressed contents are dumped in a text format,
|
||||||
similar to <code>repr()</code> manner. When
|
similar to <code>repr()</code> manner. When
|
||||||
<code>-r</code> or <code>-b</code> option is given,
|
<code>-r</code> or <code>-b</code> option is given,
|
||||||
no stream header is displayed for the ease of saving it to a file.
|
no stream header is displayed for the ease of saving it to a file.
|
||||||
|
|
|
@ -303,6 +303,10 @@ class TextConverter(PDFConverter):
|
||||||
self.word_margin = word_margin
|
self.word_margin = word_margin
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def write(self, text):
|
||||||
|
self.outfp.write(text.encode(self.codec, 'ignore'))
|
||||||
|
return
|
||||||
|
|
||||||
def end_page(self, page):
|
def end_page(self, page):
|
||||||
def render(item):
|
def render(item):
|
||||||
if isinstance(item, LTText):
|
if isinstance(item, LTText):
|
||||||
|
|
|
@ -18,16 +18,24 @@ class PSValueError(PSException): pass
|
||||||
## Basic PostScript Types
|
## Basic PostScript Types
|
||||||
##
|
##
|
||||||
|
|
||||||
# PSLiteral
|
## PSObject
|
||||||
|
##
|
||||||
|
## Base class for all PS or PDF-related data types.
|
||||||
|
##
|
||||||
class PSObject(object): pass
|
class PSObject(object): pass
|
||||||
|
|
||||||
class PSLiteral(PSObject):
|
|
||||||
|
|
||||||
'''
|
## PSLiteral
|
||||||
PS literals (e.g. "/Name").
|
##
|
||||||
Caution: Never create these objects directly.
|
## Postscript literals are used as identifiers, such as
|
||||||
Use PSLiteralTable.intern() instead.
|
## variable names, property names and dictionary keys.
|
||||||
'''
|
## Literals are case sensitive and denoted by a preceding
|
||||||
|
## slash sign (e.g. "/Name")
|
||||||
|
##
|
||||||
|
## Note: Never create an instance of PSLiteral by hand.
|
||||||
|
## Always use PSLiteralTable.intern().
|
||||||
|
##
|
||||||
|
class PSLiteral(PSObject):
|
||||||
|
|
||||||
def __init__(self, name):
|
def __init__(self, name):
|
||||||
self.name = name
|
self.name = name
|
||||||
|
@ -36,14 +44,13 @@ class PSLiteral(PSObject):
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '/%s' % self.name
|
return '/%s' % self.name
|
||||||
|
|
||||||
# PSKeyword
|
|
||||||
class PSKeyword(PSObject):
|
|
||||||
|
|
||||||
'''
|
## PSKeyword
|
||||||
PS keywords (e.g. "showpage").
|
##
|
||||||
Caution: Never create these objects directly.
|
## Note: Never create an instance of PSLiteral by hand.
|
||||||
Use PSKeywordTable.intern() instead.
|
## Always use PSKeywordTable.intern().
|
||||||
'''
|
##
|
||||||
|
class PSKeyword(PSObject):
|
||||||
|
|
||||||
def __init__(self, name):
|
def __init__(self, name):
|
||||||
self.name = name
|
self.name = name
|
||||||
|
@ -52,23 +59,27 @@ class PSKeyword(PSObject):
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return self.name
|
return self.name
|
||||||
|
|
||||||
# PSSymbolTable
|
|
||||||
|
## PSSymbolTable
|
||||||
|
##
|
||||||
|
## A dictionary-like object that is used for
|
||||||
|
## storing PSLiteral/PSKeyword objects so that
|
||||||
|
## an object that has the same name can never be defined
|
||||||
|
## twice and it is always assured that the same name is
|
||||||
|
## referred to as the same PSLiteral/PSKeyword object.
|
||||||
|
##
|
||||||
class PSSymbolTable(object):
|
class PSSymbolTable(object):
|
||||||
|
|
||||||
'''
|
def __init__(self, klass):
|
||||||
Symbol table that stores PSLiteral or PSKeyword.
|
|
||||||
'''
|
|
||||||
|
|
||||||
def __init__(self, classe):
|
|
||||||
self.dic = {}
|
self.dic = {}
|
||||||
self.classe = classe
|
self.klass = klass
|
||||||
return
|
return
|
||||||
|
|
||||||
def intern(self, name):
|
def intern(self, name):
|
||||||
if name in self.dic:
|
if name in self.dic:
|
||||||
lit = self.dic[name]
|
lit = self.dic[name]
|
||||||
else:
|
else:
|
||||||
lit = self.classe(name)
|
lit = self.klass(name)
|
||||||
self.dic[name] = lit
|
self.dic[name] = lit
|
||||||
return lit
|
return lit
|
||||||
|
|
||||||
|
@ -118,7 +129,7 @@ ESC_STRING = { 'b':8, 't':9, 'n':10, 'f':12, 'r':13, '(':40, ')':41, '\\':92 }
|
||||||
class PSBaseParser(object):
|
class PSBaseParser(object):
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Most basic PostScript parser that performs only basic tokenization.
|
Most basic PostScript parser that performs only tokenization.
|
||||||
'''
|
'''
|
||||||
BUFSIZ = 4096
|
BUFSIZ = 4096
|
||||||
|
|
||||||
|
@ -163,8 +174,10 @@ class PSBaseParser(object):
|
||||||
self.buf = ''
|
self.buf = ''
|
||||||
self.charpos = 0
|
self.charpos = 0
|
||||||
# reset the status for nexttoken()
|
# reset the status for nexttoken()
|
||||||
self.parse1 = self.parse_main
|
self._parse1 = self._parse_main
|
||||||
self.tokens = []
|
self._curtoken = ''
|
||||||
|
self._curtokenpos = 0
|
||||||
|
self._tokens = []
|
||||||
return
|
return
|
||||||
|
|
||||||
def fillbuf(self):
|
def fillbuf(self):
|
||||||
|
@ -177,192 +190,6 @@ class PSBaseParser(object):
|
||||||
self.charpos = 0
|
self.charpos = 0
|
||||||
return
|
return
|
||||||
|
|
||||||
def parse_main(self, s, i):
|
|
||||||
m = NONSPC.search(s, i)
|
|
||||||
if not m:
|
|
||||||
return (self.parse_main, len(s))
|
|
||||||
j = m.start(0)
|
|
||||||
c = s[j]
|
|
||||||
self.tokenstart = self.bufpos+j
|
|
||||||
if c == '%':
|
|
||||||
self.token = '%'
|
|
||||||
return (self.parse_comment, j+1)
|
|
||||||
if c == '/':
|
|
||||||
self.token = ''
|
|
||||||
return (self.parse_literal, j+1)
|
|
||||||
if c in '-+' or c.isdigit():
|
|
||||||
self.token = c
|
|
||||||
return (self.parse_number, j+1)
|
|
||||||
if c == '.':
|
|
||||||
self.token = c
|
|
||||||
return (self.parse_float, j+1)
|
|
||||||
if c.isalpha():
|
|
||||||
self.token = c
|
|
||||||
return (self.parse_keyword, j+1)
|
|
||||||
if c == '(':
|
|
||||||
self.token = ''
|
|
||||||
self.paren = 1
|
|
||||||
return (self.parse_string, j+1)
|
|
||||||
if c == '<':
|
|
||||||
self.token = ''
|
|
||||||
return (self.parse_wopen, j+1)
|
|
||||||
if c == '>':
|
|
||||||
self.token = ''
|
|
||||||
return (self.parse_wclose, j+1)
|
|
||||||
self.add_token(KWD(c))
|
|
||||||
return (self.parse_main, j+1)
|
|
||||||
|
|
||||||
def add_token(self, obj):
|
|
||||||
self.tokens.append((self.tokenstart, obj))
|
|
||||||
return
|
|
||||||
|
|
||||||
def parse_comment(self, s, i):
|
|
||||||
m = EOL.search(s, i)
|
|
||||||
if not m:
|
|
||||||
self.token += s[i:]
|
|
||||||
return (self.parse_comment, len(s))
|
|
||||||
j = m.start(0)
|
|
||||||
self.token += s[i:j]
|
|
||||||
# We ignore comments.
|
|
||||||
#self.tokens.append(self.token)
|
|
||||||
return (self.parse_main, j)
|
|
||||||
|
|
||||||
def parse_literal(self, s, i):
|
|
||||||
m = END_LITERAL.search(s, i)
|
|
||||||
if not m:
|
|
||||||
self.token += s[i:]
|
|
||||||
return (self.parse_literal, len(s))
|
|
||||||
j = m.start(0)
|
|
||||||
self.token += s[i:j]
|
|
||||||
c = s[j]
|
|
||||||
if c == '#':
|
|
||||||
self.hex = ''
|
|
||||||
return (self.parse_literal_hex, j+1)
|
|
||||||
self.add_token(LIT(self.token))
|
|
||||||
return (self.parse_main, j)
|
|
||||||
|
|
||||||
def parse_literal_hex(self, s, i):
|
|
||||||
c = s[i]
|
|
||||||
if HEX.match(c) and len(self.hex) < 2:
|
|
||||||
self.hex += c
|
|
||||||
return (self.parse_literal_hex, i+1)
|
|
||||||
if self.hex:
|
|
||||||
self.token += chr(int(self.hex, 16))
|
|
||||||
return (self.parse_literal, i)
|
|
||||||
|
|
||||||
def parse_number(self, s, i):
|
|
||||||
m = END_NUMBER.search(s, i)
|
|
||||||
if not m:
|
|
||||||
self.token += s[i:]
|
|
||||||
return (self.parse_number, len(s))
|
|
||||||
j = m.start(0)
|
|
||||||
self.token += s[i:j]
|
|
||||||
c = s[j]
|
|
||||||
if c == '.':
|
|
||||||
self.token += c
|
|
||||||
return (self.parse_float, j+1)
|
|
||||||
try:
|
|
||||||
self.add_token(int(self.token))
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
return (self.parse_main, j)
|
|
||||||
def parse_float(self, s, i):
|
|
||||||
m = END_NUMBER.search(s, i)
|
|
||||||
if not m:
|
|
||||||
self.token += s[i:]
|
|
||||||
return (self.parse_float, len(s))
|
|
||||||
j = m.start(0)
|
|
||||||
self.token += s[i:j]
|
|
||||||
self.add_token(float(self.token))
|
|
||||||
return (self.parse_main, j)
|
|
||||||
|
|
||||||
def parse_keyword(self, s, i):
|
|
||||||
m = END_KEYWORD.search(s, i)
|
|
||||||
if not m:
|
|
||||||
self.token += s[i:]
|
|
||||||
return (self.parse_keyword, len(s))
|
|
||||||
j = m.start(0)
|
|
||||||
self.token += s[i:j]
|
|
||||||
if self.token == 'true':
|
|
||||||
token = True
|
|
||||||
elif self.token == 'false':
|
|
||||||
token = False
|
|
||||||
else:
|
|
||||||
token = KWD(self.token)
|
|
||||||
self.add_token(token)
|
|
||||||
return (self.parse_main, j)
|
|
||||||
|
|
||||||
def parse_string(self, s, i):
|
|
||||||
m = END_STRING.search(s, i)
|
|
||||||
if not m:
|
|
||||||
self.token += s[i:]
|
|
||||||
return (self.parse_string, len(s))
|
|
||||||
j = m.start(0)
|
|
||||||
self.token += s[i:j]
|
|
||||||
c = s[j]
|
|
||||||
if c == '\\':
|
|
||||||
self.oct = ''
|
|
||||||
return (self.parse_string_1, j+1)
|
|
||||||
if c == '(':
|
|
||||||
self.paren += 1
|
|
||||||
self.token += c
|
|
||||||
return (self.parse_string, j+1)
|
|
||||||
if c == ')':
|
|
||||||
self.paren -= 1
|
|
||||||
if self.paren: # WTF, they said balanced parens need no special treatment.
|
|
||||||
self.token += c
|
|
||||||
return (self.parse_string, j+1)
|
|
||||||
self.add_token(self.token)
|
|
||||||
return (self.parse_main, j+1)
|
|
||||||
def parse_string_1(self, s, i):
|
|
||||||
c = s[i]
|
|
||||||
if OCT_STRING.match(c) and len(self.oct) < 3:
|
|
||||||
self.oct += c
|
|
||||||
return (self.parse_string_1, i+1)
|
|
||||||
if self.oct:
|
|
||||||
self.token += chr(int(self.oct, 8))
|
|
||||||
return (self.parse_string, i)
|
|
||||||
if c in ESC_STRING:
|
|
||||||
self.token += chr(ESC_STRING[c])
|
|
||||||
return (self.parse_string, i+1)
|
|
||||||
|
|
||||||
def parse_wopen(self, s, i):
|
|
||||||
c = s[i]
|
|
||||||
if c.isspace() or HEX.match(c):
|
|
||||||
return (self.parse_hexstring, i)
|
|
||||||
if c == '<':
|
|
||||||
self.add_token(KEYWORD_DICT_BEGIN)
|
|
||||||
i += 1
|
|
||||||
return (self.parse_main, i)
|
|
||||||
|
|
||||||
def parse_wclose(self, s, i):
|
|
||||||
c = s[i]
|
|
||||||
if c == '>':
|
|
||||||
self.add_token(KEYWORD_DICT_END)
|
|
||||||
i += 1
|
|
||||||
return (self.parse_main, i)
|
|
||||||
|
|
||||||
def parse_hexstring(self, s, i):
|
|
||||||
m = END_HEX_STRING.search(s, i)
|
|
||||||
if not m:
|
|
||||||
self.token += s[i:]
|
|
||||||
return (self.parse_hexstring, len(s))
|
|
||||||
j = m.start(0)
|
|
||||||
self.token += s[i:j]
|
|
||||||
token = HEX_PAIR.sub(lambda m: chr(int(m.group(0), 16)),
|
|
||||||
SPC.sub('', self.token))
|
|
||||||
self.add_token(token)
|
|
||||||
return (self.parse_main, j)
|
|
||||||
|
|
||||||
def nexttoken(self):
|
|
||||||
while not self.tokens:
|
|
||||||
self.fillbuf()
|
|
||||||
(self.parse1, self.charpos) = self.parse1(self.buf, self.charpos)
|
|
||||||
token = self.tokens.pop(0)
|
|
||||||
if 2 <= self.debug:
|
|
||||||
print >>stderr, 'nexttoken: %r' % (token,)
|
|
||||||
return token
|
|
||||||
|
|
||||||
def nextline(self):
|
def nextline(self):
|
||||||
'''
|
'''
|
||||||
Fetches a next line that ends either with \\r or \\n.
|
Fetches a next line that ends either with \\r or \\n.
|
||||||
|
@ -417,6 +244,195 @@ class PSBaseParser(object):
|
||||||
buf = ''
|
buf = ''
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def _parse_main(self, s, i):
|
||||||
|
m = NONSPC.search(s, i)
|
||||||
|
if not m:
|
||||||
|
return (self._parse_main, len(s))
|
||||||
|
j = m.start(0)
|
||||||
|
c = s[j]
|
||||||
|
self._curtokenpos = self.bufpos+j
|
||||||
|
if c == '%':
|
||||||
|
self._curtoken = '%'
|
||||||
|
return (self._parse_comment, j+1)
|
||||||
|
elif c == '/':
|
||||||
|
self._curtoken = ''
|
||||||
|
return (self._parse_literal, j+1)
|
||||||
|
elif c in '-+' or c.isdigit():
|
||||||
|
self._curtoken = c
|
||||||
|
return (self._parse_number, j+1)
|
||||||
|
elif c == '.':
|
||||||
|
self._curtoken = c
|
||||||
|
return (self._parse_float, j+1)
|
||||||
|
elif c.isalpha():
|
||||||
|
self._curtoken = c
|
||||||
|
return (self._parse_keyword, j+1)
|
||||||
|
elif c == '(':
|
||||||
|
self._curtoken = ''
|
||||||
|
self.paren = 1
|
||||||
|
return (self._parse_string, j+1)
|
||||||
|
elif c == '<':
|
||||||
|
self._curtoken = ''
|
||||||
|
return (self._parse_wopen, j+1)
|
||||||
|
elif c == '>':
|
||||||
|
self._curtoken = ''
|
||||||
|
return (self._parse_wclose, j+1)
|
||||||
|
else:
|
||||||
|
self._add_token(KWD(c))
|
||||||
|
return (self._parse_main, j+1)
|
||||||
|
|
||||||
|
def _add_token(self, obj):
|
||||||
|
self._tokens.append((self._curtokenpos, obj))
|
||||||
|
return
|
||||||
|
|
||||||
|
def _parse_comment(self, s, i):
|
||||||
|
m = EOL.search(s, i)
|
||||||
|
if not m:
|
||||||
|
self._curtoken += s[i:]
|
||||||
|
return (self._parse_comment, len(s))
|
||||||
|
j = m.start(0)
|
||||||
|
self._curtoken += s[i:j]
|
||||||
|
# We ignore comments.
|
||||||
|
#self._tokens.append(self._curtoken)
|
||||||
|
return (self._parse_main, j)
|
||||||
|
|
||||||
|
def _parse_literal(self, s, i):
|
||||||
|
m = END_LITERAL.search(s, i)
|
||||||
|
if not m:
|
||||||
|
self._curtoken += s[i:]
|
||||||
|
return (self._parse_literal, len(s))
|
||||||
|
j = m.start(0)
|
||||||
|
self._curtoken += s[i:j]
|
||||||
|
c = s[j]
|
||||||
|
if c == '#':
|
||||||
|
self.hex = ''
|
||||||
|
return (self._parse_literal_hex, j+1)
|
||||||
|
self._add_token(LIT(self._curtoken))
|
||||||
|
return (self._parse_main, j)
|
||||||
|
|
||||||
|
def _parse_literal_hex(self, s, i):
|
||||||
|
c = s[i]
|
||||||
|
if HEX.match(c) and len(self.hex) < 2:
|
||||||
|
self.hex += c
|
||||||
|
return (self._parse_literal_hex, i+1)
|
||||||
|
if self.hex:
|
||||||
|
self._curtoken += chr(int(self.hex, 16))
|
||||||
|
return (self._parse_literal, i)
|
||||||
|
|
||||||
|
def _parse_number(self, s, i):
|
||||||
|
m = END_NUMBER.search(s, i)
|
||||||
|
if not m:
|
||||||
|
self._curtoken += s[i:]
|
||||||
|
return (self._parse_number, len(s))
|
||||||
|
j = m.start(0)
|
||||||
|
self._curtoken += s[i:j]
|
||||||
|
c = s[j]
|
||||||
|
if c == '.':
|
||||||
|
self._curtoken += c
|
||||||
|
return (self._parse_float, j+1)
|
||||||
|
try:
|
||||||
|
self._add_token(int(self._curtoken))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
return (self._parse_main, j)
|
||||||
|
|
||||||
|
def _parse_float(self, s, i):
|
||||||
|
m = END_NUMBER.search(s, i)
|
||||||
|
if not m:
|
||||||
|
self._curtoken += s[i:]
|
||||||
|
return (self._parse_float, len(s))
|
||||||
|
j = m.start(0)
|
||||||
|
self._curtoken += s[i:j]
|
||||||
|
self._add_token(float(self._curtoken))
|
||||||
|
return (self._parse_main, j)
|
||||||
|
|
||||||
|
def _parse_keyword(self, s, i):
|
||||||
|
m = END_KEYWORD.search(s, i)
|
||||||
|
if not m:
|
||||||
|
self._curtoken += s[i:]
|
||||||
|
return (self._parse_keyword, len(s))
|
||||||
|
j = m.start(0)
|
||||||
|
self._curtoken += s[i:j]
|
||||||
|
if self._curtoken == 'true':
|
||||||
|
token = True
|
||||||
|
elif self._curtoken == 'false':
|
||||||
|
token = False
|
||||||
|
else:
|
||||||
|
token = KWD(self._curtoken)
|
||||||
|
self._add_token(token)
|
||||||
|
return (self._parse_main, j)
|
||||||
|
|
||||||
|
def _parse_string(self, s, i):
|
||||||
|
m = END_STRING.search(s, i)
|
||||||
|
if not m:
|
||||||
|
self._curtoken += s[i:]
|
||||||
|
return (self._parse_string, len(s))
|
||||||
|
j = m.start(0)
|
||||||
|
self._curtoken += s[i:j]
|
||||||
|
c = s[j]
|
||||||
|
if c == '\\':
|
||||||
|
self.oct = ''
|
||||||
|
return (self._parse_string_1, j+1)
|
||||||
|
if c == '(':
|
||||||
|
self.paren += 1
|
||||||
|
self._curtoken += c
|
||||||
|
return (self._parse_string, j+1)
|
||||||
|
if c == ')':
|
||||||
|
self.paren -= 1
|
||||||
|
if self.paren: # WTF, they said balanced parens need no special treatment.
|
||||||
|
self._curtoken += c
|
||||||
|
return (self._parse_string, j+1)
|
||||||
|
self._add_token(self._curtoken)
|
||||||
|
return (self._parse_main, j+1)
|
||||||
|
|
||||||
|
def _parse_string_1(self, s, i):
|
||||||
|
c = s[i]
|
||||||
|
if OCT_STRING.match(c) and len(self.oct) < 3:
|
||||||
|
self.oct += c
|
||||||
|
return (self._parse_string_1, i+1)
|
||||||
|
if self.oct:
|
||||||
|
self._curtoken += chr(int(self.oct, 8))
|
||||||
|
return (self._parse_string, i)
|
||||||
|
if c in ESC_STRING:
|
||||||
|
self._curtoken += chr(ESC_STRING[c])
|
||||||
|
return (self._parse_string, i+1)
|
||||||
|
|
||||||
|
def _parse_wopen(self, s, i):
|
||||||
|
c = s[i]
|
||||||
|
if c.isspace() or HEX.match(c):
|
||||||
|
return (self._parse_hexstring, i)
|
||||||
|
if c == '<':
|
||||||
|
self._add_token(KEYWORD_DICT_BEGIN)
|
||||||
|
i += 1
|
||||||
|
return (self._parse_main, i)
|
||||||
|
|
||||||
|
def _parse_wclose(self, s, i):
|
||||||
|
c = s[i]
|
||||||
|
if c == '>':
|
||||||
|
self._add_token(KEYWORD_DICT_END)
|
||||||
|
i += 1
|
||||||
|
return (self._parse_main, i)
|
||||||
|
|
||||||
|
def _parse_hexstring(self, s, i):
|
||||||
|
m = END_HEX_STRING.search(s, i)
|
||||||
|
if not m:
|
||||||
|
self._curtoken += s[i:]
|
||||||
|
return (self._parse_hexstring, len(s))
|
||||||
|
j = m.start(0)
|
||||||
|
self._curtoken += s[i:j]
|
||||||
|
token = HEX_PAIR.sub(lambda m: chr(int(m.group(0), 16)),
|
||||||
|
SPC.sub('', self._curtoken))
|
||||||
|
self._add_token(token)
|
||||||
|
return (self._parse_main, j)
|
||||||
|
|
||||||
|
def nexttoken(self):
|
||||||
|
while not self._tokens:
|
||||||
|
self.fillbuf()
|
||||||
|
(self._parse1, self.charpos) = self._parse1(self.buf, self.charpos)
|
||||||
|
token = self._tokens.pop(0)
|
||||||
|
if 2 <= self.debug:
|
||||||
|
print >>stderr, 'nexttoken: %r' % (token,)
|
||||||
|
return token
|
||||||
|
|
||||||
|
|
||||||
## PSStackParser
|
## PSStackParser
|
||||||
##
|
##
|
||||||
|
|
Loading…
Reference in New Issue