From 14de393d5e8a05a374a4737974755ec1ffd9a677 Mon Sep 17 00:00:00 2001 From: Venelin Stoykov Date: Fri, 18 Aug 2017 09:10:06 +0300 Subject: [PATCH] Cleanup psparser (#83) - Do not use bytesindex function. Use native slices instead - Fix import ordering --- pdfminer/psparser.py | 67 +++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index 6ebd583..9b214af 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -8,20 +8,11 @@ import logging import six # Python 2+3 compatibility from . import settings +from .utils import choplist log = logging.getLogger(__name__) -def bytesindex(s,i,j=None): - """implements s[i], s[i:], s[i:j] for Python2 and Python3""" - if i<0 : i=len(s)+i - if j is None: j=i+1 - if j<0 : j=len(s) - return s[i:j] - -from .utils import choplist - - ## PS Exceptions ## class PSException(Exception): @@ -249,7 +240,7 @@ class PSBaseParser(object): while 1: self.fillbuf() if eol: - c = bytesindex(self.buf,self.charpos) + c = self.buf[self.charpos:self.charpos+1] # handle b'\r\n' if c == b'\n': linebuf += c @@ -257,14 +248,14 @@ class PSBaseParser(object): break m = EOL.search(self.buf, self.charpos) if m: - linebuf += bytesindex(self.buf,self.charpos,m.end(0)) + linebuf += self.buf[self.charpos:m.end(0)] self.charpos = m.end(0) - if bytesindex(linebuf,-1) == b'\r': + if linebuf[-1:] == b'\r': eol = True else: break else: - linebuf += bytesindex(self.buf,self.charpos,-1) + linebuf += self.buf[self.charpos:] self.charpos = len(self.buf) log.debug('nextline: %r, %r', linepos, linebuf) @@ -290,8 +281,8 @@ class PSBaseParser(object): if n == -1: buf = s + buf break - yield bytesindex(s,n,-1)+buf - s = bytesindex(s,0,n) + yield s[n:] + buf + s = s[:n] buf = b'' return @@ -300,7 +291,7 @@ class PSBaseParser(object): if not m: return len(s) j = m.start(0) - c = bytesindex(s,j) + c = s[j:j+1] self._curtokenpos = self.bufpos+j if c == b'%': self._curtoken = b'%' @@ -346,10 +337,10 @@ class PSBaseParser(object): def _parse_comment(self, s, i): m = EOL.search(s, i) if not m: - self._curtoken += bytesindex(s,i,-1) + self._curtoken += s[i:] return (self._parse_comment, len(s)) j = m.start(0) - self._curtoken += bytesindex(s,i,j) + self._curtoken += s[i:j] self._parse1 = self._parse_main # We ignore comments. #self._tokens.append(self._curtoken) @@ -358,11 +349,11 @@ class PSBaseParser(object): def _parse_literal(self, s, i): m = END_LITERAL.search(s, i) if not m: - self._curtoken += bytesindex(s,i,-1) + self._curtoken += s[i:] return len(s) j = m.start(0) - self._curtoken += bytesindex(s,i,j) - c = bytesindex(s,j) + self._curtoken += s[i:j] + c = s[j:j+1] if c == b'#': self.hex = b'' self._parse1 = self._parse_literal_hex @@ -376,7 +367,7 @@ class PSBaseParser(object): return j def _parse_literal_hex(self, s, i): - c = bytesindex(s,i) + c = s[i:i+1] if HEX.match(c) and len(self.hex) < 2: self.hex += c return i+1 @@ -388,11 +379,11 @@ class PSBaseParser(object): def _parse_number(self, s, i): m = END_NUMBER.search(s, i) if not m: - self._curtoken += bytesindex(s,i,-1) + self._curtoken += s[i:] return len(s) j = m.start(0) - self._curtoken += bytesindex(s,i,j) - c = bytesindex(s,j) + self._curtoken += s[i:j] + c = s[j:j+1] if c == b'.': self._curtoken += c self._parse1 = self._parse_float @@ -407,10 +398,10 @@ class PSBaseParser(object): def _parse_float(self, s, i): m = END_NUMBER.search(s, i) if not m: - self._curtoken += bytesindex(s,i,-1) + self._curtoken += s[i:] return len(s) j = m.start(0) - self._curtoken += bytesindex(s,i,j) + self._curtoken += s[i:j] try: self._add_token(float(self._curtoken)) except ValueError: @@ -421,10 +412,10 @@ class PSBaseParser(object): def _parse_keyword(self, s, i): m = END_KEYWORD.search(s, i) if not m: - self._curtoken += bytesindex(s,i,-1) + self._curtoken += s[i:] return len(s) j = m.start(0) - self._curtoken += bytesindex(s,i,j) + self._curtoken += s[i:j] if self._curtoken == b'true': token = True elif self._curtoken == b'false': @@ -438,11 +429,11 @@ class PSBaseParser(object): def _parse_string(self, s, i): m = END_STRING.search(s, i) if not m: - self._curtoken += bytesindex(s,i,-1) + self._curtoken += s[i:] return len(s) j = m.start(0) - self._curtoken += bytesindex(s,i,j) - c = bytesindex(s,j) + self._curtoken += s[i:j] + c = s[j:j+1] if c == b'\\': self.oct = b'' self._parse1 = self._parse_string_1 @@ -461,7 +452,7 @@ class PSBaseParser(object): return j+1 def _parse_string_1(self, s, i): - c = bytesindex(s,i) + c = s[i:i+1] if OCT_STRING.match(c) and len(self.oct) < 3: self.oct += c return i+1 @@ -475,7 +466,7 @@ class PSBaseParser(object): return i+1 def _parse_wopen(self, s, i): - c = bytesindex(s,i) + c = s[i:i+1] if c == b'<': self._add_token(KEYWORD_DICT_BEGIN) self._parse1 = self._parse_main @@ -485,7 +476,7 @@ class PSBaseParser(object): return i def _parse_wclose(self, s, i): - c = bytesindex(s,i) + c = s[i:i+1] if c == b'>': self._add_token(KEYWORD_DICT_END) i += 1 @@ -495,10 +486,10 @@ class PSBaseParser(object): def _parse_hexstring(self, s, i): m = END_HEX_STRING.search(s, i) if not m: - self._curtoken += bytesindex(s,i,-1) + self._curtoken += s[i:] return len(s) j = m.start(0) - self._curtoken += bytesindex(s,i,j) + self._curtoken += s[i:j] token = HEX_PAIR.sub(lambda m: six.int2byte(int(m.group(0), 16)),SPC.sub(b'', self._curtoken)) self._add_token(token) self._parse1 = self._parse_main