From 1ccfaff41158b96eefe5c7d9a305f8b033560f10 Mon Sep 17 00:00:00 2001 From: Yusuke Shinyama Date: Mon, 30 Jun 2014 19:05:56 +0900 Subject: [PATCH] String-Bytes distinction (first attempt). --- pdfminer/arcfour.py | 8 +-- pdfminer/ascii85.py | 24 ++++---- pdfminer/ccitt.py | 2 +- pdfminer/cmapdb.py | 34 +++++------ pdfminer/image.py | 2 +- pdfminer/lzw.py | 8 +-- pdfminer/pdfdocument.py | 34 +++++------ pdfminer/pdffont.py | 28 ++++----- pdfminer/pdfinterp.py | 14 ++--- pdfminer/pdfpage.py | 2 +- pdfminer/pdfparser.py | 20 +++---- pdfminer/pdftypes.py | 2 +- pdfminer/psparser.py | 130 ++++++++++++++++++++-------------------- pdfminer/rijndael.py | 12 ++-- pdfminer/runlength.py | 10 ++-- pdfminer/utils.py | 18 +++--- 16 files changed, 174 insertions(+), 174 deletions(-) diff --git a/pdfminer/arcfour.py b/pdfminer/arcfour.py index d16cf89..523d991 100644 --- a/pdfminer/arcfour.py +++ b/pdfminer/arcfour.py @@ -12,11 +12,11 @@ This code is in the public domain. class Arcfour(object): """ - >>> Arcfour('Key').process('Plaintext').encode('hex') + >>> Arcfour(b'Key').process(b'Plaintext').encode('hex') 'bbf316e8d940af0ad3' - >>> Arcfour('Wiki').process('pedia').encode('hex') + >>> Arcfour(b'Wiki').process(b'pedia').encode('hex') '1021bf0420' - >>> Arcfour('Secret').process('Attack at dawn').encode('hex') + >>> Arcfour(b'Secret').process(b'Attack at dawn').encode('hex') '45a01f645fc35b383552544b9bf5' """ @@ -34,7 +34,7 @@ class Arcfour(object): def process(self, data): (i, j) = (self.i, self.j) s = self.s - r = '' + r = b'' for c in data: i = (i+1) % 256 j = (j+s[i]) % 256 diff --git a/pdfminer/ascii85.py b/pdfminer/ascii85.py index 80df0ba..067fccd 100644 --- a/pdfminer/ascii85.py +++ b/pdfminer/ascii85.py @@ -24,24 +24,24 @@ def ascii85decode(data): The sample string is taken from: http://en.wikipedia.org/w/index.php?title=Ascii85 - >>> ascii85decode('9jqo^BlbD-BleB1DJ+*+F(f,q') + >>> ascii85decode(b'9jqo^BlbD-BleB1DJ+*+F(f,q') 'Man is distinguished' - >>> ascii85decode('E,9)oF*2M7/c~>') + >>> ascii85decode(b'E,9)oF*2M7/c~>') 'pleasure.' """ n = b = 0 - out = '' + out = b'' for c in data: - if '!' <= c and c <= 'u': + if b'!' <= c and c <= b'u': n += 1 b = b*85+(ord(c)-33) if n == 5: out += struct.pack('>L', b) n = b = 0 - elif c == 'z': + elif c == b'z': assert n == 0 - out += '\0\0\0\0' - elif c == '~': + out += b'\0\0\0\0' + elif c == b'~': if n: for _ in range(5-n): b = b*85+84 @@ -64,19 +64,19 @@ def asciihexdecode(data): the EOD marker after reading an odd number of hexadecimal digits, it will behave as if a 0 followed the last digit. - >>> asciihexdecode('61 62 2e6364 65') + >>> asciihexdecode(b'61 62 2e6364 65') 'ab.cde' - >>> asciihexdecode('61 62 2e6364 657>') + >>> asciihexdecode(b'61 62 2e6364 657>') 'ab.cdep' - >>> asciihexdecode('7>') + >>> asciihexdecode(b'7>') 'p' """ decode = (lambda hx: chr(int(hx, 16))) out = map(decode, hex_re.findall(data)) m = trail_re.search(data) if m: - out.append(decode("%c0" % m.group(1))) - return ''.join(out) + out.append(decode('%c0' % m.group(1))) + return b''.join(out) if __name__ == '__main__': diff --git a/pdfminer/ccitt.py b/pdfminer/ccitt.py index bb8181c..d0cc093 100644 --- a/pdfminer/ccitt.py +++ b/pdfminer/ccitt.py @@ -691,7 +691,7 @@ class CCITTFaxDecoder(CCITTG4Parser): def __init__(self, width, bytealign=False, reversed=False): CCITTG4Parser.__init__(self, width, bytealign=bytealign) self.reversed = reversed - self._buf = '' + self._buf = b'' return def close(self): diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index c53793b..7e0a4e9 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -237,7 +237,7 @@ class CMapDB(object): if os.path.exists(path): gzfile = gzip.open(path) try: - return type(name, (), pickle.loads(gzfile.read())) + return type(str(name), (), pickle.loads(gzfile.read())) finally: gzfile.close() else: @@ -288,17 +288,17 @@ class CMapParser(PSStackParser): def do_keyword(self, pos, token): name = token.name - if name == 'begincmap': + if name == b'begincmap': self._in_cmap = True self.popall() return - elif name == 'endcmap': + elif name == b'endcmap': self._in_cmap = False return if not self._in_cmap: return # - if name == 'def': + if name == b'def': try: ((_, k), (_, v)) = self.pop(2) self.cmap.set_attr(literal_name(k), v) @@ -306,7 +306,7 @@ class CMapParser(PSStackParser): pass return - if name == 'usecmap': + if name == b'usecmap': try: ((_, cmapname),) = self.pop(1) self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname))) @@ -316,17 +316,17 @@ class CMapParser(PSStackParser): pass return - if name == 'begincodespacerange': + if name == b'begincodespacerange': self.popall() return - if name == 'endcodespacerange': + if name == b'endcodespacerange': self.popall() return - if name == 'begincidrange': + if name == b'begincidrange': self.popall() return - if name == 'endcidrange': + if name == b'endcidrange': objs = [obj for (__, obj) in self.popall()] for (s, e, cid) in choplist(3, objs): if (not isinstance(s, str) or not isinstance(e, str) or @@ -347,20 +347,20 @@ class CMapParser(PSStackParser): self.cmap.add_code2cid(x, cid+i) return - if name == 'begincidchar': + if name == b'begincidchar': self.popall() return - if name == 'endcidchar': + if name == b'endcidchar': objs = [obj for (__, obj) in self.popall()] for (cid, code) in choplist(2, objs): if isinstance(code, str) and isinstance(cid, str): self.cmap.add_code2cid(code, nunpack(cid)) return - if name == 'beginbfrange': + if name == b'beginbfrange': self.popall() return - if name == 'endbfrange': + if name == b'endbfrange': objs = [obj for (__, obj) in self.popall()] for (s, e, code) in choplist(3, objs): if (not isinstance(s, str) or not isinstance(e, str) or @@ -382,20 +382,20 @@ class CMapParser(PSStackParser): self.cmap.add_cid2unichr(s1+i, x) return - if name == 'beginbfchar': + if name == b'beginbfchar': self.popall() return - if name == 'endbfchar': + if name == b'endbfchar': objs = [obj for (__, obj) in self.popall()] for (cid, code) in choplist(2, objs): if isinstance(cid, str) and isinstance(code, str): self.cmap.add_cid2unichr(nunpack(cid), code) return - if name == 'beginnotdefrange': + if name == b'beginnotdefrange': self.popall() return - if name == 'endnotdefrange': + if name == b'endnotdefrange': self.popall() return diff --git a/pdfminer/image.py b/pdfminer/image.py index e796e9c..d937603 100644 --- a/pdfminer/image.py +++ b/pdfminer/image.py @@ -35,7 +35,7 @@ class BMPWriter(object): headersize = 14+40+ncols*4 info = struct.pack('>> lzwdecode('\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01') + >>> lzwdecode(b'\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01') '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42' """ fp = BytesIO(data) - return ''.join(LZWDecoder(fp).run()) + return b''.join(LZWDecoder(fp).run()) if __name__ == '__main__': import doctest diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py index 1c28ac2..0072584 100644 --- a/pdfminer/pdfdocument.py +++ b/pdfminer/pdfdocument.py @@ -100,10 +100,10 @@ class PDFXRef(PDFBaseXRef): raise PDFNoValidXRef('Unexpected EOF - file corrupted?') if not line: raise PDFNoValidXRef('Premature eof: %r' % parser) - if line.startswith('trailer'): + if line.startswith(b'trailer'): parser.seek(pos) break - f = line.strip().split(' ') + f = line.strip().split(b' ') if len(f) != 2: raise PDFNoValidXRef('Trailer not found: %r: line=%r' % (parser, line)) try: @@ -115,11 +115,11 @@ class PDFXRef(PDFBaseXRef): (_, line) = parser.nextline() except PSEOF: raise PDFNoValidXRef('Unexpected EOF - file corrupted?') - f = line.strip().split(' ') + f = line.strip().split(b' ') if len(f) != 3: raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line)) (pos, genno, use) = f - if use != 'n': + if use != b'n': continue self.offsets[objid] = (None, long(pos), int(genno)) logging.info('xref objects: %r' % self.offsets) @@ -170,7 +170,7 @@ class PDFXRefFallback(PDFXRef): (pos, line) = parser.nextline() except PSEOF: break - if line.startswith('trailer'): + if line.startswith(b'trailer'): parser.seek(pos) self.load_trailer(parser) logging.info('trailer: %r' % self.get_trailer()) @@ -284,10 +284,10 @@ class PDFXRefStream(PDFBaseXRef): ## class PDFStandardSecurityHandler(object): - PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz' + PASSWORD_PADDING = b'(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz' supported_revisions = (2, 3) - def __init__(self, docid, param, password=''): + def __init__(self, docid, param, password=b''): self.docid = docid self.param = param self.password = password @@ -331,7 +331,7 @@ class PDFStandardSecurityHandler(object): hash.update(self.docid[0]) # 3 result = ARC4.new(key).encrypt(hash.digest()) # 4 for i in range(1, 20): # 5 - k = ''.join(chr(ord(c) ^ i) for c in key) + k = b''.join(chr(ord(c) ^ i) for c in key) result = ARC4.new(k).encrypt(result) result += result # 6 return result @@ -345,7 +345,7 @@ class PDFStandardSecurityHandler(object): hash.update(self.docid[0]) # 5 if self.r >= 4: if not self.encrypt_metadata: - hash.update('\xff\xff\xff\xff') + hash.update(b'\xff\xff\xff\xff') result = hash.digest() n = 5 if self.r >= 3: @@ -388,7 +388,7 @@ class PDFStandardSecurityHandler(object): else: user_password = self.o for i in range(19, -1, -1): - k = ''.join(chr(ord(c) ^ i) for c in key) + k = b''.join(chr(ord(c) ^ i) for c in key) user_password = ARC4.new(k).decrypt(user_password) return self.authenticate_user_password(user_password) @@ -444,7 +444,7 @@ class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler): return data def decrypt_aes128(self, objid, genno, data): - key = self.key + struct.pack(''+'H'*n, self.fp.read(2*n))): @@ -342,7 +342,7 @@ class CFFFont(object): name = self.getstr(sid) self.name2gid[name] = gid self.gid2name[gid] = name - elif format == '\x01': + elif format == b'\x01': # Format 1 (n,) = struct.unpack('B', self.fp.read(1)) sid = 0 @@ -353,7 +353,7 @@ class CFFFont(object): self.name2gid[name] = gid self.gid2name[gid] = name sid += 1 - elif format == '\x02': + elif format == b'\x02': # Format 2 assert 0 else: diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index edf7c4f..3b368e0 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -246,10 +246,10 @@ class PDFContentParser(PSStackParser): self.charpos = 0 return - def get_inline_data(self, pos, target='EI'): + def get_inline_data(self, pos, target=b'EI'): self.seek(pos) i = 0 - data = '' + data = b'' while i <= len(target): self.fillbuf() if i: @@ -273,16 +273,16 @@ class PDFContentParser(PSStackParser): data += self.buf[self.charpos:] self.charpos = len(self.buf) data = data[:-(len(target)+1)] # strip the last part - data = re.sub(r'(\x0d\x0a|[\x0d\x0a])$', '', data) + data = re.sub(br'(\x0d\x0a|[\x0d\x0a])$', b'', data) return (pos, data) def flush(self): self.add_results(*self.popall()) return - KEYWORD_BI = KWD('BI') - KEYWORD_ID = KWD('ID') - KEYWORD_EI = KWD('EI') + KEYWORD_BI = KWD(b'BI') + KEYWORD_ID = KWD(b'ID') + KEYWORD_EI = KWD(b'EI') def do_keyword(self, pos, token): if token is self.KEYWORD_BI: @@ -294,7 +294,7 @@ class PDFContentParser(PSStackParser): if len(objs) % 2 != 0: raise PSTypeError('Invalid dictionary construct: %r' % objs) d = dict((literal_name(k), v) for (k, v) in choplist(2, objs)) - (pos, data) = self.get_inline_data(pos+len('ID ')) + (pos, data) = self.get_inline_data(pos+len(b'ID ')) obj = PDFStream(d, data) self.push((pos, obj)) self.push((pos, self.KEYWORD_EI)) diff --git a/pdfminer/pdfpage.py b/pdfminer/pdfpage.py index 5cf081a..fcdf17b 100644 --- a/pdfminer/pdfpage.py +++ b/pdfminer/pdfpage.py @@ -112,7 +112,7 @@ class PDFPage(object): @classmethod def get_pages(klass, fp, - pagenos=None, maxpages=0, password='', + pagenos=None, maxpages=0, password=b'', caching=True, check_extractable=True): # Create a PDF parser object associated with the file object. parser = PDFParser(fp) diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py index f061917..61eb1dc 100644 --- a/pdfminer/pdfparser.py +++ b/pdfminer/pdfparser.py @@ -50,12 +50,12 @@ class PDFParser(PSStackParser): self.doc = doc return - KEYWORD_R = KWD('R') - KEYWORD_NULL = KWD('null') - KEYWORD_ENDOBJ = KWD('endobj') - KEYWORD_STREAM = KWD('stream') - KEYWORD_XREF = KWD('xref') - KEYWORD_STARTXREF = KWD('startxref') + KEYWORD_R = KWD(b'R') + KEYWORD_NULL = KWD(b'null') + KEYWORD_ENDOBJ = KWD(b'endobj') + KEYWORD_STREAM = KWD(b'stream') + KEYWORD_XREF = KWD(b'xref') + KEYWORD_STARTXREF = KWD(b'startxref') def do_keyword(self, pos, token): """Handles PDF-related keywords.""" @@ -109,8 +109,8 @@ class PDFParser(PSStackParser): if STRICT: raise PDFSyntaxError('Unexpected EOF') break - if 'endstream' in line: - i = line.index('endstream') + if b'endstream' in line: + i = line.index(b'endstream') objlen += i if self.fallback: data += line[:i] @@ -153,7 +153,7 @@ class PDFStreamParser(PDFParser): self.add_results(*self.popall()) return - KEYWORD_OBJ = KWD('obj') + KEYWORD_OBJ = KWD(b'obj') def do_keyword(self, pos, token): if token is self.KEYWORD_R: # reference to indirect object @@ -169,7 +169,7 @@ class PDFStreamParser(PDFParser): if STRICT: # See PDF Spec 3.4.6: Only the object values are stored in the # stream; the obj and endobj keywords are not used. - raise PDFSyntaxError("Keyword endobj found in stream") + raise PDFSyntaxError('Keyword endobj found in stream') return # others self.push((pos, token)) diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index 05294e7..8176055 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -244,7 +244,7 @@ class PDFStream(PDFObject): except zlib.error as e: if STRICT: raise PDFException('Invalid zlib bytes: %r, %r' % (e, data)) - data = '' + data = b'' elif f in LITERALS_LZW_DECODE: data = lzwdecode(data) elif f in LITERALS_ASCII85_DECODE: diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index 7c83391..c1ebe93 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -111,12 +111,12 @@ PSLiteralTable = PSSymbolTable(PSLiteral) PSKeywordTable = PSSymbolTable(PSKeyword) LIT = PSLiteralTable.intern KWD = PSKeywordTable.intern -KEYWORD_PROC_BEGIN = KWD('{') -KEYWORD_PROC_END = KWD('}') -KEYWORD_ARRAY_BEGIN = KWD('[') -KEYWORD_ARRAY_END = KWD(']') -KEYWORD_DICT_BEGIN = KWD('<<') -KEYWORD_DICT_END = KWD('>>') +KEYWORD_PROC_BEGIN = KWD(b'{') +KEYWORD_PROC_END = KWD(b'}') +KEYWORD_ARRAY_BEGIN = KWD(b'[') +KEYWORD_ARRAY_END = KWD(b']') +KEYWORD_DICT_BEGIN = KWD(b'<<') +KEYWORD_DICT_END = KWD(b'>>') def literal_name(x): @@ -139,18 +139,18 @@ def keyword_name(x): ## PSBaseParser ## -EOL = re.compile(r'[\r\n]') -SPC = re.compile(r'\s') -NONSPC = re.compile(r'\S') -HEX = re.compile(r'[0-9a-fA-F]') -END_LITERAL = re.compile(r'[#/%\[\]()<>{}\s]') -END_HEX_STRING = re.compile(r'[^\s0-9a-fA-F]') -HEX_PAIR = re.compile(r'[0-9a-fA-F]{2}|.') -END_NUMBER = re.compile(r'[^0-9]') -END_KEYWORD = re.compile(r'[#/%\[\]()<>{}\s]') -END_STRING = re.compile(r'[()\134]') -OCT_STRING = re.compile(r'[0-7]') -ESC_STRING = {'b': 8, 't': 9, 'n': 10, 'f': 12, 'r': 13, '(': 40, ')': 41, '\\': 92} +EOL = re.compile(br'[\r\n]') +SPC = re.compile(br'\s') +NONSPC = re.compile(br'\S') +HEX = re.compile(br'[0-9a-fA-F]') +END_LITERAL = re.compile(br'[#/%\[\]()<>{}\s]') +END_HEX_STRING = re.compile(br'[^\s0-9a-fA-F]') +HEX_PAIR = re.compile(br'[0-9a-fA-F]{2}|.') +END_NUMBER = re.compile(br'[^0-9]') +END_KEYWORD = re.compile(br'[#/%\[\]()<>{}\s]') +END_STRING = re.compile(br'[()\134]') +OCT_STRING = re.compile(br'[0-7]') +ESC_STRING = {b'b': 8, b't': 9, b'n': 10, b'f': 12, b'r': 13, b'(': 40, b')': 41, b'\\': 92} class PSBaseParser(object): @@ -196,11 +196,11 @@ class PSBaseParser(object): self.fp.seek(pos) # reset the status for nextline() self.bufpos = pos - self.buf = '' + self.buf = b'' self.charpos = 0 # reset the status for nexttoken() self._parse1 = self._parse_main - self._curtoken = '' + self._curtoken = b'' self._curtokenpos = 0 self._tokens = [] return @@ -219,15 +219,15 @@ class PSBaseParser(object): def nextline(self): """Fetches a next line that ends either with \\r or \\n. """ - linebuf = '' + linebuf = b'' linepos = self.bufpos + self.charpos eol = False while 1: self.fillbuf() if eol: c = self.buf[self.charpos] - # handle '\r\n' - if c == '\n': + # handle b'\r\n' + if c == b'\n': linebuf += c self.charpos += 1 break @@ -235,7 +235,7 @@ class PSBaseParser(object): if m: linebuf += self.buf[self.charpos:m.end(0)] self.charpos = m.end(0) - if linebuf[-1] == '\r': + if linebuf[-1] == b'\r': eol = True else: break @@ -253,7 +253,7 @@ class PSBaseParser(object): """ self.fp.seek(0, 2) pos = self.fp.tell() - buf = '' + buf = b'' while 0 < pos: prevpos = pos pos = max(0, pos-self.BUFSIZ) @@ -262,13 +262,13 @@ class PSBaseParser(object): if not s: break while 1: - n = max(s.rfind('\r'), s.rfind('\n')) + n = max(s.rfind(b'\r'), s.rfind(b'\n')) if n == -1: buf = s + buf break yield s[n:]+buf s = s[:n] - buf = '' + buf = b'' return def _parse_main(self, s, i): @@ -278,19 +278,19 @@ class PSBaseParser(object): j = m.start(0) c = s[j] self._curtokenpos = self.bufpos+j - if c == '%': - self._curtoken = '%' + if c == b'%': + self._curtoken = b'%' self._parse1 = self._parse_comment return j+1 - elif c == '/': - self._curtoken = '' + elif c == b'/': + self._curtoken = b'' self._parse1 = self._parse_literal return j+1 - elif c in '-+' or c.isdigit(): + elif c in b'-+' or c.isdigit(): self._curtoken = c self._parse1 = self._parse_number return j+1 - elif c == '.': + elif c == b'.': self._curtoken = c self._parse1 = self._parse_float return j+1 @@ -298,17 +298,17 @@ class PSBaseParser(object): self._curtoken = c self._parse1 = self._parse_keyword return j+1 - elif c == '(': - self._curtoken = '' + elif c == b'(': + self._curtoken = b'' self.paren = 1 self._parse1 = self._parse_string return j+1 - elif c == '<': - self._curtoken = '' + elif c == b'<': + self._curtoken = b'' self._parse1 = self._parse_wopen return j+1 - elif c == '>': - self._curtoken = '' + elif c == b'>': + self._curtoken = b'' self._parse1 = self._parse_wclose return j+1 else: @@ -339,11 +339,11 @@ class PSBaseParser(object): j = m.start(0) self._curtoken += s[i:j] c = s[j] - if c == '#': - self.hex = '' + if c == b'#': + self.hex = b'' self._parse1 = self._parse_literal_hex return j+1 - self._add_token(LIT(self._curtoken)) + self._add_token(LIT(unicode(self._curtoken))) self._parse1 = self._parse_main return j @@ -365,7 +365,7 @@ class PSBaseParser(object): j = m.start(0) self._curtoken += s[i:j] c = s[j] - if c == '.': + if c == b'.': self._curtoken += c self._parse1 = self._parse_float return j+1 @@ -397,9 +397,9 @@ class PSBaseParser(object): return len(s) j = m.start(0) self._curtoken += s[i:j] - if self._curtoken == 'true': + if self._curtoken == b'true': token = True - elif self._curtoken == 'false': + elif self._curtoken == b'false': token = False else: token = KWD(self._curtoken) @@ -415,20 +415,20 @@ class PSBaseParser(object): j = m.start(0) self._curtoken += s[i:j] c = s[j] - if c == '\\': - self.oct = '' + if c == b'\\': + self.oct = b'' self._parse1 = self._parse_string_1 return j+1 - if c == '(': + if c == b'(': self.paren += 1 self._curtoken += c return j+1 - if c == ')': + if c == b')': self.paren -= 1 if self.paren: # WTF, they said balanced parens need no special treatment. self._curtoken += c return j+1 - self._add_token(self._curtoken) + self._add_token(str(self._curtoken)) self._parse1 = self._parse_main return j+1 @@ -448,7 +448,7 @@ class PSBaseParser(object): def _parse_wopen(self, s, i): c = s[i] - if c == '<': + if c == b'<': self._add_token(KEYWORD_DICT_BEGIN) self._parse1 = self._parse_main i += 1 @@ -458,7 +458,7 @@ class PSBaseParser(object): def _parse_wclose(self, s, i): c = s[i] - if c == '>': + if c == b'>': self._add_token(KEYWORD_DICT_END) i += 1 self._parse1 = self._parse_main @@ -472,7 +472,7 @@ class PSBaseParser(object): j = m.start(0) self._curtoken += s[i:j] token = HEX_PAIR.sub(lambda m: chr(int(m.group(0), 16)), - SPC.sub('', self._curtoken)) + SPC.sub(b'', self._curtoken)) self._add_token(token) self._parse1 = self._parse_main return j @@ -616,7 +616,7 @@ import unittest ## class TestPSBaseParser(unittest.TestCase): - TESTDATA = r'''%!PS + TESTDATA = br'''%!PS begin end " @ # /a/BCD /Some_Name /foo#5f#xbaa @@ -637,18 +637,18 @@ func/a/b{(c)do*}def ''' TOKENS = [ - (5, KWD('begin')), (11, KWD('end')), (16, KWD('"')), (19, KWD('@')), - (21, KWD('#')), (23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')), + (5, KWD(b'begin')), (11, KWD(b'end')), (16, KWD(b'"')), (19, KWD(b'@')), + (21, KWD(b'#')), (23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')), (41, LIT('foo_xbaa')), (54, 0), (56, 1), (59, -2), (62, 0.5), - (65, 1.234), (71, 'abc'), (77, ''), (80, 'abc ( def ) ghi'), - (98, 'def \x00 4ghi'), (118, 'bach\\slask'), (132, 'foo\nbaa'), - (143, 'this % is not a comment.'), (170, 'foo\nbaa'), (180, 'foobaa'), - (191, ''), (194, ' '), (199, '@@ '), (211, '\xab\xcd\x00\x124\x05'), - (226, KWD('func')), (230, LIT('a')), (232, LIT('b')), - (234, KWD('{')), (235, 'c'), (238, KWD('do*')), (241, KWD('}')), - (242, KWD('def')), (246, KWD('[')), (248, 1), (250, 'z'), (254, KWD('!')), - (256, KWD(']')), (258, KWD('<<')), (261, LIT('foo')), (266, 'bar'), - (272, KWD('>>')) + (65, 1.234), (71, b'abc'), (77, b''), (80, b'abc ( def ) ghi'), + (98, b'def \x00 4ghi'), (118, b'bach\\slask'), (132, b'foo\nbaa'), + (143, b'this % is not a comment.'), (170, b'foo\nbaa'), (180, b'foobaa'), + (191, b''), (194, b' '), (199, b'@@ '), (211, b'\xab\xcd\x00\x124\x05'), + (226, KWD(b'func')), (230, LIT('a')), (232, LIT('b')), + (234, KWD(b'{')), (235, b'c'), (238, KWD(b'do*')), (241, KWD(b'}')), + (242, KWD(b'def')), (246, KWD(b'[')), (248, 1), (250, b'z'), (254, KWD(b'!')), + (256, KWD(b']')), (258, KWD(b'<<')), (261, LIT('foo')), (266, b'bar'), + (272, KWD(b'>>')) ] OBJS = [ diff --git a/pdfminer/rijndael.py b/pdfminer/rijndael.py index 6216737..756dd46 100644 --- a/pdfminer/rijndael.py +++ b/pdfminer/rijndael.py @@ -898,7 +898,7 @@ def rijndaelEncrypt(rk, nrounds, plaintext): Te3[(t2 ) & 0xff] ^ rk[p+3]) - ciphertext = '' + ciphertext = b'' # apply last round and # map cipher state to byte array block: @@ -1001,7 +1001,7 @@ def rijndaelDecrypt(rk, nrounds, ciphertext): Td3[(t0 ) & 0xff] ^ rk[p+3]) - plaintext = '' + plaintext = b'' # apply last round and # map cipher state to byte array block: @@ -1042,8 +1042,8 @@ def rijndaelDecrypt(rk, nrounds, ciphertext): class RijndaelDecryptor(object): """ - >>> key = '00010203050607080a0b0c0d0f101112'.decode('hex') - >>> ciphertext = 'd8f532538289ef7d06b506a4fd5be9c9'.decode('hex') + >>> key = b'00010203050607080a0b0c0d0f101112'.decode('hex') + >>> ciphertext = b'd8f532538289ef7d06b506a4fd5be9c9'.decode('hex') >>> RijndaelDecryptor(key, 128).decrypt(ciphertext).encode('hex') '506812a45f08c889b97f5980038b8359' """ @@ -1064,8 +1064,8 @@ class RijndaelDecryptor(object): class RijndaelEncryptor(object): """ - >>> key = '00010203050607080a0b0c0d0f101112'.decode('hex') - >>> plaintext = '506812a45f08c889b97f5980038b8359'.decode('hex') + >>> key = b'00010203050607080a0b0c0d0f101112'.decode('hex') + >>> plaintext = b'506812a45f08c889b97f5980038b8359'.decode('hex') >>> RijndaelEncryptor(key, 128).encrypt(plaintext).encode('hex') 'd8f532538289ef7d06b506a4fd5be9c9' """ diff --git a/pdfminer/runlength.py b/pdfminer/runlength.py index 4637198..ba7b742 100644 --- a/pdfminer/runlength.py +++ b/pdfminer/runlength.py @@ -19,28 +19,28 @@ def rldecode(data): 129 to 255, the following single byte is to be copied 257 - length (2 to 128) times during decompression. A length value of 128 denotes EOD. - >>> s = "\x05123456\xfa7\x04abcde\x80junk" + >>> s = b'\x05123456\xfa7\x04abcde\x80junk' >>> rldecode(s) '1234567777777abcde' """ decoded = [] i = 0 while i < len(data): - #print "data[%d]=:%d:" % (i,ord(data[i])) + #print 'data[%d]=:%d:' % (i,ord(data[i])) length = ord(data[i]) if length == 128: break if length >= 0 and length < 128: run = data[i+1:(i+1)+(length+1)] - #print "length=%d, run=%s" % (length+1,run) + #print 'length=%d, run=%s' % (length+1,run) decoded.append(run) i = (i+1) + (length+1) if length > 128: run = data[i+1]*(257-length) - #print "length=%d, run=%s" % (257-length,run) + #print 'length=%d, run=%s' % (257-length,run) decoded.append(run) i = (i+1) + 1 - return ''.join(decoded) + return b''.join(decoded) if __name__ == '__main__': diff --git a/pdfminer/utils.py b/pdfminer/utils.py index 91e19c0..b53c1c1 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -14,28 +14,28 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data): raise ValueError(bitspercomponent) nbytes = colors*columns*bitspercomponent//8 i = 0 - buf = '' - line0 = '\x00' * columns + buf = b'' + line0 = b'\x00' * columns for i in xrange(0, len(data), nbytes+1): ft = data[i] i += 1 line1 = data[i:i+nbytes] - line2 = '' - if ft == '\x00': + line2 = b'' + if ft == b'\x00': # PNG none line2 += line1 - elif ft == '\x01': + elif ft == b'\x01': # PNG sub (UNTESTED) c = 0 for b in line1: c = (c+ord(b)) & 255 line2 += chr(c) - elif ft == '\x02': + elif ft == b'\x02': # PNG up for (a, b) in zip(line0, line1): c = (ord(a)+ord(b)) & 255 line2 += chr(c) - elif ft == '\x03': + elif ft == b'\x03': # PNG average (UNTESTED) c = 0 for (a, b) in zip(line0, line1): @@ -176,7 +176,7 @@ def nunpack(s, default=0): elif l == 2: return struct.unpack('>H', s)[0] elif l == 3: - return struct.unpack('>L', '\x00'+s)[0] + return struct.unpack('>L', b'\x00'+s)[0] elif l == 4: return struct.unpack('>L', s)[0] else: @@ -222,7 +222,7 @@ PDFDocEncoding = ''.join(unichr(x) for x in ( def decode_text(s): """Decodes a PDFDocEncoding string to Unicode.""" - if s.startswith('\xfe\xff'): + if s.startswith(b'\xfe\xff'): return unicode(s[2:], 'utf-16be', 'ignore') else: return ''.join(PDFDocEncoding[ord(c)] for c in s)