String-Bytes distinction (first attempt).

pull/1/head
Yusuke Shinyama 2014-06-30 19:05:56 +09:00
parent 8791355e1d
commit 1ccfaff411
16 changed files with 174 additions and 174 deletions

View File

@ -12,11 +12,11 @@ This code is in the public domain.
class Arcfour(object): class Arcfour(object):
""" """
>>> Arcfour('Key').process('Plaintext').encode('hex') >>> Arcfour(b'Key').process(b'Plaintext').encode('hex')
'bbf316e8d940af0ad3' 'bbf316e8d940af0ad3'
>>> Arcfour('Wiki').process('pedia').encode('hex') >>> Arcfour(b'Wiki').process(b'pedia').encode('hex')
'1021bf0420' '1021bf0420'
>>> Arcfour('Secret').process('Attack at dawn').encode('hex') >>> Arcfour(b'Secret').process(b'Attack at dawn').encode('hex')
'45a01f645fc35b383552544b9bf5' '45a01f645fc35b383552544b9bf5'
""" """
@ -34,7 +34,7 @@ class Arcfour(object):
def process(self, data): def process(self, data):
(i, j) = (self.i, self.j) (i, j) = (self.i, self.j)
s = self.s s = self.s
r = '' r = b''
for c in data: for c in data:
i = (i+1) % 256 i = (i+1) % 256
j = (j+s[i]) % 256 j = (j+s[i]) % 256

View File

@ -24,24 +24,24 @@ def ascii85decode(data):
The sample string is taken from: The sample string is taken from:
http://en.wikipedia.org/w/index.php?title=Ascii85 http://en.wikipedia.org/w/index.php?title=Ascii85
>>> ascii85decode('9jqo^BlbD-BleB1DJ+*+F(f,q') >>> ascii85decode(b'9jqo^BlbD-BleB1DJ+*+F(f,q')
'Man is distinguished' 'Man is distinguished'
>>> ascii85decode('E,9)oF*2M7/c~>') >>> ascii85decode(b'E,9)oF*2M7/c~>')
'pleasure.' 'pleasure.'
""" """
n = b = 0 n = b = 0
out = '' out = b''
for c in data: for c in data:
if '!' <= c and c <= 'u': if b'!' <= c and c <= b'u':
n += 1 n += 1
b = b*85+(ord(c)-33) b = b*85+(ord(c)-33)
if n == 5: if n == 5:
out += struct.pack('>L', b) out += struct.pack('>L', b)
n = b = 0 n = b = 0
elif c == 'z': elif c == b'z':
assert n == 0 assert n == 0
out += '\0\0\0\0' out += b'\0\0\0\0'
elif c == '~': elif c == b'~':
if n: if n:
for _ in range(5-n): for _ in range(5-n):
b = b*85+84 b = b*85+84
@ -64,19 +64,19 @@ def asciihexdecode(data):
the EOD marker after reading an odd number of hexadecimal digits, it the EOD marker after reading an odd number of hexadecimal digits, it
will behave as if a 0 followed the last digit. will behave as if a 0 followed the last digit.
>>> asciihexdecode('61 62 2e6364 65') >>> asciihexdecode(b'61 62 2e6364 65')
'ab.cde' 'ab.cde'
>>> asciihexdecode('61 62 2e6364 657>') >>> asciihexdecode(b'61 62 2e6364 657>')
'ab.cdep' 'ab.cdep'
>>> asciihexdecode('7>') >>> asciihexdecode(b'7>')
'p' 'p'
""" """
decode = (lambda hx: chr(int(hx, 16))) decode = (lambda hx: chr(int(hx, 16)))
out = map(decode, hex_re.findall(data)) out = map(decode, hex_re.findall(data))
m = trail_re.search(data) m = trail_re.search(data)
if m: if m:
out.append(decode("%c0" % m.group(1))) out.append(decode('%c0' % m.group(1)))
return ''.join(out) return b''.join(out)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -691,7 +691,7 @@ class CCITTFaxDecoder(CCITTG4Parser):
def __init__(self, width, bytealign=False, reversed=False): def __init__(self, width, bytealign=False, reversed=False):
CCITTG4Parser.__init__(self, width, bytealign=bytealign) CCITTG4Parser.__init__(self, width, bytealign=bytealign)
self.reversed = reversed self.reversed = reversed
self._buf = '' self._buf = b''
return return
def close(self): def close(self):

View File

@ -237,7 +237,7 @@ class CMapDB(object):
if os.path.exists(path): if os.path.exists(path):
gzfile = gzip.open(path) gzfile = gzip.open(path)
try: try:
return type(name, (), pickle.loads(gzfile.read())) return type(str(name), (), pickle.loads(gzfile.read()))
finally: finally:
gzfile.close() gzfile.close()
else: else:
@ -288,17 +288,17 @@ class CMapParser(PSStackParser):
def do_keyword(self, pos, token): def do_keyword(self, pos, token):
name = token.name name = token.name
if name == 'begincmap': if name == b'begincmap':
self._in_cmap = True self._in_cmap = True
self.popall() self.popall()
return return
elif name == 'endcmap': elif name == b'endcmap':
self._in_cmap = False self._in_cmap = False
return return
if not self._in_cmap: if not self._in_cmap:
return return
# #
if name == 'def': if name == b'def':
try: try:
((_, k), (_, v)) = self.pop(2) ((_, k), (_, v)) = self.pop(2)
self.cmap.set_attr(literal_name(k), v) self.cmap.set_attr(literal_name(k), v)
@ -306,7 +306,7 @@ class CMapParser(PSStackParser):
pass pass
return return
if name == 'usecmap': if name == b'usecmap':
try: try:
((_, cmapname),) = self.pop(1) ((_, cmapname),) = self.pop(1)
self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname))) self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
@ -316,17 +316,17 @@ class CMapParser(PSStackParser):
pass pass
return return
if name == 'begincodespacerange': if name == b'begincodespacerange':
self.popall() self.popall()
return return
if name == 'endcodespacerange': if name == b'endcodespacerange':
self.popall() self.popall()
return return
if name == 'begincidrange': if name == b'begincidrange':
self.popall() self.popall()
return return
if name == 'endcidrange': if name == b'endcidrange':
objs = [obj for (__, obj) in self.popall()] objs = [obj for (__, obj) in self.popall()]
for (s, e, cid) in choplist(3, objs): for (s, e, cid) in choplist(3, objs):
if (not isinstance(s, str) or not isinstance(e, str) or if (not isinstance(s, str) or not isinstance(e, str) or
@ -347,20 +347,20 @@ class CMapParser(PSStackParser):
self.cmap.add_code2cid(x, cid+i) self.cmap.add_code2cid(x, cid+i)
return return
if name == 'begincidchar': if name == b'begincidchar':
self.popall() self.popall()
return return
if name == 'endcidchar': if name == b'endcidchar':
objs = [obj for (__, obj) in self.popall()] objs = [obj for (__, obj) in self.popall()]
for (cid, code) in choplist(2, objs): for (cid, code) in choplist(2, objs):
if isinstance(code, str) and isinstance(cid, str): if isinstance(code, str) and isinstance(cid, str):
self.cmap.add_code2cid(code, nunpack(cid)) self.cmap.add_code2cid(code, nunpack(cid))
return return
if name == 'beginbfrange': if name == b'beginbfrange':
self.popall() self.popall()
return return
if name == 'endbfrange': if name == b'endbfrange':
objs = [obj for (__, obj) in self.popall()] objs = [obj for (__, obj) in self.popall()]
for (s, e, code) in choplist(3, objs): for (s, e, code) in choplist(3, objs):
if (not isinstance(s, str) or not isinstance(e, str) or if (not isinstance(s, str) or not isinstance(e, str) or
@ -382,20 +382,20 @@ class CMapParser(PSStackParser):
self.cmap.add_cid2unichr(s1+i, x) self.cmap.add_cid2unichr(s1+i, x)
return return
if name == 'beginbfchar': if name == b'beginbfchar':
self.popall() self.popall()
return return
if name == 'endbfchar': if name == b'endbfchar':
objs = [obj for (__, obj) in self.popall()] objs = [obj for (__, obj) in self.popall()]
for (cid, code) in choplist(2, objs): for (cid, code) in choplist(2, objs):
if isinstance(cid, str) and isinstance(code, str): if isinstance(cid, str) and isinstance(code, str):
self.cmap.add_cid2unichr(nunpack(cid), code) self.cmap.add_cid2unichr(nunpack(cid), code)
return return
if name == 'beginnotdefrange': if name == b'beginnotdefrange':
self.popall() self.popall()
return return
if name == 'endnotdefrange': if name == b'endnotdefrange':
self.popall() self.popall()
return return

View File

@ -35,7 +35,7 @@ class BMPWriter(object):
headersize = 14+40+ncols*4 headersize = 14+40+ncols*4
info = struct.pack('<IiiHHIIIIII', 40, self.width, self.height, 1, self.bits, 0, self.datasize, 0, 0, ncols, 0) info = struct.pack('<IiiHHIIIIII', 40, self.width, self.height, 1, self.bits, 0, self.datasize, 0, 0, ncols, 0)
assert len(info) == 40, len(info) assert len(info) == 40, len(info)
header = struct.pack('<ccIHHI', 'B', 'M', headersize+self.datasize, 0, 0, headersize) header = struct.pack('<ccIHHI', b'B', b'M', headersize+self.datasize, 0, 0, headersize)
assert len(header) == 14, len(header) assert len(header) == 14, len(header)
self.fp.write(header) self.fp.write(header)
self.fp.write(info) self.fp.write(info)

View File

@ -45,12 +45,12 @@ class LZWDecoder(object):
return v return v
def feed(self, code): def feed(self, code):
x = '' x = b''
if code == 256: if code == 256:
self.table = [chr(c) for c in xrange(256)] # 0-255 self.table = [chr(c) for c in xrange(256)] # 0-255
self.table.append(None) # 256 self.table.append(None) # 256
self.table.append(None) # 257 self.table.append(None) # 257
self.prevbuf = '' self.prevbuf = b''
self.nbits = 9 self.nbits = 9
elif code == 257: elif code == 257:
pass pass
@ -95,11 +95,11 @@ class LZWDecoder(object):
# lzwdecode # lzwdecode
def lzwdecode(data): def lzwdecode(data):
""" """
>>> lzwdecode('\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01') >>> lzwdecode(b'\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01')
'\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42' '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42'
""" """
fp = BytesIO(data) fp = BytesIO(data)
return ''.join(LZWDecoder(fp).run()) return b''.join(LZWDecoder(fp).run())
if __name__ == '__main__': if __name__ == '__main__':
import doctest import doctest

View File

@ -100,10 +100,10 @@ class PDFXRef(PDFBaseXRef):
raise PDFNoValidXRef('Unexpected EOF - file corrupted?') raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
if not line: if not line:
raise PDFNoValidXRef('Premature eof: %r' % parser) raise PDFNoValidXRef('Premature eof: %r' % parser)
if line.startswith('trailer'): if line.startswith(b'trailer'):
parser.seek(pos) parser.seek(pos)
break break
f = line.strip().split(' ') f = line.strip().split(b' ')
if len(f) != 2: if len(f) != 2:
raise PDFNoValidXRef('Trailer not found: %r: line=%r' % (parser, line)) raise PDFNoValidXRef('Trailer not found: %r: line=%r' % (parser, line))
try: try:
@ -115,11 +115,11 @@ class PDFXRef(PDFBaseXRef):
(_, line) = parser.nextline() (_, line) = parser.nextline()
except PSEOF: except PSEOF:
raise PDFNoValidXRef('Unexpected EOF - file corrupted?') raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
f = line.strip().split(' ') f = line.strip().split(b' ')
if len(f) != 3: if len(f) != 3:
raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line)) raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line))
(pos, genno, use) = f (pos, genno, use) = f
if use != 'n': if use != b'n':
continue continue
self.offsets[objid] = (None, long(pos), int(genno)) self.offsets[objid] = (None, long(pos), int(genno))
logging.info('xref objects: %r' % self.offsets) logging.info('xref objects: %r' % self.offsets)
@ -170,7 +170,7 @@ class PDFXRefFallback(PDFXRef):
(pos, line) = parser.nextline() (pos, line) = parser.nextline()
except PSEOF: except PSEOF:
break break
if line.startswith('trailer'): if line.startswith(b'trailer'):
parser.seek(pos) parser.seek(pos)
self.load_trailer(parser) self.load_trailer(parser)
logging.info('trailer: %r' % self.get_trailer()) logging.info('trailer: %r' % self.get_trailer())
@ -284,10 +284,10 @@ class PDFXRefStream(PDFBaseXRef):
## ##
class PDFStandardSecurityHandler(object): class PDFStandardSecurityHandler(object):
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz' PASSWORD_PADDING = b'(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
supported_revisions = (2, 3) supported_revisions = (2, 3)
def __init__(self, docid, param, password=''): def __init__(self, docid, param, password=b''):
self.docid = docid self.docid = docid
self.param = param self.param = param
self.password = password self.password = password
@ -331,7 +331,7 @@ class PDFStandardSecurityHandler(object):
hash.update(self.docid[0]) # 3 hash.update(self.docid[0]) # 3
result = ARC4.new(key).encrypt(hash.digest()) # 4 result = ARC4.new(key).encrypt(hash.digest()) # 4
for i in range(1, 20): # 5 for i in range(1, 20): # 5
k = ''.join(chr(ord(c) ^ i) for c in key) k = b''.join(chr(ord(c) ^ i) for c in key)
result = ARC4.new(k).encrypt(result) result = ARC4.new(k).encrypt(result)
result += result # 6 result += result # 6
return result return result
@ -345,7 +345,7 @@ class PDFStandardSecurityHandler(object):
hash.update(self.docid[0]) # 5 hash.update(self.docid[0]) # 5
if self.r >= 4: if self.r >= 4:
if not self.encrypt_metadata: if not self.encrypt_metadata:
hash.update('\xff\xff\xff\xff') hash.update(b'\xff\xff\xff\xff')
result = hash.digest() result = hash.digest()
n = 5 n = 5
if self.r >= 3: if self.r >= 3:
@ -388,7 +388,7 @@ class PDFStandardSecurityHandler(object):
else: else:
user_password = self.o user_password = self.o
for i in range(19, -1, -1): for i in range(19, -1, -1):
k = ''.join(chr(ord(c) ^ i) for c in key) k = b''.join(chr(ord(c) ^ i) for c in key)
user_password = ARC4.new(k).decrypt(user_password) user_password = ARC4.new(k).decrypt(user_password)
return self.authenticate_user_password(user_password) return self.authenticate_user_password(user_password)
@ -444,7 +444,7 @@ class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
return data return data
def decrypt_aes128(self, objid, genno, data): def decrypt_aes128(self, objid, genno, data):
key = self.key + struct.pack('<L', objid)[:3] + struct.pack('<L', genno)[:2] + "sAlT" key = self.key + struct.pack('<L', objid)[:3] + struct.pack('<L', genno)[:2] + b'sAlT'
hash = md5.md5(key) hash = md5.md5(key)
key = hash.digest()[:min(len(key), 16)] key = hash.digest()[:min(len(key), 16)]
return AES.new(key, mode=AES.MODE_CBC, IV=data[:16]).decrypt(data[16:]) return AES.new(key, mode=AES.MODE_CBC, IV=data[:16]).decrypt(data[16:])
@ -479,13 +479,13 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
hash = SHA256.new(password) hash = SHA256.new(password)
hash.update(self.o_key_salt) hash.update(self.o_key_salt)
hash.update(self.u) hash.update(self.u)
return AES.new(hash.digest(), mode=AES.MODE_CBC, IV='\x00' * 16).decrypt(self.oe) return AES.new(hash.digest(), mode=AES.MODE_CBC, IV=b'\x00' * 16).decrypt(self.oe)
hash = SHA256.new(password) hash = SHA256.new(password)
hash.update(self.u_validation_salt) hash.update(self.u_validation_salt)
if hash.digest() == self.u_hash: if hash.digest() == self.u_hash:
hash = SHA256.new(password) hash = SHA256.new(password)
hash.update(self.u_key_salt) hash.update(self.u_key_salt)
return AES.new(hash.digest(), mode=AES.MODE_CBC, IV='\x00' * 16).decrypt(self.ue) return AES.new(hash.digest(), mode=AES.MODE_CBC, IV=b'\x00' * 16).decrypt(self.ue)
def decrypt_aes256(self, objid, genno, data): def decrypt_aes256(self, objid, genno, data):
return AES.new(self.key, mode=AES.MODE_CBC, IV=data[:16]).decrypt(data[16:]) return AES.new(self.key, mode=AES.MODE_CBC, IV=data[:16]).decrypt(data[16:])
@ -517,7 +517,7 @@ class PDFDocument(object):
security_handler_registry[5] = PDFStandardSecurityHandlerV5 security_handler_registry[5] = PDFStandardSecurityHandlerV5
debug = 0 debug = 0
def __init__(self, parser, password='', caching=True, fallback=True): def __init__(self, parser, password=b'', caching=True, fallback=True):
"Set the document to use a given PDFParser object." "Set the document to use a given PDFParser object."
self.caching = caching self.caching = caching
self.xrefs = [] self.xrefs = []
@ -566,9 +566,9 @@ class PDFDocument(object):
raise PDFSyntaxError('Catalog not found!') raise PDFSyntaxError('Catalog not found!')
return return
# _initialize_password(password='') # _initialize_password(password=b'')
# Perform the initialization with a given password. # Perform the initialization with a given password.
def _initialize_password(self, password=''): def _initialize_password(self, password=b''):
(docid, param) = self.encryption (docid, param) = self.encryption
if literal_name(param.get('Filter')) != 'Standard': if literal_name(param.get('Filter')) != 'Standard':
raise PDFEncryptionError('Unknown filter: param=%r' % param) raise PDFEncryptionError('Unknown filter: param=%r' % param)
@ -740,7 +740,7 @@ class PDFDocument(object):
line = line.strip() line = line.strip()
if self.debug: if self.debug:
logging.debug('find_xref: %r' % line) logging.debug('find_xref: %r' % line)
if line == 'startxref': if line == b'startxref':
break break
if line: if line:
prev = line prev = line

View File

@ -88,15 +88,15 @@ class FontMetricsDB(object):
## ##
class Type1FontHeaderParser(PSStackParser): class Type1FontHeaderParser(PSStackParser):
KEYWORD_BEGIN = KWD('begin') KEYWORD_BEGIN = KWD(b'begin')
KEYWORD_END = KWD('end') KEYWORD_END = KWD(b'end')
KEYWORD_DEF = KWD('def') KEYWORD_DEF = KWD(b'def')
KEYWORD_PUT = KWD('put') KEYWORD_PUT = KWD(b'put')
KEYWORD_DICT = KWD('dict') KEYWORD_DICT = KWD(b'dict')
KEYWORD_ARRAY = KWD('array') KEYWORD_ARRAY = KWD(b'array')
KEYWORD_READONLY = KWD('readonly') KEYWORD_READONLY = KWD(b'readonly')
KEYWORD_FOR = KWD('for') KEYWORD_FOR = KWD(b'for')
KEYWORD_FOR = KWD('for') KEYWORD_FOR = KWD(b'for')
def __init__(self, data): def __init__(self, data):
PSStackParser.__init__(self, data) PSStackParser.__init__(self, data)
@ -311,13 +311,13 @@ class CFFFont(object):
self.gid2code = {} self.gid2code = {}
self.fp.seek(encoding_pos) self.fp.seek(encoding_pos)
format = self.fp.read(1) format = self.fp.read(1)
if format == '\x00': if format == b'\x00':
# Format 0 # Format 0
(n,) = struct.unpack('B', self.fp.read(1)) (n,) = struct.unpack('B', self.fp.read(1))
for (code, gid) in enumerate(struct.unpack('B'*n, self.fp.read(n))): for (code, gid) in enumerate(struct.unpack('B'*n, self.fp.read(n))):
self.code2gid[code] = gid self.code2gid[code] = gid
self.gid2code[gid] = code self.gid2code[gid] = code
elif format == '\x01': elif format == b'\x01':
# Format 1 # Format 1
(n,) = struct.unpack('B', self.fp.read(1)) (n,) = struct.unpack('B', self.fp.read(1))
code = 0 code = 0
@ -334,7 +334,7 @@ class CFFFont(object):
self.gid2name = {} self.gid2name = {}
self.fp.seek(charset_pos) self.fp.seek(charset_pos)
format = self.fp.read(1) format = self.fp.read(1)
if format == '\x00': if format == b'\x00':
# Format 0 # Format 0
n = self.nglyphs-1 n = self.nglyphs-1
for (gid, sid) in enumerate(struct.unpack('>'+'H'*n, self.fp.read(2*n))): for (gid, sid) in enumerate(struct.unpack('>'+'H'*n, self.fp.read(2*n))):
@ -342,7 +342,7 @@ class CFFFont(object):
name = self.getstr(sid) name = self.getstr(sid)
self.name2gid[name] = gid self.name2gid[name] = gid
self.gid2name[gid] = name self.gid2name[gid] = name
elif format == '\x01': elif format == b'\x01':
# Format 1 # Format 1
(n,) = struct.unpack('B', self.fp.read(1)) (n,) = struct.unpack('B', self.fp.read(1))
sid = 0 sid = 0
@ -353,7 +353,7 @@ class CFFFont(object):
self.name2gid[name] = gid self.name2gid[name] = gid
self.gid2name[gid] = name self.gid2name[gid] = name
sid += 1 sid += 1
elif format == '\x02': elif format == b'\x02':
# Format 2 # Format 2
assert 0 assert 0
else: else:

View File

@ -246,10 +246,10 @@ class PDFContentParser(PSStackParser):
self.charpos = 0 self.charpos = 0
return return
def get_inline_data(self, pos, target='EI'): def get_inline_data(self, pos, target=b'EI'):
self.seek(pos) self.seek(pos)
i = 0 i = 0
data = '' data = b''
while i <= len(target): while i <= len(target):
self.fillbuf() self.fillbuf()
if i: if i:
@ -273,16 +273,16 @@ class PDFContentParser(PSStackParser):
data += self.buf[self.charpos:] data += self.buf[self.charpos:]
self.charpos = len(self.buf) self.charpos = len(self.buf)
data = data[:-(len(target)+1)] # strip the last part data = data[:-(len(target)+1)] # strip the last part
data = re.sub(r'(\x0d\x0a|[\x0d\x0a])$', '', data) data = re.sub(br'(\x0d\x0a|[\x0d\x0a])$', b'', data)
return (pos, data) return (pos, data)
def flush(self): def flush(self):
self.add_results(*self.popall()) self.add_results(*self.popall())
return return
KEYWORD_BI = KWD('BI') KEYWORD_BI = KWD(b'BI')
KEYWORD_ID = KWD('ID') KEYWORD_ID = KWD(b'ID')
KEYWORD_EI = KWD('EI') KEYWORD_EI = KWD(b'EI')
def do_keyword(self, pos, token): def do_keyword(self, pos, token):
if token is self.KEYWORD_BI: if token is self.KEYWORD_BI:
@ -294,7 +294,7 @@ class PDFContentParser(PSStackParser):
if len(objs) % 2 != 0: if len(objs) % 2 != 0:
raise PSTypeError('Invalid dictionary construct: %r' % objs) raise PSTypeError('Invalid dictionary construct: %r' % objs)
d = dict((literal_name(k), v) for (k, v) in choplist(2, objs)) d = dict((literal_name(k), v) for (k, v) in choplist(2, objs))
(pos, data) = self.get_inline_data(pos+len('ID ')) (pos, data) = self.get_inline_data(pos+len(b'ID '))
obj = PDFStream(d, data) obj = PDFStream(d, data)
self.push((pos, obj)) self.push((pos, obj))
self.push((pos, self.KEYWORD_EI)) self.push((pos, self.KEYWORD_EI))

View File

@ -112,7 +112,7 @@ class PDFPage(object):
@classmethod @classmethod
def get_pages(klass, fp, def get_pages(klass, fp,
pagenos=None, maxpages=0, password='', pagenos=None, maxpages=0, password=b'',
caching=True, check_extractable=True): caching=True, check_extractable=True):
# Create a PDF parser object associated with the file object. # Create a PDF parser object associated with the file object.
parser = PDFParser(fp) parser = PDFParser(fp)

View File

@ -50,12 +50,12 @@ class PDFParser(PSStackParser):
self.doc = doc self.doc = doc
return return
KEYWORD_R = KWD('R') KEYWORD_R = KWD(b'R')
KEYWORD_NULL = KWD('null') KEYWORD_NULL = KWD(b'null')
KEYWORD_ENDOBJ = KWD('endobj') KEYWORD_ENDOBJ = KWD(b'endobj')
KEYWORD_STREAM = KWD('stream') KEYWORD_STREAM = KWD(b'stream')
KEYWORD_XREF = KWD('xref') KEYWORD_XREF = KWD(b'xref')
KEYWORD_STARTXREF = KWD('startxref') KEYWORD_STARTXREF = KWD(b'startxref')
def do_keyword(self, pos, token): def do_keyword(self, pos, token):
"""Handles PDF-related keywords.""" """Handles PDF-related keywords."""
@ -109,8 +109,8 @@ class PDFParser(PSStackParser):
if STRICT: if STRICT:
raise PDFSyntaxError('Unexpected EOF') raise PDFSyntaxError('Unexpected EOF')
break break
if 'endstream' in line: if b'endstream' in line:
i = line.index('endstream') i = line.index(b'endstream')
objlen += i objlen += i
if self.fallback: if self.fallback:
data += line[:i] data += line[:i]
@ -153,7 +153,7 @@ class PDFStreamParser(PDFParser):
self.add_results(*self.popall()) self.add_results(*self.popall())
return return
KEYWORD_OBJ = KWD('obj') KEYWORD_OBJ = KWD(b'obj')
def do_keyword(self, pos, token): def do_keyword(self, pos, token):
if token is self.KEYWORD_R: if token is self.KEYWORD_R:
# reference to indirect object # reference to indirect object
@ -169,7 +169,7 @@ class PDFStreamParser(PDFParser):
if STRICT: if STRICT:
# See PDF Spec 3.4.6: Only the object values are stored in the # See PDF Spec 3.4.6: Only the object values are stored in the
# stream; the obj and endobj keywords are not used. # stream; the obj and endobj keywords are not used.
raise PDFSyntaxError("Keyword endobj found in stream") raise PDFSyntaxError('Keyword endobj found in stream')
return return
# others # others
self.push((pos, token)) self.push((pos, token))

View File

@ -244,7 +244,7 @@ class PDFStream(PDFObject):
except zlib.error as e: except zlib.error as e:
if STRICT: if STRICT:
raise PDFException('Invalid zlib bytes: %r, %r' % (e, data)) raise PDFException('Invalid zlib bytes: %r, %r' % (e, data))
data = '' data = b''
elif f in LITERALS_LZW_DECODE: elif f in LITERALS_LZW_DECODE:
data = lzwdecode(data) data = lzwdecode(data)
elif f in LITERALS_ASCII85_DECODE: elif f in LITERALS_ASCII85_DECODE:

View File

@ -111,12 +111,12 @@ PSLiteralTable = PSSymbolTable(PSLiteral)
PSKeywordTable = PSSymbolTable(PSKeyword) PSKeywordTable = PSSymbolTable(PSKeyword)
LIT = PSLiteralTable.intern LIT = PSLiteralTable.intern
KWD = PSKeywordTable.intern KWD = PSKeywordTable.intern
KEYWORD_PROC_BEGIN = KWD('{') KEYWORD_PROC_BEGIN = KWD(b'{')
KEYWORD_PROC_END = KWD('}') KEYWORD_PROC_END = KWD(b'}')
KEYWORD_ARRAY_BEGIN = KWD('[') KEYWORD_ARRAY_BEGIN = KWD(b'[')
KEYWORD_ARRAY_END = KWD(']') KEYWORD_ARRAY_END = KWD(b']')
KEYWORD_DICT_BEGIN = KWD('<<') KEYWORD_DICT_BEGIN = KWD(b'<<')
KEYWORD_DICT_END = KWD('>>') KEYWORD_DICT_END = KWD(b'>>')
def literal_name(x): def literal_name(x):
@ -139,18 +139,18 @@ def keyword_name(x):
## PSBaseParser ## PSBaseParser
## ##
EOL = re.compile(r'[\r\n]') EOL = re.compile(br'[\r\n]')
SPC = re.compile(r'\s') SPC = re.compile(br'\s')
NONSPC = re.compile(r'\S') NONSPC = re.compile(br'\S')
HEX = re.compile(r'[0-9a-fA-F]') HEX = re.compile(br'[0-9a-fA-F]')
END_LITERAL = re.compile(r'[#/%\[\]()<>{}\s]') END_LITERAL = re.compile(br'[#/%\[\]()<>{}\s]')
END_HEX_STRING = re.compile(r'[^\s0-9a-fA-F]') END_HEX_STRING = re.compile(br'[^\s0-9a-fA-F]')
HEX_PAIR = re.compile(r'[0-9a-fA-F]{2}|.') HEX_PAIR = re.compile(br'[0-9a-fA-F]{2}|.')
END_NUMBER = re.compile(r'[^0-9]') END_NUMBER = re.compile(br'[^0-9]')
END_KEYWORD = re.compile(r'[#/%\[\]()<>{}\s]') END_KEYWORD = re.compile(br'[#/%\[\]()<>{}\s]')
END_STRING = re.compile(r'[()\134]') END_STRING = re.compile(br'[()\134]')
OCT_STRING = re.compile(r'[0-7]') OCT_STRING = re.compile(br'[0-7]')
ESC_STRING = {'b': 8, 't': 9, 'n': 10, 'f': 12, 'r': 13, '(': 40, ')': 41, '\\': 92} ESC_STRING = {b'b': 8, b't': 9, b'n': 10, b'f': 12, b'r': 13, b'(': 40, b')': 41, b'\\': 92}
class PSBaseParser(object): class PSBaseParser(object):
@ -196,11 +196,11 @@ class PSBaseParser(object):
self.fp.seek(pos) self.fp.seek(pos)
# reset the status for nextline() # reset the status for nextline()
self.bufpos = pos self.bufpos = pos
self.buf = '' self.buf = b''
self.charpos = 0 self.charpos = 0
# reset the status for nexttoken() # reset the status for nexttoken()
self._parse1 = self._parse_main self._parse1 = self._parse_main
self._curtoken = '' self._curtoken = b''
self._curtokenpos = 0 self._curtokenpos = 0
self._tokens = [] self._tokens = []
return return
@ -219,15 +219,15 @@ class PSBaseParser(object):
def nextline(self): def nextline(self):
"""Fetches a next line that ends either with \\r or \\n. """Fetches a next line that ends either with \\r or \\n.
""" """
linebuf = '' linebuf = b''
linepos = self.bufpos + self.charpos linepos = self.bufpos + self.charpos
eol = False eol = False
while 1: while 1:
self.fillbuf() self.fillbuf()
if eol: if eol:
c = self.buf[self.charpos] c = self.buf[self.charpos]
# handle '\r\n' # handle b'\r\n'
if c == '\n': if c == b'\n':
linebuf += c linebuf += c
self.charpos += 1 self.charpos += 1
break break
@ -235,7 +235,7 @@ class PSBaseParser(object):
if m: if m:
linebuf += self.buf[self.charpos:m.end(0)] linebuf += self.buf[self.charpos:m.end(0)]
self.charpos = m.end(0) self.charpos = m.end(0)
if linebuf[-1] == '\r': if linebuf[-1] == b'\r':
eol = True eol = True
else: else:
break break
@ -253,7 +253,7 @@ class PSBaseParser(object):
""" """
self.fp.seek(0, 2) self.fp.seek(0, 2)
pos = self.fp.tell() pos = self.fp.tell()
buf = '' buf = b''
while 0 < pos: while 0 < pos:
prevpos = pos prevpos = pos
pos = max(0, pos-self.BUFSIZ) pos = max(0, pos-self.BUFSIZ)
@ -262,13 +262,13 @@ class PSBaseParser(object):
if not s: if not s:
break break
while 1: while 1:
n = max(s.rfind('\r'), s.rfind('\n')) n = max(s.rfind(b'\r'), s.rfind(b'\n'))
if n == -1: if n == -1:
buf = s + buf buf = s + buf
break break
yield s[n:]+buf yield s[n:]+buf
s = s[:n] s = s[:n]
buf = '' buf = b''
return return
def _parse_main(self, s, i): def _parse_main(self, s, i):
@ -278,19 +278,19 @@ class PSBaseParser(object):
j = m.start(0) j = m.start(0)
c = s[j] c = s[j]
self._curtokenpos = self.bufpos+j self._curtokenpos = self.bufpos+j
if c == '%': if c == b'%':
self._curtoken = '%' self._curtoken = b'%'
self._parse1 = self._parse_comment self._parse1 = self._parse_comment
return j+1 return j+1
elif c == '/': elif c == b'/':
self._curtoken = '' self._curtoken = b''
self._parse1 = self._parse_literal self._parse1 = self._parse_literal
return j+1 return j+1
elif c in '-+' or c.isdigit(): elif c in b'-+' or c.isdigit():
self._curtoken = c self._curtoken = c
self._parse1 = self._parse_number self._parse1 = self._parse_number
return j+1 return j+1
elif c == '.': elif c == b'.':
self._curtoken = c self._curtoken = c
self._parse1 = self._parse_float self._parse1 = self._parse_float
return j+1 return j+1
@ -298,17 +298,17 @@ class PSBaseParser(object):
self._curtoken = c self._curtoken = c
self._parse1 = self._parse_keyword self._parse1 = self._parse_keyword
return j+1 return j+1
elif c == '(': elif c == b'(':
self._curtoken = '' self._curtoken = b''
self.paren = 1 self.paren = 1
self._parse1 = self._parse_string self._parse1 = self._parse_string
return j+1 return j+1
elif c == '<': elif c == b'<':
self._curtoken = '' self._curtoken = b''
self._parse1 = self._parse_wopen self._parse1 = self._parse_wopen
return j+1 return j+1
elif c == '>': elif c == b'>':
self._curtoken = '' self._curtoken = b''
self._parse1 = self._parse_wclose self._parse1 = self._parse_wclose
return j+1 return j+1
else: else:
@ -339,11 +339,11 @@ class PSBaseParser(object):
j = m.start(0) j = m.start(0)
self._curtoken += s[i:j] self._curtoken += s[i:j]
c = s[j] c = s[j]
if c == '#': if c == b'#':
self.hex = '' self.hex = b''
self._parse1 = self._parse_literal_hex self._parse1 = self._parse_literal_hex
return j+1 return j+1
self._add_token(LIT(self._curtoken)) self._add_token(LIT(unicode(self._curtoken)))
self._parse1 = self._parse_main self._parse1 = self._parse_main
return j return j
@ -365,7 +365,7 @@ class PSBaseParser(object):
j = m.start(0) j = m.start(0)
self._curtoken += s[i:j] self._curtoken += s[i:j]
c = s[j] c = s[j]
if c == '.': if c == b'.':
self._curtoken += c self._curtoken += c
self._parse1 = self._parse_float self._parse1 = self._parse_float
return j+1 return j+1
@ -397,9 +397,9 @@ class PSBaseParser(object):
return len(s) return len(s)
j = m.start(0) j = m.start(0)
self._curtoken += s[i:j] self._curtoken += s[i:j]
if self._curtoken == 'true': if self._curtoken == b'true':
token = True token = True
elif self._curtoken == 'false': elif self._curtoken == b'false':
token = False token = False
else: else:
token = KWD(self._curtoken) token = KWD(self._curtoken)
@ -415,20 +415,20 @@ class PSBaseParser(object):
j = m.start(0) j = m.start(0)
self._curtoken += s[i:j] self._curtoken += s[i:j]
c = s[j] c = s[j]
if c == '\\': if c == b'\\':
self.oct = '' self.oct = b''
self._parse1 = self._parse_string_1 self._parse1 = self._parse_string_1
return j+1 return j+1
if c == '(': if c == b'(':
self.paren += 1 self.paren += 1
self._curtoken += c self._curtoken += c
return j+1 return j+1
if c == ')': if c == b')':
self.paren -= 1 self.paren -= 1
if self.paren: # WTF, they said balanced parens need no special treatment. if self.paren: # WTF, they said balanced parens need no special treatment.
self._curtoken += c self._curtoken += c
return j+1 return j+1
self._add_token(self._curtoken) self._add_token(str(self._curtoken))
self._parse1 = self._parse_main self._parse1 = self._parse_main
return j+1 return j+1
@ -448,7 +448,7 @@ class PSBaseParser(object):
def _parse_wopen(self, s, i): def _parse_wopen(self, s, i):
c = s[i] c = s[i]
if c == '<': if c == b'<':
self._add_token(KEYWORD_DICT_BEGIN) self._add_token(KEYWORD_DICT_BEGIN)
self._parse1 = self._parse_main self._parse1 = self._parse_main
i += 1 i += 1
@ -458,7 +458,7 @@ class PSBaseParser(object):
def _parse_wclose(self, s, i): def _parse_wclose(self, s, i):
c = s[i] c = s[i]
if c == '>': if c == b'>':
self._add_token(KEYWORD_DICT_END) self._add_token(KEYWORD_DICT_END)
i += 1 i += 1
self._parse1 = self._parse_main self._parse1 = self._parse_main
@ -472,7 +472,7 @@ class PSBaseParser(object):
j = m.start(0) j = m.start(0)
self._curtoken += s[i:j] self._curtoken += s[i:j]
token = HEX_PAIR.sub(lambda m: chr(int(m.group(0), 16)), token = HEX_PAIR.sub(lambda m: chr(int(m.group(0), 16)),
SPC.sub('', self._curtoken)) SPC.sub(b'', self._curtoken))
self._add_token(token) self._add_token(token)
self._parse1 = self._parse_main self._parse1 = self._parse_main
return j return j
@ -616,7 +616,7 @@ import unittest
## ##
class TestPSBaseParser(unittest.TestCase): class TestPSBaseParser(unittest.TestCase):
TESTDATA = r'''%!PS TESTDATA = br'''%!PS
begin end begin end
" @ # " @ #
/a/BCD /Some_Name /foo#5f#xbaa /a/BCD /Some_Name /foo#5f#xbaa
@ -637,18 +637,18 @@ func/a/b{(c)do*}def
''' '''
TOKENS = [ TOKENS = [
(5, KWD('begin')), (11, KWD('end')), (16, KWD('"')), (19, KWD('@')), (5, KWD(b'begin')), (11, KWD(b'end')), (16, KWD(b'"')), (19, KWD(b'@')),
(21, KWD('#')), (23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')), (21, KWD(b'#')), (23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')),
(41, LIT('foo_xbaa')), (54, 0), (56, 1), (59, -2), (62, 0.5), (41, LIT('foo_xbaa')), (54, 0), (56, 1), (59, -2), (62, 0.5),
(65, 1.234), (71, 'abc'), (77, ''), (80, 'abc ( def ) ghi'), (65, 1.234), (71, b'abc'), (77, b''), (80, b'abc ( def ) ghi'),
(98, 'def \x00 4ghi'), (118, 'bach\\slask'), (132, 'foo\nbaa'), (98, b'def \x00 4ghi'), (118, b'bach\\slask'), (132, b'foo\nbaa'),
(143, 'this % is not a comment.'), (170, 'foo\nbaa'), (180, 'foobaa'), (143, b'this % is not a comment.'), (170, b'foo\nbaa'), (180, b'foobaa'),
(191, ''), (194, ' '), (199, '@@ '), (211, '\xab\xcd\x00\x124\x05'), (191, b''), (194, b' '), (199, b'@@ '), (211, b'\xab\xcd\x00\x124\x05'),
(226, KWD('func')), (230, LIT('a')), (232, LIT('b')), (226, KWD(b'func')), (230, LIT('a')), (232, LIT('b')),
(234, KWD('{')), (235, 'c'), (238, KWD('do*')), (241, KWD('}')), (234, KWD(b'{')), (235, b'c'), (238, KWD(b'do*')), (241, KWD(b'}')),
(242, KWD('def')), (246, KWD('[')), (248, 1), (250, 'z'), (254, KWD('!')), (242, KWD(b'def')), (246, KWD(b'[')), (248, 1), (250, b'z'), (254, KWD(b'!')),
(256, KWD(']')), (258, KWD('<<')), (261, LIT('foo')), (266, 'bar'), (256, KWD(b']')), (258, KWD(b'<<')), (261, LIT('foo')), (266, b'bar'),
(272, KWD('>>')) (272, KWD(b'>>'))
] ]
OBJS = [ OBJS = [

View File

@ -898,7 +898,7 @@ def rijndaelEncrypt(rk, nrounds, plaintext):
Te3[(t2 ) & 0xff] ^ Te3[(t2 ) & 0xff] ^
rk[p+3]) rk[p+3])
ciphertext = '' ciphertext = b''
# apply last round and # apply last round and
# map cipher state to byte array block: # map cipher state to byte array block:
@ -1001,7 +1001,7 @@ def rijndaelDecrypt(rk, nrounds, ciphertext):
Td3[(t0 ) & 0xff] ^ Td3[(t0 ) & 0xff] ^
rk[p+3]) rk[p+3])
plaintext = '' plaintext = b''
# apply last round and # apply last round and
# map cipher state to byte array block: # map cipher state to byte array block:
@ -1042,8 +1042,8 @@ def rijndaelDecrypt(rk, nrounds, ciphertext):
class RijndaelDecryptor(object): class RijndaelDecryptor(object):
""" """
>>> key = '00010203050607080a0b0c0d0f101112'.decode('hex') >>> key = b'00010203050607080a0b0c0d0f101112'.decode('hex')
>>> ciphertext = 'd8f532538289ef7d06b506a4fd5be9c9'.decode('hex') >>> ciphertext = b'd8f532538289ef7d06b506a4fd5be9c9'.decode('hex')
>>> RijndaelDecryptor(key, 128).decrypt(ciphertext).encode('hex') >>> RijndaelDecryptor(key, 128).decrypt(ciphertext).encode('hex')
'506812a45f08c889b97f5980038b8359' '506812a45f08c889b97f5980038b8359'
""" """
@ -1064,8 +1064,8 @@ class RijndaelDecryptor(object):
class RijndaelEncryptor(object): class RijndaelEncryptor(object):
""" """
>>> key = '00010203050607080a0b0c0d0f101112'.decode('hex') >>> key = b'00010203050607080a0b0c0d0f101112'.decode('hex')
>>> plaintext = '506812a45f08c889b97f5980038b8359'.decode('hex') >>> plaintext = b'506812a45f08c889b97f5980038b8359'.decode('hex')
>>> RijndaelEncryptor(key, 128).encrypt(plaintext).encode('hex') >>> RijndaelEncryptor(key, 128).encrypt(plaintext).encode('hex')
'd8f532538289ef7d06b506a4fd5be9c9' 'd8f532538289ef7d06b506a4fd5be9c9'
""" """

View File

@ -19,28 +19,28 @@ def rldecode(data):
129 to 255, the following single byte is to be copied 257 - length 129 to 255, the following single byte is to be copied 257 - length
(2 to 128) times during decompression. A length value of 128 (2 to 128) times during decompression. A length value of 128
denotes EOD. denotes EOD.
>>> s = "\x05123456\xfa7\x04abcde\x80junk" >>> s = b'\x05123456\xfa7\x04abcde\x80junk'
>>> rldecode(s) >>> rldecode(s)
'1234567777777abcde' '1234567777777abcde'
""" """
decoded = [] decoded = []
i = 0 i = 0
while i < len(data): while i < len(data):
#print "data[%d]=:%d:" % (i,ord(data[i])) #print 'data[%d]=:%d:' % (i,ord(data[i]))
length = ord(data[i]) length = ord(data[i])
if length == 128: if length == 128:
break break
if length >= 0 and length < 128: if length >= 0 and length < 128:
run = data[i+1:(i+1)+(length+1)] run = data[i+1:(i+1)+(length+1)]
#print "length=%d, run=%s" % (length+1,run) #print 'length=%d, run=%s' % (length+1,run)
decoded.append(run) decoded.append(run)
i = (i+1) + (length+1) i = (i+1) + (length+1)
if length > 128: if length > 128:
run = data[i+1]*(257-length) run = data[i+1]*(257-length)
#print "length=%d, run=%s" % (257-length,run) #print 'length=%d, run=%s' % (257-length,run)
decoded.append(run) decoded.append(run)
i = (i+1) + 1 i = (i+1) + 1
return ''.join(decoded) return b''.join(decoded)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -14,28 +14,28 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
raise ValueError(bitspercomponent) raise ValueError(bitspercomponent)
nbytes = colors*columns*bitspercomponent//8 nbytes = colors*columns*bitspercomponent//8
i = 0 i = 0
buf = '' buf = b''
line0 = '\x00' * columns line0 = b'\x00' * columns
for i in xrange(0, len(data), nbytes+1): for i in xrange(0, len(data), nbytes+1):
ft = data[i] ft = data[i]
i += 1 i += 1
line1 = data[i:i+nbytes] line1 = data[i:i+nbytes]
line2 = '' line2 = b''
if ft == '\x00': if ft == b'\x00':
# PNG none # PNG none
line2 += line1 line2 += line1
elif ft == '\x01': elif ft == b'\x01':
# PNG sub (UNTESTED) # PNG sub (UNTESTED)
c = 0 c = 0
for b in line1: for b in line1:
c = (c+ord(b)) & 255 c = (c+ord(b)) & 255
line2 += chr(c) line2 += chr(c)
elif ft == '\x02': elif ft == b'\x02':
# PNG up # PNG up
for (a, b) in zip(line0, line1): for (a, b) in zip(line0, line1):
c = (ord(a)+ord(b)) & 255 c = (ord(a)+ord(b)) & 255
line2 += chr(c) line2 += chr(c)
elif ft == '\x03': elif ft == b'\x03':
# PNG average (UNTESTED) # PNG average (UNTESTED)
c = 0 c = 0
for (a, b) in zip(line0, line1): for (a, b) in zip(line0, line1):
@ -176,7 +176,7 @@ def nunpack(s, default=0):
elif l == 2: elif l == 2:
return struct.unpack('>H', s)[0] return struct.unpack('>H', s)[0]
elif l == 3: elif l == 3:
return struct.unpack('>L', '\x00'+s)[0] return struct.unpack('>L', b'\x00'+s)[0]
elif l == 4: elif l == 4:
return struct.unpack('>L', s)[0] return struct.unpack('>L', s)[0]
else: else:
@ -222,7 +222,7 @@ PDFDocEncoding = ''.join(unichr(x) for x in (
def decode_text(s): def decode_text(s):
"""Decodes a PDFDocEncoding string to Unicode.""" """Decodes a PDFDocEncoding string to Unicode."""
if s.startswith('\xfe\xff'): if s.startswith(b'\xfe\xff'):
return unicode(s[2:], 'utf-16be', 'ignore') return unicode(s[2:], 'utf-16be', 'ignore')
else: else:
return ''.join(PDFDocEncoding[ord(c)] for c in s) return ''.join(PDFDocEncoding[ord(c)] for c in s)