#!/usr/bin/env python # # pycdb.py - Python implementation of cdb and tcdb # # by Yusuke Shinyama # * public domain * # import sys import os from struct import pack from struct import unpack from array import array # calc hash value with a given key def cdbhash(s, n=5381L): return reduce(lambda h,c: ((h*33) ^ ord(c)) & 0xffffffffL, s, n) if pack('=i',1) == pack('>i',1): # big endian def decode(x): a = array('I', x) a.byteswap() return a def encode(a): a.byteswap() return a.tostring() else: # little endian def decode(x): a = array('I', x) return a def encode(a): return a.tostring() ## CDB ## # cdbiter def cdbiter(fp, eod): kloc = 2048 while kloc < eod: fp.seek(kloc) (klen, vlen) = unpack('' % self.name def __getstate__(self): raise TypeError def __setstate__(self, dict): raise TypeError def __getitem__(self, k): k = str(k) if k in self._cache: return self._cache[k] h = cdbhash(k) h1 = h & 0xff (pos_bucket, ncells) = self._hash0[h1] if ncells == 0: raise KeyError(k) hs = self._hash1[h1] if hs == None: self._fp.seek(pos_bucket) hs = decode(self._fp.read(ncells * 8)) self._hash1[h1] = hs i = ((h >> 8) % ncells) * 2 n = ncells*2 for _ in xrange(ncells): p1 = hs[i+1] if p1 == 0: raise KeyError(k) if hs[i] == h: self._fp.seek(p1) (klen, vlen) = unpack('' % (self.fn, self.fntmp, self.numentries) def __len__(self): return self.numentries def __getstate__(self): raise TypeError def __setstate__(self, dict): raise TypeError def add(self, k, v): (k, v) = (str(k), str(v)) (klen, vlen) = (len(k), len(v)) self._fp.seek(self._pos) self._fp.write(pack('> 8) % blen)*2 while a[i+1]: # is cell[i] already occupied? i = (i+2) % len(a) a[i] = h a[i+1] = p self._fp.write(encode(a)) # write header self._fp.seek(0) a = array('I') for b1 in self._bucket: a.append(pos_hash) a.append(len(b1)) pos_hash += len(b1)*8 self._fp.write(encode(a)) # close self._fp.close() os.rename(self.fntmp, self.fn) return # txt2cdb def txt2cdb(self, lines): import re HEAD = re.compile(r'^\+(\d+),(\d+):') for line in lines: m = HEAD.match(line) if not m: break (klen, vlen) = (int(m.group(1)), int(m.group(2))) i = len(m.group(0)) k = line[i:i+klen] i += klen if line[i:i+2] != '->': raise ValueError('invalid separator: %r' % line) i += 2 v = line[i:i+vlen] self.add(k, v) return self # cdbdump def cdbdump(cdbname): fp = file(cdbname, 'rb') (eor,) = unpack('> 8) % ncells for i in xrange(ncells): self._fp.seek(pos_bucket + ((start+i) % ncells << 3)) (h1, p1) = unpack('%s' % (len(k), len(v), k, v)) for (k, v) in opts: if k == '-k': f = (lambda k,_: k) elif k == '-v': f = (lambda _,v: v) elif k == '-2': f = (lambda k,v: k+'\t'+v) for (k,v) in cdbdump(dbname): print f(k,v) print elif cmd == 'cmerge': dbs = [ cdbdump(fname) for fname in args ] m = CDBMaker(dbname, dbname+'.tmp') for (k,vs) in tcdbmerge(dbs): m.add(k, ' '.join(vs)) m.finish() # tcdb elif cmd == 'tmake': TCDBMaker(dbname, dbname+'.tmp').txt2tcdb(fileinput.input(args)).finish() elif cmd == 'tget': print repr(TCDBReader(dbname).lookup(args)) elif cmd == 'tdump': f = (lambda k,v: '%s%d,%d:%s->%s' % ('+'*len(k), len(k[-1]), len(v), k[-1], v)) for (k, v) in opts: if k == '-k': f = (lambda k,_: '/'.join(k)) elif k == '-v': f = (lambda _,v: v) elif k == '-2': f = (lambda k,v: '/'.join(k)+'\t'+v) for (k,v) in tcdbdump(dbname): print f(k,v) print elif cmd == 'tmerge': dbs = [ tcdbdump(fname) for fname in args ] m = TCDBMaker(dbname, dbname+'.tmp') for (k,vs) in tcdbmerge(dbs): m.put(len(k), k[-1], ' '.join(vs)) m.finish() else: return usage() return if __name__ == '__main__': sys.exit(main(sys.argv))