2007-12-31 04:40:27 +00:00
|
|
|
#!/usr/bin/env python
|
2009-11-04 10:35:16 +00:00
|
|
|
|
|
|
|
""" pycdb - Python implementation of cdb and tcdb
|
|
|
|
|
|
|
|
This code is in the public domain.
|
|
|
|
|
|
|
|
cdb is a static data structure that was developed by
|
|
|
|
D. J. Bernstein. It is suitable for storing a large number
|
|
|
|
of items in a disk with efficient seeking.
|
|
|
|
|
|
|
|
tcdb is an extention of cdb that support hierarchical lookup.
|
|
|
|
|
|
|
|
"""
|
2007-12-31 04:40:27 +00:00
|
|
|
|
2009-11-03 01:27:30 +00:00
|
|
|
import sys
|
|
|
|
import os
|
2009-11-03 13:39:34 +00:00
|
|
|
from struct import pack, unpack
|
2007-12-31 04:40:27 +00:00
|
|
|
from array import array
|
|
|
|
|
|
|
|
|
|
|
|
# calc hash value with a given key
|
|
|
|
def cdbhash(s, n=5381L):
|
2009-10-24 04:41:59 +00:00
|
|
|
return reduce(lambda h,c: ((h*33) ^ ord(c)) & 0xffffffffL, s, n)
|
2007-12-31 04:40:27 +00:00
|
|
|
|
|
|
|
if pack('=i',1) == pack('>i',1):
|
2009-10-24 04:41:59 +00:00
|
|
|
# big endian
|
|
|
|
def decode(x):
|
|
|
|
a = array('I', x)
|
|
|
|
a.byteswap()
|
|
|
|
return a
|
|
|
|
def encode(a):
|
|
|
|
a.byteswap()
|
|
|
|
return a.tostring()
|
2007-12-31 04:40:27 +00:00
|
|
|
else:
|
2009-10-24 04:41:59 +00:00
|
|
|
# little endian
|
|
|
|
def decode(x):
|
|
|
|
a = array('I', x)
|
|
|
|
return a
|
|
|
|
def encode(a):
|
|
|
|
return a.tostring()
|
2007-12-31 04:40:27 +00:00
|
|
|
|
|
|
|
|
|
|
|
## CDB
|
|
|
|
##
|
|
|
|
|
|
|
|
# cdbiter
|
|
|
|
def cdbiter(fp, eod):
|
2009-10-24 04:41:59 +00:00
|
|
|
kloc = 2048
|
|
|
|
while kloc < eod:
|
|
|
|
fp.seek(kloc)
|
|
|
|
(klen, vlen) = unpack('<II', fp.read(8))
|
|
|
|
k = fp.read(klen)
|
|
|
|
v = fp.read(vlen)
|
|
|
|
kloc += 8+klen+vlen
|
|
|
|
yield (k,v)
|
|
|
|
fp.close()
|
|
|
|
return
|
2007-12-31 04:40:27 +00:00
|
|
|
|
|
|
|
|
|
|
|
# CDBReader
|
2008-07-09 15:15:32 +00:00
|
|
|
class CDBReader(object):
|
2007-12-31 04:40:27 +00:00
|
|
|
|
2009-10-24 04:41:59 +00:00
|
|
|
def __init__(self, cdbname, docache=1):
|
|
|
|
self.name = cdbname
|
|
|
|
self._fp = file(cdbname, 'rb')
|
|
|
|
hash0 = decode(self._fp.read(2048))
|
|
|
|
self._hash0 = [ (hash0[i], hash0[i+1]) for i in xrange(0, 512, 2) ]
|
|
|
|
self._hash1 = [ None ] * 256
|
|
|
|
self._eod = hash0[0]
|
|
|
|
self._docache = docache
|
|
|
|
self._cache = {}
|
|
|
|
self._keyiter = None
|
|
|
|
self._eachiter = None
|
|
|
|
return
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return '<CDBReader: %r>' % self.name
|
|
|
|
|
|
|
|
def __getstate__(self):
|
|
|
|
raise TypeError
|
|
|
|
|
|
|
|
def __setstate__(self, dict):
|
|
|
|
raise TypeError
|
|
|
|
|
|
|
|
def __getitem__(self, k):
|
|
|
|
k = str(k)
|
|
|
|
if k in self._cache: return self._cache[k]
|
|
|
|
h = cdbhash(k)
|
|
|
|
h1 = h & 0xff
|
|
|
|
(pos_bucket, ncells) = self._hash0[h1]
|
|
|
|
if ncells == 0: raise KeyError(k)
|
|
|
|
hs = self._hash1[h1]
|
2009-11-06 15:10:29 +00:00
|
|
|
if hs is None:
|
2009-10-24 04:41:59 +00:00
|
|
|
self._fp.seek(pos_bucket)
|
|
|
|
hs = decode(self._fp.read(ncells * 8))
|
|
|
|
self._hash1[h1] = hs
|
|
|
|
i = ((h >> 8) % ncells) * 2
|
|
|
|
n = ncells*2
|
|
|
|
for _ in xrange(ncells):
|
|
|
|
p1 = hs[i+1]
|
|
|
|
if p1 == 0: raise KeyError(k)
|
|
|
|
if hs[i] == h:
|
|
|
|
self._fp.seek(p1)
|
|
|
|
(klen, vlen) = unpack('<II', self._fp.read(8))
|
|
|
|
k1 = self._fp.read(klen)
|
|
|
|
if k1 == k:
|
|
|
|
v1 = self._fp.read(vlen)
|
|
|
|
if self._docache:
|
|
|
|
self._cache[k] = v1
|
|
|
|
return v1
|
|
|
|
i = (i+2) % n
|
|
|
|
raise KeyError(k)
|
|
|
|
|
|
|
|
def get(self, k, failed=None):
|
|
|
|
try:
|
|
|
|
return self.__getitem__(k)
|
|
|
|
except KeyError:
|
|
|
|
return failed
|
|
|
|
|
|
|
|
def has_key(self, k):
|
|
|
|
try:
|
|
|
|
self.__getitem__(k)
|
|
|
|
return True
|
|
|
|
except KeyError:
|
|
|
|
return False
|
|
|
|
|
|
|
|
def __contains__(self, k):
|
|
|
|
return self.has_key(k)
|
|
|
|
|
|
|
|
def firstkey(self):
|
|
|
|
self._keyiter = None
|
|
|
|
return self.nextkey()
|
|
|
|
|
|
|
|
def nextkey(self):
|
|
|
|
if not self._keyiter:
|
|
|
|
self._keyiter = ( k for (k,v) in cdbiter(self._fp, self._eod) )
|
|
|
|
try:
|
|
|
|
return self._keyiter.next()
|
|
|
|
except StopIteration:
|
|
|
|
return None
|
|
|
|
|
|
|
|
def each(self):
|
|
|
|
if not self._eachiter:
|
|
|
|
self._eachiter = cdbiter(self._fp, self._eod)
|
|
|
|
try:
|
|
|
|
return self._eachiter.next()
|
|
|
|
except StopIteration:
|
|
|
|
return None
|
|
|
|
|
|
|
|
def iterkeys(self):
|
|
|
|
return ( k for (k,v) in cdbiter(self._fp, self._eod) )
|
|
|
|
def itervalues(self):
|
|
|
|
return ( v for (k,v) in cdbiter(self._fp, self._eod) )
|
|
|
|
def iteritems(self):
|
|
|
|
return cdbiter(self._fp, self._eod)
|
2007-12-31 04:40:27 +00:00
|
|
|
|
|
|
|
|
|
|
|
# CDBMaker
|
2008-07-09 15:15:32 +00:00
|
|
|
class CDBMaker(object):
|
2007-12-31 04:40:27 +00:00
|
|
|
|
2009-10-24 04:41:59 +00:00
|
|
|
def __init__(self, cdbname, tmpname):
|
|
|
|
self.fn = cdbname
|
|
|
|
self.fntmp = tmpname
|
|
|
|
self.numentries = 0
|
|
|
|
self._fp = file(tmpname, 'wb')
|
|
|
|
self._pos = 2048 # sizeof((h,p))*256
|
|
|
|
self._bucket = [ array('I') for _ in xrange(256) ]
|
|
|
|
return
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return '<CDBMaker: %r, %r, %d ents>' % (self.fn, self.fntmp, self.numentries)
|
|
|
|
|
|
|
|
def __len__(self):
|
|
|
|
return self.numentries
|
|
|
|
|
|
|
|
def __getstate__(self):
|
|
|
|
raise TypeError
|
|
|
|
|
|
|
|
def __setstate__(self, dict):
|
|
|
|
raise TypeError
|
|
|
|
|
|
|
|
def add(self, k, v):
|
|
|
|
(k, v) = (str(k), str(v))
|
|
|
|
(klen, vlen) = (len(k), len(v))
|
|
|
|
self._fp.seek(self._pos)
|
|
|
|
self._fp.write(pack('<II', klen, vlen))
|
|
|
|
self._fp.write(k)
|
|
|
|
self._fp.write(v)
|
|
|
|
h = cdbhash(k)
|
|
|
|
b = self._bucket[h % 256]
|
|
|
|
b.append(h)
|
|
|
|
b.append(self._pos)
|
|
|
|
# sizeof(keylen)+sizeof(datalen)+sizeof(key)+sizeof(data)
|
|
|
|
self._pos += 8+klen+vlen
|
|
|
|
self.numentries += 1
|
|
|
|
return self
|
|
|
|
|
|
|
|
def finish(self):
|
|
|
|
self._fp.seek(self._pos)
|
|
|
|
pos_hash = self._pos
|
|
|
|
# write hashes
|
|
|
|
for b1 in self._bucket:
|
|
|
|
if not b1: continue
|
|
|
|
blen = len(b1)
|
|
|
|
a = array('I', [0]*blen*2)
|
|
|
|
for j in xrange(0, blen, 2):
|
|
|
|
(h,p) = (b1[j],b1[j+1])
|
|
|
|
i = ((h >> 8) % blen)*2
|
|
|
|
while a[i+1]: # is cell[i] already occupied?
|
|
|
|
i = (i+2) % len(a)
|
|
|
|
a[i] = h
|
|
|
|
a[i+1] = p
|
|
|
|
self._fp.write(encode(a))
|
|
|
|
# write header
|
|
|
|
self._fp.seek(0)
|
|
|
|
a = array('I')
|
|
|
|
for b1 in self._bucket:
|
|
|
|
a.append(pos_hash)
|
|
|
|
a.append(len(b1))
|
|
|
|
pos_hash += len(b1)*8
|
|
|
|
self._fp.write(encode(a))
|
|
|
|
# close
|
|
|
|
self._fp.close()
|
|
|
|
os.rename(self.fntmp, self.fn)
|
|
|
|
return
|
|
|
|
|
|
|
|
# txt2cdb
|
|
|
|
def txt2cdb(self, lines):
|
|
|
|
import re
|
|
|
|
HEAD = re.compile(r'^\+(\d+),(\d+):')
|
|
|
|
for line in lines:
|
|
|
|
m = HEAD.match(line)
|
|
|
|
if not m: break
|
|
|
|
(klen, vlen) = (int(m.group(1)), int(m.group(2)))
|
|
|
|
i = len(m.group(0))
|
|
|
|
k = line[i:i+klen]
|
|
|
|
i += klen
|
|
|
|
if line[i:i+2] != '->': raise ValueError('invalid separator: %r' % line)
|
|
|
|
i += 2
|
|
|
|
v = line[i:i+vlen]
|
|
|
|
self.add(k, v)
|
|
|
|
return self
|
2007-12-31 04:40:27 +00:00
|
|
|
|
|
|
|
|
|
|
|
# cdbdump
|
|
|
|
def cdbdump(cdbname):
|
2009-10-24 04:41:59 +00:00
|
|
|
fp = file(cdbname, 'rb')
|
|
|
|
(eor,) = unpack('<I', fp.read(4))
|
|
|
|
return cdbiter(fp, eor)
|
2007-12-31 04:40:27 +00:00
|
|
|
|
|
|
|
|
|
|
|
# cdbmerge
|
|
|
|
def cdbmerge(iters):
|
2009-10-24 04:41:59 +00:00
|
|
|
q = []
|
|
|
|
for it in iters:
|
|
|
|
try:
|
|
|
|
q.append((it.next(),it))
|
|
|
|
except StopIteration:
|
|
|
|
pass
|
|
|
|
k0 = None
|
|
|
|
vs = None
|
|
|
|
while q:
|
|
|
|
q.sort()
|
|
|
|
((k,v),it) = q.pop(0)
|
|
|
|
if k0 != k:
|
|
|
|
if vs: yield (k0,vs)
|
|
|
|
vs = []
|
|
|
|
vs.append(v)
|
|
|
|
k0 = k
|
|
|
|
try:
|
|
|
|
q.append((it.next(),it))
|
|
|
|
except StopIteration:
|
|
|
|
continue
|
|
|
|
if vs: yield (k0,vs)
|
|
|
|
return
|
2007-12-31 04:40:27 +00:00
|
|
|
|
|
|
|
|
|
|
|
# aliases
|
|
|
|
cdbmake = CDBMaker
|
|
|
|
init = CDBReader
|
|
|
|
|
|
|
|
|
|
|
|
## TCDB
|
|
|
|
##
|
|
|
|
|
|
|
|
# tcdbiter
|
|
|
|
def tcdbiter(fp, eor):
|
2009-10-24 04:41:59 +00:00
|
|
|
locs = {}
|
|
|
|
fp.seek(eor)
|
|
|
|
while 1:
|
|
|
|
x = fp.read(8)
|
|
|
|
if not x: break
|
|
|
|
(h, pos) = unpack('<II', x)
|
|
|
|
if pos: locs[pos] = h
|
|
|
|
pos = 2048
|
|
|
|
fp.seek(pos)
|
|
|
|
key = ()
|
|
|
|
parents = [0]
|
|
|
|
while pos < eor:
|
|
|
|
(klen, vlen) = unpack('<II', fp.read(8))
|
|
|
|
k = fp.read(klen)
|
|
|
|
v = fp.read(vlen)
|
|
|
|
h = locs[pos]
|
|
|
|
for (i,p) in enumerate(parents):
|
|
|
|
if cdbhash(k, p+5381L) == h:
|
|
|
|
parents = parents[:i+1]
|
|
|
|
key = key[:i]
|
|
|
|
break
|
|
|
|
key += (k,)
|
|
|
|
yield (key, v)
|
|
|
|
parents.append(pos)
|
|
|
|
pos += 8+klen+vlen
|
|
|
|
fp.close()
|
|
|
|
return
|
2007-12-31 04:40:27 +00:00
|
|
|
|
|
|
|
|
|
|
|
# TCDBMaker
|
|
|
|
class TCDBMaker(CDBMaker):
|
|
|
|
|
2009-10-24 04:41:59 +00:00
|
|
|
def __init__(self, cdbname, tmpname):
|
|
|
|
CDBMaker.__init__(self, cdbname, tmpname)
|
|
|
|
self._parent = 0
|
|
|
|
self._stack = [self._parent]
|
|
|
|
return
|
|
|
|
|
|
|
|
def put(self, depth, k, v):
|
|
|
|
if depth == len(self._stack)+1:
|
|
|
|
self._stack.append(self._parent)
|
|
|
|
elif depth < len(self._stack):
|
|
|
|
self._stack = self._stack[:depth]
|
|
|
|
elif depth != len(self._stack):
|
|
|
|
raise ValueError('invalid depth: %d' % depth)
|
|
|
|
#
|
|
|
|
(k, v) = (str(k), str(v))
|
|
|
|
(klen, vlen) = (len(k), len(v))
|
|
|
|
self._parent = self._pos
|
|
|
|
# sizeof(keylen)+sizeof(datalen)+sizeof(key)+sizeof(data)
|
|
|
|
self._fp.seek(self._pos)
|
|
|
|
self._fp.write(pack('<II', klen, vlen))
|
|
|
|
self._fp.write(k)
|
|
|
|
self._fp.write(v)
|
|
|
|
self._pos += 4+4+klen+vlen
|
|
|
|
h = cdbhash(k, self._stack[-1]+5381L)
|
|
|
|
b = self._bucket[h % 256]
|
|
|
|
b.append(h)
|
|
|
|
b.append(self._parent)
|
|
|
|
self.numentries += 1
|
|
|
|
return self
|
|
|
|
|
|
|
|
def txt2tcdb(self, lines):
|
|
|
|
import re
|
|
|
|
HEAD = re.compile(r'^(\++)(\d+),(\d+):')
|
|
|
|
for line in lines:
|
|
|
|
m = HEAD.match(line)
|
|
|
|
if not m: break
|
|
|
|
(depth, klen, vlen) = (len(m.group(1)), int(m.group(2)), int(m.group(3)))
|
|
|
|
i = len(m.group(0))
|
|
|
|
k = line[i:i+klen]
|
|
|
|
i += klen
|
|
|
|
if line[i:i+2] != '->': raise ValueError('invalid separator: %r' % line)
|
|
|
|
i += 2
|
|
|
|
v = line[i:i+vlen]
|
|
|
|
self.put(depth, k, v)
|
|
|
|
return self
|
2007-12-31 04:40:27 +00:00
|
|
|
|
|
|
|
|
|
|
|
# TCDBReader
|
|
|
|
class TCDBReader(CDBReader):
|
|
|
|
|
2009-10-24 04:41:59 +00:00
|
|
|
def lookup(self, seq, parent=0L):
|
|
|
|
r = []
|
|
|
|
for k in seq:
|
|
|
|
(v, parent) = self.lookup1(k, parent)
|
|
|
|
r.append(v)
|
|
|
|
return r
|
|
|
|
|
|
|
|
def lookup1(self, k, parent=0L):
|
|
|
|
k = str(k)
|
|
|
|
if self._docache and (parent,k) in self._cache:
|
|
|
|
return self._cache[(parent,k)]
|
|
|
|
h = cdbhash(k, parent+5381L)
|
|
|
|
self._fp.seek((h % 256) << 3)
|
|
|
|
(pos_bucket, ncells) = unpack('<II', self._fp.read(8))
|
|
|
|
if ncells == 0: raise KeyError(k)
|
|
|
|
start = (h >> 8) % ncells
|
|
|
|
for i in xrange(ncells):
|
|
|
|
self._fp.seek(pos_bucket + ((start+i) % ncells << 3))
|
|
|
|
(h1, p1) = unpack('<II', self._fp.read(8))
|
|
|
|
if p1 == 0: raise KeyError(k)
|
|
|
|
if h1 == h:
|
|
|
|
self._fp.seek(p1)
|
|
|
|
(klen, vlen) = unpack('<II', self._fp.read(8))
|
|
|
|
k1 = self._fp.read(klen)
|
|
|
|
if k1 == k:
|
|
|
|
v1 = self._fp.read(vlen)
|
|
|
|
if self._docache:
|
|
|
|
self._cache[(parent,k)] = (v1,p1)
|
|
|
|
return (v1,p1)
|
|
|
|
raise KeyError(k)
|
|
|
|
|
|
|
|
def iterkeys(self):
|
|
|
|
return ( k for (k,v) in tcdbiter(self._fp, self._eod) )
|
|
|
|
def itervalues(self):
|
|
|
|
return ( v for (k,v) in tcdbiter(self._fp, self._eod) )
|
|
|
|
def iteritems(self):
|
|
|
|
return tcdbiter(self._fp, self._eod)
|
2007-12-31 04:40:27 +00:00
|
|
|
|
|
|
|
|
|
|
|
# tcdbdump
|
|
|
|
def tcdbdump(cdbname):
|
2009-10-24 04:41:59 +00:00
|
|
|
fp = file(cdbname, 'rb')
|
|
|
|
(eor,) = unpack('<I', fp.read(4))
|
|
|
|
return tcdbiter(fp, eor)
|
2007-12-31 04:40:27 +00:00
|
|
|
|
|
|
|
|
|
|
|
# aliases
|
|
|
|
tcdbmake = TCDBMaker
|
|
|
|
tcdbinit = TCDBReader
|
|
|
|
tcdbmerge = cdbmerge
|
|
|
|
|
|
|
|
|
|
|
|
# main
|
|
|
|
def main(argv):
|
2009-10-24 04:41:59 +00:00
|
|
|
import getopt, fileinput
|
|
|
|
def usage():
|
|
|
|
print 'usage: %s {cmake,cget,cdump,cmerge} [options] cdbname [args ...]' % argv[0]
|
|
|
|
print 'usage: %s {tmake,tget,tdump,tmerge} [options] tcdbname [args ...]' % argv[0]
|
|
|
|
return 100
|
|
|
|
args = argv[1:]
|
|
|
|
if not args: return usage()
|
|
|
|
cmd = args.pop(0)
|
|
|
|
try:
|
|
|
|
(opts, args) = getopt.getopt(args, 'kv2')
|
|
|
|
except getopt.GetoptError:
|
|
|
|
return usage()
|
|
|
|
if not args: return usage()
|
|
|
|
dbname = args.pop(0)
|
|
|
|
|
|
|
|
# cdb
|
|
|
|
if cmd == 'cmake':
|
|
|
|
CDBMaker(dbname, dbname+'.tmp').txt2cdb(fileinput.input(args)).finish()
|
|
|
|
elif cmd == 'cget':
|
|
|
|
print repr(CDBReader(dbname).get(args[0]))
|
|
|
|
elif cmd == 'cdump':
|
|
|
|
f = (lambda k,v: '+%d,%d:%s->%s' % (len(k), len(v), k, v))
|
|
|
|
for (k, v) in opts:
|
|
|
|
if k == '-k': f = (lambda k,_: k)
|
|
|
|
elif k == '-v': f = (lambda _,v: v)
|
|
|
|
elif k == '-2': f = (lambda k,v: k+'\t'+v)
|
|
|
|
for (k,v) in cdbdump(dbname):
|
|
|
|
print f(k,v)
|
|
|
|
print
|
|
|
|
elif cmd == 'cmerge':
|
|
|
|
dbs = [ cdbdump(fname) for fname in args ]
|
|
|
|
m = CDBMaker(dbname, dbname+'.tmp')
|
|
|
|
for (k,vs) in tcdbmerge(dbs):
|
|
|
|
m.add(k, ' '.join(vs))
|
|
|
|
m.finish()
|
|
|
|
# tcdb
|
|
|
|
elif cmd == 'tmake':
|
|
|
|
TCDBMaker(dbname, dbname+'.tmp').txt2tcdb(fileinput.input(args)).finish()
|
|
|
|
elif cmd == 'tget':
|
|
|
|
print repr(TCDBReader(dbname).lookup(args))
|
|
|
|
elif cmd == 'tdump':
|
|
|
|
f = (lambda k,v: '%s%d,%d:%s->%s' % ('+'*len(k), len(k[-1]), len(v), k[-1], v))
|
|
|
|
for (k, v) in opts:
|
|
|
|
if k == '-k': f = (lambda k,_: '/'.join(k))
|
|
|
|
elif k == '-v': f = (lambda _,v: v)
|
|
|
|
elif k == '-2': f = (lambda k,v: '/'.join(k)+'\t'+v)
|
|
|
|
for (k,v) in tcdbdump(dbname):
|
|
|
|
print f(k,v)
|
|
|
|
print
|
|
|
|
elif cmd == 'tmerge':
|
|
|
|
dbs = [ tcdbdump(fname) for fname in args ]
|
|
|
|
m = TCDBMaker(dbname, dbname+'.tmp')
|
|
|
|
for (k,vs) in tcdbmerge(dbs):
|
|
|
|
m.put(len(k), k[-1], ' '.join(vs))
|
|
|
|
m.finish()
|
|
|
|
|
|
|
|
else:
|
|
|
|
return usage()
|
|
|
|
return
|
2007-12-31 04:40:27 +00:00
|
|
|
|
|
|
|
if __name__ == '__main__': sys.exit(main(sys.argv))
|