diff --git a/README b/README deleted file mode 100644 index 3d807fe..0000000 --- a/README +++ /dev/null @@ -1,42 +0,0 @@ - -Installation: - - 1. Get http://www.unixuser.org/~euske/pub/CMap.tar.bz2 - 2. $ tar jxf CMap.tar.bz2 - 3. $ make cdbcmap - - -Dump the contents: - - $ ./dumppdf.py foo.pdf - -Extract the text: - - $ ./pdf2txt.py foo.pdf > foo.xml - - -Terms and conditions: - - Copyright (c) 2004-2008 Yusuke Shinyama - - Permission is hereby granted, free of charge, to any person - obtaining a copy of this software and associated documentation - files (the "Software"), to deal in the Software without - restriction, including without limitation the rights to use, - copy, modify, merge, publish, distribute, sublicense, and/or - sell copies of the Software, and to permit persons to whom the - Software is furnished to do so, subject to the following - conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY - KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE - WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR - PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR - COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - diff --git a/README.html b/README.html new file mode 100644 index 0000000..e53bfa6 --- /dev/null +++ b/README.html @@ -0,0 +1,88 @@ + + +PDFMiner + + + +

PDFMiner

+ +

+PDFMiner is a suite of programs that help +extracting or analyzing text data from PDF documents. + +

+Homepage:
+ +http://www.unixuser.org/~euske/python/pdfminer/index.html + + +

+Download:
+ +http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20071231.tar.gz + + +

+Svn repository:
+ +http://pdfminerr.googlecode.com/svn/ + + +


+

Installation

+ +
    +
  1. Get + +http://www.unixuser.org/~euske/pub/CMap.tar.bz2 + +
  2. $ tar jxf CMap.tar.bz2 +
  3. $ make cdbcmap +
+ +
+

Usage

+ +

+Dump the contents: +

+$ ./dumppdf.py foo.pdf
+
+ +

+Extract the text: +

+$ ./pdf2txt.py foo.pdf > foo.xml
+
+ +
+

Terms and conditions

+

+ +Copyright (c) 2004-2008 Yusuke Shinyama <yusuke at cs dot nyu dot edu> +

+Permission is hereby granted, free of charge, to any person +obtaining a copy of this software and associated documentation +files (the "Software"), to deal in the Software without +restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following +conditions: +

+The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. +

+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY +KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +


+
Yusuke Shinyama
+ diff --git a/pycdb.py b/pycdb.py new file mode 100755 index 0000000..e1a4944 --- /dev/null +++ b/pycdb.py @@ -0,0 +1,477 @@ +#!/usr/bin/env python +# +# pycdb.py - Python implementation of cdb and tcdb +# +# by Yusuke Shinyama +# * public domain * +# + +import sys, os +from struct import pack, unpack +from array import array + + +# calc hash value with a given key +def cdbhash(s, n=5381L): + return reduce(lambda h,c: ((h*33) ^ ord(c)) & 0xffffffffL, s, n) + +if pack('=i',1) == pack('>i',1): + # big endian + def decode(x): + a = array('I', x) + a.byteswap() + return a + def encode(a): + a.byteswap() + return a.tostring() +else: + # little endian + def decode(x): + a = array('I', x) + return a + def encode(a): + return a.tostring() + + +## CDB +## + +# cdbiter +def cdbiter(fp, eod): + kloc = 2048 + while kloc < eod: + fp.seek(kloc) + (klen, vlen) = unpack('' % self.name + + def __getstate__(self): + raise TypeError + + def __setstate__(self, dict): + raise TypeError + + def __getitem__(self, k): + k = str(k) + if k in self._cache: return self._cache[k] + h = cdbhash(k) + h1 = h & 0xff + (pos_bucket, ncells) = self._hash0[h1] + if ncells == 0: raise KeyError(k) + hs = self._hash1[h1] + if hs == None: + self._fp.seek(pos_bucket) + hs = decode(self._fp.read(ncells * 8)) + self._hash1[h1] = hs + i = ((h >> 8) % ncells) * 2 + n = ncells*2 + for _ in xrange(ncells): + p1 = hs[i+1] + if p1 == 0: raise KeyError(k) + if hs[i] == h: + self._fp.seek(p1) + (klen, vlen) = unpack('' % (self.fn, self.fntmp, self.numentries) + + def __len__(self): + return self.numentries + + def __getstate__(self): + raise TypeError + + def __setstate__(self, dict): + raise TypeError + + def add(self, k, v): + (k, v) = (str(k), str(v)) + (klen, vlen) = (len(k), len(v)) + self._fp.seek(self._pos) + self._fp.write(pack('> 8) % blen)*2 + while a[i+1]: # is cell[i] already occupied? + i = (i+2) % len(a) + a[i] = h + a[i+1] = p + self._fp.write(encode(a)) + # write header + self._fp.seek(0) + a = array('I') + for b1 in self._bucket: + a.append(pos_hash) + a.append(len(b1)) + pos_hash += len(b1)*8 + self._fp.write(encode(a)) + # close + self._fp.close() + os.rename(self.fntmp, self.fn) + return + + # txt2cdb + def txt2cdb(self, lines): + import re + HEAD = re.compile(r'^\+(\d+),(\d+):') + for line in lines: + m = HEAD.match(line) + if not m: break + (klen, vlen) = (int(m.group(1)), int(m.group(2))) + i = len(m.group(0)) + k = line[i:i+klen] + i += klen + if line[i:i+2] != '->': raise ValueError('invalid separator: %r' % line) + i += 2 + v = line[i:i+vlen] + self.add(k, v) + return self + + +# cdbdump +def cdbdump(cdbname): + fp = file(cdbname, 'rb') + (eor,) = unpack('> 8) % ncells + for i in xrange(ncells): + self._fp.seek(pos_bucket + ((start+i) % ncells << 3)) + (h1, p1) = unpack('%s' % (len(k), len(v), k, v)) + for (k, v) in opts: + if k == '-k': f = (lambda k,_: k) + elif k == '-v': f = (lambda _,v: v) + elif k == '-2': f = (lambda k,v: k+'\t'+v) + for (k,v) in cdbdump(dbname): + print f(k,v) + print + elif cmd == 'cmerge': + dbs = [ cdbdump(fname) for fname in args ] + m = CDBMaker(dbname, dbname+'.tmp') + for (k,vs) in tcdbmerge(dbs): + m.add(k, ' '.join(vs)) + m.finish() + # tcdb + elif cmd == 'tmake': + TCDBMaker(dbname, dbname+'.tmp').txt2tcdb(fileinput.input(args)).finish() + elif cmd == 'tget': + print repr(TCDBReader(dbname).lookup(args)) + elif cmd == 'tdump': + f = (lambda k,v: '%s%d,%d:%s->%s' % ('+'*len(k), len(k[-1]), len(v), k[-1], v)) + for (k, v) in opts: + if k == '-k': f = (lambda k,_: '/'.join(k)) + elif k == '-v': f = (lambda _,v: v) + elif k == '-2': f = (lambda k,v: '/'.join(k)+'\t'+v) + for (k,v) in tcdbdump(dbname): + print f(k,v) + print + elif cmd == 'tmerge': + dbs = [ tcdbdump(fname) for fname in args ] + m = TCDBMaker(dbname, dbname+'.tmp') + for (k,vs) in tcdbmerge(dbs): + m.put(len(k), k[-1], ' '.join(vs)) + m.finish() + + else: + return usage() + return + +if __name__ == '__main__': sys.exit(main(sys.argv))