documentation.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@7 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2007-12-31 04:40:27 +00:00
parent bf75331011
commit d558d31f26
3 changed files with 565 additions and 42 deletions

42
README
View File

@ -1,42 +0,0 @@
Installation:
1. Get http://www.unixuser.org/~euske/pub/CMap.tar.bz2
2. $ tar jxf CMap.tar.bz2
3. $ make cdbcmap
Dump the contents:
$ ./dumppdf.py foo.pdf
Extract the text:
$ ./pdf2txt.py foo.pdf > foo.xml
Terms and conditions:
Copyright (c) 2004-2008 Yusuke Shinyama <yusuke at cs dot nyu dot edu>
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

88
README.html Normal file
View File

@ -0,0 +1,88 @@
<html>
<head>
<title>PDFMiner</title>
</head>
<body>
<h1>PDFMiner</h1>
<p>
PDFMiner is a suite of programs that help
extracting or analyzing text data from PDF documents.
<p>
<strong>Homepage:</strong><br>
<a href="http://www.unixuser.org/~euske/python/pdfminer/index.html">
http://www.unixuser.org/~euske/python/pdfminer/index.html
</a>
<p>
<strong>Download:</strong><br>
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20071231.tar.gz">
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20071231.tar.gz
</a>
<P>
<strong>Svn repository:</strong><br>
<a href="http://pdfminerr.googlecode.com/svn/">
http://pdfminerr.googlecode.com/svn/
</a>
<hr>
<h2>Installation</h2>
<ol>
<li> Get
<a href="http://www.unixuser.org/~euske/pub/CMap.tar.bz2">
http://www.unixuser.org/~euske/pub/CMap.tar.bz2
</a>
<li> <code>$ tar jxf CMap.tar.bz2</code>
<li> <code>$ make cdbcmap</code>
</ol>
<hr>
<h2>Usage</h2>
<p>
<strong>Dump the contents:</strong>
<blockquote><pre>
$ ./dumppdf.py foo.pdf
</pre></blockquote>
<p>
<strong>Extract the text:</strong>
<blockquote><pre>
$ ./pdf2txt.py foo.pdf &gt; foo.xml
</pre></blockquote>
<hr>
<h2>Terms and conditions</h2>
<p>
<small>
Copyright (c) 2004-2008 Yusuke Shinyama &lt;yusuke at cs dot nyu dot edu&gt;
<p>
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
<p>
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
<p>
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
</small>
<hr>
<address>Yusuke Shinyama</address>
</body>

477
pycdb.py Executable file
View File

@ -0,0 +1,477 @@
#!/usr/bin/env python
#
# pycdb.py - Python implementation of cdb and tcdb
#
# by Yusuke Shinyama
# * public domain *
#
import sys, os
from struct import pack, unpack
from array import array
# calc hash value with a given key
def cdbhash(s, n=5381L):
return reduce(lambda h,c: ((h*33) ^ ord(c)) & 0xffffffffL, s, n)
if pack('=i',1) == pack('>i',1):
# big endian
def decode(x):
a = array('I', x)
a.byteswap()
return a
def encode(a):
a.byteswap()
return a.tostring()
else:
# little endian
def decode(x):
a = array('I', x)
return a
def encode(a):
return a.tostring()
## CDB
##
# cdbiter
def cdbiter(fp, eod):
kloc = 2048
while kloc < eod:
fp.seek(kloc)
(klen, vlen) = unpack('<II', fp.read(8))
k = fp.read(klen)
v = fp.read(vlen)
kloc += 8+klen+vlen
yield (k,v)
fp.close()
return
# CDBReader
class CDBReader:
def __init__(self, cdbname, docache=1):
self.name = cdbname
self._fp = file(cdbname, 'rb')
hash0 = decode(self._fp.read(2048))
self._hash0 = [ (hash0[i], hash0[i+1]) for i in xrange(0, 512, 2) ]
self._hash1 = [ None ] * 256
self._eod = self._hash0[0]
self._docache = docache
self._cache = {}
self._keyiter = None
self._eachiter = None
return
def __repr__(self):
return '<CDBReader: %r>' % self.name
def __getstate__(self):
raise TypeError
def __setstate__(self, dict):
raise TypeError
def __getitem__(self, k):
k = str(k)
if k in self._cache: return self._cache[k]
h = cdbhash(k)
h1 = h & 0xff
(pos_bucket, ncells) = self._hash0[h1]
if ncells == 0: raise KeyError(k)
hs = self._hash1[h1]
if hs == None:
self._fp.seek(pos_bucket)
hs = decode(self._fp.read(ncells * 8))
self._hash1[h1] = hs
i = ((h >> 8) % ncells) * 2
n = ncells*2
for _ in xrange(ncells):
p1 = hs[i+1]
if p1 == 0: raise KeyError(k)
if hs[i] == h:
self._fp.seek(p1)
(klen, vlen) = unpack('<II', self._fp.read(8))
k1 = self._fp.read(klen)
if k1 == k:
v1 = self._fp.read(vlen)
if self._docache:
self._cache[k] = v1
return v1
i = (i+2) % n
raise KeyError(k)
def get(self, k, failed=None):
try:
return self.__getitem__(k)
except KeyError:
return failed
def has_key(self, k):
try:
self.__getitem__(k)
return True
except KeyError:
return False
def __contains__(self, k):
return self.has_key(k)
def firstkey(self):
self._keyiter = None
return self.nextkey()
def nextkey(self):
if not self._keyiter:
self._keyiter = ( k for (k,v) in cdbiter(self._fp, self._eod) )
try:
return self._keyiter.next()
except StopIteration:
return None
def each(self):
if not self._eachiter:
self._eachiter = cdbiter(self._fp, self._eod)
try:
return self._eachiter.next()
except StopIteration:
return None
def iterkeys(self):
return ( k for (k,v) in cdbiter(self._fp, self._eod) )
def itervalues(self):
return ( v for (k,v) in cdbiter(self._fp, self._eod) )
def iteritems(self):
return cdbiter(self._fp, self._eod)
# CDBMaker
class CDBMaker:
def __init__(self, cdbname, tmpname):
self.fn = cdbname
self.fntmp = tmpname
self.numentries = 0
self._fp = file(tmpname, 'wb')
self._pos = 2048 # sizeof((h,p))*256
self._bucket = [ array('I') for _ in xrange(256) ]
return
def __repr__(self):
return '<CDBMaker: %r, %r, %d ents>' % (self.fn, self.fntmp, self.numentries)
def __len__(self):
return self.numentries
def __getstate__(self):
raise TypeError
def __setstate__(self, dict):
raise TypeError
def add(self, k, v):
(k, v) = (str(k), str(v))
(klen, vlen) = (len(k), len(v))
self._fp.seek(self._pos)
self._fp.write(pack('<II', klen, vlen))
self._fp.write(k)
self._fp.write(v)
h = cdbhash(k)
b = self._bucket[h % 256]
b.append(h)
b.append(self._pos)
# sizeof(keylen)+sizeof(datalen)+sizeof(key)+sizeof(data)
self._pos += 8+klen+vlen
self.numentries += 1
return self
def finish(self):
self._fp.seek(self._pos)
pos_hash = self._pos
# write hashes
for b1 in self._bucket:
if not b1: continue
blen = len(b1)
a = array('I', [0]*blen*2)
for j in xrange(0, blen, 2):
(h,p) = (b1[j],b1[j+1])
i = ((h >> 8) % blen)*2
while a[i+1]: # is cell[i] already occupied?
i = (i+2) % len(a)
a[i] = h
a[i+1] = p
self._fp.write(encode(a))
# write header
self._fp.seek(0)
a = array('I')
for b1 in self._bucket:
a.append(pos_hash)
a.append(len(b1))
pos_hash += len(b1)*8
self._fp.write(encode(a))
# close
self._fp.close()
os.rename(self.fntmp, self.fn)
return
# txt2cdb
def txt2cdb(self, lines):
import re
HEAD = re.compile(r'^\+(\d+),(\d+):')
for line in lines:
m = HEAD.match(line)
if not m: break
(klen, vlen) = (int(m.group(1)), int(m.group(2)))
i = len(m.group(0))
k = line[i:i+klen]
i += klen
if line[i:i+2] != '->': raise ValueError('invalid separator: %r' % line)
i += 2
v = line[i:i+vlen]
self.add(k, v)
return self
# cdbdump
def cdbdump(cdbname):
fp = file(cdbname, 'rb')
(eor,) = unpack('<I', fp.read(4))
return cdbiter(fp, eor)
# cdbmerge
def cdbmerge(iters):
q = []
for it in iters:
try:
q.append((it.next(),it))
except StopIteration:
pass
k0 = None
vs = None
while q:
q.sort()
((k,v),it) = q.pop(0)
if k0 != k:
if vs: yield (k0,vs)
vs = []
vs.append(v)
k0 = k
try:
q.append((it.next(),it))
except StopIteration:
continue
if vs: yield (k0,vs)
return
# aliases
cdbmake = CDBMaker
init = CDBReader
## TCDB
##
# tcdbiter
def tcdbiter(fp, eor):
locs = {}
fp.seek(eor)
while 1:
x = fp.read(8)
if not x: break
(h, pos) = unpack('<II', x)
if pos: locs[pos] = h
pos = 2048
fp.seek(pos)
key = ()
parents = [0]
while pos < eor:
(klen, vlen) = unpack('<II', fp.read(8))
k = fp.read(klen)
v = fp.read(vlen)
h = locs[pos]
for (i,p) in enumerate(parents):
if cdbhash(k, p+5381L) == h:
parents = parents[:i+1]
key = key[:i]
break
key += (k,)
yield (key, v)
parents.append(pos)
pos += 8+klen+vlen
fp.close()
return
# TCDBMaker
class TCDBMaker(CDBMaker):
def __init__(self, cdbname, tmpname):
CDBMaker.__init__(self, cdbname, tmpname)
self._parent = 0
self._stack = [self._parent]
return
def put(self, depth, k, v):
if depth == len(self._stack)+1:
self._stack.append(self._parent)
elif depth < len(self._stack):
self._stack = self._stack[:depth]
elif depth != len(self._stack):
raise ValueError('invalid depth: %d' % depth)
#
(k, v) = (str(k), str(v))
(klen, vlen) = (len(k), len(v))
self._parent = self._pos
# sizeof(keylen)+sizeof(datalen)+sizeof(key)+sizeof(data)
self._fp.seek(self._pos)
self._fp.write(pack('<II', klen, vlen))
self._fp.write(k)
self._fp.write(v)
self._pos += 4+4+klen+vlen
h = cdbhash(k, self._stack[-1]+5381L)
b = self._bucket[h % 256]
b.append(h)
b.append(self._parent)
self.numentries += 1
return self
def txt2tcdb(self, lines):
import re
HEAD = re.compile(r'^(\++)(\d+),(\d+):')
for line in lines:
m = HEAD.match(line)
if not m: break
(depth, klen, vlen) = (len(m.group(1)), int(m.group(2)), int(m.group(3)))
i = len(m.group(0))
k = line[i:i+klen]
i += klen
if line[i:i+2] != '->': raise ValueError('invalid separator: %r' % line)
i += 2
v = line[i:i+vlen]
self.put(depth, k, v)
return self
# TCDBReader
class TCDBReader(CDBReader):
def lookup(self, seq, parent=0L):
r = []
for k in seq:
(v, parent) = self.lookup1(k, parent)
r.append(v)
return r
def lookup1(self, k, parent=0L):
k = str(k)
if self._docache and (parent,k) in self._cache:
return self._cache[(parent,k)]
h = cdbhash(k, parent+5381L)
self._fp.seek((h % 256) << 3)
(pos_bucket, ncells) = unpack('<II', self._fp.read(8))
if ncells == 0: raise KeyError(k)
start = (h >> 8) % ncells
for i in xrange(ncells):
self._fp.seek(pos_bucket + ((start+i) % ncells << 3))
(h1, p1) = unpack('<II', self._fp.read(8))
if p1 == 0: raise KeyError(k)
if h1 == h:
self._fp.seek(p1)
(klen, vlen) = unpack('<II', self._fp.read(8))
k1 = self._fp.read(klen)
if k1 == k:
v1 = self._fp.read(vlen)
if self._docache:
self._cache[(parent,k)] = (v1,p1)
return (v1,p1)
raise KeyError(k)
def iterkeys(self):
return ( k for (k,v) in tcdbiter(self._fp, self._eod) )
def itervalues(self):
return ( v for (k,v) in tcdbiter(self._fp, self._eod) )
def iteritems(self):
return tcdbiter(self._fp, self._eod)
# tcdbdump
def tcdbdump(cdbname):
fp = file(cdbname, 'rb')
(eor,) = unpack('<I', fp.read(4))
return tcdbiter(fp, eor)
# aliases
tcdbmake = TCDBMaker
tcdbinit = TCDBReader
tcdbmerge = cdbmerge
# main
def main(argv):
import getopt, fileinput
def usage():
print 'usage: %s {cmake,cget,cdump,cmerge} [options] cdbname [args ...]' % argv[0]
print 'usage: %s {tmake,tget,tdump,tmerge} [options] tcdbname [args ...]' % argv[0]
return 100
args = argv[1:]
if not args: return usage()
cmd = args.pop(0)
try:
(opts, args) = getopt.getopt(args, 'kv2')
except getopt.GetoptError:
return usage()
if not args: return usage()
dbname = args.pop(0)
# cdb
if cmd == 'cmake':
CDBMaker(dbname, dbname+'.tmp').txt2cdb(fileinput.input(args)).finish()
elif cmd == 'cget':
print repr(CDBReader(dbname).get(args[0]))
elif cmd == 'cdump':
f = (lambda k,v: '+%d,%d:%s->%s' % (len(k), len(v), k, v))
for (k, v) in opts:
if k == '-k': f = (lambda k,_: k)
elif k == '-v': f = (lambda _,v: v)
elif k == '-2': f = (lambda k,v: k+'\t'+v)
for (k,v) in cdbdump(dbname):
print f(k,v)
print
elif cmd == 'cmerge':
dbs = [ cdbdump(fname) for fname in args ]
m = CDBMaker(dbname, dbname+'.tmp')
for (k,vs) in tcdbmerge(dbs):
m.add(k, ' '.join(vs))
m.finish()
# tcdb
elif cmd == 'tmake':
TCDBMaker(dbname, dbname+'.tmp').txt2tcdb(fileinput.input(args)).finish()
elif cmd == 'tget':
print repr(TCDBReader(dbname).lookup(args))
elif cmd == 'tdump':
f = (lambda k,v: '%s%d,%d:%s->%s' % ('+'*len(k), len(k[-1]), len(v), k[-1], v))
for (k, v) in opts:
if k == '-k': f = (lambda k,_: '/'.join(k))
elif k == '-v': f = (lambda _,v: v)
elif k == '-2': f = (lambda k,v: '/'.join(k)+'\t'+v)
for (k,v) in tcdbdump(dbname):
print f(k,v)
print
elif cmd == 'tmerge':
dbs = [ tcdbdump(fname) for fname in args ]
m = TCDBMaker(dbname, dbname+'.tmp')
for (k,vs) in tcdbmerge(dbs):
m.put(len(k), k[-1], ' '.join(vs))
m.finish()
else:
return usage()
return
if __name__ == '__main__': sys.exit(main(sys.argv))