to 4-space indentation

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-10-24 04:41:59 +00:00
parent a09b71d89d
commit 7790808560
24 changed files with 4953 additions and 4953 deletions

View File

@ -18,7 +18,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod>
<!-- hhmts start -->
Last Modified: Sat Oct 24 12:42:25 JST 2009
Last Modified: Sat Oct 24 13:40:19 JST 2009
<!-- hhmts end -->
</div>
@ -352,7 +352,7 @@ no stream header is displayed for the ease of saving it to a file.
<hr noshade>
<h2>Changes</h2>
<ul>
<li> 2009/10/24: Charspace bug fixed.
<li> 2009/10/24: Charspace bug fixed. Adjusted for 4-space indentation.
<li> 2009/10/04: Another matrix operation bug fixed. Thanks to Vitaly Sedelnik.
<li> 2009/09/12: Fixed rectangle handling. Able to extract image boundaries.
<li> 2009/08/30: Fixed page rotation handling.

View File

@ -9,36 +9,36 @@
##
class Arcfour(object):
def __init__(self, key):
s = range(256)
j = 0
klen = len(key)
for i in xrange(256):
j = (j + s[i] + ord(key[i % klen])) % 256
(s[i], s[j]) = (s[j], s[i])
self.s = s
(self.i, self.j) = (0, 0)
return
def __init__(self, key):
s = range(256)
j = 0
klen = len(key)
for i in xrange(256):
j = (j + s[i] + ord(key[i % klen])) % 256
(s[i], s[j]) = (s[j], s[i])
self.s = s
(self.i, self.j) = (0, 0)
return
def process(self, data):
(i, j) = (self.i, self.j)
s = self.s
r = ''
for c in data:
i = (i+1) % 256
j = (j+s[i]) % 256
(s[i], s[j]) = (s[j], s[i])
k = s[(s[i]+s[j]) % 256]
r += chr(ord(c) ^ k)
(self.i, self.j) = (i, j)
return r
def process(self, data):
(i, j) = (self.i, self.j)
s = self.s
r = ''
for c in data:
i = (i+1) % 256
j = (j+s[i]) % 256
(s[i], s[j]) = (s[j], s[i])
k = s[(s[i]+s[j]) % 256]
r += chr(ord(c) ^ k)
(self.i, self.j) = (i, j)
return r
# test
if __name__ == '__main__':
def doit(key, data):
cipher = Arcfour(key)
return ''.join( '%02X' % ord(c) for c in cipher.process(data) )
assert doit("Key", "Plaintext") == 'BBF316E8D940AF0AD3'
assert doit("Wiki", "pedia") == '1021BF0420'
assert doit("Secret", "Attack at dawn") == '45A01F645FC35B383552544B9BF5'
print 'test succeeded'
def doit(key, data):
cipher = Arcfour(key)
return ''.join( '%02X' % ord(c) for c in cipher.process(data) )
assert doit("Key", "Plaintext") == 'BBF316E8D940AF0AD3'
assert doit("Wiki", "pedia") == '1021BF0420'
assert doit("Secret", "Attack at dawn") == '45A01F645FC35B383552544B9BF5'
print 'test succeeded'

View File

@ -6,72 +6,72 @@
# ascii85decode(data)
def ascii85decode(data):
import struct
n = b = 0
out = ''
for c in data:
if '!' <= c and c <= 'u':
n += 1
b = b*85+(ord(c)-33)
if n == 5:
out += struct.pack('>L',b)
n = b = 0
elif c == 'z':
assert n == 0
out += '\0\0\0\0'
elif c == '~':
if n:
for _ in range(5-n):
b = b*85+84
out += struct.pack('>L',b)[:n-1]
break
return out
import struct
n = b = 0
out = ''
for c in data:
if '!' <= c and c <= 'u':
n += 1
b = b*85+(ord(c)-33)
if n == 5:
out += struct.pack('>L',b)
n = b = 0
elif c == 'z':
assert n == 0
out += '\0\0\0\0'
elif c == '~':
if n:
for _ in range(5-n):
b = b*85+84
out += struct.pack('>L',b)[:n-1]
break
return out
# asciihexdecode(data)
def asciihexdecode(data):
"""
ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the
ASCIIHexDecode filter produces one byte of binary data. All white-space
characters are ignored. A right angle bracket character (>) indicates
EOD. Any other characters will cause an error. If the filter encounters
the EOD marker after reading an odd number of hexadecimal digits, it
will behave as if a 0 followed the last digit.
>>> asciihexdecode("61 62 2e6364 65")
'ab.cde'
>>> asciihexdecode("61 62 2e6364 657>")
'ab.cdep'
>>> asciihexdecode("7>")
'p'
"""
import re
hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE)
trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
decode = (lambda hx: chr(int(hx, 16)))
out = map(decode, hex_re.findall(data))
m = trail_re.search(data)
if m:
out.append(decode("%c0" % m.group(1)))
return ''.join(out)
"""
ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the
ASCIIHexDecode filter produces one byte of binary data. All white-space
characters are ignored. A right angle bracket character (>) indicates
EOD. Any other characters will cause an error. If the filter encounters
the EOD marker after reading an odd number of hexadecimal digits, it
will behave as if a 0 followed the last digit.
>>> asciihexdecode("61 62 2e6364 65")
'ab.cde'
>>> asciihexdecode("61 62 2e6364 657>")
'ab.cdep'
>>> asciihexdecode("7>")
'p'
"""
import re
hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE)
trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
decode = (lambda hx: chr(int(hx, 16)))
out = map(decode, hex_re.findall(data))
m = trail_re.search(data)
if m:
out.append(decode("%c0" % m.group(1)))
return ''.join(out)
# test
# sample taken from: http://en.wikipedia.org/w/index.php?title=Ascii85
if __name__ == '__main__':
orig = r'''
9jqo^BlbD-BleB1DJ+*+F(f,q/0JhKF<GL>Cj@.4Gp$d7F!,L7@<6@)/0JDEF<G%<+EV:2F!,
O<DJ+*.@<*K0@<6L(Df-\0Ec5e;DffZ(EZee.Bl.9pF"AGXBPCsi+DGm>@3BB/F*&OCAfu2/AKY
i(DIb:@FD,*)+C]U=@3BN#EcYf8ATD3s@q?d$AftVqCh[NqF<G:8+EV:.+Cf>-FD5W8ARlolDIa
l(DId<j@<?3r@:F%a+D58'ATD4$Bl@l3De:,-DJs`8ARoFb/0JMK@qB4^F!,R<AKZ&-DfTqBG%G
>uD.RTpAKYo'+CT/5+Cei#DII?(E,9)oF*2M7/c~>
'''
data = \
'Man is distinguished, not only by his reason, but by this singular passion from '\
'other animals, which is a lust of the mind, that by a perseverance of delight in the '\
'continued and indefatigable generation of knowledge, exceeds the short vehemence of '\
'any carnal pleasure.'
assert ascii85decode(orig) == data
print 'ascii85decode test succeeded'
orig = r'''
9jqo^BlbD-BleB1DJ+*+F(f,q/0JhKF<GL>Cj@.4Gp$d7F!,L7@<6@)/0JDEF<G%<+EV:2F!,
O<DJ+*.@<*K0@<6L(Df-\0Ec5e;DffZ(EZee.Bl.9pF"AGXBPCsi+DGm>@3BB/F*&OCAfu2/AKY
i(DIb:@FD,*)+C]U=@3BN#EcYf8ATD3s@q?d$AftVqCh[NqF<G:8+EV:.+Cf>-FD5W8ARlolDIa
l(DId<j@<?3r@:F%a+D58'ATD4$Bl@l3De:,-DJs`8ARoFb/0JMK@qB4^F!,R<AKZ&-DfTqBG%G
>uD.RTpAKYo'+CT/5+Cei#DII?(E,9)oF*2M7/c~>
'''
data = \
'Man is distinguished, not only by his reason, but by this singular passion from '\
'other animals, which is a lust of the mind, that by a perseverance of delight in the '\
'continued and indefatigable generation of knowledge, exceeds the short vehemence of '\
'any carnal pleasure.'
assert ascii85decode(orig) == data
print 'ascii85decode test succeeded'
import doctest
doctest.testmod()
import doctest
doctest.testmod()

View File

@ -10,9 +10,9 @@ from pdfminer.psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
PSLiteral, PSKeyword, literal_name, keyword_name, \
PSStackParser
try:
import cdb
import cdb
except ImportError:
import pdfminer.pycdb as cdb
import pdfminer.pycdb as cdb
class CMapError(Exception): pass
@ -21,449 +21,449 @@ class CMapError(Exception): pass
## find_cmap_path
##
def find_cmap_path():
try:
return os.environ['CMAP_PATH']
except KeyError:
pass
basedir = os.path.dirname(__file__)
return os.path.join(basedir, 'CMap')
try:
return os.environ['CMAP_PATH']
except KeyError:
pass
basedir = os.path.dirname(__file__)
return os.path.join(basedir, 'CMap')
STRIP_NAME = re.compile(r'[0-9]+')
def name2unicode(name):
if name in charname2unicode:
return charname2unicode[name]
m = STRIP_NAME.search(name)
if not m: raise KeyError(name)
return int(m.group(0))
if name in charname2unicode:
return charname2unicode[name]
m = STRIP_NAME.search(name)
if not m: raise KeyError(name)
return int(m.group(0))
## CMap
##
class CMap(object):
debug = 0
debug = 0
def __init__(self):
self.code2cid = {}
self.cid2code = {}
self.attrs = {}
return
def __init__(self):
self.code2cid = {}
self.cid2code = {}
self.attrs = {}
return
def __repr__(self):
return '<CMap: %s>' % self.attrs.get('CMapName')
def __repr__(self):
return '<CMap: %s>' % self.attrs.get('CMapName')
def update(self, code2cid=None, cid2code=None):
if code2cid:
self.code2cid.update(code2cid)
if cid2code:
self.cid2code.update(cid2code)
return self
def update(self, code2cid=None, cid2code=None):
if code2cid:
self.code2cid.update(code2cid)
if cid2code:
self.cid2code.update(cid2code)
return self
def copycmap(self, cmap):
self.code2cid.update(cmap.getall_code2cid())
self.cid2code.update(cmap.getall_cid2code())
return self
def copycmap(self, cmap):
self.code2cid.update(cmap.getall_code2cid())
self.cid2code.update(cmap.getall_cid2code())
return self
def register_code2cid(self, code, cid):
if isinstance(code, str) and isinstance(cid, int):
self.code2cid[code] = cid
return self
def register_code2cid(self, code, cid):
if isinstance(code, str) and isinstance(cid, int):
self.code2cid[code] = cid
return self
def register_cid2code(self, cid, code):
if isinstance(cid, int):
if isinstance(code, PSLiteral):
self.cid2code[cid] = pack('>H', name2unicode(code.name))
elif isinstance(code, str):
self.cid2code[cid] = code
return self
def register_cid2code(self, cid, code):
if isinstance(cid, int):
if isinstance(code, PSLiteral):
self.cid2code[cid] = pack('>H', name2unicode(code.name))
elif isinstance(code, str):
self.cid2code[cid] = code
return self
def decode(self, bytes):
if self.debug:
print >>stderr, 'decode: %r, %r' % (self, bytes)
x = ''
for c in bytes:
if x:
if x+c in self.code2cid:
yield self.code2cid[x+c]
def decode(self, bytes):
if self.debug:
print >>stderr, 'decode: %r, %r' % (self, bytes)
x = ''
elif c in self.code2cid:
yield self.code2cid[c]
else:
x = c
return
for c in bytes:
if x:
if x+c in self.code2cid:
yield self.code2cid[x+c]
x = ''
elif c in self.code2cid:
yield self.code2cid[c]
else:
x = c
return
def is_vertical(self):
return self.attrs.get('WMode', 0)
def is_vertical(self):
return self.attrs.get('WMode', 0)
def tocid(self, code):
return self.code2cid.get(code)
def tocode(self, cid):
return self.cid2code.get(cid)
def tocid(self, code):
return self.code2cid.get(code)
def tocode(self, cid):
return self.cid2code.get(cid)
def getall_attrs(self):
return self.attrs.iteritems()
def getall_code2cid(self):
return self.code2cid.iteritems()
def getall_cid2code(self):
return self.cid2code.iteritems()
def getall_attrs(self):
return self.attrs.iteritems()
def getall_code2cid(self):
return self.code2cid.iteritems()
def getall_cid2code(self):
return self.cid2code.iteritems()
## CDBCMap
##
class CDBCMap(CMap):
def __init__(self, cdbname):
CMap.__init__(self)
self.cdbname = cdbname
self.db = cdb.init(cdbname)
return
def __init__(self, cdbname):
CMap.__init__(self)
self.cdbname = cdbname
self.db = cdb.init(cdbname)
return
def __repr__(self):
return '<CDBCMap: %s (%r)>' % (self.db['/CMapName'], self.cdbname)
def __repr__(self):
return '<CDBCMap: %s (%r)>' % (self.db['/CMapName'], self.cdbname)
def tocid(self, code):
k = 'c'+code
if not self.db.has_key(k):
return None
return unpack('>L', self.db[k])
def tocode(self, cid):
k = 'i'+pack('>L', cid)
if not self.db.has_key(k):
return None
return self.db[k]
def tocid(self, code):
k = 'c'+code
if not self.db.has_key(k):
return None
return unpack('>L', self.db[k])
def tocode(self, cid):
k = 'i'+pack('>L', cid)
if not self.db.has_key(k):
return None
return self.db[k]
def is_vertical(self):
return (self.db.has_key('/WMode') and
self.db['/WMode'] == '1')
def is_vertical(self):
return (self.db.has_key('/WMode') and
self.db['/WMode'] == '1')
def getall(self, c):
while 1:
x = self.db.each()
if not x: break
(k,v) = x
if k.startswith(c):
yield (k[1:], unpack('>L', v)[0])
return
def getall(self, c):
while 1:
x = self.db.each()
if not x: break
(k,v) = x
if k.startswith(c):
yield (k[1:], unpack('>L', v)[0])
return
def getall_attrs(self):
while 1:
x = self.db.each()
if not x: break
(k,v) = x
if k.startswith('/'):
yield (k[1:], eval(v)[0])
return
def getall_attrs(self):
while 1:
x = self.db.each()
if not x: break
(k,v) = x
if k.startswith('/'):
yield (k[1:], eval(v)[0])
return
def getall_cid2code(self):
return self.getall('i')
def getall_code2cid(self):
return self.getall('c')
def getall_cid2code(self):
return self.getall('i')
def getall_code2cid(self):
return self.getall('c')
def decode(self, bytes):
if self.debug:
print >>stderr, 'decode: %r, %r' % (self, bytes)
x = ''
for c in bytes:
if x:
if x+c in self.code2cid:
yield self.code2cid[x+c]
elif self.db.has_key('c'+x+c):
(dest,) = unpack('>L', self.db['c'+x+c])
self.code2cid[x+c] = dest
yield dest
def decode(self, bytes):
if self.debug:
print >>stderr, 'decode: %r, %r' % (self, bytes)
x = ''
elif c in self.code2cid:
yield self.code2cid[c]
elif self.db.has_key('c'+c):
(dest,) = unpack('>L', self.db['c'+c])
self.code2cid[c] = dest
yield dest
else:
x = c
return
for c in bytes:
if x:
if x+c in self.code2cid:
yield self.code2cid[x+c]
elif self.db.has_key('c'+x+c):
(dest,) = unpack('>L', self.db['c'+x+c])
self.code2cid[x+c] = dest
yield dest
x = ''
elif c in self.code2cid:
yield self.code2cid[c]
elif self.db.has_key('c'+c):
(dest,) = unpack('>L', self.db['c'+c])
self.code2cid[c] = dest
yield dest
else:
x = c
return
## CMapDB
##
class CMapDB(object):
class CMapNotFound(CMapError): pass
class CMapNotFound(CMapError): pass
CMAP_ALIAS = {
}
CMAP_ALIAS = {
}
debug = 0
dirname = None
cdbdirname = None
cmapdb = {}
debug = 0
dirname = None
cdbdirname = None
cmapdb = {}
@classmethod
def initialize(klass, dirname=None, cdbdirname=None):
if not dirname:
dirname = find_cmap_path()
klass.dirname = dirname
klass.cdbdirname = cdbdirname or dirname
return
@classmethod
def initialize(klass, dirname=None, cdbdirname=None):
if not dirname:
dirname = find_cmap_path()
klass.dirname = dirname
klass.cdbdirname = cdbdirname or dirname
return
@classmethod
def get_cmap(klass, cmapname, strict=True):
cmapname = klass.CMAP_ALIAS.get(cmapname, cmapname)
if cmapname in klass.cmapdb:
cmap = klass.cmapdb[cmapname]
else:
fname = os.path.join(klass.dirname, cmapname)
cdbname = os.path.join(klass.cdbdirname, cmapname+'.cmap.cdb')
if os.path.exists(cdbname):
if 1 <= klass.debug:
print >>stderr, 'Opening: CDBCMap %r...' % cdbname
cmap = CDBCMap(cdbname)
elif os.path.exists(fname):
if 1 <= klass.debug:
print >>stderr, 'Reading: CMap %r...' % fname
cmap = CMap()
fp = file(fname, 'rb')
CMapParser(cmap, fp).run()
fp.close()
elif not strict:
cmap = CMap() # just create empty cmap
else:
raise CMapDB.CMapNotFound(cmapname)
klass.cmapdb[cmapname] = cmap
return cmap
@classmethod
def get_cmap(klass, cmapname, strict=True):
cmapname = klass.CMAP_ALIAS.get(cmapname, cmapname)
if cmapname in klass.cmapdb:
cmap = klass.cmapdb[cmapname]
else:
fname = os.path.join(klass.dirname, cmapname)
cdbname = os.path.join(klass.cdbdirname, cmapname+'.cmap.cdb')
if os.path.exists(cdbname):
if 1 <= klass.debug:
print >>stderr, 'Opening: CDBCMap %r...' % cdbname
cmap = CDBCMap(cdbname)
elif os.path.exists(fname):
if 1 <= klass.debug:
print >>stderr, 'Reading: CMap %r...' % fname
cmap = CMap()
fp = file(fname, 'rb')
CMapParser(cmap, fp).run()
fp.close()
elif not strict:
cmap = CMap() # just create empty cmap
else:
raise CMapDB.CMapNotFound(cmapname)
klass.cmapdb[cmapname] = cmap
return cmap
## CMapParser
##
class CMapParser(PSStackParser):
def __init__(self, cmap, fp):
PSStackParser.__init__(self, fp)
self.cmap = cmap
self.in_cmap = False
return
def __init__(self, cmap, fp):
PSStackParser.__init__(self, fp)
self.cmap = cmap
self.in_cmap = False
return
def run(self):
try:
self.nextobject()
except PSEOF:
pass
return
def run(self):
try:
self.nextobject()
except PSEOF:
pass
return
def do_keyword(self, pos, token):
name = token.name
if name == 'begincmap':
self.in_cmap = True
self.popall()
return
elif name == 'endcmap':
self.in_cmap = False
return
if not self.in_cmap: return
#
if name == 'def':
try:
((_,k),(_,v)) = self.pop(2)
self.cmap.attrs[literal_name(k)] = v
except PSSyntaxError:
pass
return
def do_keyword(self, pos, token):
name = token.name
if name == 'begincmap':
self.in_cmap = True
self.popall()
return
elif name == 'endcmap':
self.in_cmap = False
return
if not self.in_cmap: return
#
if name == 'def':
try:
((_,k),(_,v)) = self.pop(2)
self.cmap.attrs[literal_name(k)] = v
except PSSyntaxError:
pass
return
if name == 'usecmap':
try:
((_,cmapname),) = self.pop(1)
self.cmap.copycmap(CMapDB.get_cmap(literal_name(cmapname)))
except PSSyntaxError:
pass
return
if name == 'usecmap':
try:
((_,cmapname),) = self.pop(1)
self.cmap.copycmap(CMapDB.get_cmap(literal_name(cmapname)))
except PSSyntaxError:
pass
return
if name == 'begincodespacerange':
self.popall()
return
if name == 'endcodespacerange':
self.popall()
return
if name == 'begincodespacerange':
self.popall()
return
if name == 'endcodespacerange':
self.popall()
return
if name == 'begincidrange':
self.popall()
return
if name == 'endcidrange':
objs = [ obj for (_,obj) in self.popall() ]
for (s,e,cid) in choplist(3, objs):
if (not isinstance(s, str) or not isinstance(e, str) or
not isinstance(cid, int) or len(s) != len(e)): continue
sprefix = s[:-4]
eprefix = e[:-4]
if sprefix != eprefix: continue
svar = s[-4:]
evar = e[-4:]
s1 = nunpack(svar)
e1 = nunpack(evar)
vlen = len(svar)
#assert s1 <= e1
for i in xrange(e1-s1+1):
x = sprefix+pack('>L',s1+i)[-vlen:]
self.cmap.register_code2cid(x, cid+i)
return
if name == 'begincidrange':
self.popall()
return
if name == 'endcidrange':
objs = [ obj for (_,obj) in self.popall() ]
for (s,e,cid) in choplist(3, objs):
if (not isinstance(s, str) or not isinstance(e, str) or
not isinstance(cid, int) or len(s) != len(e)): continue
sprefix = s[:-4]
eprefix = e[:-4]
if sprefix != eprefix: continue
svar = s[-4:]
evar = e[-4:]
s1 = nunpack(svar)
e1 = nunpack(evar)
vlen = len(svar)
#assert s1 <= e1
for i in xrange(e1-s1+1):
x = sprefix+pack('>L',s1+i)[-vlen:]
self.cmap.register_code2cid(x, cid+i)
return
if name == 'begincidchar':
self.popall()
return
if name == 'endcidchar':
objs = [ obj for (_,obj) in self.popall() ]
for (cid,code) in choplist(2, objs):
if isinstance(code, str) and isinstance(cid, str):
self.cmap.register_code2cid(code, nunpack(cid))
return
if name == 'begincidchar':
self.popall()
return
if name == 'endcidchar':
objs = [ obj for (_,obj) in self.popall() ]
for (cid,code) in choplist(2, objs):
if isinstance(code, str) and isinstance(cid, str):
self.cmap.register_code2cid(code, nunpack(cid))
return
if name == 'beginbfrange':
self.popall()
return
if name == 'endbfrange':
objs = [ obj for (_,obj) in self.popall() ]
for (s,e,code) in choplist(3, objs):
if (not isinstance(s, str) or not isinstance(e, str) or
len(s) != len(e)): continue
s1 = nunpack(s)
e1 = nunpack(e)
#assert s1 <= e1
if isinstance(code, list):
for i in xrange(e1-s1+1):
self.cmap.register_cid2code(s1+i, code[i])
else:
var = code[-4:]
base = nunpack(var)
prefix = code[:-4]
vlen = len(var)
for i in xrange(e1-s1+1):
x = prefix+pack('>L',base+i)[-vlen:]
self.cmap.register_cid2code(s1+i, x)
return
if name == 'beginbfrange':
self.popall()
return
if name == 'endbfrange':
objs = [ obj for (_,obj) in self.popall() ]
for (s,e,code) in choplist(3, objs):
if (not isinstance(s, str) or not isinstance(e, str) or
len(s) != len(e)): continue
s1 = nunpack(s)
e1 = nunpack(e)
#assert s1 <= e1
if isinstance(code, list):
for i in xrange(e1-s1+1):
self.cmap.register_cid2code(s1+i, code[i])
else:
var = code[-4:]
base = nunpack(var)
prefix = code[:-4]
vlen = len(var)
for i in xrange(e1-s1+1):
x = prefix+pack('>L',base+i)[-vlen:]
self.cmap.register_cid2code(s1+i, x)
return
if name == 'beginbfchar':
self.popall()
return
if name == 'endbfchar':
objs = [ obj for (_,obj) in self.popall() ]
for (cid,code) in choplist(2, objs):
if isinstance(cid, str) and isinstance(code, str):
self.cmap.register_cid2code(nunpack(cid), code)
return
if name == 'beginbfchar':
self.popall()
return
if name == 'endbfchar':
objs = [ obj for (_,obj) in self.popall() ]
for (cid,code) in choplist(2, objs):
if isinstance(cid, str) and isinstance(code, str):
self.cmap.register_cid2code(nunpack(cid), code)
return
if name == 'beginnotdefrange':
self.popall()
return
if name == 'endnotdefrange':
self.popall()
return
if name == 'beginnotdefrange':
self.popall()
return
if name == 'endnotdefrange':
self.popall()
return
self.push((pos, token))
return
self.push((pos, token))
return
## FontMetricsDB
##
class FontMetricsDB(object):
@classmethod
def get_metrics(klass, fontname):
return FONT_METRICS[fontname]
@classmethod
def get_metrics(klass, fontname):
return FONT_METRICS[fontname]
## EncodingDB
##
class EncodingDB(object):
std2unicode = {}
mac2unicode = {}
win2unicode = {}
pdf2unicode = {}
for (name,std,mac,win,pdf) in ENCODING:
c = unichr(name2unicode(name))
if std: std2unicode[std] = c
if mac: mac2unicode[mac] = c
if win: win2unicode[win] = c
if pdf: pdf2unicode[pdf] = c
std2unicode = {}
mac2unicode = {}
win2unicode = {}
pdf2unicode = {}
for (name,std,mac,win,pdf) in ENCODING:
c = unichr(name2unicode(name))
if std: std2unicode[std] = c
if mac: mac2unicode[mac] = c
if win: win2unicode[win] = c
if pdf: pdf2unicode[pdf] = c
encodings = {
'StandardEncoding': std2unicode,
'MacRomanEncoding': mac2unicode,
'WinAnsiEncoding': win2unicode,
'PDFDocEncoding': pdf2unicode,
}
encodings = {
'StandardEncoding': std2unicode,
'MacRomanEncoding': mac2unicode,
'WinAnsiEncoding': win2unicode,
'PDFDocEncoding': pdf2unicode,
}
@classmethod
def get_encoding(klass, name, diff=None):
cid2unicode = klass.encodings.get(name, klass.std2unicode)
if diff:
cid2unicode = cid2unicode.copy()
cid = 0
for x in diff:
if isinstance(x, int):
cid = x
elif isinstance(x, PSLiteral):
try:
cid2unicode[cid] = unichr(name2unicode(x.name))
except KeyError:
pass
cid += 1
return cid2unicode
@classmethod
def get_encoding(klass, name, diff=None):
cid2unicode = klass.encodings.get(name, klass.std2unicode)
if diff:
cid2unicode = cid2unicode.copy()
cid = 0
for x in diff:
if isinstance(x, int):
cid = x
elif isinstance(x, PSLiteral):
try:
cid2unicode[cid] = unichr(name2unicode(x.name))
except KeyError:
pass
cid += 1
return cid2unicode
## CMap -> CMapCDB conversion
##
def dumpcdb(cmap, cdbfile, verbose=1):
m = cdb.cdbmake(cdbfile, cdbfile+'.tmp')
if verbose:
print >>stderr, 'Writing: %r...' % cdbfile
for (k,v) in cmap.getall_attrs():
m.add('/'+k, repr(v))
for (code,cid) in cmap.getall_code2cid():
m.add('c'+code, pack('>L',cid))
for (cid,code) in cmap.getall_cid2code():
m.add('i'+pack('>L',cid), code)
m.finish()
return
m = cdb.cdbmake(cdbfile, cdbfile+'.tmp')
if verbose:
print >>stderr, 'Writing: %r...' % cdbfile
for (k,v) in cmap.getall_attrs():
m.add('/'+k, repr(v))
for (code,cid) in cmap.getall_code2cid():
m.add('c'+code, pack('>L',cid))
for (cid,code) in cmap.getall_cid2code():
m.add('i'+pack('>L',cid), code)
m.finish()
return
def convert_cmap(cmapdir, outputdir, force=False):
CMapDB.initialize(cmapdir)
for fname in os.listdir(cmapdir):
if '.' in fname: continue
cmapname = os.path.basename(fname)
cdbname = os.path.join(outputdir, cmapname+'.cmap.cdb')
if not force and os.path.exists(cdbname):
print >>stderr, 'Skipping: %r' % cmapname
continue
print >>stderr, 'Reading: %r...' % cmapname
cmap = CMapDB.get_cmap(cmapname)
dumpcdb(cmap, cdbname)
return
CMapDB.initialize(cmapdir)
for fname in os.listdir(cmapdir):
if '.' in fname: continue
cmapname = os.path.basename(fname)
cdbname = os.path.join(outputdir, cmapname+'.cmap.cdb')
if not force and os.path.exists(cdbname):
print >>stderr, 'Skipping: %r' % cmapname
continue
print >>stderr, 'Reading: %r...' % cmapname
cmap = CMapDB.get_cmap(cmapname)
dumpcdb(cmap, cdbname)
return
def main(argv):
import getopt
def usage():
print 'usage: %s [-D outputdir] [-f] cmap_dir' % argv[0]
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'C:D:f')
except getopt.GetoptError:
return usage()
if args:
cmapdir = args.pop(0)
else:
cmapdir = find_cmap_path()
outputdir = cmapdir
force = False
for (k, v) in opts:
if k == '-f': force = True
elif k == '-C': cmapdir = v
elif k == '-D': outputdir = v
if not os.path.isdir(cmapdir):
print >>stderr, 'directory does not exist: %r' % cmapdir
return 111
if not os.path.isdir(outputdir):
print >>stderr, 'directory does not exist: %r' % outputdir
return 111
return convert_cmap(cmapdir, outputdir, force=force)
import getopt
def usage():
print 'usage: %s [-D outputdir] [-f] cmap_dir' % argv[0]
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'C:D:f')
except getopt.GetoptError:
return usage()
if args:
cmapdir = args.pop(0)
else:
cmapdir = find_cmap_path()
outputdir = cmapdir
force = False
for (k, v) in opts:
if k == '-f': force = True
elif k == '-C': cmapdir = v
elif k == '-D': outputdir = v
if not os.path.isdir(cmapdir):
print >>stderr, 'directory does not exist: %r' % cmapdir
return 111
if not os.path.isdir(outputdir):
print >>stderr, 'directory does not exist: %r' % outputdir
return 111
return convert_cmap(cmapdir, outputdir, force=force)
if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -10,298 +10,298 @@ from pdfminer.utils import apply_matrix_pt, mult_matrix, enc
##
class TagExtractor(PDFDevice):
def __init__(self, rsrc, outfp, codec='utf-8'):
PDFDevice.__init__(self, rsrc)
self.outfp = outfp
self.codec = codec
self.pageno = 0
self.tag = None
return
def __init__(self, rsrc, outfp, codec='utf-8'):
PDFDevice.__init__(self, rsrc)
self.outfp = outfp
self.codec = codec
self.pageno = 0
self.tag = None
return
def render_string(self, textstate, seq):
font = textstate.font
text = ''
for obj in seq:
if not isinstance(obj, str): continue
chars = font.decode(obj)
for cid in chars:
try:
char = font.to_unicode(cid)
text += char
except PDFUnicodeNotDefined:
pass
self.outfp.write(enc(text, self.codec))
return
def render_string(self, textstate, seq):
font = textstate.font
text = ''
for obj in seq:
if not isinstance(obj, str): continue
chars = font.decode(obj)
for cid in chars:
try:
char = font.to_unicode(cid)
text += char
except PDFUnicodeNotDefined:
pass
self.outfp.write(enc(text, self.codec))
return
def begin_page(self, page, ctm):
(x0, y0, x1, y1) = page.mediabox
bbox = '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' %
(self.pageno, bbox, page.rotate))
return
def begin_page(self, page, ctm):
(x0, y0, x1, y1) = page.mediabox
bbox = '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' %
(self.pageno, bbox, page.rotate))
return
def end_page(self, page):
self.outfp.write('</page>\n')
self.pageno += 1
return
def end_page(self, page):
self.outfp.write('</page>\n')
self.pageno += 1
return
def begin_tag(self, tag, props=None):
s = ''
if props:
s = ''.join( ' %s="%s"' % (enc(k), enc(str(v))) for (k,v)
in sorted(props.iteritems()) )
self.outfp.write('<%s%s>' % (enc(tag.name), s))
self.tag = tag
return
def begin_tag(self, tag, props=None):
s = ''
if props:
s = ''.join( ' %s="%s"' % (enc(k), enc(str(v))) for (k,v)
in sorted(props.iteritems()) )
self.outfp.write('<%s%s>' % (enc(tag.name), s))
self.tag = tag
return
def end_tag(self):
assert self.tag
self.outfp.write('</%s>' % enc(self.tag.name))
self.tag = None
return
def end_tag(self):
assert self.tag
self.outfp.write('</%s>' % enc(self.tag.name))
self.tag = None
return
def do_tag(self, tag, props=None):
self.begin_tag(tag, props)
self.tag = None
return
def do_tag(self, tag, props=None):
self.begin_tag(tag, props)
self.tag = None
return
## PDFPageAggregator
##
class PDFPageAggregator(PDFTextDevice):
def __init__(self, rsrc, pageno=1, laparams=None):
PDFTextDevice.__init__(self, rsrc)
self.laparams = laparams
self.pageno = pageno
self.stack = []
return
def __init__(self, rsrc, pageno=1, laparams=None):
PDFTextDevice.__init__(self, rsrc)
self.laparams = laparams
self.pageno = pageno
self.stack = []
return
def begin_page(self, page, ctm):
(x0,y0,x1,y1) = page.mediabox
(x0,y0) = apply_matrix_pt(ctm, (x0,y0))
(x1,y1) = apply_matrix_pt(ctm, (x1,y1))
mediabox = (0, 0, abs(x0-x1), abs(y0-y1))
self.cur_item = LTPage(self.pageno, mediabox)
return
def begin_page(self, page, ctm):
(x0,y0,x1,y1) = page.mediabox
(x0,y0) = apply_matrix_pt(ctm, (x0,y0))
(x1,y1) = apply_matrix_pt(ctm, (x1,y1))
mediabox = (0, 0, abs(x0-x1), abs(y0-y1))
self.cur_item = LTPage(self.pageno, mediabox)
return
def end_page(self, _):
assert not self.stack
assert isinstance(self.cur_item, LTPage)
self.cur_item.fixate()
if self.laparams:
self.cur_item.analyze_layout(self.laparams)
self.pageno += 1
return self.cur_item
def end_page(self, _):
assert not self.stack
assert isinstance(self.cur_item, LTPage)
self.cur_item.fixate()
if self.laparams:
self.cur_item.analyze_layout(self.laparams)
self.pageno += 1
return self.cur_item
def begin_figure(self, name, bbox, matrix):
self.stack.append(self.cur_item)
self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
return
def begin_figure(self, name, bbox, matrix):
self.stack.append(self.cur_item)
self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
return
def end_figure(self, _):
fig = self.cur_item
self.cur_item.fixate()
self.cur_item = self.stack.pop()
self.cur_item.add(fig)
return
def end_figure(self, _):
fig = self.cur_item
self.cur_item.fixate()
self.cur_item = self.stack.pop()
self.cur_item.add(fig)
return
def paint_path(self, gstate, stroke, fill, evenodd, path):
shape = ''.join(x[0] for x in path)
if shape == 'ml': # horizontal/vertical line
(_,x0,y0) = path[0]
(_,x1,y1) = path[1]
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
if y0 == y1:
# horizontal ruler
self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1)))
elif x0 == x1:
# vertical ruler
self.cur_item.add(LTLine(gstate.linewidth, 'V', (x0,y0,x1,y1)))
elif shape == 'mlllh':
# rectangle
(_,x0,y0) = path[0]
(_,x1,y1) = path[1]
(_,x2,y2) = path[2]
(_,x3,y3) = path[3]
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
(x2,y2) = apply_matrix_pt(self.ctm, (x2,y2))
(x3,y3) = apply_matrix_pt(self.ctm, (x3,y3))
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
return
def paint_path(self, gstate, stroke, fill, evenodd, path):
shape = ''.join(x[0] for x in path)
if shape == 'ml': # horizontal/vertical line
(_,x0,y0) = path[0]
(_,x1,y1) = path[1]
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
if y0 == y1:
# horizontal ruler
self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1)))
elif x0 == x1:
# vertical ruler
self.cur_item.add(LTLine(gstate.linewidth, 'V', (x0,y0,x1,y1)))
elif shape == 'mlllh':
# rectangle
(_,x0,y0) = path[0]
(_,x1,y1) = path[1]
(_,x2,y2) = path[2]
(_,x3,y3) = path[3]
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
(x2,y2) = apply_matrix_pt(self.ctm, (x2,y2))
(x3,y3) = apply_matrix_pt(self.ctm, (x3,y3))
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
return
def render_chars(self, matrix, font, fontsize, charspace, scaling, chars):
if not chars: return (0, 0)
item = LTTextItem(matrix, font, fontsize, charspace, scaling, chars)
self.cur_item.add(item)
return item.adv
def render_chars(self, matrix, font, fontsize, charspace, scaling, chars):
if not chars: return (0, 0)
item = LTTextItem(matrix, font, fontsize, charspace, scaling, chars)
self.cur_item.add(item)
return item.adv
## PDFConverter
##
class PDFConverter(PDFPageAggregator):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None):
PDFPageAggregator.__init__(self, rsrc, pageno=pageno, laparams=laparams)
self.outfp = outfp
self.codec = codec
return
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None):
PDFPageAggregator.__init__(self, rsrc, pageno=pageno, laparams=laparams)
self.outfp = outfp
self.codec = codec
return
def write(self, text):
self.outfp.write(enc(text, self.codec))
return
def write(self, text):
self.outfp.write(enc(text, self.codec))
return
## SGMLConverter
##
class SGMLConverter(PDFConverter):
def end_page(self, page):
def render(item):
if isinstance(item, LTPage):
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
(item.id, item.get_bbox(), item.rotate))
for child in item:
render(child)
self.outfp.write('</page>\n')
elif isinstance(item, LTLine):
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox()))
elif isinstance(item, LTRect):
self.outfp.write('<rect linewidth="%d" bbox="%s" />' % (item.linewidth, item.get_bbox()))
elif isinstance(item, LTFigure):
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
for child in item:
render(child)
self.outfp.write('</figure>\n')
elif isinstance(item, LTTextLine):
self.outfp.write('<textline bbox="%s">\n' % (item.get_bbox()))
for child in item:
render(child)
self.outfp.write('</textline>\n')
elif isinstance(item, LTTextBox):
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
for child in item:
render(child)
self.outfp.write('</textbox>\n')
elif isinstance(item, LTTextItem):
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
(enc(item.font.fontname), item.is_vertical(),
item.get_bbox(), item.fontsize))
self.write(item.text)
self.outfp.write('</text>\n')
elif isinstance(item, LTText):
self.outfp.write('<text>%s</text>\n' % item.text)
else:
assert 0, item
return
page = PDFConverter.end_page(self, page)
render(page)
return
def end_page(self, page):
def render(item):
if isinstance(item, LTPage):
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
(item.id, item.get_bbox(), item.rotate))
for child in item:
render(child)
self.outfp.write('</page>\n')
elif isinstance(item, LTLine):
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox()))
elif isinstance(item, LTRect):
self.outfp.write('<rect linewidth="%d" bbox="%s" />' % (item.linewidth, item.get_bbox()))
elif isinstance(item, LTFigure):
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
for child in item:
render(child)
self.outfp.write('</figure>\n')
elif isinstance(item, LTTextLine):
self.outfp.write('<textline bbox="%s">\n' % (item.get_bbox()))
for child in item:
render(child)
self.outfp.write('</textline>\n')
elif isinstance(item, LTTextBox):
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
for child in item:
render(child)
self.outfp.write('</textbox>\n')
elif isinstance(item, LTTextItem):
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
(enc(item.font.fontname), item.is_vertical(),
item.get_bbox(), item.fontsize))
self.write(item.text)
self.outfp.write('</text>\n')
elif isinstance(item, LTText):
self.outfp.write('<text>%s</text>\n' % item.text)
else:
assert 0, item
return
page = PDFConverter.end_page(self, page)
render(page)
return
## HTMLConverter
##
class HTMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
scale=1, showpageno=True, pagepad=50):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.showpageno = showpageno
self.pagepad = pagepad
self.scale = scale
self.outfp.write('<html><head>\n')
self.outfp.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' %
self.codec)
self.outfp.write('</head><body>\n')
self.yoffset = self.pagepad
return
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
scale=1, showpageno=True, pagepad=50):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.showpageno = showpageno
self.pagepad = pagepad
self.scale = scale
self.outfp.write('<html><head>\n')
self.outfp.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' %
self.codec)
self.outfp.write('</head><body>\n')
self.yoffset = self.pagepad
return
def write_rect(self, color, width, x, y, w, h):
self.outfp.write('<span style="position:absolute; border: %s %dpx solid; '
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
(color, width, x*self.scale, y*self.scale, w*self.scale, h*self.scale))
return
def write_rect(self, color, width, x, y, w, h):
self.outfp.write('<span style="position:absolute; border: %s %dpx solid; '
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
(color, width, x*self.scale, y*self.scale, w*self.scale, h*self.scale))
return
def end_page(self, page):
def render(item):
if isinstance(item, LTPage):
self.yoffset += item.y1
self.write_rect('gray', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
if self.showpageno:
self.outfp.write('<div style="position:absolute; top:%dpx;">' %
((self.yoffset-item.y1)*self.scale))
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id))
for child in item:
render(child)
elif isinstance(item, LTTextItem):
if item.vertical:
wmode = 'tb-rl'
else:
wmode = 'lr-tb'
self.outfp.write('<span style="position:absolute; writing-mode:%s;'
' left:%dpx; top:%dpx; font-size:%dpx;">' %
(wmode, item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
item.fontsize*self.scale))
self.write(item.text)
self.outfp.write('</span>\n')
if self.debug:
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTLine) or isinstance(item, LTRect):
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTTextLine):
for child in item:
render(child)
elif isinstance(item, LTTextBox):
self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
for child in item:
render(child)
elif isinstance(item, LTFigure):
self.write_rect('green', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
for child in item:
render(child)
return
page = PDFConverter.end_page(self, page)
render(page)
self.yoffset += self.pagepad
return
def end_page(self, page):
def render(item):
if isinstance(item, LTPage):
self.yoffset += item.y1
self.write_rect('gray', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
if self.showpageno:
self.outfp.write('<div style="position:absolute; top:%dpx;">' %
((self.yoffset-item.y1)*self.scale))
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id))
for child in item:
render(child)
elif isinstance(item, LTTextItem):
if item.vertical:
wmode = 'tb-rl'
else:
wmode = 'lr-tb'
self.outfp.write('<span style="position:absolute; writing-mode:%s;'
' left:%dpx; top:%dpx; font-size:%dpx;">' %
(wmode, item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
item.fontsize*self.scale))
self.write(item.text)
self.outfp.write('</span>\n')
if self.debug:
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTLine) or isinstance(item, LTRect):
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTTextLine):
for child in item:
render(child)
elif isinstance(item, LTTextBox):
self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
for child in item:
render(child)
elif isinstance(item, LTFigure):
self.write_rect('green', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
for child in item:
render(child)
return
page = PDFConverter.end_page(self, page)
render(page)
self.yoffset += self.pagepad
return
def close(self):
self.outfp.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' %
', '.join('<a href="#%s">%s</a>' % (i,i) for i in xrange(1,self.pageno)))
self.outfp.write('</body></html>\n')
return
def close(self):
self.outfp.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' %
', '.join('<a href="#%s">%s</a>' % (i,i) for i in xrange(1,self.pageno)))
self.outfp.write('</body></html>\n')
return
## TextConverter
##
class TextConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
showpageno=False):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.showpageno = showpageno
return
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
showpageno=False):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.showpageno = showpageno
return
def write(self, text):
self.outfp.write(text.encode(self.codec, 'ignore'))
return
def write(self, text):
self.outfp.write(text.encode(self.codec, 'ignore'))
return
def end_page(self, page):
def render(item):
if isinstance(item, LTText):
self.write(item.text)
elif isinstance(item, LayoutContainer):
for child in item:
render(child)
if isinstance(item, LTTextBox):
self.write('\n')
page = PDFConverter.end_page(self, page)
if self.showpageno:
self.write('Page %d\n' % page.id)
render(page)
self.write('\f')
return
def end_page(self, page):
def render(item):
if isinstance(item, LTText):
self.write(item.text)
elif isinstance(item, LayoutContainer):
for child in item:
render(child)
if isinstance(item, LTTextBox):
self.write('\n')
page = PDFConverter.end_page(self, page)
if self.showpageno:
self.write('Page %d\n' % page.id)
render(page)
self.write('\f')
return

View File

@ -8,22 +8,22 @@ INF = sys.maxint
##
class LAParams(object):
def __init__(self,
direction=None,
line_overlap=0.5,
char_margin=1.0,
line_margin=0.5,
word_margin=0.1):
self.direction = direction
self.line_overlap = line_overlap
self.char_margin = char_margin
self.line_margin = line_margin
self.word_margin = word_margin
return
def __init__(self,
direction=None,
line_overlap=0.5,
char_margin=1.0,
line_margin=0.5,
word_margin=0.1):
self.direction = direction
self.line_overlap = line_overlap
self.char_margin = char_margin
self.line_margin = line_margin
self.word_margin = word_margin
return
def __repr__(self):
return ('<LAParams: direction=%r, char_margin=%.1f, line_margin=%.1f, word_margin=%.1f>' %
(self.direction, self.char_margin, self.line_margin, self.word_margin))
def __repr__(self):
return ('<LAParams: direction=%r, char_margin=%.1f, line_margin=%.1f, word_margin=%.1f>' %
(self.direction, self.char_margin, self.line_margin, self.word_margin))
## Plane
@ -35,354 +35,354 @@ class LAParams(object):
##
class Plane(object):
def __init__(self, objs):
self.xobjs = []
self.yobjs = []
for obj in objs:
self.place(obj)
self.xobjs.sort()
self.yobjs.sort()
return
def __init__(self, objs):
self.xobjs = []
self.yobjs = []
for obj in objs:
self.place(obj)
self.xobjs.sort()
self.yobjs.sort()
return
# place(obj): place an object in a certain area.
def place(self, obj):
assert isinstance(obj, LayoutItem)
self.xobjs.append((obj.x0, obj))
self.xobjs.append((obj.x1, obj))
self.yobjs.append((obj.y0, obj))
self.yobjs.append((obj.y1, obj))
return
# place(obj): place an object in a certain area.
def place(self, obj):
assert isinstance(obj, LayoutItem)
self.xobjs.append((obj.x0, obj))
self.xobjs.append((obj.x1, obj))
self.yobjs.append((obj.y0, obj))
self.yobjs.append((obj.y1, obj))
return
# find(): finds objects that are in a certain area.
def find(self, (x0,y0,x1,y1)):
(i0,_) = bsearch(self.xobjs, x0)
(_,i1) = bsearch(self.xobjs, x1)
xobjs = set( obj for (_,obj) in self.xobjs[i0:i1] )
(i0,_) = bsearch(self.yobjs, y0)
(_,i1) = bsearch(self.yobjs, y1)
yobjs = set( obj for (_,obj) in self.yobjs[i0:i1] )
objs = xobjs.intersection(yobjs)
return objs
# find(): finds objects that are in a certain area.
def find(self, (x0,y0,x1,y1)):
(i0,_) = bsearch(self.xobjs, x0)
(_,i1) = bsearch(self.xobjs, x1)
xobjs = set( obj for (_,obj) in self.xobjs[i0:i1] )
(i0,_) = bsearch(self.yobjs, y0)
(_,i1) = bsearch(self.yobjs, y1)
yobjs = set( obj for (_,obj) in self.yobjs[i0:i1] )
objs = xobjs.intersection(yobjs)
return objs
## ClusterSet
##
class ClusterSet(object):
def __init__(self, klass):
self.clusters = {}
self.klass = klass
self.i = 0
return
def __init__(self, klass):
self.clusters = {}
self.klass = klass
self.i = 0
return
# add(objs): groups text objects if necessary.
def add(self, objs):
group = self.klass(self.i, objs)
self.i += 1
for obj in objs:
if obj in self.clusters:
group.merge(self.clusters[obj])
for obj in group:
self.clusters[obj] = group
return
# add(objs): groups text objects if necessary.
def add(self, objs):
group = self.klass(self.i, objs)
self.i += 1
for obj in objs:
if obj in self.clusters:
group.merge(self.clusters[obj])
for obj in group:
self.clusters[obj] = group
return
# finish(): returns all the LTTextBoxes in a page.
def finish(self):
r = set(self.clusters.itervalues())
for group in r:
group.fixate()
return list(r)
# finish(): returns all the LTTextBoxes in a page.
def finish(self):
r = set(self.clusters.itervalues())
for group in r:
group.fixate()
return list(r)
@classmethod
def build(klass, objs, hratio, vratio, objtype, func=None):
plane = Plane(objs)
cset = ClusterSet(objtype)
for obj in objs:
margin = obj.get_margin()
hmargin = hratio * margin
vmargin = vratio * margin
neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin))
assert obj in neighbors, obj
if func:
neighbors = [ x for x in neighbors if x is obj or func(obj, x) ]
cset.add(neighbors)
return cset.finish()
@classmethod
def build(klass, objs, hratio, vratio, objtype, func=None):
plane = Plane(objs)
cset = ClusterSet(objtype)
for obj in objs:
margin = obj.get_margin()
hmargin = hratio * margin
vmargin = vratio * margin
neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin))
assert obj in neighbors, obj
if func:
neighbors = [ x for x in neighbors if x is obj or func(obj, x) ]
cset.add(neighbors)
return cset.finish()
## LayoutItem
##
class LayoutItem(object):
def __init__(self, bbox):
self.set_bbox(bbox)
return
def __init__(self, bbox):
self.set_bbox(bbox)
return
def set_bbox(self, (x0,y0,x1,y1)):
if x1 < x0: (x0,x1) = (x1,x0)
if y1 < y0: (y0,y1) = (y1,y0)
self.x0 = x0
self.y0 = y0
self.x1 = x1
self.y1 = y1
self.width = x1-x0
self.height = y1-y0
return
def set_bbox(self, (x0,y0,x1,y1)):
if x1 < x0: (x0,x1) = (x1,x0)
if y1 < y0: (y0,y1) = (y1,y0)
self.x0 = x0
self.y0 = y0
self.x1 = x1
self.y1 = y1
self.width = x1-x0
self.height = y1-y0
return
def __repr__(self):
return ('<item bbox=%s>' % (self.get_bbox()))
def __repr__(self):
return ('<item bbox=%s>' % (self.get_bbox()))
def hoverlap(self, obj):
assert isinstance(obj, LayoutItem)
if self.x1 <= obj.x0 or obj.x1 <= self.x0:
return 0
else:
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
def hoverlap(self, obj):
assert isinstance(obj, LayoutItem)
if self.x1 <= obj.x0 or obj.x1 <= self.x0:
return 0
else:
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
def voverlap(self, obj):
assert isinstance(obj, LayoutItem)
if self.y1 <= obj.y0 or obj.y1 <= self.y0:
return 0
else:
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
def voverlap(self, obj):
assert isinstance(obj, LayoutItem)
if self.y1 <= obj.y0 or obj.y1 <= self.y0:
return 0
else:
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
def get_bbox(self):
return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1)
def get_bbox(self):
return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1)
def get_margin(self):
return 0
def get_margin(self):
return 0
def get_weight(self):
return 0
def get_weight(self):
return 0
def get_direction(self):
return None
def get_direction(self):
return None
## LayoutContainer
##
class LayoutContainer(LayoutItem):
def __init__(self, id, bbox, objs=None):
LayoutItem.__init__(self, bbox)
self.id = id
if objs:
self.objs = set(objs)
else:
self.objs = set()
self.weight = None
return
def __init__(self, id, bbox, objs=None):
LayoutItem.__init__(self, bbox)
self.id = id
if objs:
self.objs = set(objs)
else:
self.objs = set()
self.weight = None
return
def __repr__(self):
return ('<group %s>' % (self.get_bbox()))
def __repr__(self):
return ('<group %s>' % (self.get_bbox()))
def __iter__(self):
return iter(self.objs)
def __iter__(self):
return iter(self.objs)
def __len__(self):
return len(self.objs)
def __len__(self):
return len(self.objs)
def add(self, obj):
self.objs.add(obj)
return
def add(self, obj):
self.objs.add(obj)
return
def merge(self, group):
self.objs.update(iter(group))
return
def merge(self, group):
self.objs.update(iter(group))
return
# fixate(): determines its boundery and writing direction.
def fixate(self):
if not self.width and self.objs:
(bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
for obj in self.objs:
bx0 = min(bx0, obj.x0)
by0 = min(by0, obj.y0)
bx1 = max(bx1, obj.x1)
by1 = max(by1, obj.y1)
self.set_bbox((bx0, by0, bx1, by1))
self.weight = sum( obj.get_weight() for obj in self.objs )
return
# fixate(): determines its boundery and writing direction.
def fixate(self):
if not self.width and self.objs:
(bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
for obj in self.objs:
bx0 = min(bx0, obj.x0)
by0 = min(by0, obj.y0)
bx1 = max(bx1, obj.x1)
by1 = max(by1, obj.y1)
self.set_bbox((bx0, by0, bx1, by1))
self.weight = sum( obj.get_weight() for obj in self.objs )
return
def get_weight(self):
return self.weight
def get_weight(self):
return self.weight
def get_direction(self):
return None
def get_direction(self):
return None
## LTLine
##
class LTLine(LayoutItem):
def __init__(self, linewidth, direction, bbox):
LayoutItem.__init__(self, bbox)
self.linewidth = linewidth
self.direction = direction
return
def __init__(self, linewidth, direction, bbox):
LayoutItem.__init__(self, bbox)
self.linewidth = linewidth
self.direction = direction
return
## LTRect
##
class LTRect(LayoutItem):
def __init__(self, linewidth, bbox):
LayoutItem.__init__(self, bbox)
self.linewidth = linewidth
return
def __init__(self, linewidth, bbox):
LayoutItem.__init__(self, bbox)
self.linewidth = linewidth
return
## LTText
##
class LTText(object):
def __init__(self, text):
self.text = text
return
def __init__(self, text):
self.text = text
return
def __repr__(self):
return '<text %r>' % self.text
def __repr__(self):
return '<text %r>' % self.text
def get_weight(self):
return len(self.text)
def get_weight(self):
return len(self.text)
def is_upright(self):
return True
def is_upright(self):
return True
## LTAnon
##
class LTAnon(LTText):
def get_weight(self):
return 0
def get_weight(self):
return 0
## LTTextItem
##
class LTTextItem(LayoutItem, LTText):
debug = 1
debug = 1
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
assert chars
self.matrix = matrix
self.font = font
self.vertical = font.is_vertical()
self.text = ''.join( char for (char,_) in chars )
adv = sum( font.char_width(cid) for (_,cid) in chars )
adv = (adv * fontsize + (len(chars)-1)*charspace) * scaling
#size = (font.get_ascent() - font.get_descent()) * fontsize
size = font.get_size() * fontsize
(_,_,_,_,tx,ty) = self.matrix
if not self.vertical:
# horizontal text
self.adv = (adv, 0)
(dx,dy) = apply_matrix_norm(self.matrix, (adv,size))
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
ty += descent
bbox = (tx, ty, tx+dx, ty+dy)
else:
# vertical text
self.adv = (0, adv)
(_,cid) = chars[0]
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001))
(dx,dy) = apply_matrix_norm(self.matrix, (size,adv))
tx -= dx/2
ty += disp
bbox = (tx, ty+dy, tx+dx, ty)
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
LayoutItem.__init__(self, bbox)
return
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
assert chars
self.matrix = matrix
self.font = font
self.vertical = font.is_vertical()
self.text = ''.join( char for (char,_) in chars )
adv = sum( font.char_width(cid) for (_,cid) in chars )
adv = (adv * fontsize + (len(chars)-1)*charspace) * scaling
#size = (font.get_ascent() - font.get_descent()) * fontsize
size = font.get_size() * fontsize
(_,_,_,_,tx,ty) = self.matrix
if not self.vertical:
# horizontal text
self.adv = (adv, 0)
(dx,dy) = apply_matrix_norm(self.matrix, (adv,size))
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
ty += descent
bbox = (tx, ty, tx+dx, ty+dy)
else:
# vertical text
self.adv = (0, adv)
(_,cid) = chars[0]
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001))
(dx,dy) = apply_matrix_norm(self.matrix, (size,adv))
tx -= dx/2
ty += disp
bbox = (tx, ty+dy, tx+dx, ty)
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
LayoutItem.__init__(self, bbox)
return
def __repr__(self):
if self.debug:
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
('[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % self.matrix,
self.font, self.fontsize, self.get_bbox(),
'(%.1f, %.1f)' % self.adv,
self.text))
else:
return '<text %r>' % self.text
def __repr__(self):
if self.debug:
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
('[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % self.matrix,
self.font, self.fontsize, self.get_bbox(),
'(%.1f, %.1f)' % self.adv,
self.text))
else:
return '<text %r>' % self.text
def get_margin(self):
return abs(self.fontsize)
def get_margin(self):
return abs(self.fontsize)
def is_vertical(self):
return self.vertical
def is_vertical(self):
return self.vertical
def is_upright(self):
(a,b,c,d,e,f) = self.matrix
return 0 < a*d and b*c <= 0
def is_upright(self):
(a,b,c,d,e,f) = self.matrix
return 0 < a*d and b*c <= 0
## LTFigure
##
class LTFigure(LayoutContainer):
def __init__(self, id, bbox, matrix):
(x,y,w,h) = bbox
x0 = y0 = INF
x1 = y1 = -INF
for (p,q) in ((x,y),(x+w,y),(x,y+h),(x+w,y+h)):
(p,q) = apply_matrix_pt(matrix, (p,q))
x0 = min(x0, p)
x1 = max(x1, p)
y0 = min(y0, q)
y1 = max(y1, q)
bbox = (x0,y0,x1,y1)
self.matrix = matrix
LayoutContainer.__init__(self, id, bbox)
return
def __init__(self, id, bbox, matrix):
(x,y,w,h) = bbox
x0 = y0 = INF
x1 = y1 = -INF
for (p,q) in ((x,y),(x+w,y),(x,y+h),(x+w,y+h)):
(p,q) = apply_matrix_pt(matrix, (p,q))
x0 = min(x0, p)
x1 = max(x1, p)
y0 = min(y0, q)
y1 = max(y1, q)
bbox = (x0,y0,x1,y1)
self.matrix = matrix
LayoutContainer.__init__(self, id, bbox)
return
def __repr__(self):
return ('<figure id=%r bbox=%s matrix=%r>' % (self.id, self.get_bbox(), self.matrix))
def __repr__(self):
return ('<figure id=%r bbox=%s matrix=%r>' % (self.id, self.get_bbox(), self.matrix))
## LTTextLine
##
class LTTextLine(LayoutContainer):
def __init__(self, id, objs, direction, word_margin):
LayoutContainer.__init__(self, id, (0,0,0,0), objs)
self.direction = direction
self.word_margin = word_margin
return
def __init__(self, id, objs, direction, word_margin):
LayoutContainer.__init__(self, id, (0,0,0,0), objs)
self.direction = direction
self.word_margin = word_margin
return
def __repr__(self):
return ('<line %s(%s)>' % (self.get_bbox(), self.direction))
def __repr__(self):
return ('<line %s(%s)>' % (self.get_bbox(), self.direction))
def get_margin(self):
return min(self.width, self.height)
def get_margin(self):
return min(self.width, self.height)
def get_direction(self):
return self.direction
def get_direction(self):
return self.direction
def get_text(self):
return ''.join( obj.text for obj in self.objs if isinstance(obj, LTText) )
def get_text(self):
return ''.join( obj.text for obj in self.objs if isinstance(obj, LTText) )
def fixate(self):
LayoutContainer.fixate(self)
objs = []
if self.direction == 'V':
y0 = -INF
for obj in sorted(self.objs, key=lambda obj: -obj.y1):
if isinstance(obj, LTTextItem) and self.word_margin:
margin = self.word_margin * obj.get_margin()
if obj.y1+margin < y0:
objs.append(LTAnon(' '))
objs.append(obj)
y0 = obj.y0
else:
x1 = INF
for obj in sorted(self.objs, key=lambda obj: obj.x0):
if isinstance(obj, LTTextItem) and self.word_margin:
margin = self.word_margin * obj.get_margin()
if x1 < obj.x0-margin:
objs.append(LTAnon(' '))
objs.append(obj)
x1 = obj.x1
objs.append(LTAnon('\n'))
self.objs = objs
return
def fixate(self):
LayoutContainer.fixate(self)
objs = []
if self.direction == 'V':
y0 = -INF
for obj in sorted(self.objs, key=lambda obj: -obj.y1):
if isinstance(obj, LTTextItem) and self.word_margin:
margin = self.word_margin * obj.get_margin()
if obj.y1+margin < y0:
objs.append(LTAnon(' '))
objs.append(obj)
y0 = obj.y0
else:
x1 = INF
for obj in sorted(self.objs, key=lambda obj: obj.x0):
if isinstance(obj, LTTextItem) and self.word_margin:
margin = self.word_margin * obj.get_margin()
if x1 < obj.x0-margin:
objs.append(LTAnon(' '))
objs.append(obj)
x1 = obj.x1
objs.append(LTAnon('\n'))
self.objs = objs
return
## LTTextBox
@ -392,109 +392,109 @@ class LTTextLine(LayoutContainer):
##
class LTTextBox(LayoutContainer):
def __init__(self, id, objs, direction):
LayoutContainer.__init__(self, id, (0,0,0,0), objs)
self.direction = direction
return
def __init__(self, id, objs, direction):
LayoutContainer.__init__(self, id, (0,0,0,0), objs)
self.direction = direction
return
def __repr__(self):
return ('<textbox %s(%s) %r...>' % (self.get_bbox(), self.direction, self.get_text()[:20]))
def __repr__(self):
return ('<textbox %s(%s) %r...>' % (self.get_bbox(), self.direction, self.get_text()[:20]))
def get_text(self):
return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) )
def get_text(self):
return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) )
def fixate(self):
LayoutContainer.fixate(self)
if self.direction == 'V':
self.objs = sorted(self.objs, key=lambda obj: -obj.x1)
else:
self.objs = sorted(self.objs, key=lambda obj: -obj.y1)
return
def fixate(self):
LayoutContainer.fixate(self)
if self.direction == 'V':
self.objs = sorted(self.objs, key=lambda obj: -obj.x1)
else:
self.objs = sorted(self.objs, key=lambda obj: -obj.y1)
return
def get_direction(self):
return self.direction
def get_direction(self):
return self.direction
def tsort(objs, f):
gi = dict( (obj,[]) for obj in objs )
go = dict( (obj,[]) for obj in objs )
for obj1 in objs:
for obj2 in objs:
if obj1 is obj2: continue
if f(obj1, obj2): # obj1 -> obj2
go[obj1].append(obj2)
gi[obj2].append(obj1)
r = objs[:]
s = []
while r:
for obj in r:
if not go[obj] or gi[obj]: continue
for c in go[obj]:
gi[c].remove(obj)
del gi[obj]
del go[obj]
r.remove(obj)
s.append(obj)
break
else:
obj = r.pop()
del gi[obj]
del go[obj]
s.append(obj)
return s
gi = dict( (obj,[]) for obj in objs )
go = dict( (obj,[]) for obj in objs )
for obj1 in objs:
for obj2 in objs:
if obj1 is obj2: continue
if f(obj1, obj2): # obj1 -> obj2
go[obj1].append(obj2)
gi[obj2].append(obj1)
r = objs[:]
s = []
while r:
for obj in r:
if not go[obj] or gi[obj]: continue
for c in go[obj]:
gi[c].remove(obj)
del gi[obj]
del go[obj]
r.remove(obj)
s.append(obj)
break
else:
obj = r.pop()
del gi[obj]
del go[obj]
s.append(obj)
return s
## LTPage
##
class LTPage(LayoutContainer):
def __init__(self, id, bbox, rotate=0):
LayoutContainer.__init__(self, id, bbox)
self.rotate = rotate
return
def __init__(self, id, bbox, rotate=0):
LayoutContainer.__init__(self, id, bbox)
self.rotate = rotate
return
def __repr__(self):
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.get_bbox(), self.rotate))
def __repr__(self):
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.get_bbox(), self.rotate))
def analyze_layout(self, laparams):
textobjs = []
otherobjs = []
for obj in self.objs:
if isinstance(obj, LTText) and obj.is_upright():
textobjs.append(obj)
else:
otherobjs.append(obj)
if laparams.direction == 'V':
def vline(obj1, obj2):
return obj1.width * laparams.line_overlap < obj1.hoverlap(obj2)
def vorder(obj1, obj2):
if obj1.voverlap(obj2):
return obj2.x1 < obj1.x0
elif obj1.hoverlap(obj2):
return obj2.y1 < obj1.y0
def analyze_layout(self, laparams):
textobjs = []
otherobjs = []
for obj in self.objs:
if isinstance(obj, LTText) and obj.is_upright():
textobjs.append(obj)
else:
otherobjs.append(obj)
if laparams.direction == 'V':
def vline(obj1, obj2):
return obj1.width * laparams.line_overlap < obj1.hoverlap(obj2)
def vorder(obj1, obj2):
if obj1.voverlap(obj2):
return obj2.x1 < obj1.x0
elif obj1.hoverlap(obj2):
return obj2.y1 < obj1.y0
else:
return obj2.x1 < obj1.x0 and obj2.y1 < obj1.y0
lines = ClusterSet.build(textobjs, 0, laparams.char_margin,
(lambda id,objs: LTTextLine(id, objs, 'V', laparams.word_margin)),
vline)
boxes = ClusterSet.build(lines, laparams.line_margin, 0,
(lambda id,objs: LTTextBox(id, objs, 'V')))
boxes = tsort(boxes, vorder)
else:
return obj2.x1 < obj1.x0 and obj2.y1 < obj1.y0
lines = ClusterSet.build(textobjs, 0, laparams.char_margin,
(lambda id,objs: LTTextLine(id, objs, 'V', laparams.word_margin)),
vline)
boxes = ClusterSet.build(lines, laparams.line_margin, 0,
(lambda id,objs: LTTextBox(id, objs, 'V')))
boxes = tsort(boxes, vorder)
else:
def hline(obj1, obj2):
return obj1.height * laparams.line_overlap < obj1.voverlap(obj2)
def horder(obj1, obj2):
if obj1.hoverlap(obj2):
return obj2.y1 < obj1.y0
elif obj1.voverlap(obj2):
return obj1.x1 < obj2.x0
else:
return obj1.x1 < obj2.x0 and obj2.y1 < obj1.y0
lines = ClusterSet.build(textobjs, laparams.char_margin, 0,
(lambda id,objs: LTTextLine(id, objs, 'H', laparams.word_margin)),
hline)
boxes = ClusterSet.build(lines, 0, laparams.line_margin,
(lambda id,objs: LTTextBox(id, objs, 'H')))
boxes = tsort(boxes, horder)
self.objs = otherobjs + boxes
return
def hline(obj1, obj2):
return obj1.height * laparams.line_overlap < obj1.voverlap(obj2)
def horder(obj1, obj2):
if obj1.hoverlap(obj2):
return obj2.y1 < obj1.y0
elif obj1.voverlap(obj2):
return obj1.x1 < obj2.x0
else:
return obj1.x1 < obj2.x0 and obj2.y1 < obj1.y0
lines = ClusterSet.build(textobjs, laparams.char_margin, 0,
(lambda id,objs: LTTextLine(id, objs, 'H', laparams.word_margin)),
hline)
boxes = ClusterSet.build(lines, 0, laparams.line_margin,
(lambda id,objs: LTTextBox(id, objs, 'H')))
boxes = tsort(boxes, horder)
self.objs = otherobjs + boxes
return

View File

@ -7,93 +7,93 @@ stderr = sys.stderr
##
class LZWDecoder(object):
debug = 0
debug = 0
def __init__(self, fp):
self.fp = fp
self.buff = 0
self.bpos = 8
self.nbits = 9
self.table = None
self.prevbuf = None
return
def __init__(self, fp):
self.fp = fp
self.buff = 0
self.bpos = 8
self.nbits = 9
self.table = None
self.prevbuf = None
return
def readbits(self, bits):
v = 0
while 1:
# the number of remaining bits we can get from the current buffer.
r = 8-self.bpos
if bits <= r:
# |-----8-bits-----|
# |-bpos-|-bits-| |
# | |----r----|
v = (v<<bits) | ((self.buff>>(r-bits)) & ((1<<bits)-1))
self.bpos += bits
break
else:
# |-----8-bits-----|
# |-bpos-|---bits----...
# | |----r----|
v = (v<<r) | (self.buff & ((1<<r)-1))
bits -= r
x = self.fp.read(1)
if not x: raise EOFError
self.buff = ord(x)
self.bpos = 0
return v
def readbits(self, bits):
v = 0
while 1:
# the number of remaining bits we can get from the current buffer.
r = 8-self.bpos
if bits <= r:
# |-----8-bits-----|
# |-bpos-|-bits-| |
# | |----r----|
v = (v<<bits) | ((self.buff>>(r-bits)) & ((1<<bits)-1))
self.bpos += bits
break
else:
# |-----8-bits-----|
# |-bpos-|---bits----...
# | |----r----|
v = (v<<r) | (self.buff & ((1<<r)-1))
bits -= r
x = self.fp.read(1)
if not x: raise EOFError
self.buff = ord(x)
self.bpos = 0
return v
def feed(self, code):
x = ''
if code == 256:
self.table = [ chr(c) for c in xrange(256) ] # 0-255
self.table.append(None) # 256
self.table.append(None) # 257
self.prevbuf = ''
self.nbits = 9
elif code == 257:
pass
elif not self.prevbuf:
x = self.prevbuf = self.table[code]
else:
if code < len(self.table):
x = self.table[code]
self.table.append(self.prevbuf+x[0])
else:
self.table.append(self.prevbuf+self.prevbuf[0])
x = self.table[code]
l = len(self.table)
if l == 511:
self.nbits = 10
elif l == 1023:
self.nbits = 11
elif l == 2047:
self.nbits = 12
self.prevbuf = x
return x
def feed(self, code):
x = ''
if code == 256:
self.table = [ chr(c) for c in xrange(256) ] # 0-255
self.table.append(None) # 256
self.table.append(None) # 257
self.prevbuf = ''
self.nbits = 9
elif code == 257:
pass
elif not self.prevbuf:
x = self.prevbuf = self.table[code]
else:
if code < len(self.table):
x = self.table[code]
self.table.append(self.prevbuf+x[0])
else:
self.table.append(self.prevbuf+self.prevbuf[0])
x = self.table[code]
l = len(self.table)
if l == 511:
self.nbits = 10
elif l == 1023:
self.nbits = 11
elif l == 2047:
self.nbits = 12
self.prevbuf = x
return x
def run(self):
while 1:
try:
code = self.readbits(self.nbits)
except EOFError:
break
x = self.feed(code)
yield x
if self.debug:
print >>stderr, ('nbits=%d, code=%d, output=%r, table=%r' %
(self.nbits, code, x, self.table[258:]))
return
def run(self):
while 1:
try:
code = self.readbits(self.nbits)
except EOFError:
break
x = self.feed(code)
yield x
if self.debug:
print >>stderr, ('nbits=%d, code=%d, output=%r, table=%r' %
(self.nbits, code, x, self.table[258:]))
return
def main(argv):
import StringIO
data = '\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01'
fp = StringIO.StringIO(data)
expected = '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42'
LZWDecoder.debug = 1
output = ''.join(LZWDecoder(fp).run())
print (data, expected, output)
print output == expected
return 0
import StringIO
data = '\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01'
fp = StringIO.StringIO(data)
expected = '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42'
LZWDecoder.debug = 1
output = ''.join(LZWDecoder(fp).run())
print (data, expected, output)
print output == expected
return 0
if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -11,13 +11,13 @@ LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
class PDFColorSpace(object):
def __init__(self, name, ncomponents):
self.name = name
self.ncomponents = ncomponents
return
def __init__(self, name, ncomponents):
self.name = name
self.ncomponents = ncomponents
return
def __repr__(self):
return '<PDFColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
def __repr__(self):
return '<PDFColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
PREDEFINED_COLORSPACE = dict(

View File

@ -9,116 +9,116 @@ from pdfminer.pdffont import PDFUnicodeNotDefined
##
class PDFDevice(object):
debug = 0
debug = 0
def __init__(self, rsrc):
self.rsrc = rsrc
self.ctm = None
return
def __init__(self, rsrc):
self.rsrc = rsrc
self.ctm = None
return
def __repr__(self):
return '<PDFDevice>'
def __repr__(self):
return '<PDFDevice>'
def close(self):
return
def close(self):
return
def set_ctm(self, ctm):
self.ctm = ctm
return
def set_ctm(self, ctm):
self.ctm = ctm
return
def begin_tag(self, tag, props=None):
return
def end_tag(self):
return
def do_tag(self, tag, props=None):
return
def begin_tag(self, tag, props=None):
return
def end_tag(self):
return
def do_tag(self, tag, props=None):
return
def begin_page(self, page, ctm):
return
def end_page(self, page):
return
def begin_figure(self, name, bbox, matrix):
return
def end_figure(self, name):
return
def begin_page(self, page, ctm):
return
def end_page(self, page):
return
def begin_figure(self, name, bbox, matrix):
return
def end_figure(self, name):
return
def paint_path(self, graphicstate, stroke, fill, evenodd, path):
return
def render_image(self, stream, size):
return
def render_string(self, textstate, seq):
return
def paint_path(self, graphicstate, stroke, fill, evenodd, path):
return
def render_image(self, stream, size):
return
def render_string(self, textstate, seq):
return
## PDFTextDevice
##
class PDFTextDevice(PDFDevice):
def handle_undefined_char(self, cidcoding, cid):
if self.debug:
print >>sys.stderr, 'undefined: %r, %r' % (cidcoding, cid)
return '?'
def handle_undefined_char(self, cidcoding, cid):
if self.debug:
print >>sys.stderr, 'undefined: %r, %r' % (cidcoding, cid)
return '?'
def render_chars(self, matrix, font, fontsize, charspace, scaling, chars):
return (0, 0)
def render_chars(self, matrix, font, fontsize, charspace, scaling, chars):
return (0, 0)
def render_string(self, textstate, seq):
matrix = mult_matrix(textstate.matrix, self.ctm)
font = textstate.font
fontsize = textstate.fontsize
scaling = textstate.scaling * .01
charspace = textstate.charspace * scaling
wordspace = textstate.wordspace * scaling
dxscale = .001 * fontsize * scaling
chars = []
needspace = False
(x,y) = textstate.linematrix
for obj in seq:
if isinstance(obj, int) or isinstance(obj, float):
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars)
x += dx
y += dy
d = -obj*dxscale
if font.is_vertical():
y += d
else:
x += d
def render_string(self, textstate, seq):
matrix = mult_matrix(textstate.matrix, self.ctm)
font = textstate.font
fontsize = textstate.fontsize
scaling = textstate.scaling * .01
charspace = textstate.charspace * scaling
wordspace = textstate.wordspace * scaling
dxscale = .001 * fontsize * scaling
chars = []
needspace = False
else:
for cid in font.decode(obj):
try:
char = font.to_unicode(cid)
except PDFUnicodeNotDefined, e:
(cidcoding, cid) = e.args
char = self.handle_undefined_char(cidcoding, cid)
chars.append((char, cid))
if cid == 32 and textstate.wordspace and not font.is_multibyte():
(x,y) = textstate.linematrix
for obj in seq:
if isinstance(obj, int) or isinstance(obj, float):
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars)
x += dx
y += dy
d = -obj*dxscale
if font.is_vertical():
y += d
else:
x += d
chars = []
needspace = False
else:
for cid in font.decode(obj):
try:
char = font.to_unicode(cid)
except PDFUnicodeNotDefined, e:
(cidcoding, cid) = e.args
char = self.handle_undefined_char(cidcoding, cid)
chars.append((char, cid))
if cid == 32 and textstate.wordspace and not font.is_multibyte():
if needspace:
if font.is_vertical():
y += charspace
else:
x += charspace
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars)
needspace = True
x += dx
y += dy
if font.is_vertical():
y += wordspace
else:
x += wordspace
chars = []
if chars:
if needspace:
if font.is_vertical():
y += charspace
else:
x += charspace
if font.is_vertical():
y += charspace
else:
x += charspace
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars)
needspace = True
x += dx
y += dy
if font.is_vertical():
y += wordspace
else:
x += wordspace
chars = []
if chars:
if needspace:
if font.is_vertical():
y += charspace
else:
x += charspace
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars)
x += dx
y += dy
textstate.linematrix = (x,y)
return
textstate.linematrix = (x,y)
return

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -26,217 +26,217 @@ class PDFNotImplementedError(PSException): pass
##
class PDFObjRef(PDFObject):
def __init__(self, doc, objid, _):
if objid == 0:
if STRICT:
raise PDFValueError('PDF object id cannot be 0.')
self.doc = doc
self.objid = objid
#self.genno = genno # Never used.
return
def __init__(self, doc, objid, _):
if objid == 0:
if STRICT:
raise PDFValueError('PDF object id cannot be 0.')
self.doc = doc
self.objid = objid
#self.genno = genno # Never used.
return
def __repr__(self):
return '<PDFObjRef:%d>' % (self.objid)
def __repr__(self):
return '<PDFObjRef:%d>' % (self.objid)
def resolve(self):
return self.doc.getobj(self.objid)
def resolve(self):
return self.doc.getobj(self.objid)
# resolve
def resolve1(x):
'''
Resolve an object. If this is an array or dictionary,
it may still contains some indirect objects inside.
'''
while isinstance(x, PDFObjRef):
x = x.resolve()
return x
'''
Resolve an object. If this is an array or dictionary,
it may still contains some indirect objects inside.
'''
while isinstance(x, PDFObjRef):
x = x.resolve()
return x
def resolve_all(x):
'''
Recursively resolve X and all the internals.
Make sure there is no indirect reference within the nested object.
This procedure might be slow.
'''
while isinstance(x, PDFObjRef):
x = x.resolve()
if isinstance(x, list):
x = [ resolve_all(v) for v in x ]
elif isinstance(x, dict):
for (k,v) in x.iteritems():
x[k] = resolve_all(v)
return x
'''
Recursively resolve X and all the internals.
Make sure there is no indirect reference within the nested object.
This procedure might be slow.
'''
while isinstance(x, PDFObjRef):
x = x.resolve()
if isinstance(x, list):
x = [ resolve_all(v) for v in x ]
elif isinstance(x, dict):
for (k,v) in x.iteritems():
x[k] = resolve_all(v)
return x
def decipher_all(decipher, objid, genno, x):
'''
Recursively decipher X.
'''
if isinstance(x, str):
return decipher(objid, genno, x)
if isinstance(x, list):
x = [ decipher_all(decipher, objid, genno, v) for v in x ]
elif isinstance(x, dict):
for (k,v) in x.iteritems():
x[k] = decipher_all(decipher, objid, genno, v)
return x
'''
Recursively decipher X.
'''
if isinstance(x, str):
return decipher(objid, genno, x)
if isinstance(x, list):
x = [ decipher_all(decipher, objid, genno, v) for v in x ]
elif isinstance(x, dict):
for (k,v) in x.iteritems():
x[k] = decipher_all(decipher, objid, genno, v)
return x
# Type cheking
def int_value(x):
x = resolve1(x)
if not isinstance(x, int):
if STRICT:
raise PDFTypeError('Integer required: %r' % x)
return 0
return x
x = resolve1(x)
if not isinstance(x, int):
if STRICT:
raise PDFTypeError('Integer required: %r' % x)
return 0
return x
def float_value(x):
x = resolve1(x)
if not isinstance(x, float):
if STRICT:
raise PDFTypeError('Float required: %r' % x)
return 0.0
return x
x = resolve1(x)
if not isinstance(x, float):
if STRICT:
raise PDFTypeError('Float required: %r' % x)
return 0.0
return x
def num_value(x):
x = resolve1(x)
if not (isinstance(x, int) or isinstance(x, float)):
if STRICT:
raise PDFTypeError('Int or Float required: %r' % x)
return 0
return x
x = resolve1(x)
if not (isinstance(x, int) or isinstance(x, float)):
if STRICT:
raise PDFTypeError('Int or Float required: %r' % x)
return 0
return x
def str_value(x):
x = resolve1(x)
if not isinstance(x, str):
if STRICT:
raise PDFTypeError('String required: %r' % x)
return ''
return x
x = resolve1(x)
if not isinstance(x, str):
if STRICT:
raise PDFTypeError('String required: %r' % x)
return ''
return x
def list_value(x):
x = resolve1(x)
if not (isinstance(x, list) or isinstance(x, tuple)):
if STRICT:
raise PDFTypeError('List required: %r' % x)
return []
return x
x = resolve1(x)
if not (isinstance(x, list) or isinstance(x, tuple)):
if STRICT:
raise PDFTypeError('List required: %r' % x)
return []
return x
def dict_value(x):
x = resolve1(x)
if not isinstance(x, dict):
if STRICT:
raise PDFTypeError('Dict required: %r' % x)
return {}
return x
x = resolve1(x)
if not isinstance(x, dict):
if STRICT:
raise PDFTypeError('Dict required: %r' % x)
return {}
return x
def stream_value(x):
x = resolve1(x)
if not isinstance(x, PDFStream):
if STRICT:
raise PDFTypeError('PDFStream required: %r' % x)
return PDFStream({}, '')
return x
x = resolve1(x)
if not isinstance(x, PDFStream):
if STRICT:
raise PDFTypeError('PDFStream required: %r' % x)
return PDFStream({}, '')
return x
## PDFStream type
##
class PDFStream(PDFObject):
def __init__(self, dic, rawdata, decipher=None):
self.dic = dic
self.rawdata = rawdata
self.decipher = decipher
self.data = None
self.objid = None
self.genno = None
return
def __init__(self, dic, rawdata, decipher=None):
self.dic = dic
self.rawdata = rawdata
self.decipher = decipher
self.data = None
self.objid = None
self.genno = None
return
def set_objid(self, objid, genno):
self.objid = objid
self.genno = genno
return
def set_objid(self, objid, genno):
self.objid = objid
self.genno = genno
return
def __repr__(self):
return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
def __repr__(self):
return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
def decomp(self,data):
import zlib
buf = data
# some FlateDecode streams have garbage (newlines, etc) appended to the
# end. remove chars from the end to try and decompress the buffer
while 8 <= len(buf):
try:
# will get errors if the document is encrypted.
dco = zlib.decompressobj()
return dco.decompress(buf)
except zlib.error:
buf = buf[:-1]
raise Exception, "zlib.error while decompressing data"
def decomp(self,data):
import zlib
buf = data
# some FlateDecode streams have garbage (newlines, etc) appended to the
# end. remove chars from the end to try and decompress the buffer
while 8 <= len(buf):
try:
# will get errors if the document is encrypted.
dco = zlib.decompressobj()
return dco.decompress(buf)
except zlib.error:
buf = buf[:-1]
raise Exception, "zlib.error while decompressing data"
def decode(self):
assert self.data == None and self.rawdata != None
data = self.rawdata
if self.decipher:
# Handle encryption
data = self.decipher(self.objid, self.genno, data)
if 'Filter' not in self.dic:
self.data = data
self.rawdata = None
return
filters = self.dic['Filter']
if not isinstance(filters, list):
filters = [ filters ]
for f in filters:
if f in LITERALS_FLATE_DECODE:
# will get errors if the document is encrypted.
data = self.decomp(data)
elif f in LITERALS_LZW_DECODE:
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
data = ''.join(LZWDecoder(StringIO(data)).run())
elif f in LITERALS_ASCII85_DECODE:
import ascii85
data = ascii85.ascii85decode(data)
elif f in LITERALS_ASCIIHEX_DECODE:
import ascii85
data = ascii85.asciihexdecode(data)
elif f == LITERAL_CRYPT:
raise PDFNotImplementedError('/Crypt filter is unsupported')
else:
raise PDFNotImplementedError('Unsupported filter: %r' % f)
# apply predictors
if 'DP' in self.dic:
params = self.dic['DP']
else:
params = self.dic.get('DecodeParms', {})
if 'Predictor' in params:
pred = int_value(params['Predictor'])
if pred:
if pred != 12:
raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
if 'Columns' not in params:
raise PDFValueError('Columns undefined for predictor=12')
columns = int_value(params['Columns'])
buf = ''
ent0 = '\x00' * columns
for i in xrange(0, len(data), columns+1):
pred = data[i]
ent1 = data[i+1:i+1+columns]
if pred == '\x02':
ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )
buf += ent1
ent0 = ent1
data = buf
self.data = data
self.rawdata = None
return
def decode(self):
assert self.data == None and self.rawdata != None
data = self.rawdata
if self.decipher:
# Handle encryption
data = self.decipher(self.objid, self.genno, data)
if 'Filter' not in self.dic:
self.data = data
self.rawdata = None
return
filters = self.dic['Filter']
if not isinstance(filters, list):
filters = [ filters ]
for f in filters:
if f in LITERALS_FLATE_DECODE:
# will get errors if the document is encrypted.
data = self.decomp(data)
elif f in LITERALS_LZW_DECODE:
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
data = ''.join(LZWDecoder(StringIO(data)).run())
elif f in LITERALS_ASCII85_DECODE:
import ascii85
data = ascii85.ascii85decode(data)
elif f in LITERALS_ASCIIHEX_DECODE:
import ascii85
data = ascii85.asciihexdecode(data)
elif f == LITERAL_CRYPT:
raise PDFNotImplementedError('/Crypt filter is unsupported')
else:
raise PDFNotImplementedError('Unsupported filter: %r' % f)
# apply predictors
if 'DP' in self.dic:
params = self.dic['DP']
else:
params = self.dic.get('DecodeParms', {})
if 'Predictor' in params:
pred = int_value(params['Predictor'])
if pred:
if pred != 12:
raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
if 'Columns' not in params:
raise PDFValueError('Columns undefined for predictor=12')
columns = int_value(params['Columns'])
buf = ''
ent0 = '\x00' * columns
for i in xrange(0, len(data), columns+1):
pred = data[i]
ent1 = data[i+1:i+1+columns]
if pred == '\x02':
ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )
buf += ent1
ent0 = ent1
data = buf
self.data = data
self.rawdata = None
return
def get_data(self):
if self.data == None:
self.decode()
return self.data
def get_data(self):
if self.data == None:
self.decode()
return self.data
def get_rawdata(self):
return self.rawdata
def get_rawdata(self):
return self.rawdata

File diff suppressed because it is too large Load Diff

View File

@ -13,24 +13,24 @@ from array import array
# calc hash value with a given key
def cdbhash(s, n=5381L):
return reduce(lambda h,c: ((h*33) ^ ord(c)) & 0xffffffffL, s, n)
return reduce(lambda h,c: ((h*33) ^ ord(c)) & 0xffffffffL, s, n)
if pack('=i',1) == pack('>i',1):
# big endian
def decode(x):
a = array('I', x)
a.byteswap()
return a
def encode(a):
a.byteswap()
return a.tostring()
# big endian
def decode(x):
a = array('I', x)
a.byteswap()
return a
def encode(a):
a.byteswap()
return a.tostring()
else:
# little endian
def decode(x):
a = array('I', x)
return a
def encode(a):
return a.tostring()
# little endian
def decode(x):
a = array('I', x)
return a
def encode(a):
return a.tostring()
## CDB
@ -38,234 +38,234 @@ else:
# cdbiter
def cdbiter(fp, eod):
kloc = 2048
while kloc < eod:
fp.seek(kloc)
(klen, vlen) = unpack('<II', fp.read(8))
k = fp.read(klen)
v = fp.read(vlen)
kloc += 8+klen+vlen
yield (k,v)
fp.close()
return
kloc = 2048
while kloc < eod:
fp.seek(kloc)
(klen, vlen) = unpack('<II', fp.read(8))
k = fp.read(klen)
v = fp.read(vlen)
kloc += 8+klen+vlen
yield (k,v)
fp.close()
return
# CDBReader
class CDBReader(object):
def __init__(self, cdbname, docache=1):
self.name = cdbname
self._fp = file(cdbname, 'rb')
hash0 = decode(self._fp.read(2048))
self._hash0 = [ (hash0[i], hash0[i+1]) for i in xrange(0, 512, 2) ]
self._hash1 = [ None ] * 256
self._eod = hash0[0]
self._docache = docache
self._cache = {}
self._keyiter = None
self._eachiter = None
return
def __init__(self, cdbname, docache=1):
self.name = cdbname
self._fp = file(cdbname, 'rb')
hash0 = decode(self._fp.read(2048))
self._hash0 = [ (hash0[i], hash0[i+1]) for i in xrange(0, 512, 2) ]
self._hash1 = [ None ] * 256
self._eod = hash0[0]
self._docache = docache
self._cache = {}
self._keyiter = None
self._eachiter = None
return
def __repr__(self):
return '<CDBReader: %r>' % self.name
def __repr__(self):
return '<CDBReader: %r>' % self.name
def __getstate__(self):
raise TypeError
def __getstate__(self):
raise TypeError
def __setstate__(self, dict):
raise TypeError
def __setstate__(self, dict):
raise TypeError
def __getitem__(self, k):
k = str(k)
if k in self._cache: return self._cache[k]
h = cdbhash(k)
h1 = h & 0xff
(pos_bucket, ncells) = self._hash0[h1]
if ncells == 0: raise KeyError(k)
hs = self._hash1[h1]
if hs == None:
self._fp.seek(pos_bucket)
hs = decode(self._fp.read(ncells * 8))
self._hash1[h1] = hs
i = ((h >> 8) % ncells) * 2
n = ncells*2
for _ in xrange(ncells):
p1 = hs[i+1]
if p1 == 0: raise KeyError(k)
if hs[i] == h:
self._fp.seek(p1)
(klen, vlen) = unpack('<II', self._fp.read(8))
k1 = self._fp.read(klen)
if k1 == k:
v1 = self._fp.read(vlen)
if self._docache:
self._cache[k] = v1
return v1
i = (i+2) % n
raise KeyError(k)
def __getitem__(self, k):
k = str(k)
if k in self._cache: return self._cache[k]
h = cdbhash(k)
h1 = h & 0xff
(pos_bucket, ncells) = self._hash0[h1]
if ncells == 0: raise KeyError(k)
hs = self._hash1[h1]
if hs == None:
self._fp.seek(pos_bucket)
hs = decode(self._fp.read(ncells * 8))
self._hash1[h1] = hs
i = ((h >> 8) % ncells) * 2
n = ncells*2
for _ in xrange(ncells):
p1 = hs[i+1]
if p1 == 0: raise KeyError(k)
if hs[i] == h:
self._fp.seek(p1)
(klen, vlen) = unpack('<II', self._fp.read(8))
k1 = self._fp.read(klen)
if k1 == k:
v1 = self._fp.read(vlen)
if self._docache:
self._cache[k] = v1
return v1
i = (i+2) % n
raise KeyError(k)
def get(self, k, failed=None):
try:
return self.__getitem__(k)
except KeyError:
return failed
def get(self, k, failed=None):
try:
return self.__getitem__(k)
except KeyError:
return failed
def has_key(self, k):
try:
self.__getitem__(k)
return True
except KeyError:
return False
def has_key(self, k):
try:
self.__getitem__(k)
return True
except KeyError:
return False
def __contains__(self, k):
return self.has_key(k)
def __contains__(self, k):
return self.has_key(k)
def firstkey(self):
self._keyiter = None
return self.nextkey()
def firstkey(self):
self._keyiter = None
return self.nextkey()
def nextkey(self):
if not self._keyiter:
self._keyiter = ( k for (k,v) in cdbiter(self._fp, self._eod) )
try:
return self._keyiter.next()
except StopIteration:
return None
def nextkey(self):
if not self._keyiter:
self._keyiter = ( k for (k,v) in cdbiter(self._fp, self._eod) )
try:
return self._keyiter.next()
except StopIteration:
return None
def each(self):
if not self._eachiter:
self._eachiter = cdbiter(self._fp, self._eod)
try:
return self._eachiter.next()
except StopIteration:
return None
def each(self):
if not self._eachiter:
self._eachiter = cdbiter(self._fp, self._eod)
try:
return self._eachiter.next()
except StopIteration:
return None
def iterkeys(self):
return ( k for (k,v) in cdbiter(self._fp, self._eod) )
def itervalues(self):
return ( v for (k,v) in cdbiter(self._fp, self._eod) )
def iteritems(self):
return cdbiter(self._fp, self._eod)
def iterkeys(self):
return ( k for (k,v) in cdbiter(self._fp, self._eod) )
def itervalues(self):
return ( v for (k,v) in cdbiter(self._fp, self._eod) )
def iteritems(self):
return cdbiter(self._fp, self._eod)
# CDBMaker
class CDBMaker(object):
def __init__(self, cdbname, tmpname):
self.fn = cdbname
self.fntmp = tmpname
self.numentries = 0
self._fp = file(tmpname, 'wb')
self._pos = 2048 # sizeof((h,p))*256
self._bucket = [ array('I') for _ in xrange(256) ]
return
def __init__(self, cdbname, tmpname):
self.fn = cdbname
self.fntmp = tmpname
self.numentries = 0
self._fp = file(tmpname, 'wb')
self._pos = 2048 # sizeof((h,p))*256
self._bucket = [ array('I') for _ in xrange(256) ]
return
def __repr__(self):
return '<CDBMaker: %r, %r, %d ents>' % (self.fn, self.fntmp, self.numentries)
def __repr__(self):
return '<CDBMaker: %r, %r, %d ents>' % (self.fn, self.fntmp, self.numentries)
def __len__(self):
return self.numentries
def __len__(self):
return self.numentries
def __getstate__(self):
raise TypeError
def __getstate__(self):
raise TypeError
def __setstate__(self, dict):
raise TypeError
def __setstate__(self, dict):
raise TypeError
def add(self, k, v):
(k, v) = (str(k), str(v))
(klen, vlen) = (len(k), len(v))
self._fp.seek(self._pos)
self._fp.write(pack('<II', klen, vlen))
self._fp.write(k)
self._fp.write(v)
h = cdbhash(k)
b = self._bucket[h % 256]
b.append(h)
b.append(self._pos)
# sizeof(keylen)+sizeof(datalen)+sizeof(key)+sizeof(data)
self._pos += 8+klen+vlen
self.numentries += 1
return self
def add(self, k, v):
(k, v) = (str(k), str(v))
(klen, vlen) = (len(k), len(v))
self._fp.seek(self._pos)
self._fp.write(pack('<II', klen, vlen))
self._fp.write(k)
self._fp.write(v)
h = cdbhash(k)
b = self._bucket[h % 256]
b.append(h)
b.append(self._pos)
# sizeof(keylen)+sizeof(datalen)+sizeof(key)+sizeof(data)
self._pos += 8+klen+vlen
self.numentries += 1
return self
def finish(self):
self._fp.seek(self._pos)
pos_hash = self._pos
# write hashes
for b1 in self._bucket:
if not b1: continue
blen = len(b1)
a = array('I', [0]*blen*2)
for j in xrange(0, blen, 2):
(h,p) = (b1[j],b1[j+1])
i = ((h >> 8) % blen)*2
while a[i+1]: # is cell[i] already occupied?
i = (i+2) % len(a)
a[i] = h
a[i+1] = p
self._fp.write(encode(a))
# write header
self._fp.seek(0)
a = array('I')
for b1 in self._bucket:
a.append(pos_hash)
a.append(len(b1))
pos_hash += len(b1)*8
self._fp.write(encode(a))
# close
self._fp.close()
os.rename(self.fntmp, self.fn)
return
def finish(self):
self._fp.seek(self._pos)
pos_hash = self._pos
# write hashes
for b1 in self._bucket:
if not b1: continue
blen = len(b1)
a = array('I', [0]*blen*2)
for j in xrange(0, blen, 2):
(h,p) = (b1[j],b1[j+1])
i = ((h >> 8) % blen)*2
while a[i+1]: # is cell[i] already occupied?
i = (i+2) % len(a)
a[i] = h
a[i+1] = p
self._fp.write(encode(a))
# write header
self._fp.seek(0)
a = array('I')
for b1 in self._bucket:
a.append(pos_hash)
a.append(len(b1))
pos_hash += len(b1)*8
self._fp.write(encode(a))
# close
self._fp.close()
os.rename(self.fntmp, self.fn)
return
# txt2cdb
def txt2cdb(self, lines):
import re
HEAD = re.compile(r'^\+(\d+),(\d+):')
for line in lines:
m = HEAD.match(line)
if not m: break
(klen, vlen) = (int(m.group(1)), int(m.group(2)))
i = len(m.group(0))
k = line[i:i+klen]
i += klen
if line[i:i+2] != '->': raise ValueError('invalid separator: %r' % line)
i += 2
v = line[i:i+vlen]
self.add(k, v)
return self
# txt2cdb
def txt2cdb(self, lines):
import re
HEAD = re.compile(r'^\+(\d+),(\d+):')
for line in lines:
m = HEAD.match(line)
if not m: break
(klen, vlen) = (int(m.group(1)), int(m.group(2)))
i = len(m.group(0))
k = line[i:i+klen]
i += klen
if line[i:i+2] != '->': raise ValueError('invalid separator: %r' % line)
i += 2
v = line[i:i+vlen]
self.add(k, v)
return self
# cdbdump
def cdbdump(cdbname):
fp = file(cdbname, 'rb')
(eor,) = unpack('<I', fp.read(4))
return cdbiter(fp, eor)
fp = file(cdbname, 'rb')
(eor,) = unpack('<I', fp.read(4))
return cdbiter(fp, eor)
# cdbmerge
def cdbmerge(iters):
q = []
for it in iters:
try:
q.append((it.next(),it))
except StopIteration:
pass
k0 = None
vs = None
while q:
q.sort()
((k,v),it) = q.pop(0)
if k0 != k:
if vs: yield (k0,vs)
vs = []
vs.append(v)
k0 = k
try:
q.append((it.next(),it))
except StopIteration:
continue
if vs: yield (k0,vs)
return
q = []
for it in iters:
try:
q.append((it.next(),it))
except StopIteration:
pass
k0 = None
vs = None
while q:
q.sort()
((k,v),it) = q.pop(0)
if k0 != k:
if vs: yield (k0,vs)
vs = []
vs.append(v)
k0 = k
try:
q.append((it.next(),it))
except StopIteration:
continue
if vs: yield (k0,vs)
return
# aliases
@ -278,132 +278,132 @@ init = CDBReader
# tcdbiter
def tcdbiter(fp, eor):
locs = {}
fp.seek(eor)
while 1:
x = fp.read(8)
if not x: break
(h, pos) = unpack('<II', x)
if pos: locs[pos] = h
pos = 2048
fp.seek(pos)
key = ()
parents = [0]
while pos < eor:
(klen, vlen) = unpack('<II', fp.read(8))
k = fp.read(klen)
v = fp.read(vlen)
h = locs[pos]
for (i,p) in enumerate(parents):
if cdbhash(k, p+5381L) == h:
parents = parents[:i+1]
key = key[:i]
break
key += (k,)
yield (key, v)
parents.append(pos)
pos += 8+klen+vlen
fp.close()
return
locs = {}
fp.seek(eor)
while 1:
x = fp.read(8)
if not x: break
(h, pos) = unpack('<II', x)
if pos: locs[pos] = h
pos = 2048
fp.seek(pos)
key = ()
parents = [0]
while pos < eor:
(klen, vlen) = unpack('<II', fp.read(8))
k = fp.read(klen)
v = fp.read(vlen)
h = locs[pos]
for (i,p) in enumerate(parents):
if cdbhash(k, p+5381L) == h:
parents = parents[:i+1]
key = key[:i]
break
key += (k,)
yield (key, v)
parents.append(pos)
pos += 8+klen+vlen
fp.close()
return
# TCDBMaker
class TCDBMaker(CDBMaker):
def __init__(self, cdbname, tmpname):
CDBMaker.__init__(self, cdbname, tmpname)
self._parent = 0
self._stack = [self._parent]
return
def __init__(self, cdbname, tmpname):
CDBMaker.__init__(self, cdbname, tmpname)
self._parent = 0
self._stack = [self._parent]
return
def put(self, depth, k, v):
if depth == len(self._stack)+1:
self._stack.append(self._parent)
elif depth < len(self._stack):
self._stack = self._stack[:depth]
elif depth != len(self._stack):
raise ValueError('invalid depth: %d' % depth)
#
(k, v) = (str(k), str(v))
(klen, vlen) = (len(k), len(v))
self._parent = self._pos
# sizeof(keylen)+sizeof(datalen)+sizeof(key)+sizeof(data)
self._fp.seek(self._pos)
self._fp.write(pack('<II', klen, vlen))
self._fp.write(k)
self._fp.write(v)
self._pos += 4+4+klen+vlen
h = cdbhash(k, self._stack[-1]+5381L)
b = self._bucket[h % 256]
b.append(h)
b.append(self._parent)
self.numentries += 1
return self
def put(self, depth, k, v):
if depth == len(self._stack)+1:
self._stack.append(self._parent)
elif depth < len(self._stack):
self._stack = self._stack[:depth]
elif depth != len(self._stack):
raise ValueError('invalid depth: %d' % depth)
#
(k, v) = (str(k), str(v))
(klen, vlen) = (len(k), len(v))
self._parent = self._pos
# sizeof(keylen)+sizeof(datalen)+sizeof(key)+sizeof(data)
self._fp.seek(self._pos)
self._fp.write(pack('<II', klen, vlen))
self._fp.write(k)
self._fp.write(v)
self._pos += 4+4+klen+vlen
h = cdbhash(k, self._stack[-1]+5381L)
b = self._bucket[h % 256]
b.append(h)
b.append(self._parent)
self.numentries += 1
return self
def txt2tcdb(self, lines):
import re
HEAD = re.compile(r'^(\++)(\d+),(\d+):')
for line in lines:
m = HEAD.match(line)
if not m: break
(depth, klen, vlen) = (len(m.group(1)), int(m.group(2)), int(m.group(3)))
i = len(m.group(0))
k = line[i:i+klen]
i += klen
if line[i:i+2] != '->': raise ValueError('invalid separator: %r' % line)
i += 2
v = line[i:i+vlen]
self.put(depth, k, v)
return self
def txt2tcdb(self, lines):
import re
HEAD = re.compile(r'^(\++)(\d+),(\d+):')
for line in lines:
m = HEAD.match(line)
if not m: break
(depth, klen, vlen) = (len(m.group(1)), int(m.group(2)), int(m.group(3)))
i = len(m.group(0))
k = line[i:i+klen]
i += klen
if line[i:i+2] != '->': raise ValueError('invalid separator: %r' % line)
i += 2
v = line[i:i+vlen]
self.put(depth, k, v)
return self
# TCDBReader
class TCDBReader(CDBReader):
def lookup(self, seq, parent=0L):
r = []
for k in seq:
(v, parent) = self.lookup1(k, parent)
r.append(v)
return r
def lookup(self, seq, parent=0L):
r = []
for k in seq:
(v, parent) = self.lookup1(k, parent)
r.append(v)
return r
def lookup1(self, k, parent=0L):
k = str(k)
if self._docache and (parent,k) in self._cache:
return self._cache[(parent,k)]
h = cdbhash(k, parent+5381L)
self._fp.seek((h % 256) << 3)
(pos_bucket, ncells) = unpack('<II', self._fp.read(8))
if ncells == 0: raise KeyError(k)
start = (h >> 8) % ncells
for i in xrange(ncells):
self._fp.seek(pos_bucket + ((start+i) % ncells << 3))
(h1, p1) = unpack('<II', self._fp.read(8))
if p1 == 0: raise KeyError(k)
if h1 == h:
self._fp.seek(p1)
(klen, vlen) = unpack('<II', self._fp.read(8))
k1 = self._fp.read(klen)
if k1 == k:
v1 = self._fp.read(vlen)
if self._docache:
self._cache[(parent,k)] = (v1,p1)
return (v1,p1)
raise KeyError(k)
def lookup1(self, k, parent=0L):
k = str(k)
if self._docache and (parent,k) in self._cache:
return self._cache[(parent,k)]
h = cdbhash(k, parent+5381L)
self._fp.seek((h % 256) << 3)
(pos_bucket, ncells) = unpack('<II', self._fp.read(8))
if ncells == 0: raise KeyError(k)
start = (h >> 8) % ncells
for i in xrange(ncells):
self._fp.seek(pos_bucket + ((start+i) % ncells << 3))
(h1, p1) = unpack('<II', self._fp.read(8))
if p1 == 0: raise KeyError(k)
if h1 == h:
self._fp.seek(p1)
(klen, vlen) = unpack('<II', self._fp.read(8))
k1 = self._fp.read(klen)
if k1 == k:
v1 = self._fp.read(vlen)
if self._docache:
self._cache[(parent,k)] = (v1,p1)
return (v1,p1)
raise KeyError(k)
def iterkeys(self):
return ( k for (k,v) in tcdbiter(self._fp, self._eod) )
def itervalues(self):
return ( v for (k,v) in tcdbiter(self._fp, self._eod) )
def iteritems(self):
return tcdbiter(self._fp, self._eod)
def iterkeys(self):
return ( k for (k,v) in tcdbiter(self._fp, self._eod) )
def itervalues(self):
return ( v for (k,v) in tcdbiter(self._fp, self._eod) )
def iteritems(self):
return tcdbiter(self._fp, self._eod)
# tcdbdump
def tcdbdump(cdbname):
fp = file(cdbname, 'rb')
(eor,) = unpack('<I', fp.read(4))
return tcdbiter(fp, eor)
fp = file(cdbname, 'rb')
(eor,) = unpack('<I', fp.read(4))
return tcdbiter(fp, eor)
# aliases
@ -414,64 +414,64 @@ tcdbmerge = cdbmerge
# main
def main(argv):
import getopt, fileinput
def usage():
print 'usage: %s {cmake,cget,cdump,cmerge} [options] cdbname [args ...]' % argv[0]
print 'usage: %s {tmake,tget,tdump,tmerge} [options] tcdbname [args ...]' % argv[0]
return 100
args = argv[1:]
if not args: return usage()
cmd = args.pop(0)
try:
(opts, args) = getopt.getopt(args, 'kv2')
except getopt.GetoptError:
return usage()
if not args: return usage()
dbname = args.pop(0)
import getopt, fileinput
def usage():
print 'usage: %s {cmake,cget,cdump,cmerge} [options] cdbname [args ...]' % argv[0]
print 'usage: %s {tmake,tget,tdump,tmerge} [options] tcdbname [args ...]' % argv[0]
return 100
args = argv[1:]
if not args: return usage()
cmd = args.pop(0)
try:
(opts, args) = getopt.getopt(args, 'kv2')
except getopt.GetoptError:
return usage()
if not args: return usage()
dbname = args.pop(0)
# cdb
if cmd == 'cmake':
CDBMaker(dbname, dbname+'.tmp').txt2cdb(fileinput.input(args)).finish()
elif cmd == 'cget':
print repr(CDBReader(dbname).get(args[0]))
elif cmd == 'cdump':
f = (lambda k,v: '+%d,%d:%s->%s' % (len(k), len(v), k, v))
for (k, v) in opts:
if k == '-k': f = (lambda k,_: k)
elif k == '-v': f = (lambda _,v: v)
elif k == '-2': f = (lambda k,v: k+'\t'+v)
for (k,v) in cdbdump(dbname):
print f(k,v)
print
elif cmd == 'cmerge':
dbs = [ cdbdump(fname) for fname in args ]
m = CDBMaker(dbname, dbname+'.tmp')
for (k,vs) in tcdbmerge(dbs):
m.add(k, ' '.join(vs))
m.finish()
# tcdb
elif cmd == 'tmake':
TCDBMaker(dbname, dbname+'.tmp').txt2tcdb(fileinput.input(args)).finish()
elif cmd == 'tget':
print repr(TCDBReader(dbname).lookup(args))
elif cmd == 'tdump':
f = (lambda k,v: '%s%d,%d:%s->%s' % ('+'*len(k), len(k[-1]), len(v), k[-1], v))
for (k, v) in opts:
if k == '-k': f = (lambda k,_: '/'.join(k))
elif k == '-v': f = (lambda _,v: v)
elif k == '-2': f = (lambda k,v: '/'.join(k)+'\t'+v)
for (k,v) in tcdbdump(dbname):
print f(k,v)
print
elif cmd == 'tmerge':
dbs = [ tcdbdump(fname) for fname in args ]
m = TCDBMaker(dbname, dbname+'.tmp')
for (k,vs) in tcdbmerge(dbs):
m.put(len(k), k[-1], ' '.join(vs))
m.finish()
# cdb
if cmd == 'cmake':
CDBMaker(dbname, dbname+'.tmp').txt2cdb(fileinput.input(args)).finish()
elif cmd == 'cget':
print repr(CDBReader(dbname).get(args[0]))
elif cmd == 'cdump':
f = (lambda k,v: '+%d,%d:%s->%s' % (len(k), len(v), k, v))
for (k, v) in opts:
if k == '-k': f = (lambda k,_: k)
elif k == '-v': f = (lambda _,v: v)
elif k == '-2': f = (lambda k,v: k+'\t'+v)
for (k,v) in cdbdump(dbname):
print f(k,v)
print
elif cmd == 'cmerge':
dbs = [ cdbdump(fname) for fname in args ]
m = CDBMaker(dbname, dbname+'.tmp')
for (k,vs) in tcdbmerge(dbs):
m.add(k, ' '.join(vs))
m.finish()
# tcdb
elif cmd == 'tmake':
TCDBMaker(dbname, dbname+'.tmp').txt2tcdb(fileinput.input(args)).finish()
elif cmd == 'tget':
print repr(TCDBReader(dbname).lookup(args))
elif cmd == 'tdump':
f = (lambda k,v: '%s%d,%d:%s->%s' % ('+'*len(k), len(k[-1]), len(v), k[-1], v))
for (k, v) in opts:
if k == '-k': f = (lambda k,_: '/'.join(k))
elif k == '-v': f = (lambda _,v: v)
elif k == '-2': f = (lambda k,v: '/'.join(k)+'\t'+v)
for (k,v) in tcdbdump(dbname):
print f(k,v)
print
elif cmd == 'tmerge':
dbs = [ tcdbdump(fname) for fname in args ]
m = TCDBMaker(dbname, dbname+'.tmp')
for (k,vs) in tcdbmerge(dbs):
m.put(len(k), k[-1], ' '.join(vs))
m.finish()
else:
return usage()
return
else:
return usage()
return
if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -691,88 +691,88 @@ rcon = [
]
if len(pack('L',0)) == 4:
# 32bit
def GETU32(x): return unpack('>L', x)[0]
def PUTU32(x): return pack('>L', x)
# 32bit
def GETU32(x): return unpack('>L', x)[0]
def PUTU32(x): return pack('>L', x)
else:
# 64bit
def GETU32(x): return unpack('>I', x)[0]
def PUTU32(x): return pack('>I', x)
# 64bit
def GETU32(x): return unpack('>I', x)[0]
def PUTU32(x): return pack('>I', x)
# Expand the cipher key into the encryption key schedule.
#
# @return the number of rounds for the given cipher key size.
def rijndaelSetupEncrypt(key, keybits):
i = p = 0
rk = [0]*RKLENGTH(keybits)
rk[0] = GETU32(key[0:4])
rk[1] = GETU32(key[4:8])
rk[2] = GETU32(key[8:12])
rk[3] = GETU32(key[12:16])
if keybits == 128:
while 1:
temp = rk[p+3]
rk[p+4] = (rk[p+0] ^
(Te4[(temp >> 16) & 0xff] & 0xff000000) ^
(Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
(Te4[(temp ) & 0xff] & 0x0000ff00) ^
(Te4[(temp >> 24) ] & 0x000000ff) ^
rcon[i])
rk[p+5] = rk[p+1] ^ rk[p+4]
rk[p+6] = rk[p+2] ^ rk[p+5]
rk[p+7] = rk[p+3] ^ rk[p+6]
i += 1
if i == 10: return (rk, 10)
p += 4
i = p = 0
rk = [0]*RKLENGTH(keybits)
rk[0] = GETU32(key[0:4])
rk[1] = GETU32(key[4:8])
rk[2] = GETU32(key[8:12])
rk[3] = GETU32(key[12:16])
if keybits == 128:
while 1:
temp = rk[p+3]
rk[p+4] = (rk[p+0] ^
(Te4[(temp >> 16) & 0xff] & 0xff000000) ^
(Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
(Te4[(temp ) & 0xff] & 0x0000ff00) ^
(Te4[(temp >> 24) ] & 0x000000ff) ^
rcon[i])
rk[p+5] = rk[p+1] ^ rk[p+4]
rk[p+6] = rk[p+2] ^ rk[p+5]
rk[p+7] = rk[p+3] ^ rk[p+6]
i += 1
if i == 10: return (rk, 10)
p += 4
rk[4] = GETU32(key[16:20])
rk[5] = GETU32(key[20:24])
if keybits == 192:
while 1:
temp = rk[p+5]
rk[p+6] = (rk[p+0] ^
(Te4[(temp >> 16) & 0xff] & 0xff000000) ^
(Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
(Te4[(temp ) & 0xff] & 0x0000ff00) ^
(Te4[(temp >> 24) ] & 0x000000ff) ^
rcon[i])
rk[p+7] = rk[p+1] ^ rk[p+6]
rk[p+8] = rk[p+2] ^ rk[p+7]
rk[p+9] = rk[p+3] ^ rk[p+8]
i += 1
if i == 8: return (rk, 12)
rk[p+10] = rk[p+4] ^ rk[p+9]
rk[p+11] = rk[p+5] ^ rk[p+10]
p += 6
rk[4] = GETU32(key[16:20])
rk[5] = GETU32(key[20:24])
if keybits == 192:
while 1:
temp = rk[p+5]
rk[p+6] = (rk[p+0] ^
(Te4[(temp >> 16) & 0xff] & 0xff000000) ^
(Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
(Te4[(temp ) & 0xff] & 0x0000ff00) ^
(Te4[(temp >> 24) ] & 0x000000ff) ^
rcon[i])
rk[p+7] = rk[p+1] ^ rk[p+6]
rk[p+8] = rk[p+2] ^ rk[p+7]
rk[p+9] = rk[p+3] ^ rk[p+8]
i += 1
if i == 8: return (rk, 12)
rk[p+10] = rk[p+4] ^ rk[p+9]
rk[p+11] = rk[p+5] ^ rk[p+10]
p += 6
rk[6] = GETU32(key[24:28])
rk[7] = GETU32(key[28:32])
if keybits == 256:
while 1:
temp = rk[p+7]
rk[p+8] = (rk[p+0] ^
(Te4[(temp >> 16) & 0xff] & 0xff000000) ^
(Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
(Te4[(temp ) & 0xff] & 0x0000ff00) ^
(Te4[(temp >> 24) ] & 0x000000ff) ^
rcon[i])
rk[p+9] = rk[p+1] ^ rk[p+8]
rk[p+10] = rk[p+2] ^ rk[p+9]
rk[p+11] = rk[p+3] ^ rk[p+10]
i += 1
if i == 7: return (rk, 14)
temp = rk[p+11]
rk[p+12] = (rk[p+4] ^
(Te4[(temp >> 24) ] & 0xff000000) ^
(Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^
(Te4[(temp >> 8) & 0xff] & 0x0000ff00) ^
(Te4[(temp ) & 0xff] & 0x000000ff))
rk[p+13] = rk[p+5] ^ rk[p+12]
rk[p+14] = rk[p+6] ^ rk[p+13]
rk[p+15] = rk[p+7] ^ rk[p+14]
p += 8
rk[6] = GETU32(key[24:28])
rk[7] = GETU32(key[28:32])
if keybits == 256:
while 1:
temp = rk[p+7]
rk[p+8] = (rk[p+0] ^
(Te4[(temp >> 16) & 0xff] & 0xff000000) ^
(Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
(Te4[(temp ) & 0xff] & 0x0000ff00) ^
(Te4[(temp >> 24) ] & 0x000000ff) ^
rcon[i])
rk[p+9] = rk[p+1] ^ rk[p+8]
rk[p+10] = rk[p+2] ^ rk[p+9]
rk[p+11] = rk[p+3] ^ rk[p+10]
i += 1
if i == 7: return (rk, 14)
temp = rk[p+11]
rk[p+12] = (rk[p+4] ^
(Te4[(temp >> 24) ] & 0xff000000) ^
(Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^
(Te4[(temp >> 8) & 0xff] & 0x0000ff00) ^
(Te4[(temp ) & 0xff] & 0x000000ff))
rk[p+13] = rk[p+5] ^ rk[p+12]
rk[p+14] = rk[p+6] ^ rk[p+13]
rk[p+15] = rk[p+7] ^ rk[p+14]
p += 8
raise ValueError(keybits)
raise ValueError(keybits)
# Expand the cipher key into the decryption key schedule.
@ -780,291 +780,291 @@ def rijndaelSetupEncrypt(key, keybits):
# @return the number of rounds for the given cipher key size.
def rijndaelSetupDecrypt(key, keybits):
# expand the cipher key:
(rk, nrounds) = rijndaelSetupEncrypt(key, keybits)
# invert the order of the round keys:
i = 0
j = 4*nrounds
while i < j:
temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp
temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp
temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp
temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp
i += 4
j -= 4
# apply the inverse MixColumn transform to all round keys but the first and the last:
p = 0
for i in xrange(1, nrounds):
p += 4
rk[p+0] = (
Td0[Te4[(rk[p+0] >> 24) ] & 0xff] ^
Td1[Te4[(rk[p+0] >> 16) & 0xff] & 0xff] ^
Td2[Te4[(rk[p+0] >> 8) & 0xff] & 0xff] ^
Td3[Te4[(rk[p+0] ) & 0xff] & 0xff])
rk[p+1] = (
Td0[Te4[(rk[p+1] >> 24) ] & 0xff] ^
Td1[Te4[(rk[p+1] >> 16) & 0xff] & 0xff] ^
Td2[Te4[(rk[p+1] >> 8) & 0xff] & 0xff] ^
Td3[Te4[(rk[p+1] ) & 0xff] & 0xff])
rk[p+2] = (
Td0[Te4[(rk[p+2] >> 24) ] & 0xff] ^
Td1[Te4[(rk[p+2] >> 16) & 0xff] & 0xff] ^
Td2[Te4[(rk[p+2] >> 8) & 0xff] & 0xff] ^
Td3[Te4[(rk[p+2] ) & 0xff] & 0xff])
rk[p+3] = (
Td0[Te4[(rk[p+3] >> 24) ] & 0xff] ^
Td1[Te4[(rk[p+3] >> 16) & 0xff] & 0xff] ^
Td2[Te4[(rk[p+3] >> 8) & 0xff] & 0xff] ^
Td3[Te4[(rk[p+3] ) & 0xff] & 0xff])
# expand the cipher key:
(rk, nrounds) = rijndaelSetupEncrypt(key, keybits)
# invert the order of the round keys:
i = 0
j = 4*nrounds
while i < j:
temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp
temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp
temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp
temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp
i += 4
j -= 4
# apply the inverse MixColumn transform to all round keys but the first and the last:
p = 0
for i in xrange(1, nrounds):
p += 4
rk[p+0] = (
Td0[Te4[(rk[p+0] >> 24) ] & 0xff] ^
Td1[Te4[(rk[p+0] >> 16) & 0xff] & 0xff] ^
Td2[Te4[(rk[p+0] >> 8) & 0xff] & 0xff] ^
Td3[Te4[(rk[p+0] ) & 0xff] & 0xff])
rk[p+1] = (
Td0[Te4[(rk[p+1] >> 24) ] & 0xff] ^
Td1[Te4[(rk[p+1] >> 16) & 0xff] & 0xff] ^
Td2[Te4[(rk[p+1] >> 8) & 0xff] & 0xff] ^
Td3[Te4[(rk[p+1] ) & 0xff] & 0xff])
rk[p+2] = (
Td0[Te4[(rk[p+2] >> 24) ] & 0xff] ^
Td1[Te4[(rk[p+2] >> 16) & 0xff] & 0xff] ^
Td2[Te4[(rk[p+2] >> 8) & 0xff] & 0xff] ^
Td3[Te4[(rk[p+2] ) & 0xff] & 0xff])
rk[p+3] = (
Td0[Te4[(rk[p+3] >> 24) ] & 0xff] ^
Td1[Te4[(rk[p+3] >> 16) & 0xff] & 0xff] ^
Td2[Te4[(rk[p+3] >> 8) & 0xff] & 0xff] ^
Td3[Te4[(rk[p+3] ) & 0xff] & 0xff])
return (rk, nrounds)
return (rk, nrounds)
def rijndaelEncrypt(rk, nrounds, plaintext):
assert len(plaintext) == 16
assert len(plaintext) == 16
# map byte array block to cipher state
# and add initial round key:
s0 = GETU32(plaintext[0:4]) ^ rk[0]
s1 = GETU32(plaintext[4:8]) ^ rk[1]
s2 = GETU32(plaintext[8:12]) ^ rk[2]
s3 = GETU32(plaintext[12:16]) ^ rk[3]
# map byte array block to cipher state
# and add initial round key:
s0 = GETU32(plaintext[0:4]) ^ rk[0]
s1 = GETU32(plaintext[4:8]) ^ rk[1]
s2 = GETU32(plaintext[8:12]) ^ rk[2]
s3 = GETU32(plaintext[12:16]) ^ rk[3]
# nrounds - 1 full rounds:
r = nrounds >> 1
p = 0
while 1:
t0 = (
Te0[(s0 >> 24) ] ^
Te1[(s1 >> 16) & 0xff] ^
Te2[(s2 >> 8) & 0xff] ^
Te3[(s3 ) & 0xff] ^
rk[p+4])
t1 = (
Te0[(s1 >> 24) ] ^
Te1[(s2 >> 16) & 0xff] ^
Te2[(s3 >> 8) & 0xff] ^
Te3[(s0 ) & 0xff] ^
rk[p+5])
t2 = (
Te0[(s2 >> 24) ] ^
Te1[(s3 >> 16) & 0xff] ^
Te2[(s0 >> 8) & 0xff] ^
Te3[(s1 ) & 0xff] ^
rk[p+6])
t3 = (
Te0[(s3 >> 24) ] ^
Te1[(s0 >> 16) & 0xff] ^
Te2[(s1 >> 8) & 0xff] ^
Te3[(s2 ) & 0xff] ^
rk[p+7])
p += 8
r -= 1
if r == 0: break
# nrounds - 1 full rounds:
r = nrounds >> 1
p = 0
while 1:
t0 = (
Te0[(s0 >> 24) ] ^
Te1[(s1 >> 16) & 0xff] ^
Te2[(s2 >> 8) & 0xff] ^
Te3[(s3 ) & 0xff] ^
rk[p+4])
t1 = (
Te0[(s1 >> 24) ] ^
Te1[(s2 >> 16) & 0xff] ^
Te2[(s3 >> 8) & 0xff] ^
Te3[(s0 ) & 0xff] ^
rk[p+5])
t2 = (
Te0[(s2 >> 24) ] ^
Te1[(s3 >> 16) & 0xff] ^
Te2[(s0 >> 8) & 0xff] ^
Te3[(s1 ) & 0xff] ^
rk[p+6])
t3 = (
Te0[(s3 >> 24) ] ^
Te1[(s0 >> 16) & 0xff] ^
Te2[(s1 >> 8) & 0xff] ^
Te3[(s2 ) & 0xff] ^
rk[p+7])
p += 8
r -= 1
if r == 0: break
s0 = (
Te0[(t0 >> 24) ] ^
Te1[(t1 >> 16) & 0xff] ^
Te2[(t2 >> 8) & 0xff] ^
Te3[(t3 ) & 0xff] ^
rk[p+0])
s1 = (
Te0[(t1 >> 24) ] ^
Te1[(t2 >> 16) & 0xff] ^
Te2[(t3 >> 8) & 0xff] ^
Te3[(t0 ) & 0xff] ^
rk[p+1])
s2 = (
Te0[(t2 >> 24) ] ^
Te1[(t3 >> 16) & 0xff] ^
Te2[(t0 >> 8) & 0xff] ^
Te3[(t1 ) & 0xff] ^
rk[p+2])
s3 = (
Te0[(t3 >> 24) ] ^
Te1[(t0 >> 16) & 0xff] ^
Te2[(t1 >> 8) & 0xff] ^
Te3[(t2 ) & 0xff] ^
rk[p+3])
ciphertext = ''
# apply last round and
# map cipher state to byte array block:
s0 = (
Te0[(t0 >> 24) ] ^
Te1[(t1 >> 16) & 0xff] ^
Te2[(t2 >> 8) & 0xff] ^
Te3[(t3 ) & 0xff] ^
(Te4[(t0 >> 24) ] & 0xff000000) ^
(Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
(Te4[(t2 >> 8) & 0xff] & 0x0000ff00) ^
(Te4[(t3 ) & 0xff] & 0x000000ff) ^
rk[p+0])
ciphertext += PUTU32(s0)
s1 = (
Te0[(t1 >> 24) ] ^
Te1[(t2 >> 16) & 0xff] ^
Te2[(t3 >> 8) & 0xff] ^
Te3[(t0 ) & 0xff] ^
(Te4[(t1 >> 24) ] & 0xff000000) ^
(Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
(Te4[(t3 >> 8) & 0xff] & 0x0000ff00) ^
(Te4[(t0 ) & 0xff] & 0x000000ff) ^
rk[p+1])
ciphertext += PUTU32(s1)
s2 = (
Te0[(t2 >> 24) ] ^
Te1[(t3 >> 16) & 0xff] ^
Te2[(t0 >> 8) & 0xff] ^
Te3[(t1 ) & 0xff] ^
(Te4[(t2 >> 24) ] & 0xff000000) ^
(Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
(Te4[(t0 >> 8) & 0xff] & 0x0000ff00) ^
(Te4[(t1 ) & 0xff] & 0x000000ff) ^
rk[p+2])
ciphertext += PUTU32(s2)
s3 = (
Te0[(t3 >> 24) ] ^
Te1[(t0 >> 16) & 0xff] ^
Te2[(t1 >> 8) & 0xff] ^
Te3[(t2 ) & 0xff] ^
(Te4[(t3 >> 24) ] & 0xff000000) ^
(Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
(Te4[(t1 >> 8) & 0xff] & 0x0000ff00) ^
(Te4[(t2 ) & 0xff] & 0x000000ff) ^
rk[p+3])
ciphertext += PUTU32(s3)
ciphertext = ''
# apply last round and
# map cipher state to byte array block:
s0 = (
(Te4[(t0 >> 24) ] & 0xff000000) ^
(Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
(Te4[(t2 >> 8) & 0xff] & 0x0000ff00) ^
(Te4[(t3 ) & 0xff] & 0x000000ff) ^
rk[p+0])
ciphertext += PUTU32(s0)
s1 = (
(Te4[(t1 >> 24) ] & 0xff000000) ^
(Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
(Te4[(t3 >> 8) & 0xff] & 0x0000ff00) ^
(Te4[(t0 ) & 0xff] & 0x000000ff) ^
rk[p+1])
ciphertext += PUTU32(s1)
s2 = (
(Te4[(t2 >> 24) ] & 0xff000000) ^
(Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
(Te4[(t0 >> 8) & 0xff] & 0x0000ff00) ^
(Te4[(t1 ) & 0xff] & 0x000000ff) ^
rk[p+2])
ciphertext += PUTU32(s2)
s3 = (
(Te4[(t3 >> 24) ] & 0xff000000) ^
(Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
(Te4[(t1 >> 8) & 0xff] & 0x0000ff00) ^
(Te4[(t2 ) & 0xff] & 0x000000ff) ^
rk[p+3])
ciphertext += PUTU32(s3)
assert len(ciphertext) == 16
return ciphertext
assert len(ciphertext) == 16
return ciphertext
def rijndaelDecrypt(rk, nrounds, ciphertext):
assert len(ciphertext) == 16
assert len(ciphertext) == 16
# map byte array block to cipher state
# and add initial round key:
s0 = GETU32(ciphertext[0:4]) ^ rk[0]
s1 = GETU32(ciphertext[4:8]) ^ rk[1]
s2 = GETU32(ciphertext[8:12]) ^ rk[2]
s3 = GETU32(ciphertext[12:16]) ^ rk[3]
# map byte array block to cipher state
# and add initial round key:
s0 = GETU32(ciphertext[0:4]) ^ rk[0]
s1 = GETU32(ciphertext[4:8]) ^ rk[1]
s2 = GETU32(ciphertext[8:12]) ^ rk[2]
s3 = GETU32(ciphertext[12:16]) ^ rk[3]
# nrounds - 1 full rounds:
r = nrounds >> 1
p = 0
while 1:
t0 = (
Td0[(s0 >> 24) ] ^
Td1[(s3 >> 16) & 0xff] ^
Td2[(s2 >> 8) & 0xff] ^
Td3[(s1 ) & 0xff] ^
rk[p+4])
t1 = (
Td0[(s1 >> 24) ] ^
Td1[(s0 >> 16) & 0xff] ^
Td2[(s3 >> 8) & 0xff] ^
Td3[(s2 ) & 0xff] ^
rk[p+5])
t2 = (
Td0[(s2 >> 24) ] ^
Td1[(s1 >> 16) & 0xff] ^
Td2[(s0 >> 8) & 0xff] ^
Td3[(s3 ) & 0xff] ^
rk[p+6])
t3 = (
Td0[(s3 >> 24) ] ^
Td1[(s2 >> 16) & 0xff] ^
Td2[(s1 >> 8) & 0xff] ^
Td3[(s0 ) & 0xff] ^
rk[p+7])
p += 8
r -= 1
if r == 0: break
# nrounds - 1 full rounds:
r = nrounds >> 1
p = 0
while 1:
t0 = (
Td0[(s0 >> 24) ] ^
Td1[(s3 >> 16) & 0xff] ^
Td2[(s2 >> 8) & 0xff] ^
Td3[(s1 ) & 0xff] ^
rk[p+4])
t1 = (
Td0[(s1 >> 24) ] ^
Td1[(s0 >> 16) & 0xff] ^
Td2[(s3 >> 8) & 0xff] ^
Td3[(s2 ) & 0xff] ^
rk[p+5])
t2 = (
Td0[(s2 >> 24) ] ^
Td1[(s1 >> 16) & 0xff] ^
Td2[(s0 >> 8) & 0xff] ^
Td3[(s3 ) & 0xff] ^
rk[p+6])
t3 = (
Td0[(s3 >> 24) ] ^
Td1[(s2 >> 16) & 0xff] ^
Td2[(s1 >> 8) & 0xff] ^
Td3[(s0 ) & 0xff] ^
rk[p+7])
p += 8
r -= 1
if r == 0: break
s0 = (
Td0[(t0 >> 24) ] ^
Td1[(t3 >> 16) & 0xff] ^
Td2[(t2 >> 8) & 0xff] ^
Td3[(t1 ) & 0xff] ^
rk[p+0])
s1 = (
Td0[(t1 >> 24) ] ^
Td1[(t0 >> 16) & 0xff] ^
Td2[(t3 >> 8) & 0xff] ^
Td3[(t2 ) & 0xff] ^
rk[p+1])
s2 = (
Td0[(t2 >> 24) ] ^
Td1[(t1 >> 16) & 0xff] ^
Td2[(t0 >> 8) & 0xff] ^
Td3[(t3 ) & 0xff] ^
rk[p+2])
s3 = (
Td0[(t3 >> 24) ] ^
Td1[(t2 >> 16) & 0xff] ^
Td2[(t1 >> 8) & 0xff] ^
Td3[(t0 ) & 0xff] ^
rk[p+3])
plaintext = ''
# apply last round and
# map cipher state to byte array block:
s0 = (
Td0[(t0 >> 24) ] ^
Td1[(t3 >> 16) & 0xff] ^
Td2[(t2 >> 8) & 0xff] ^
Td3[(t1 ) & 0xff] ^
(Td4[(t0 >> 24) ] & 0xff000000) ^
(Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
(Td4[(t2 >> 8) & 0xff] & 0x0000ff00) ^
(Td4[(t1 ) & 0xff] & 0x000000ff) ^
rk[p+0])
plaintext += PUTU32(s0)
s1 = (
Td0[(t1 >> 24) ] ^
Td1[(t0 >> 16) & 0xff] ^
Td2[(t3 >> 8) & 0xff] ^
Td3[(t2 ) & 0xff] ^
(Td4[(t1 >> 24) ] & 0xff000000) ^
(Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
(Td4[(t3 >> 8) & 0xff] & 0x0000ff00) ^
(Td4[(t2 ) & 0xff] & 0x000000ff) ^
rk[p+1])
plaintext += PUTU32(s1)
s2 = (
Td0[(t2 >> 24) ] ^
Td1[(t1 >> 16) & 0xff] ^
Td2[(t0 >> 8) & 0xff] ^
Td3[(t3 ) & 0xff] ^
(Td4[(t2 >> 24) ] & 0xff000000) ^
(Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
(Td4[(t0 >> 8) & 0xff] & 0x0000ff00) ^
(Td4[(t3 ) & 0xff] & 0x000000ff) ^
rk[p+2])
plaintext += PUTU32(s2)
s3 = (
Td0[(t3 >> 24) ] ^
Td1[(t2 >> 16) & 0xff] ^
Td2[(t1 >> 8) & 0xff] ^
Td3[(t0 ) & 0xff] ^
(Td4[(t3 >> 24) ] & 0xff000000) ^
(Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
(Td4[(t1 >> 8) & 0xff] & 0x0000ff00) ^
(Td4[(t0 ) & 0xff] & 0x000000ff) ^
rk[p+3])
plaintext += PUTU32(s3)
plaintext = ''
# apply last round and
# map cipher state to byte array block:
s0 = (
(Td4[(t0 >> 24) ] & 0xff000000) ^
(Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
(Td4[(t2 >> 8) & 0xff] & 0x0000ff00) ^
(Td4[(t1 ) & 0xff] & 0x000000ff) ^
rk[p+0])
plaintext += PUTU32(s0)
s1 = (
(Td4[(t1 >> 24) ] & 0xff000000) ^
(Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
(Td4[(t3 >> 8) & 0xff] & 0x0000ff00) ^
(Td4[(t2 ) & 0xff] & 0x000000ff) ^
rk[p+1])
plaintext += PUTU32(s1)
s2 = (
(Td4[(t2 >> 24) ] & 0xff000000) ^
(Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
(Td4[(t0 >> 8) & 0xff] & 0x0000ff00) ^
(Td4[(t3 ) & 0xff] & 0x000000ff) ^
rk[p+2])
plaintext += PUTU32(s2)
s3 = (
(Td4[(t3 >> 24) ] & 0xff000000) ^
(Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
(Td4[(t1 >> 8) & 0xff] & 0x0000ff00) ^
(Td4[(t0 ) & 0xff] & 0x000000ff) ^
rk[p+3])
plaintext += PUTU32(s3)
assert len(plaintext) == 16
return plaintext
assert len(plaintext) == 16
return plaintext
# decrypt(key, fin, fout, keybits=256)
class RijndaelDecryptor(object):
def __init__(self, key, keybits=256):
assert len(key) == KEYLENGTH(keybits)
(self.rk, self.nrounds) = rijndaelSetupDecrypt(key, keybits)
assert len(self.rk) == RKLENGTH(keybits)
assert self.nrounds == NROUNDS(keybits)
return
def __init__(self, key, keybits=256):
assert len(key) == KEYLENGTH(keybits)
(self.rk, self.nrounds) = rijndaelSetupDecrypt(key, keybits)
assert len(self.rk) == RKLENGTH(keybits)
assert self.nrounds == NROUNDS(keybits)
return
def decrypt(self, ciphertext):
assert len(ciphertext) == 16
return rijndaelDecrypt(self.rk, self.nrounds, ciphertext)
def decrypt(self, ciphertext):
assert len(ciphertext) == 16
return rijndaelDecrypt(self.rk, self.nrounds, ciphertext)
# encrypt(key, fin, fout, keybits=256)
class RijndaelEncryptor(object):
def __init__(self, key, keybits=256):
assert len(key) == KEYLENGTH(keybits)
(self.rk, self.nrounds) = rijndaelSetupEncrypt(key, keybits)
assert len(self.rk) == RKLENGTH(keybits)
assert self.nrounds == NROUNDS(keybits)
return
def __init__(self, key, keybits=256):
assert len(key) == KEYLENGTH(keybits)
(self.rk, self.nrounds) = rijndaelSetupEncrypt(key, keybits)
assert len(self.rk) == RKLENGTH(keybits)
assert self.nrounds == NROUNDS(keybits)
return
def encrypt(self, plaintext):
assert len(plaintext) == 16
return rijndaelEncrypt(self.rk, self.nrounds, plaintext)
def encrypt(self, plaintext):
assert len(plaintext) == 16
return rijndaelEncrypt(self.rk, self.nrounds, plaintext)
def main(argv):
# test
key = '00010203050607080A0B0C0D0F101112'.decode('hex')
plaintext = '506812A45F08C889B97F5980038B8359'.decode('hex')
ciphertext = 'D8F532538289EF7D06B506A4FD5BE9C9'.decode('hex')
e = RijndaelEncryptor(key, 128)
text = e.encrypt(plaintext)
assert text == ciphertext
d = RijndaelDecryptor(key, 128)
text = d.decrypt(ciphertext)
assert text == plaintext
return 0
# test
key = '00010203050607080A0B0C0D0F101112'.decode('hex')
plaintext = '506812A45F08C889B97F5980038B8359'.decode('hex')
ciphertext = 'D8F532538289EF7D06B506A4FD5BE9C9'.decode('hex')
e = RijndaelEncryptor(key, 128)
text = e.encrypt(plaintext)
assert text == ciphertext
d = RijndaelDecryptor(key, 128)
text = d.decrypt(ciphertext)
assert text == plaintext
return 0
if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -7,21 +7,21 @@ from struct import unpack
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
'''Multiplies two matrices.'''
return (a0*a1+c0*b1, b0*a1+d0*b1,
a0*c1+c0*d1, b0*c1+d0*d1,
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
'''Multiplies two matrices.'''
return (a0*a1+c0*b1, b0*a1+d0*b1,
a0*c1+c0*d1, b0*c1+d0*d1,
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
def translate_matrix((a,b,c,d,e,f), (x,y)):
return (a,b,c,d,x*a+y*c+e,x*b+y*d+f)
return (a,b,c,d,x*a+y*c+e,x*b+y*d+f)
def apply_matrix_pt((a,b,c,d,e,f), (x,y)):
'''Applies a matrix to a point.'''
return (a*x+c*y+e, b*x+d*y+f)
'''Applies a matrix to a point.'''
return (a*x+c*y+e, b*x+d*y+f)
def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
'''Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))'''
return (a*p+c*q, b*p+d*q)
'''Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))'''
return (a*p+c*q, b*p+d*q)
## Utility functions
@ -29,62 +29,62 @@ def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
# pick
def pick(seq, func, maxobj=None):
'''Picks the object that has the highest value of func(obj).'''
maxscore = None
for obj in seq:
score = func(obj)
if maxscore == None or maxscore < score:
(maxscore,maxobj) = (score,obj)
return maxobj
'''Picks the object that has the highest value of func(obj).'''
maxscore = None
for obj in seq:
score = func(obj)
if maxscore == None or maxscore < score:
(maxscore,maxobj) = (score,obj)
return maxobj
# bsearch
def bsearch(objs, v0):
'''Tries to find the closest value to v0.'''
i0 = 0
i1 = len(objs)
while i0 < i1:
i = (i0+i1)/2
(v, obj) = objs[i]
if v0 == v:
(i0,i1) = (i,i+1)
while 0 < i0 and objs[i0-1][0] == v0:
i0 -= 1
while i1 < len(objs)-1 and objs[i1][0] == v0:
i1 += 1
break
elif v0 < v:
i1 = i
else:
i0 = i+1
return (i0,i1)
'''Tries to find the closest value to v0.'''
i0 = 0
i1 = len(objs)
while i0 < i1:
i = (i0+i1)/2
(v, obj) = objs[i]
if v0 == v:
(i0,i1) = (i,i+1)
while 0 < i0 and objs[i0-1][0] == v0:
i0 -= 1
while i1 < len(objs)-1 and objs[i1][0] == v0:
i1 += 1
break
elif v0 < v:
i1 = i
else:
i0 = i+1
return (i0,i1)
# choplist
def choplist(n, seq):
'''Groups every n elements of the list.'''
r = []
for x in seq:
r.append(x)
if len(r) == n:
yield tuple(r)
r = []
return
'''Groups every n elements of the list.'''
r = []
for x in seq:
r.append(x)
if len(r) == n:
yield tuple(r)
r = []
return
# nunpack
def nunpack(s, default=0):
'''Unpacks up to 4 bytes big endian.'''
l = len(s)
if not l:
return default
elif l == 1:
return ord(s)
elif l == 2:
return unpack('>H', s)[0]
elif l == 3:
return unpack('>L', '\x00'+s)[0]
elif l == 4:
return unpack('>L', s)[0]
else:
return TypeError('invalid length: %d' % l)
'''Unpacks up to 4 bytes big endian.'''
l = len(s)
if not l:
return default
elif l == 1:
return ord(s)
elif l == 2:
return unpack('>H', s)[0]
elif l == 3:
return unpack('>L', '\x00'+s)[0]
elif l == 4:
return unpack('>L', s)[0]
else:
return TypeError('invalid length: %d' % l)
# decode_text
PDFDocEncoding = ''.join( unichr(x) for x in (
@ -122,14 +122,14 @@ PDFDocEncoding = ''.join( unichr(x) for x in (
0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
))
def decode_text(s):
'''Decodes a PDFDocEncoding string to Unicode.'''
if s.startswith('\xfe\xff'):
return unicode(s[2:], 'utf-16be', 'ignore')
else:
return ''.join( PDFDocEncoding[ord(c)] for c in s )
'''Decodes a PDFDocEncoding string to Unicode.'''
if s.startswith('\xfe\xff'):
return unicode(s[2:], 'utf-16be', 'ignore')
else:
return ''.join( PDFDocEncoding[ord(c)] for c in s )
# enc
def enc(x, codec='ascii'):
'''Encodes a string for SGML/XML/HTML'''
x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
return x.encode(codec, 'xmlcharrefreplace')
'''Encodes a string for SGML/XML/HTML'''
x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
return x.encode(codec, 'xmlcharrefreplace')

View File

@ -3,10 +3,10 @@ from distutils.core import setup
from pdfminer import __version__
setup(
name='pdfminer',
version=__version__,
description='PDF parser and analyzer',
long_description='''PDFMiner is a suite of programs that help
name='pdfminer',
version=__version__,
description='PDF parser and analyzer',
long_description='''PDFMiner is a suite of programs that help
extracting and analyzing text data of PDF documents.
Unlike other PDF-related tools, it allows to obtain
the exact location of texts in a page, as well as
@ -14,23 +14,23 @@ other extra information such as font information or ruled lines.
It includes a PDF converter that can transform PDF files
into other text formats (such as HTML). It has an extensible
PDF parser that can be used for other purposes instead of text analysis.''',
license='MIT/X',
author='Yusuke Shinyama',
author_email='yusuke at cs dot nyu dot edu',
url='http://www.unixuser.org/~euske/python/pdfminer/index.html',
packages=[
license='MIT/X',
author='Yusuke Shinyama',
author_email='yusuke at cs dot nyu dot edu',
url='http://www.unixuser.org/~euske/python/pdfminer/index.html',
packages=[
'pdfminer'
],
scripts=[
],
scripts=[
'tools/pdf2txt.py',
'tools/dumppdf.py'
],
keywords=['pdf parser', 'pdf converter', 'text mining'],
classifiers=[
keywords=['pdf parser', 'pdf converter', 'text mining'],
classifiers=[
'Development Status :: 4 - Beta',
'Environment :: Console',
'Intended Audience :: Developers',
'Intended Audience :: Science/Research',
'License :: OSI Approved :: MIT License',
],
)
],
)

View File

@ -5,38 +5,38 @@ stdout = sys.stdout
stderr = sys.stderr
def main(argv):
fonts = {}
for line in fileinput.input():
f = line.strip().split(' ')
if not f: continue
k = f[0]
if k == 'FontName':
fontname = f[1]
props = {'FontName': fontname, 'Flags': 0}
chars = {}
fonts[fontname] = (props, chars)
elif k == 'C':
cid = int(f[1])
if 0 <= cid and cid <= 255:
width = int(f[4])
chars[cid] = width
elif k in ('CapHeight', 'XHeight', 'ItalicAngle',
'Ascender', 'Descender'):
k = {'Ascender':'Ascent', 'Descender':'Descent'}.get(k,k)
props[k] = float(f[1])
elif k in ('FontName', 'FamilyName', 'Weight'):
k = {'FamilyName':'FontFamily', 'Weight':'FontWeight'}.get(k,k)
props[k] = f[1]
elif k == 'IsFixedPitch':
if f[1].lower() == 'true':
props['Flags'] = 64
elif k == 'FontBBox':
props[k] = tuple(map(float, f[1:5]))
print '# -*- python -*-'
print 'FONT_METRICS = {'
for (fontname,(props,chars)) in fonts.iteritems():
print ' %r: %r,' % (fontname, (props,chars))
print '}'
return 0
fonts = {}
for line in fileinput.input():
f = line.strip().split(' ')
if not f: continue
k = f[0]
if k == 'FontName':
fontname = f[1]
props = {'FontName': fontname, 'Flags': 0}
chars = {}
fonts[fontname] = (props, chars)
elif k == 'C':
cid = int(f[1])
if 0 <= cid and cid <= 255:
width = int(f[4])
chars[cid] = width
elif k in ('CapHeight', 'XHeight', 'ItalicAngle',
'Ascender', 'Descender'):
k = {'Ascender':'Ascent', 'Descender':'Descent'}.get(k,k)
props[k] = float(f[1])
elif k in ('FontName', 'FamilyName', 'Weight'):
k = {'FamilyName':'FontFamily', 'Weight':'FontWeight'}.get(k,k)
props[k] = f[1]
elif k == 'IsFixedPitch':
if f[1].lower() == 'true':
props['Flags'] = 64
elif k == 'FontBBox':
props[k] = tuple(map(float, f[1:5]))
print '# -*- python -*-'
print 'FONT_METRICS = {'
for (fontname,(props,chars)) in fonts.iteritems():
print ' %r: %r,' % (fontname, (props,chars))
print '}'
return 0
if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -13,173 +13,173 @@ from pdfminer.pdftypes import PDFStream, PDFObjRef, PSKeyword, PSLiteral, resolv
ESC_PAT = re.compile(r'[\000-\037&<>()\042\047\134\177-\377]')
def esc(s):
return ESC_PAT.sub(lambda m:'&#%d;' % ord(m.group(0)), s)
return ESC_PAT.sub(lambda m:'&#%d;' % ord(m.group(0)), s)
# dumpxml
def dumpxml(out, obj, codec=None):
if isinstance(obj, dict):
out.write('<dict size="%d">\n' % len(obj))
for (k,v) in obj.iteritems():
out.write('<key>%s</key>\n' % k)
out.write('<value>')
dumpxml(out, v)
out.write('</value>\n')
out.write('</dict>')
return
if isinstance(obj, dict):
out.write('<dict size="%d">\n' % len(obj))
for (k,v) in obj.iteritems():
out.write('<key>%s</key>\n' % k)
out.write('<value>')
dumpxml(out, v)
out.write('</value>\n')
out.write('</dict>')
return
if isinstance(obj, list):
out.write('<list size="%d">\n' % len(obj))
for v in obj:
dumpxml(out, v)
out.write('\n')
out.write('</list>')
return
if isinstance(obj, list):
out.write('<list size="%d">\n' % len(obj))
for v in obj:
dumpxml(out, v)
out.write('\n')
out.write('</list>')
return
if isinstance(obj, str):
out.write('<string size="%d">%s</string>' % (len(obj), esc(obj)))
return
if isinstance(obj, str):
out.write('<string size="%d">%s</string>' % (len(obj), esc(obj)))
return
if isinstance(obj, PDFStream):
out.write('<stream>\n<props>\n')
dumpxml(out, obj.dic)
out.write('\n</props>\n')
if codec == 'text':
data = obj.get_data()
out.write('<data size="%d">%s</data>\n' % (len(data), esc(data)))
out.write('</stream>')
return
if isinstance(obj, PDFStream):
out.write('<stream>\n<props>\n')
dumpxml(out, obj.dic)
out.write('\n</props>\n')
if codec == 'text':
data = obj.get_data()
out.write('<data size="%d">%s</data>\n' % (len(data), esc(data)))
out.write('</stream>')
return
if isinstance(obj, PDFObjRef):
out.write('<ref id="%d"/>' % obj.objid)
return
if isinstance(obj, PDFObjRef):
out.write('<ref id="%d"/>' % obj.objid)
return
if isinstance(obj, PSKeyword):
out.write('<keyword>%s</keyword>' % obj.name)
return
if isinstance(obj, PSKeyword):
out.write('<keyword>%s</keyword>' % obj.name)
return
if isinstance(obj, PSLiteral):
out.write('<literal>%s</literal>' % obj.name)
return
if isinstance(obj, PSLiteral):
out.write('<literal>%s</literal>' % obj.name)
return
if isinstance(obj, int) or isinstance(obj, float):
out.write('<number>%s</number>' % obj)
return
if isinstance(obj, int) or isinstance(obj, float):
out.write('<number>%s</number>' % obj)
return
raise TypeError(obj)
raise TypeError(obj)
# dumptrailers
def dumptrailers(out, doc):
for xref in doc.xrefs:
out.write('<trailer>\n')
dumpxml(out, xref.trailer)
out.write('\n</trailer>\n\n')
return
for xref in doc.xrefs:
out.write('<trailer>\n')
dumpxml(out, xref.trailer)
out.write('\n</trailer>\n\n')
return
# dumpallobjs
def dumpallobjs(out, doc, codec=None):
out.write('<pdf>')
for xref in doc.xrefs:
for objid in xref.objids():
try:
obj = doc.getobj(objid)
if obj == None: continue
out.write('<object id="%d">\n' % objid)
dumpxml(out, obj, codec=codec)
out.write('\n</object>\n\n')
except:
raise
dumptrailers(out, doc)
out.write('</pdf>')
return
out.write('<pdf>')
for xref in doc.xrefs:
for objid in xref.objids():
try:
obj = doc.getobj(objid)
if obj == None: continue
out.write('<object id="%d">\n' % objid)
dumpxml(out, obj, codec=codec)
out.write('\n</object>\n\n')
except:
raise
dumptrailers(out, doc)
out.write('</pdf>')
return
# dumpoutline
def dumpoutline(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None):
doc = PDFDocument()
fp = file(fname, 'rb')
parser = PDFParser(doc, fp)
doc.initialize(password)
pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
for (level,title,dest,a,se) in doc.get_outlines():
pageno = None
if dest:
dest = resolve1( doc.lookup_name('Dests', dest) )
if isinstance(dest, dict):
dest = dest['D']
pageno = pages[dest[0].objid]
outfp.write(repr((level,title,dest,pageno))+'\n')
parser.close()
fp.close()
return
doc = PDFDocument()
fp = file(fname, 'rb')
parser = PDFParser(doc, fp)
doc.initialize(password)
pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
for (level,title,dest,a,se) in doc.get_outlines():
pageno = None
if dest:
dest = resolve1( doc.lookup_name('Dests', dest) )
if isinstance(dest, dict):
dest = dest['D']
pageno = pages[dest[0].objid]
outfp.write(repr((level,title,dest,pageno))+'\n')
parser.close()
fp.close()
return
# dumppdf
def dumppdf(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None):
doc = PDFDocument()
fp = file(fname, 'rb')
parser = PDFParser(doc, fp)
doc.initialize(password)
if objids:
for objid in objids:
obj = doc.getobj(objid)
if isinstance(obj, PDFStream) and codec == 'raw':
outfp.write(obj.get_rawdata())
elif isinstance(obj, PDFStream) and codec == 'binary':
outfp.write(obj.get_data())
else:
dumpxml(outfp, obj, codec=codec)
if pagenos:
for (pageno,page) in enumerate(doc.get_pages()):
if pageno in pagenos:
dumpxml(outfp, page.attrs)
if dumpall:
dumpallobjs(outfp, doc, codec=codec)
if (not objids) and (not pagenos) and (not dumpall):
dumptrailers(outfp, doc)
fp.close()
if codec not in ('raw','binary'):
outfp.write('\n')
return
doc = PDFDocument()
fp = file(fname, 'rb')
parser = PDFParser(doc, fp)
doc.initialize(password)
if objids:
for objid in objids:
obj = doc.getobj(objid)
if isinstance(obj, PDFStream) and codec == 'raw':
outfp.write(obj.get_rawdata())
elif isinstance(obj, PDFStream) and codec == 'binary':
outfp.write(obj.get_data())
else:
dumpxml(outfp, obj, codec=codec)
if pagenos:
for (pageno,page) in enumerate(doc.get_pages()):
if pageno in pagenos:
dumpxml(outfp, page.attrs)
if dumpall:
dumpallobjs(outfp, doc, codec=codec)
if (not objids) and (not pagenos) and (not dumpall):
dumptrailers(outfp, doc)
fp.close()
if codec not in ('raw','binary'):
outfp.write('\n')
return
# main
def main(argv):
import getopt
def usage():
print 'usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-i objid] file ...' % argv[0]
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTi:')
except getopt.GetoptError:
return usage()
if not args: return usage()
debug = 0
objids = []
pagenos = set()
codec = None
password = ''
dumpall = False
proc = dumppdf
outfp = sys.stdout
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-i': objids.extend( int(x) for x in v.split(',') )
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-P': password = v
elif k == '-a': dumpall = True
elif k == '-r': codec = 'raw'
elif k == '-b': codec = 'binary'
elif k == '-t': codec = 'text'
elif k == '-T': proc = dumpoutline
elif k == '-o': outfp = file(v, 'wb')
#
PDFDocument.debug = debug
PDFParser.debug = debug
#
for fname in args:
proc(outfp, fname, objids, pagenos, password=password,
dumpall=dumpall, codec=codec)
return
import getopt
def usage():
print 'usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-i objid] file ...' % argv[0]
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTi:')
except getopt.GetoptError:
return usage()
if not args: return usage()
debug = 0
objids = []
pagenos = set()
codec = None
password = ''
dumpall = False
proc = dumppdf
outfp = sys.stdout
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-i': objids.extend( int(x) for x in v.split(',') )
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-P': password = v
elif k == '-a': dumpall = True
elif k == '-r': codec = 'raw'
elif k == '-b': codec = 'binary'
elif k == '-t': codec = 'text'
elif k == '-T': proc = dumpoutline
elif k == '-o': outfp = file(v, 'wb')
#
PDFDocument.debug = debug
PDFParser.debug = debug
#
for fname in args:
proc(outfp, fname, objids, pagenos, password=password,
dumpall=dumpall, codec=codec)
return
if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -27,16 +27,16 @@ from pdfminer.cmap import CMapDB
# quote HTML metacharacters
def q(x):
return x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
return x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
# encode parameters as a URL
Q = re.compile(r'[^a-zA-Z0-9_.-=]')
def url(base, **kw):
r = []
for (k,v) in kw.iteritems():
v = Q.sub(lambda m: '%%%02X' % ord(m.group(0)), encoder(q(v), 'replace')[0])
r.append('%s=%s' % (k, v))
return base+'&'.join(r)
r = []
for (k,v) in kw.iteritems():
v = Q.sub(lambda m: '%%%02X' % ord(m.group(0)), encoder(q(v), 'replace')[0])
r.append('%s=%s' % (k, v))
return base+'&'.join(r)
## convert
@ -44,156 +44,156 @@ def url(base, **kw):
class FileSizeExceeded(ValueError): pass
def convert(outfp, infp, path, codec='utf-8', maxpages=10,
maxfilesize=5000000, pagenos=None, html=True):
# save the input file.
src = file(path, 'wb')
nbytes = 0
while 1:
data = infp.read(4096)
nbytes += len(data)
if maxfilesize and maxfilesize < nbytes:
raise FileSizeExceeded(maxfilesize)
if not data: break
src.write(data)
src.close()
infp.close()
# perform conversion and
# send the results over the network.
CMapDB.initialize()
rsrc = PDFResourceManager()
laparams = LAParams()
if html:
device = HTMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
else:
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
fp = file(path, 'rb')
process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages)
fp.close()
return
# save the input file.
src = file(path, 'wb')
nbytes = 0
while 1:
data = infp.read(4096)
nbytes += len(data)
if maxfilesize and maxfilesize < nbytes:
raise FileSizeExceeded(maxfilesize)
if not data: break
src.write(data)
src.close()
infp.close()
# perform conversion and
# send the results over the network.
CMapDB.initialize()
rsrc = PDFResourceManager()
laparams = LAParams()
if html:
device = HTMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
else:
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
fp = file(path, 'rb')
process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages)
fp.close()
return
## PDF2HTMLApp
##
class PDF2HTMLApp(object):
APPURL = '/convert'
TMPDIR = './var/'
LOGPATH = './var/log'
MAXFILESIZE = 5000000
MAXPAGES = 10
APPURL = '/convert'
TMPDIR = './var/'
LOGPATH = './var/log'
MAXFILESIZE = 5000000
MAXPAGES = 10
def __init__(self, outfp, logpath=LOGPATH, loglevel=logging.DEBUG, codec='utf-8'):
self.outfp = outfp
self.codec = codec
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
level=loglevel, filename=logpath, filemode='a')
self.remote_addr = os.environ.get('REMOTE_ADDR')
self.path_info = os.environ.get('PATH_INFO')
self.method = os.environ.get('REQUEST_METHOD', 'GET')
self.server = os.environ.get('SERVER_SOFTWARE', '')
self.content_type = 'text/html; charset=%s' % codec
self.cur_time = time.time()
self.form = cgi.FieldStorage()
return
def __init__(self, outfp, logpath=LOGPATH, loglevel=logging.DEBUG, codec='utf-8'):
self.outfp = outfp
self.codec = codec
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
level=loglevel, filename=logpath, filemode='a')
self.remote_addr = os.environ.get('REMOTE_ADDR')
self.path_info = os.environ.get('PATH_INFO')
self.method = os.environ.get('REQUEST_METHOD', 'GET')
self.server = os.environ.get('SERVER_SOFTWARE', '')
self.content_type = 'text/html; charset=%s' % codec
self.cur_time = time.time()
self.form = cgi.FieldStorage()
return
def put(self, *args):
for x in args:
if isinstance(x, str):
self.outfp.write(x)
elif isinstance(x, unicode):
self.outfp.write(x.encode(self.codec, 'xmlcharrefreplace'))
return
def put(self, *args):
for x in args:
if isinstance(x, str):
self.outfp.write(x)
elif isinstance(x, unicode):
self.outfp.write(x.encode(self.codec, 'xmlcharrefreplace'))
return
def http_200(self):
if self.server.startswith('cgi-httpd'):
# required for cgi-httpd
self.outfp.write('HTTP/1.0 200 OK\r\n')
self.outfp.write('Content-type: %s\r\n' % self.content_type)
self.outfp.write('Connection: close\r\n\r\n')
return
def http_200(self):
if self.server.startswith('cgi-httpd'):
# required for cgi-httpd
self.outfp.write('HTTP/1.0 200 OK\r\n')
self.outfp.write('Content-type: %s\r\n' % self.content_type)
self.outfp.write('Connection: close\r\n\r\n')
return
def http_404(self):
if self.server.startswith('cgi-httpd'):
# required for cgi-httpd
self.outfp.write('HTTP/1.0 404 Not Found\r\n')
self.outfp.write('Content-type: text/html\r\n')
self.outfp.write('Connection: close\r\n\r\n')
self.outfp.write('<html><body>page does not exist</body></body>\n')
return
def http_404(self):
if self.server.startswith('cgi-httpd'):
# required for cgi-httpd
self.outfp.write('HTTP/1.0 404 Not Found\r\n')
self.outfp.write('Content-type: text/html\r\n')
self.outfp.write('Connection: close\r\n\r\n')
self.outfp.write('<html><body>page does not exist</body></body>\n')
return
def http_301(self, url):
if self.server.startswith('cgi-httpd'):
# required for cgi-httpd
self.outfp.write('HTTP/1.0 301 Moved\r\n')
self.outfp.write('Location: %s\r\n\r\n' % url)
return
def http_301(self, url):
if self.server.startswith('cgi-httpd'):
# required for cgi-httpd
self.outfp.write('HTTP/1.0 301 Moved\r\n')
self.outfp.write('Location: %s\r\n\r\n' % url)
return
def coverpage(self):
self.put(
'<html><head><title>pdf2html demo</title></head><body>\n',
'<h1>pdf2html demo</h1><hr>\n',
'<form method="POST" action="%s" enctype="multipart/form-data">\n' % q(self.APPURL),
'<p>Upload PDF File: <input name="f" type="file" value="">\n',
'&nbsp; Page numbers (comma-separated): <input name="p" type="text" size="10" value="">\n',
'<p>(Text extraction is limited to maximum %d pages.\n' % self.MAXPAGES,
'Maximum file size for input is %d bytes.)\n' % self.MAXFILESIZE,
'<p><input type="submit" name="c" value="Convert to HTML">\n',
'<input type="submit" name="c" value="Convert to TEXT">\n',
'<input type="reset" value="Reset">\n',
'</form><hr>\n',
'<p>Powered by <a href="http://www.unixuser.org/~euske/python/pdfminer/">PDFMiner</a>\n',
'</body></html>\n',
)
return
def coverpage(self):
self.put(
'<html><head><title>pdf2html demo</title></head><body>\n',
'<h1>pdf2html demo</h1><hr>\n',
'<form method="POST" action="%s" enctype="multipart/form-data">\n' % q(self.APPURL),
'<p>Upload PDF File: <input name="f" type="file" value="">\n',
'&nbsp; Page numbers (comma-separated): <input name="p" type="text" size="10" value="">\n',
'<p>(Text extraction is limited to maximum %d pages.\n' % self.MAXPAGES,
'Maximum file size for input is %d bytes.)\n' % self.MAXFILESIZE,
'<p><input type="submit" name="c" value="Convert to HTML">\n',
'<input type="submit" name="c" value="Convert to TEXT">\n',
'<input type="reset" value="Reset">\n',
'</form><hr>\n',
'<p>Powered by <a href="http://www.unixuser.org/~euske/python/pdfminer/">PDFMiner</a>\n',
'</body></html>\n',
)
return
def run(self, argv):
if self.path_info == '/':
self.http_200()
self.coverpage()
return
if self.path_info != self.APPURL:
self.http_404()
return
if not os.path.isdir(self.TMPDIR):
self.bummer('error')
return
if 'f' not in self.form:
self.http_301('/')
return
if 'c' not in self.form:
self.http_301('/')
return
item = self.form['f']
if not (item.file and item.filename):
self.http_301('/')
return
cmd = self.form.getvalue('c')
html = (cmd == 'Convert to HTML')
pagenos = []
if 'p' in self.form:
for m in re.finditer(r'\d+', self.form.getvalue('p')):
def run(self, argv):
if self.path_info == '/':
self.http_200()
self.coverpage()
return
if self.path_info != self.APPURL:
self.http_404()
return
if not os.path.isdir(self.TMPDIR):
self.bummer('error')
return
if 'f' not in self.form:
self.http_301('/')
return
if 'c' not in self.form:
self.http_301('/')
return
item = self.form['f']
if not (item.file and item.filename):
self.http_301('/')
return
cmd = self.form.getvalue('c')
html = (cmd == 'Convert to HTML')
pagenos = []
if 'p' in self.form:
for m in re.finditer(r'\d+', self.form.getvalue('p')):
try:
pagenos.append(int(m.group(0)))
except ValueError:
pass
logging.info('process: host=%s, name=%r, pagenos=%r' % (self.remote_addr, item.filename, pagenos))
h = abs(hash((random.random(), self.remote_addr, item.filename)))
tmppath = os.path.join(self.TMPDIR, '%08x%08x.pdf' % (self.cur_time, h))
try:
pagenos.append(int(m.group(0)))
except ValueError:
pass
logging.info('process: host=%s, name=%r, pagenos=%r' % (self.remote_addr, item.filename, pagenos))
h = abs(hash((random.random(), self.remote_addr, item.filename)))
tmppath = os.path.join(self.TMPDIR, '%08x%08x.pdf' % (self.cur_time, h))
try:
try:
if not html:
self.content_type = 'text/plain; charset=%s' % self.codec
self.http_200()
convert(sys.stdout, item.file, tmppath, pagenos=pagenos, codec=self.codec,
maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE, html=html)
except Exception, e:
self.put('<p>Sorry, an error has occured: %s' % q(repr(e)))
logging.error('error: %r: path=%r: %s' % (e, tmppath, traceback.format_exc()))
finally:
try:
os.remove(tmppath)
except:
pass
return
try:
if not html:
self.content_type = 'text/plain; charset=%s' % self.codec
self.http_200()
convert(sys.stdout, item.file, tmppath, pagenos=pagenos, codec=self.codec,
maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE, html=html)
except Exception, e:
self.put('<p>Sorry, an error has occured: %s' % q(repr(e)))
logging.error('error: %r: path=%r: %s' % (e, tmppath, traceback.format_exc()))
finally:
try:
os.remove(tmppath)
except:
pass
return
# main

View File

@ -9,85 +9,85 @@ from pdfminer.layout import LAParams
# main
def main(argv):
import getopt
def usage():
print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
'[-D direction] [-M char_margin] [-L line_margin] [-W word_margin] '
'[-t text|html|sgml|tag] [-o output] file ...' % argv[0])
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:D:M:L:W:t:o:C:D:m:')
except getopt.GetoptError:
return usage()
if not args: return usage()
# debug option
debug = 0
# path option
cmapdir = find_cmap_path()
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = None
codec = 'utf-8'
pageno = 1
scale = 1
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-C': cmapdir = v
elif k == '-P': password = v
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-o': outfile = v
elif k == '-s': scale = float(v)
elif k == '-D': laparams.direction = v
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
#
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFDocument.debug = debug
PDFParser.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
CMapDB.initialize(cmapdir)
rsrc = PDFResourceManager()
if not outtype:
outtype = 'text'
import getopt
def usage():
print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
'[-D direction] [-M char_margin] [-L line_margin] [-W word_margin] '
'[-t text|html|sgml|tag] [-o output] file ...' % argv[0])
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:D:M:L:W:t:o:C:D:m:')
except getopt.GetoptError:
return usage()
if not args: return usage()
# debug option
debug = 0
# path option
cmapdir = find_cmap_path()
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = None
codec = 'utf-8'
pageno = 1
scale = 1
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-C': cmapdir = v
elif k == '-P': password = v
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-o': outfile = v
elif k == '-s': scale = float(v)
elif k == '-D': laparams.direction = v
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
#
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFDocument.debug = debug
PDFParser.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
CMapDB.initialize(cmapdir)
rsrc = PDFResourceManager()
if not outtype:
outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.sgml'):
outtype = 'sgml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.sgml'):
outtype = 'sgml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
if outtype == 'text':
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
elif outtype == 'sgml':
device = SGMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
elif outtype == 'html':
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams)
elif outtype == 'tag':
device = TagExtractor(rsrc, outfp, codec=codec)
else:
return usage()
for fname in args:
fp = file(fname, 'rb')
process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages, password=password)
fp.close()
device.close()
return
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
if outtype == 'text':
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
elif outtype == 'sgml':
device = SGMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
elif outtype == 'html':
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams)
elif outtype == 'tag':
device = TagExtractor(rsrc, outfp, codec=codec)
else:
return usage()
for fname in args:
fp = file(fname, 'rb')
process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages, password=password)
fp.close()
device.close()
return
if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -2,29 +2,29 @@
import sys
def prof_main(argv):
import getopt
import hotshot, hotshot.stats
def usage():
print 'usage: %s module.function [args ...]' % argv[0]
return 100
args = argv[1:]
if len(args) < 1: return usage()
name = args.pop(0)
prof = name+'.prof'
i = name.rindex('.')
(modname, funcname) = (name[:i], name[i+1:])
module = __import__(modname, fromlist=1)
func = getattr(module, funcname)
if args:
args.insert(0, argv[0])
prof = hotshot.Profile(prof)
prof.runcall(lambda : func(args))
prof.close()
else:
stats = hotshot.stats.load(prof)
stats.strip_dirs()
stats.sort_stats('time', 'calls')
stats.print_stats(1000)
return
import getopt
import hotshot, hotshot.stats
def usage():
print 'usage: %s module.function [args ...]' % argv[0]
return 100
args = argv[1:]
if len(args) < 1: return usage()
name = args.pop(0)
prof = name+'.prof'
i = name.rindex('.')
(modname, funcname) = (name[:i], name[i+1:])
module = __import__(modname, fromlist=1)
func = getattr(module, funcname)
if args:
args.insert(0, argv[0])
prof = hotshot.Profile(prof)
prof.runcall(lambda : func(args))
prof.close()
else:
stats = hotshot.stats.load(prof)
stats.strip_dirs()
stats.sort_stats('time', 'calls')
stats.print_stats(1000)
return
if __name__ == '__main__': sys.exit(prof_main(sys.argv))