to 4-space indentation

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-10-24 04:41:59 +00:00
parent a09b71d89d
commit 7790808560
24 changed files with 4953 additions and 4953 deletions

View File

@ -18,7 +18,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod> <div align=right class=lastmod>
<!-- hhmts start --> <!-- hhmts start -->
Last Modified: Sat Oct 24 12:42:25 JST 2009 Last Modified: Sat Oct 24 13:40:19 JST 2009
<!-- hhmts end --> <!-- hhmts end -->
</div> </div>
@ -352,7 +352,7 @@ no stream header is displayed for the ease of saving it to a file.
<hr noshade> <hr noshade>
<h2>Changes</h2> <h2>Changes</h2>
<ul> <ul>
<li> 2009/10/24: Charspace bug fixed. <li> 2009/10/24: Charspace bug fixed. Adjusted for 4-space indentation.
<li> 2009/10/04: Another matrix operation bug fixed. Thanks to Vitaly Sedelnik. <li> 2009/10/04: Another matrix operation bug fixed. Thanks to Vitaly Sedelnik.
<li> 2009/09/12: Fixed rectangle handling. Able to extract image boundaries. <li> 2009/09/12: Fixed rectangle handling. Able to extract image boundaries.
<li> 2009/08/30: Fixed page rotation handling. <li> 2009/08/30: Fixed page rotation handling.

View File

@ -9,36 +9,36 @@
## ##
class Arcfour(object): class Arcfour(object):
def __init__(self, key): def __init__(self, key):
s = range(256) s = range(256)
j = 0 j = 0
klen = len(key) klen = len(key)
for i in xrange(256): for i in xrange(256):
j = (j + s[i] + ord(key[i % klen])) % 256 j = (j + s[i] + ord(key[i % klen])) % 256
(s[i], s[j]) = (s[j], s[i]) (s[i], s[j]) = (s[j], s[i])
self.s = s self.s = s
(self.i, self.j) = (0, 0) (self.i, self.j) = (0, 0)
return return
def process(self, data): def process(self, data):
(i, j) = (self.i, self.j) (i, j) = (self.i, self.j)
s = self.s s = self.s
r = '' r = ''
for c in data: for c in data:
i = (i+1) % 256 i = (i+1) % 256
j = (j+s[i]) % 256 j = (j+s[i]) % 256
(s[i], s[j]) = (s[j], s[i]) (s[i], s[j]) = (s[j], s[i])
k = s[(s[i]+s[j]) % 256] k = s[(s[i]+s[j]) % 256]
r += chr(ord(c) ^ k) r += chr(ord(c) ^ k)
(self.i, self.j) = (i, j) (self.i, self.j) = (i, j)
return r return r
# test # test
if __name__ == '__main__': if __name__ == '__main__':
def doit(key, data): def doit(key, data):
cipher = Arcfour(key) cipher = Arcfour(key)
return ''.join( '%02X' % ord(c) for c in cipher.process(data) ) return ''.join( '%02X' % ord(c) for c in cipher.process(data) )
assert doit("Key", "Plaintext") == 'BBF316E8D940AF0AD3' assert doit("Key", "Plaintext") == 'BBF316E8D940AF0AD3'
assert doit("Wiki", "pedia") == '1021BF0420' assert doit("Wiki", "pedia") == '1021BF0420'
assert doit("Secret", "Attack at dawn") == '45A01F645FC35B383552544B9BF5' assert doit("Secret", "Attack at dawn") == '45A01F645FC35B383552544B9BF5'
print 'test succeeded' print 'test succeeded'

View File

@ -6,72 +6,72 @@
# ascii85decode(data) # ascii85decode(data)
def ascii85decode(data): def ascii85decode(data):
import struct import struct
n = b = 0 n = b = 0
out = '' out = ''
for c in data: for c in data:
if '!' <= c and c <= 'u': if '!' <= c and c <= 'u':
n += 1 n += 1
b = b*85+(ord(c)-33) b = b*85+(ord(c)-33)
if n == 5: if n == 5:
out += struct.pack('>L',b) out += struct.pack('>L',b)
n = b = 0 n = b = 0
elif c == 'z': elif c == 'z':
assert n == 0 assert n == 0
out += '\0\0\0\0' out += '\0\0\0\0'
elif c == '~': elif c == '~':
if n: if n:
for _ in range(5-n): for _ in range(5-n):
b = b*85+84 b = b*85+84
out += struct.pack('>L',b)[:n-1] out += struct.pack('>L',b)[:n-1]
break break
return out return out
# asciihexdecode(data) # asciihexdecode(data)
def asciihexdecode(data): def asciihexdecode(data):
""" """
ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1 ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the
ASCIIHexDecode filter produces one byte of binary data. All white-space ASCIIHexDecode filter produces one byte of binary data. All white-space
characters are ignored. A right angle bracket character (>) indicates characters are ignored. A right angle bracket character (>) indicates
EOD. Any other characters will cause an error. If the filter encounters EOD. Any other characters will cause an error. If the filter encounters
the EOD marker after reading an odd number of hexadecimal digits, it the EOD marker after reading an odd number of hexadecimal digits, it
will behave as if a 0 followed the last digit. will behave as if a 0 followed the last digit.
>>> asciihexdecode("61 62 2e6364 65") >>> asciihexdecode("61 62 2e6364 65")
'ab.cde' 'ab.cde'
>>> asciihexdecode("61 62 2e6364 657>") >>> asciihexdecode("61 62 2e6364 657>")
'ab.cdep' 'ab.cdep'
>>> asciihexdecode("7>") >>> asciihexdecode("7>")
'p' 'p'
""" """
import re import re
hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE) hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE)
trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE) trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
decode = (lambda hx: chr(int(hx, 16))) decode = (lambda hx: chr(int(hx, 16)))
out = map(decode, hex_re.findall(data)) out = map(decode, hex_re.findall(data))
m = trail_re.search(data) m = trail_re.search(data)
if m: if m:
out.append(decode("%c0" % m.group(1))) out.append(decode("%c0" % m.group(1)))
return ''.join(out) return ''.join(out)
# test # test
# sample taken from: http://en.wikipedia.org/w/index.php?title=Ascii85 # sample taken from: http://en.wikipedia.org/w/index.php?title=Ascii85
if __name__ == '__main__': if __name__ == '__main__':
orig = r''' orig = r'''
9jqo^BlbD-BleB1DJ+*+F(f,q/0JhKF<GL>Cj@.4Gp$d7F!,L7@<6@)/0JDEF<G%<+EV:2F!, 9jqo^BlbD-BleB1DJ+*+F(f,q/0JhKF<GL>Cj@.4Gp$d7F!,L7@<6@)/0JDEF<G%<+EV:2F!,
O<DJ+*.@<*K0@<6L(Df-\0Ec5e;DffZ(EZee.Bl.9pF"AGXBPCsi+DGm>@3BB/F*&OCAfu2/AKY O<DJ+*.@<*K0@<6L(Df-\0Ec5e;DffZ(EZee.Bl.9pF"AGXBPCsi+DGm>@3BB/F*&OCAfu2/AKY
i(DIb:@FD,*)+C]U=@3BN#EcYf8ATD3s@q?d$AftVqCh[NqF<G:8+EV:.+Cf>-FD5W8ARlolDIa i(DIb:@FD,*)+C]U=@3BN#EcYf8ATD3s@q?d$AftVqCh[NqF<G:8+EV:.+Cf>-FD5W8ARlolDIa
l(DId<j@<?3r@:F%a+D58'ATD4$Bl@l3De:,-DJs`8ARoFb/0JMK@qB4^F!,R<AKZ&-DfTqBG%G l(DId<j@<?3r@:F%a+D58'ATD4$Bl@l3De:,-DJs`8ARoFb/0JMK@qB4^F!,R<AKZ&-DfTqBG%G
>uD.RTpAKYo'+CT/5+Cei#DII?(E,9)oF*2M7/c~> >uD.RTpAKYo'+CT/5+Cei#DII?(E,9)oF*2M7/c~>
''' '''
data = \ data = \
'Man is distinguished, not only by his reason, but by this singular passion from '\ 'Man is distinguished, not only by his reason, but by this singular passion from '\
'other animals, which is a lust of the mind, that by a perseverance of delight in the '\ 'other animals, which is a lust of the mind, that by a perseverance of delight in the '\
'continued and indefatigable generation of knowledge, exceeds the short vehemence of '\ 'continued and indefatigable generation of knowledge, exceeds the short vehemence of '\
'any carnal pleasure.' 'any carnal pleasure.'
assert ascii85decode(orig) == data assert ascii85decode(orig) == data
print 'ascii85decode test succeeded' print 'ascii85decode test succeeded'
import doctest import doctest
doctest.testmod() doctest.testmod()

View File

@ -10,9 +10,9 @@ from pdfminer.psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
PSLiteral, PSKeyword, literal_name, keyword_name, \ PSLiteral, PSKeyword, literal_name, keyword_name, \
PSStackParser PSStackParser
try: try:
import cdb import cdb
except ImportError: except ImportError:
import pdfminer.pycdb as cdb import pdfminer.pycdb as cdb
class CMapError(Exception): pass class CMapError(Exception): pass
@ -21,449 +21,449 @@ class CMapError(Exception): pass
## find_cmap_path ## find_cmap_path
## ##
def find_cmap_path(): def find_cmap_path():
try: try:
return os.environ['CMAP_PATH'] return os.environ['CMAP_PATH']
except KeyError: except KeyError:
pass pass
basedir = os.path.dirname(__file__) basedir = os.path.dirname(__file__)
return os.path.join(basedir, 'CMap') return os.path.join(basedir, 'CMap')
STRIP_NAME = re.compile(r'[0-9]+') STRIP_NAME = re.compile(r'[0-9]+')
def name2unicode(name): def name2unicode(name):
if name in charname2unicode: if name in charname2unicode:
return charname2unicode[name] return charname2unicode[name]
m = STRIP_NAME.search(name) m = STRIP_NAME.search(name)
if not m: raise KeyError(name) if not m: raise KeyError(name)
return int(m.group(0)) return int(m.group(0))
## CMap ## CMap
## ##
class CMap(object): class CMap(object):
debug = 0 debug = 0
def __init__(self): def __init__(self):
self.code2cid = {} self.code2cid = {}
self.cid2code = {} self.cid2code = {}
self.attrs = {} self.attrs = {}
return return
def __repr__(self): def __repr__(self):
return '<CMap: %s>' % self.attrs.get('CMapName') return '<CMap: %s>' % self.attrs.get('CMapName')
def update(self, code2cid=None, cid2code=None): def update(self, code2cid=None, cid2code=None):
if code2cid: if code2cid:
self.code2cid.update(code2cid) self.code2cid.update(code2cid)
if cid2code: if cid2code:
self.cid2code.update(cid2code) self.cid2code.update(cid2code)
return self return self
def copycmap(self, cmap): def copycmap(self, cmap):
self.code2cid.update(cmap.getall_code2cid()) self.code2cid.update(cmap.getall_code2cid())
self.cid2code.update(cmap.getall_cid2code()) self.cid2code.update(cmap.getall_cid2code())
return self return self
def register_code2cid(self, code, cid): def register_code2cid(self, code, cid):
if isinstance(code, str) and isinstance(cid, int): if isinstance(code, str) and isinstance(cid, int):
self.code2cid[code] = cid self.code2cid[code] = cid
return self return self
def register_cid2code(self, cid, code): def register_cid2code(self, cid, code):
if isinstance(cid, int): if isinstance(cid, int):
if isinstance(code, PSLiteral): if isinstance(code, PSLiteral):
self.cid2code[cid] = pack('>H', name2unicode(code.name)) self.cid2code[cid] = pack('>H', name2unicode(code.name))
elif isinstance(code, str): elif isinstance(code, str):
self.cid2code[cid] = code self.cid2code[cid] = code
return self return self
def decode(self, bytes): def decode(self, bytes):
if self.debug: if self.debug:
print >>stderr, 'decode: %r, %r' % (self, bytes) print >>stderr, 'decode: %r, %r' % (self, bytes)
x = ''
for c in bytes:
if x:
if x+c in self.code2cid:
yield self.code2cid[x+c]
x = '' x = ''
elif c in self.code2cid: for c in bytes:
yield self.code2cid[c] if x:
else: if x+c in self.code2cid:
x = c yield self.code2cid[x+c]
return x = ''
elif c in self.code2cid:
yield self.code2cid[c]
else:
x = c
return
def is_vertical(self): def is_vertical(self):
return self.attrs.get('WMode', 0) return self.attrs.get('WMode', 0)
def tocid(self, code): def tocid(self, code):
return self.code2cid.get(code) return self.code2cid.get(code)
def tocode(self, cid): def tocode(self, cid):
return self.cid2code.get(cid) return self.cid2code.get(cid)
def getall_attrs(self): def getall_attrs(self):
return self.attrs.iteritems() return self.attrs.iteritems()
def getall_code2cid(self): def getall_code2cid(self):
return self.code2cid.iteritems() return self.code2cid.iteritems()
def getall_cid2code(self): def getall_cid2code(self):
return self.cid2code.iteritems() return self.cid2code.iteritems()
## CDBCMap ## CDBCMap
## ##
class CDBCMap(CMap): class CDBCMap(CMap):
def __init__(self, cdbname): def __init__(self, cdbname):
CMap.__init__(self) CMap.__init__(self)
self.cdbname = cdbname self.cdbname = cdbname
self.db = cdb.init(cdbname) self.db = cdb.init(cdbname)
return return
def __repr__(self): def __repr__(self):
return '<CDBCMap: %s (%r)>' % (self.db['/CMapName'], self.cdbname) return '<CDBCMap: %s (%r)>' % (self.db['/CMapName'], self.cdbname)
def tocid(self, code): def tocid(self, code):
k = 'c'+code k = 'c'+code
if not self.db.has_key(k): if not self.db.has_key(k):
return None return None
return unpack('>L', self.db[k]) return unpack('>L', self.db[k])
def tocode(self, cid): def tocode(self, cid):
k = 'i'+pack('>L', cid) k = 'i'+pack('>L', cid)
if not self.db.has_key(k): if not self.db.has_key(k):
return None return None
return self.db[k] return self.db[k]
def is_vertical(self): def is_vertical(self):
return (self.db.has_key('/WMode') and return (self.db.has_key('/WMode') and
self.db['/WMode'] == '1') self.db['/WMode'] == '1')
def getall(self, c): def getall(self, c):
while 1: while 1:
x = self.db.each() x = self.db.each()
if not x: break if not x: break
(k,v) = x (k,v) = x
if k.startswith(c): if k.startswith(c):
yield (k[1:], unpack('>L', v)[0]) yield (k[1:], unpack('>L', v)[0])
return return
def getall_attrs(self): def getall_attrs(self):
while 1: while 1:
x = self.db.each() x = self.db.each()
if not x: break if not x: break
(k,v) = x (k,v) = x
if k.startswith('/'): if k.startswith('/'):
yield (k[1:], eval(v)[0]) yield (k[1:], eval(v)[0])
return return
def getall_cid2code(self): def getall_cid2code(self):
return self.getall('i') return self.getall('i')
def getall_code2cid(self): def getall_code2cid(self):
return self.getall('c') return self.getall('c')
def decode(self, bytes): def decode(self, bytes):
if self.debug: if self.debug:
print >>stderr, 'decode: %r, %r' % (self, bytes) print >>stderr, 'decode: %r, %r' % (self, bytes)
x = ''
for c in bytes:
if x:
if x+c in self.code2cid:
yield self.code2cid[x+c]
elif self.db.has_key('c'+x+c):
(dest,) = unpack('>L', self.db['c'+x+c])
self.code2cid[x+c] = dest
yield dest
x = '' x = ''
elif c in self.code2cid: for c in bytes:
yield self.code2cid[c] if x:
elif self.db.has_key('c'+c): if x+c in self.code2cid:
(dest,) = unpack('>L', self.db['c'+c]) yield self.code2cid[x+c]
self.code2cid[c] = dest elif self.db.has_key('c'+x+c):
yield dest (dest,) = unpack('>L', self.db['c'+x+c])
else: self.code2cid[x+c] = dest
x = c yield dest
return x = ''
elif c in self.code2cid:
yield self.code2cid[c]
elif self.db.has_key('c'+c):
(dest,) = unpack('>L', self.db['c'+c])
self.code2cid[c] = dest
yield dest
else:
x = c
return
## CMapDB ## CMapDB
## ##
class CMapDB(object): class CMapDB(object):
class CMapNotFound(CMapError): pass class CMapNotFound(CMapError): pass
CMAP_ALIAS = { CMAP_ALIAS = {
} }
debug = 0 debug = 0
dirname = None dirname = None
cdbdirname = None cdbdirname = None
cmapdb = {} cmapdb = {}
@classmethod @classmethod
def initialize(klass, dirname=None, cdbdirname=None): def initialize(klass, dirname=None, cdbdirname=None):
if not dirname: if not dirname:
dirname = find_cmap_path() dirname = find_cmap_path()
klass.dirname = dirname klass.dirname = dirname
klass.cdbdirname = cdbdirname or dirname klass.cdbdirname = cdbdirname or dirname
return return
@classmethod @classmethod
def get_cmap(klass, cmapname, strict=True): def get_cmap(klass, cmapname, strict=True):
cmapname = klass.CMAP_ALIAS.get(cmapname, cmapname) cmapname = klass.CMAP_ALIAS.get(cmapname, cmapname)
if cmapname in klass.cmapdb: if cmapname in klass.cmapdb:
cmap = klass.cmapdb[cmapname] cmap = klass.cmapdb[cmapname]
else: else:
fname = os.path.join(klass.dirname, cmapname) fname = os.path.join(klass.dirname, cmapname)
cdbname = os.path.join(klass.cdbdirname, cmapname+'.cmap.cdb') cdbname = os.path.join(klass.cdbdirname, cmapname+'.cmap.cdb')
if os.path.exists(cdbname): if os.path.exists(cdbname):
if 1 <= klass.debug: if 1 <= klass.debug:
print >>stderr, 'Opening: CDBCMap %r...' % cdbname print >>stderr, 'Opening: CDBCMap %r...' % cdbname
cmap = CDBCMap(cdbname) cmap = CDBCMap(cdbname)
elif os.path.exists(fname): elif os.path.exists(fname):
if 1 <= klass.debug: if 1 <= klass.debug:
print >>stderr, 'Reading: CMap %r...' % fname print >>stderr, 'Reading: CMap %r...' % fname
cmap = CMap() cmap = CMap()
fp = file(fname, 'rb') fp = file(fname, 'rb')
CMapParser(cmap, fp).run() CMapParser(cmap, fp).run()
fp.close() fp.close()
elif not strict: elif not strict:
cmap = CMap() # just create empty cmap cmap = CMap() # just create empty cmap
else: else:
raise CMapDB.CMapNotFound(cmapname) raise CMapDB.CMapNotFound(cmapname)
klass.cmapdb[cmapname] = cmap klass.cmapdb[cmapname] = cmap
return cmap return cmap
## CMapParser ## CMapParser
## ##
class CMapParser(PSStackParser): class CMapParser(PSStackParser):
def __init__(self, cmap, fp): def __init__(self, cmap, fp):
PSStackParser.__init__(self, fp) PSStackParser.__init__(self, fp)
self.cmap = cmap self.cmap = cmap
self.in_cmap = False self.in_cmap = False
return return
def run(self): def run(self):
try: try:
self.nextobject() self.nextobject()
except PSEOF: except PSEOF:
pass pass
return return
def do_keyword(self, pos, token): def do_keyword(self, pos, token):
name = token.name name = token.name
if name == 'begincmap': if name == 'begincmap':
self.in_cmap = True self.in_cmap = True
self.popall() self.popall()
return return
elif name == 'endcmap': elif name == 'endcmap':
self.in_cmap = False self.in_cmap = False
return return
if not self.in_cmap: return if not self.in_cmap: return
# #
if name == 'def': if name == 'def':
try: try:
((_,k),(_,v)) = self.pop(2) ((_,k),(_,v)) = self.pop(2)
self.cmap.attrs[literal_name(k)] = v self.cmap.attrs[literal_name(k)] = v
except PSSyntaxError: except PSSyntaxError:
pass pass
return return
if name == 'usecmap': if name == 'usecmap':
try: try:
((_,cmapname),) = self.pop(1) ((_,cmapname),) = self.pop(1)
self.cmap.copycmap(CMapDB.get_cmap(literal_name(cmapname))) self.cmap.copycmap(CMapDB.get_cmap(literal_name(cmapname)))
except PSSyntaxError: except PSSyntaxError:
pass pass
return return
if name == 'begincodespacerange': if name == 'begincodespacerange':
self.popall() self.popall()
return return
if name == 'endcodespacerange': if name == 'endcodespacerange':
self.popall() self.popall()
return return
if name == 'begincidrange': if name == 'begincidrange':
self.popall() self.popall()
return return
if name == 'endcidrange': if name == 'endcidrange':
objs = [ obj for (_,obj) in self.popall() ] objs = [ obj for (_,obj) in self.popall() ]
for (s,e,cid) in choplist(3, objs): for (s,e,cid) in choplist(3, objs):
if (not isinstance(s, str) or not isinstance(e, str) or if (not isinstance(s, str) or not isinstance(e, str) or
not isinstance(cid, int) or len(s) != len(e)): continue not isinstance(cid, int) or len(s) != len(e)): continue
sprefix = s[:-4] sprefix = s[:-4]
eprefix = e[:-4] eprefix = e[:-4]
if sprefix != eprefix: continue if sprefix != eprefix: continue
svar = s[-4:] svar = s[-4:]
evar = e[-4:] evar = e[-4:]
s1 = nunpack(svar) s1 = nunpack(svar)
e1 = nunpack(evar) e1 = nunpack(evar)
vlen = len(svar) vlen = len(svar)
#assert s1 <= e1 #assert s1 <= e1
for i in xrange(e1-s1+1): for i in xrange(e1-s1+1):
x = sprefix+pack('>L',s1+i)[-vlen:] x = sprefix+pack('>L',s1+i)[-vlen:]
self.cmap.register_code2cid(x, cid+i) self.cmap.register_code2cid(x, cid+i)
return return
if name == 'begincidchar': if name == 'begincidchar':
self.popall() self.popall()
return return
if name == 'endcidchar': if name == 'endcidchar':
objs = [ obj for (_,obj) in self.popall() ] objs = [ obj for (_,obj) in self.popall() ]
for (cid,code) in choplist(2, objs): for (cid,code) in choplist(2, objs):
if isinstance(code, str) and isinstance(cid, str): if isinstance(code, str) and isinstance(cid, str):
self.cmap.register_code2cid(code, nunpack(cid)) self.cmap.register_code2cid(code, nunpack(cid))
return return
if name == 'beginbfrange': if name == 'beginbfrange':
self.popall() self.popall()
return return
if name == 'endbfrange': if name == 'endbfrange':
objs = [ obj for (_,obj) in self.popall() ] objs = [ obj for (_,obj) in self.popall() ]
for (s,e,code) in choplist(3, objs): for (s,e,code) in choplist(3, objs):
if (not isinstance(s, str) or not isinstance(e, str) or if (not isinstance(s, str) or not isinstance(e, str) or
len(s) != len(e)): continue len(s) != len(e)): continue
s1 = nunpack(s) s1 = nunpack(s)
e1 = nunpack(e) e1 = nunpack(e)
#assert s1 <= e1 #assert s1 <= e1
if isinstance(code, list): if isinstance(code, list):
for i in xrange(e1-s1+1): for i in xrange(e1-s1+1):
self.cmap.register_cid2code(s1+i, code[i]) self.cmap.register_cid2code(s1+i, code[i])
else: else:
var = code[-4:] var = code[-4:]
base = nunpack(var) base = nunpack(var)
prefix = code[:-4] prefix = code[:-4]
vlen = len(var) vlen = len(var)
for i in xrange(e1-s1+1): for i in xrange(e1-s1+1):
x = prefix+pack('>L',base+i)[-vlen:] x = prefix+pack('>L',base+i)[-vlen:]
self.cmap.register_cid2code(s1+i, x) self.cmap.register_cid2code(s1+i, x)
return return
if name == 'beginbfchar': if name == 'beginbfchar':
self.popall() self.popall()
return return
if name == 'endbfchar': if name == 'endbfchar':
objs = [ obj for (_,obj) in self.popall() ] objs = [ obj for (_,obj) in self.popall() ]
for (cid,code) in choplist(2, objs): for (cid,code) in choplist(2, objs):
if isinstance(cid, str) and isinstance(code, str): if isinstance(cid, str) and isinstance(code, str):
self.cmap.register_cid2code(nunpack(cid), code) self.cmap.register_cid2code(nunpack(cid), code)
return return
if name == 'beginnotdefrange': if name == 'beginnotdefrange':
self.popall() self.popall()
return return
if name == 'endnotdefrange': if name == 'endnotdefrange':
self.popall() self.popall()
return return
self.push((pos, token)) self.push((pos, token))
return return
## FontMetricsDB ## FontMetricsDB
## ##
class FontMetricsDB(object): class FontMetricsDB(object):
@classmethod @classmethod
def get_metrics(klass, fontname): def get_metrics(klass, fontname):
return FONT_METRICS[fontname] return FONT_METRICS[fontname]
## EncodingDB ## EncodingDB
## ##
class EncodingDB(object): class EncodingDB(object):
std2unicode = {} std2unicode = {}
mac2unicode = {} mac2unicode = {}
win2unicode = {} win2unicode = {}
pdf2unicode = {} pdf2unicode = {}
for (name,std,mac,win,pdf) in ENCODING: for (name,std,mac,win,pdf) in ENCODING:
c = unichr(name2unicode(name)) c = unichr(name2unicode(name))
if std: std2unicode[std] = c if std: std2unicode[std] = c
if mac: mac2unicode[mac] = c if mac: mac2unicode[mac] = c
if win: win2unicode[win] = c if win: win2unicode[win] = c
if pdf: pdf2unicode[pdf] = c if pdf: pdf2unicode[pdf] = c
encodings = { encodings = {
'StandardEncoding': std2unicode, 'StandardEncoding': std2unicode,
'MacRomanEncoding': mac2unicode, 'MacRomanEncoding': mac2unicode,
'WinAnsiEncoding': win2unicode, 'WinAnsiEncoding': win2unicode,
'PDFDocEncoding': pdf2unicode, 'PDFDocEncoding': pdf2unicode,
} }
@classmethod @classmethod
def get_encoding(klass, name, diff=None): def get_encoding(klass, name, diff=None):
cid2unicode = klass.encodings.get(name, klass.std2unicode) cid2unicode = klass.encodings.get(name, klass.std2unicode)
if diff: if diff:
cid2unicode = cid2unicode.copy() cid2unicode = cid2unicode.copy()
cid = 0 cid = 0
for x in diff: for x in diff:
if isinstance(x, int): if isinstance(x, int):
cid = x cid = x
elif isinstance(x, PSLiteral): elif isinstance(x, PSLiteral):
try: try:
cid2unicode[cid] = unichr(name2unicode(x.name)) cid2unicode[cid] = unichr(name2unicode(x.name))
except KeyError: except KeyError:
pass pass
cid += 1 cid += 1
return cid2unicode return cid2unicode
## CMap -> CMapCDB conversion ## CMap -> CMapCDB conversion
## ##
def dumpcdb(cmap, cdbfile, verbose=1): def dumpcdb(cmap, cdbfile, verbose=1):
m = cdb.cdbmake(cdbfile, cdbfile+'.tmp') m = cdb.cdbmake(cdbfile, cdbfile+'.tmp')
if verbose: if verbose:
print >>stderr, 'Writing: %r...' % cdbfile print >>stderr, 'Writing: %r...' % cdbfile
for (k,v) in cmap.getall_attrs(): for (k,v) in cmap.getall_attrs():
m.add('/'+k, repr(v)) m.add('/'+k, repr(v))
for (code,cid) in cmap.getall_code2cid(): for (code,cid) in cmap.getall_code2cid():
m.add('c'+code, pack('>L',cid)) m.add('c'+code, pack('>L',cid))
for (cid,code) in cmap.getall_cid2code(): for (cid,code) in cmap.getall_cid2code():
m.add('i'+pack('>L',cid), code) m.add('i'+pack('>L',cid), code)
m.finish() m.finish()
return return
def convert_cmap(cmapdir, outputdir, force=False): def convert_cmap(cmapdir, outputdir, force=False):
CMapDB.initialize(cmapdir) CMapDB.initialize(cmapdir)
for fname in os.listdir(cmapdir): for fname in os.listdir(cmapdir):
if '.' in fname: continue if '.' in fname: continue
cmapname = os.path.basename(fname) cmapname = os.path.basename(fname)
cdbname = os.path.join(outputdir, cmapname+'.cmap.cdb') cdbname = os.path.join(outputdir, cmapname+'.cmap.cdb')
if not force and os.path.exists(cdbname): if not force and os.path.exists(cdbname):
print >>stderr, 'Skipping: %r' % cmapname print >>stderr, 'Skipping: %r' % cmapname
continue continue
print >>stderr, 'Reading: %r...' % cmapname print >>stderr, 'Reading: %r...' % cmapname
cmap = CMapDB.get_cmap(cmapname) cmap = CMapDB.get_cmap(cmapname)
dumpcdb(cmap, cdbname) dumpcdb(cmap, cdbname)
return return
def main(argv): def main(argv):
import getopt import getopt
def usage(): def usage():
print 'usage: %s [-D outputdir] [-f] cmap_dir' % argv[0] print 'usage: %s [-D outputdir] [-f] cmap_dir' % argv[0]
return 100 return 100
try: try:
(opts, args) = getopt.getopt(argv[1:], 'C:D:f') (opts, args) = getopt.getopt(argv[1:], 'C:D:f')
except getopt.GetoptError: except getopt.GetoptError:
return usage() return usage()
if args: if args:
cmapdir = args.pop(0) cmapdir = args.pop(0)
else: else:
cmapdir = find_cmap_path() cmapdir = find_cmap_path()
outputdir = cmapdir outputdir = cmapdir
force = False force = False
for (k, v) in opts: for (k, v) in opts:
if k == '-f': force = True if k == '-f': force = True
elif k == '-C': cmapdir = v elif k == '-C': cmapdir = v
elif k == '-D': outputdir = v elif k == '-D': outputdir = v
if not os.path.isdir(cmapdir): if not os.path.isdir(cmapdir):
print >>stderr, 'directory does not exist: %r' % cmapdir print >>stderr, 'directory does not exist: %r' % cmapdir
return 111 return 111
if not os.path.isdir(outputdir): if not os.path.isdir(outputdir):
print >>stderr, 'directory does not exist: %r' % outputdir print >>stderr, 'directory does not exist: %r' % outputdir
return 111 return 111
return convert_cmap(cmapdir, outputdir, force=force) return convert_cmap(cmapdir, outputdir, force=force)
if __name__ == '__main__': sys.exit(main(sys.argv)) if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -10,298 +10,298 @@ from pdfminer.utils import apply_matrix_pt, mult_matrix, enc
## ##
class TagExtractor(PDFDevice): class TagExtractor(PDFDevice):
def __init__(self, rsrc, outfp, codec='utf-8'): def __init__(self, rsrc, outfp, codec='utf-8'):
PDFDevice.__init__(self, rsrc) PDFDevice.__init__(self, rsrc)
self.outfp = outfp self.outfp = outfp
self.codec = codec self.codec = codec
self.pageno = 0 self.pageno = 0
self.tag = None self.tag = None
return return
def render_string(self, textstate, seq): def render_string(self, textstate, seq):
font = textstate.font font = textstate.font
text = '' text = ''
for obj in seq: for obj in seq:
if not isinstance(obj, str): continue if not isinstance(obj, str): continue
chars = font.decode(obj) chars = font.decode(obj)
for cid in chars: for cid in chars:
try: try:
char = font.to_unicode(cid) char = font.to_unicode(cid)
text += char text += char
except PDFUnicodeNotDefined: except PDFUnicodeNotDefined:
pass pass
self.outfp.write(enc(text, self.codec)) self.outfp.write(enc(text, self.codec))
return return
def begin_page(self, page, ctm): def begin_page(self, page, ctm):
(x0, y0, x1, y1) = page.mediabox (x0, y0, x1, y1) = page.mediabox
bbox = '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1) bbox = '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' % self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' %
(self.pageno, bbox, page.rotate)) (self.pageno, bbox, page.rotate))
return return
def end_page(self, page): def end_page(self, page):
self.outfp.write('</page>\n') self.outfp.write('</page>\n')
self.pageno += 1 self.pageno += 1
return return
def begin_tag(self, tag, props=None): def begin_tag(self, tag, props=None):
s = '' s = ''
if props: if props:
s = ''.join( ' %s="%s"' % (enc(k), enc(str(v))) for (k,v) s = ''.join( ' %s="%s"' % (enc(k), enc(str(v))) for (k,v)
in sorted(props.iteritems()) ) in sorted(props.iteritems()) )
self.outfp.write('<%s%s>' % (enc(tag.name), s)) self.outfp.write('<%s%s>' % (enc(tag.name), s))
self.tag = tag self.tag = tag
return return
def end_tag(self): def end_tag(self):
assert self.tag assert self.tag
self.outfp.write('</%s>' % enc(self.tag.name)) self.outfp.write('</%s>' % enc(self.tag.name))
self.tag = None self.tag = None
return return
def do_tag(self, tag, props=None): def do_tag(self, tag, props=None):
self.begin_tag(tag, props) self.begin_tag(tag, props)
self.tag = None self.tag = None
return return
## PDFPageAggregator ## PDFPageAggregator
## ##
class PDFPageAggregator(PDFTextDevice): class PDFPageAggregator(PDFTextDevice):
def __init__(self, rsrc, pageno=1, laparams=None): def __init__(self, rsrc, pageno=1, laparams=None):
PDFTextDevice.__init__(self, rsrc) PDFTextDevice.__init__(self, rsrc)
self.laparams = laparams self.laparams = laparams
self.pageno = pageno self.pageno = pageno
self.stack = [] self.stack = []
return return
def begin_page(self, page, ctm): def begin_page(self, page, ctm):
(x0,y0,x1,y1) = page.mediabox (x0,y0,x1,y1) = page.mediabox
(x0,y0) = apply_matrix_pt(ctm, (x0,y0)) (x0,y0) = apply_matrix_pt(ctm, (x0,y0))
(x1,y1) = apply_matrix_pt(ctm, (x1,y1)) (x1,y1) = apply_matrix_pt(ctm, (x1,y1))
mediabox = (0, 0, abs(x0-x1), abs(y0-y1)) mediabox = (0, 0, abs(x0-x1), abs(y0-y1))
self.cur_item = LTPage(self.pageno, mediabox) self.cur_item = LTPage(self.pageno, mediabox)
return return
def end_page(self, _): def end_page(self, _):
assert not self.stack assert not self.stack
assert isinstance(self.cur_item, LTPage) assert isinstance(self.cur_item, LTPage)
self.cur_item.fixate() self.cur_item.fixate()
if self.laparams: if self.laparams:
self.cur_item.analyze_layout(self.laparams) self.cur_item.analyze_layout(self.laparams)
self.pageno += 1 self.pageno += 1
return self.cur_item return self.cur_item
def begin_figure(self, name, bbox, matrix): def begin_figure(self, name, bbox, matrix):
self.stack.append(self.cur_item) self.stack.append(self.cur_item)
self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm)) self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
return return
def end_figure(self, _): def end_figure(self, _):
fig = self.cur_item fig = self.cur_item
self.cur_item.fixate() self.cur_item.fixate()
self.cur_item = self.stack.pop() self.cur_item = self.stack.pop()
self.cur_item.add(fig) self.cur_item.add(fig)
return return
def paint_path(self, gstate, stroke, fill, evenodd, path): def paint_path(self, gstate, stroke, fill, evenodd, path):
shape = ''.join(x[0] for x in path) shape = ''.join(x[0] for x in path)
if shape == 'ml': # horizontal/vertical line if shape == 'ml': # horizontal/vertical line
(_,x0,y0) = path[0] (_,x0,y0) = path[0]
(_,x1,y1) = path[1] (_,x1,y1) = path[1]
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0)) (x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1)) (x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
if y0 == y1: if y0 == y1:
# horizontal ruler # horizontal ruler
self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1))) self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1)))
elif x0 == x1: elif x0 == x1:
# vertical ruler # vertical ruler
self.cur_item.add(LTLine(gstate.linewidth, 'V', (x0,y0,x1,y1))) self.cur_item.add(LTLine(gstate.linewidth, 'V', (x0,y0,x1,y1)))
elif shape == 'mlllh': elif shape == 'mlllh':
# rectangle # rectangle
(_,x0,y0) = path[0] (_,x0,y0) = path[0]
(_,x1,y1) = path[1] (_,x1,y1) = path[1]
(_,x2,y2) = path[2] (_,x2,y2) = path[2]
(_,x3,y3) = path[3] (_,x3,y3) = path[3]
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0)) (x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1)) (x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
(x2,y2) = apply_matrix_pt(self.ctm, (x2,y2)) (x2,y2) = apply_matrix_pt(self.ctm, (x2,y2))
(x3,y3) = apply_matrix_pt(self.ctm, (x3,y3)) (x3,y3) = apply_matrix_pt(self.ctm, (x3,y3))
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)): (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2))) self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
return return
def render_chars(self, matrix, font, fontsize, charspace, scaling, chars): def render_chars(self, matrix, font, fontsize, charspace, scaling, chars):
if not chars: return (0, 0) if not chars: return (0, 0)
item = LTTextItem(matrix, font, fontsize, charspace, scaling, chars) item = LTTextItem(matrix, font, fontsize, charspace, scaling, chars)
self.cur_item.add(item) self.cur_item.add(item)
return item.adv return item.adv
## PDFConverter ## PDFConverter
## ##
class PDFConverter(PDFPageAggregator): class PDFConverter(PDFPageAggregator):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None): def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None):
PDFPageAggregator.__init__(self, rsrc, pageno=pageno, laparams=laparams) PDFPageAggregator.__init__(self, rsrc, pageno=pageno, laparams=laparams)
self.outfp = outfp self.outfp = outfp
self.codec = codec self.codec = codec
return return
def write(self, text): def write(self, text):
self.outfp.write(enc(text, self.codec)) self.outfp.write(enc(text, self.codec))
return return
## SGMLConverter ## SGMLConverter
## ##
class SGMLConverter(PDFConverter): class SGMLConverter(PDFConverter):
def end_page(self, page): def end_page(self, page):
def render(item): def render(item):
if isinstance(item, LTPage): if isinstance(item, LTPage):
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' % self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
(item.id, item.get_bbox(), item.rotate)) (item.id, item.get_bbox(), item.rotate))
for child in item: for child in item:
render(child) render(child)
self.outfp.write('</page>\n') self.outfp.write('</page>\n')
elif isinstance(item, LTLine): elif isinstance(item, LTLine):
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox())) self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox()))
elif isinstance(item, LTRect): elif isinstance(item, LTRect):
self.outfp.write('<rect linewidth="%d" bbox="%s" />' % (item.linewidth, item.get_bbox())) self.outfp.write('<rect linewidth="%d" bbox="%s" />' % (item.linewidth, item.get_bbox()))
elif isinstance(item, LTFigure): elif isinstance(item, LTFigure):
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox())) self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
for child in item: for child in item:
render(child) render(child)
self.outfp.write('</figure>\n') self.outfp.write('</figure>\n')
elif isinstance(item, LTTextLine): elif isinstance(item, LTTextLine):
self.outfp.write('<textline bbox="%s">\n' % (item.get_bbox())) self.outfp.write('<textline bbox="%s">\n' % (item.get_bbox()))
for child in item: for child in item:
render(child) render(child)
self.outfp.write('</textline>\n') self.outfp.write('</textline>\n')
elif isinstance(item, LTTextBox): elif isinstance(item, LTTextBox):
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox())) self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
for child in item: for child in item:
render(child) render(child)
self.outfp.write('</textbox>\n') self.outfp.write('</textbox>\n')
elif isinstance(item, LTTextItem): elif isinstance(item, LTTextItem):
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' % self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
(enc(item.font.fontname), item.is_vertical(), (enc(item.font.fontname), item.is_vertical(),
item.get_bbox(), item.fontsize)) item.get_bbox(), item.fontsize))
self.write(item.text) self.write(item.text)
self.outfp.write('</text>\n') self.outfp.write('</text>\n')
elif isinstance(item, LTText): elif isinstance(item, LTText):
self.outfp.write('<text>%s</text>\n' % item.text) self.outfp.write('<text>%s</text>\n' % item.text)
else: else:
assert 0, item assert 0, item
return return
page = PDFConverter.end_page(self, page) page = PDFConverter.end_page(self, page)
render(page) render(page)
return return
## HTMLConverter ## HTMLConverter
## ##
class HTMLConverter(PDFConverter): class HTMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
scale=1, showpageno=True, pagepad=50): scale=1, showpageno=True, pagepad=50):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams) PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.showpageno = showpageno self.showpageno = showpageno
self.pagepad = pagepad self.pagepad = pagepad
self.scale = scale self.scale = scale
self.outfp.write('<html><head>\n') self.outfp.write('<html><head>\n')
self.outfp.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' % self.outfp.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' %
self.codec) self.codec)
self.outfp.write('</head><body>\n') self.outfp.write('</head><body>\n')
self.yoffset = self.pagepad self.yoffset = self.pagepad
return return
def write_rect(self, color, width, x, y, w, h): def write_rect(self, color, width, x, y, w, h):
self.outfp.write('<span style="position:absolute; border: %s %dpx solid; ' self.outfp.write('<span style="position:absolute; border: %s %dpx solid; '
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' % 'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
(color, width, x*self.scale, y*self.scale, w*self.scale, h*self.scale)) (color, width, x*self.scale, y*self.scale, w*self.scale, h*self.scale))
return return
def end_page(self, page): def end_page(self, page):
def render(item): def render(item):
if isinstance(item, LTPage): if isinstance(item, LTPage):
self.yoffset += item.y1 self.yoffset += item.y1
self.write_rect('gray', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('gray', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
if self.showpageno: if self.showpageno:
self.outfp.write('<div style="position:absolute; top:%dpx;">' % self.outfp.write('<div style="position:absolute; top:%dpx;">' %
((self.yoffset-item.y1)*self.scale)) ((self.yoffset-item.y1)*self.scale))
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id)) self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id))
for child in item: for child in item:
render(child) render(child)
elif isinstance(item, LTTextItem): elif isinstance(item, LTTextItem):
if item.vertical: if item.vertical:
wmode = 'tb-rl' wmode = 'tb-rl'
else: else:
wmode = 'lr-tb' wmode = 'lr-tb'
self.outfp.write('<span style="position:absolute; writing-mode:%s;' self.outfp.write('<span style="position:absolute; writing-mode:%s;'
' left:%dpx; top:%dpx; font-size:%dpx;">' % ' left:%dpx; top:%dpx; font-size:%dpx;">' %
(wmode, item.x0*self.scale, (self.yoffset-item.y1)*self.scale, (wmode, item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
item.fontsize*self.scale)) item.fontsize*self.scale))
self.write(item.text) self.write(item.text)
self.outfp.write('</span>\n') self.outfp.write('</span>\n')
if self.debug: if self.debug:
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTLine) or isinstance(item, LTRect): elif isinstance(item, LTLine) or isinstance(item, LTRect):
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTTextLine): elif isinstance(item, LTTextLine):
for child in item: for child in item:
render(child) render(child)
elif isinstance(item, LTTextBox): elif isinstance(item, LTTextBox):
self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
for child in item: for child in item:
render(child) render(child)
elif isinstance(item, LTFigure): elif isinstance(item, LTFigure):
self.write_rect('green', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('green', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
for child in item: for child in item:
render(child) render(child)
return return
page = PDFConverter.end_page(self, page) page = PDFConverter.end_page(self, page)
render(page) render(page)
self.yoffset += self.pagepad self.yoffset += self.pagepad
return return
def close(self): def close(self):
self.outfp.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' % self.outfp.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' %
', '.join('<a href="#%s">%s</a>' % (i,i) for i in xrange(1,self.pageno))) ', '.join('<a href="#%s">%s</a>' % (i,i) for i in xrange(1,self.pageno)))
self.outfp.write('</body></html>\n') self.outfp.write('</body></html>\n')
return return
## TextConverter ## TextConverter
## ##
class TextConverter(PDFConverter): class TextConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
showpageno=False): showpageno=False):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams) PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.showpageno = showpageno self.showpageno = showpageno
return return
def write(self, text): def write(self, text):
self.outfp.write(text.encode(self.codec, 'ignore')) self.outfp.write(text.encode(self.codec, 'ignore'))
return return
def end_page(self, page): def end_page(self, page):
def render(item): def render(item):
if isinstance(item, LTText): if isinstance(item, LTText):
self.write(item.text) self.write(item.text)
elif isinstance(item, LayoutContainer): elif isinstance(item, LayoutContainer):
for child in item: for child in item:
render(child) render(child)
if isinstance(item, LTTextBox): if isinstance(item, LTTextBox):
self.write('\n') self.write('\n')
page = PDFConverter.end_page(self, page) page = PDFConverter.end_page(self, page)
if self.showpageno: if self.showpageno:
self.write('Page %d\n' % page.id) self.write('Page %d\n' % page.id)
render(page) render(page)
self.write('\f') self.write('\f')
return return

View File

@ -8,22 +8,22 @@ INF = sys.maxint
## ##
class LAParams(object): class LAParams(object):
def __init__(self, def __init__(self,
direction=None, direction=None,
line_overlap=0.5, line_overlap=0.5,
char_margin=1.0, char_margin=1.0,
line_margin=0.5, line_margin=0.5,
word_margin=0.1): word_margin=0.1):
self.direction = direction self.direction = direction
self.line_overlap = line_overlap self.line_overlap = line_overlap
self.char_margin = char_margin self.char_margin = char_margin
self.line_margin = line_margin self.line_margin = line_margin
self.word_margin = word_margin self.word_margin = word_margin
return return
def __repr__(self): def __repr__(self):
return ('<LAParams: direction=%r, char_margin=%.1f, line_margin=%.1f, word_margin=%.1f>' % return ('<LAParams: direction=%r, char_margin=%.1f, line_margin=%.1f, word_margin=%.1f>' %
(self.direction, self.char_margin, self.line_margin, self.word_margin)) (self.direction, self.char_margin, self.line_margin, self.word_margin))
## Plane ## Plane
@ -35,354 +35,354 @@ class LAParams(object):
## ##
class Plane(object): class Plane(object):
def __init__(self, objs): def __init__(self, objs):
self.xobjs = [] self.xobjs = []
self.yobjs = [] self.yobjs = []
for obj in objs: for obj in objs:
self.place(obj) self.place(obj)
self.xobjs.sort() self.xobjs.sort()
self.yobjs.sort() self.yobjs.sort()
return return
# place(obj): place an object in a certain area. # place(obj): place an object in a certain area.
def place(self, obj): def place(self, obj):
assert isinstance(obj, LayoutItem) assert isinstance(obj, LayoutItem)
self.xobjs.append((obj.x0, obj)) self.xobjs.append((obj.x0, obj))
self.xobjs.append((obj.x1, obj)) self.xobjs.append((obj.x1, obj))
self.yobjs.append((obj.y0, obj)) self.yobjs.append((obj.y0, obj))
self.yobjs.append((obj.y1, obj)) self.yobjs.append((obj.y1, obj))
return return
# find(): finds objects that are in a certain area. # find(): finds objects that are in a certain area.
def find(self, (x0,y0,x1,y1)): def find(self, (x0,y0,x1,y1)):
(i0,_) = bsearch(self.xobjs, x0) (i0,_) = bsearch(self.xobjs, x0)
(_,i1) = bsearch(self.xobjs, x1) (_,i1) = bsearch(self.xobjs, x1)
xobjs = set( obj for (_,obj) in self.xobjs[i0:i1] ) xobjs = set( obj for (_,obj) in self.xobjs[i0:i1] )
(i0,_) = bsearch(self.yobjs, y0) (i0,_) = bsearch(self.yobjs, y0)
(_,i1) = bsearch(self.yobjs, y1) (_,i1) = bsearch(self.yobjs, y1)
yobjs = set( obj for (_,obj) in self.yobjs[i0:i1] ) yobjs = set( obj for (_,obj) in self.yobjs[i0:i1] )
objs = xobjs.intersection(yobjs) objs = xobjs.intersection(yobjs)
return objs return objs
## ClusterSet ## ClusterSet
## ##
class ClusterSet(object): class ClusterSet(object):
def __init__(self, klass): def __init__(self, klass):
self.clusters = {} self.clusters = {}
self.klass = klass self.klass = klass
self.i = 0 self.i = 0
return return
# add(objs): groups text objects if necessary. # add(objs): groups text objects if necessary.
def add(self, objs): def add(self, objs):
group = self.klass(self.i, objs) group = self.klass(self.i, objs)
self.i += 1 self.i += 1
for obj in objs: for obj in objs:
if obj in self.clusters: if obj in self.clusters:
group.merge(self.clusters[obj]) group.merge(self.clusters[obj])
for obj in group: for obj in group:
self.clusters[obj] = group self.clusters[obj] = group
return return
# finish(): returns all the LTTextBoxes in a page. # finish(): returns all the LTTextBoxes in a page.
def finish(self): def finish(self):
r = set(self.clusters.itervalues()) r = set(self.clusters.itervalues())
for group in r: for group in r:
group.fixate() group.fixate()
return list(r) return list(r)
@classmethod @classmethod
def build(klass, objs, hratio, vratio, objtype, func=None): def build(klass, objs, hratio, vratio, objtype, func=None):
plane = Plane(objs) plane = Plane(objs)
cset = ClusterSet(objtype) cset = ClusterSet(objtype)
for obj in objs: for obj in objs:
margin = obj.get_margin() margin = obj.get_margin()
hmargin = hratio * margin hmargin = hratio * margin
vmargin = vratio * margin vmargin = vratio * margin
neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin)) neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin))
assert obj in neighbors, obj assert obj in neighbors, obj
if func: if func:
neighbors = [ x for x in neighbors if x is obj or func(obj, x) ] neighbors = [ x for x in neighbors if x is obj or func(obj, x) ]
cset.add(neighbors) cset.add(neighbors)
return cset.finish() return cset.finish()
## LayoutItem ## LayoutItem
## ##
class LayoutItem(object): class LayoutItem(object):
def __init__(self, bbox): def __init__(self, bbox):
self.set_bbox(bbox) self.set_bbox(bbox)
return return
def set_bbox(self, (x0,y0,x1,y1)): def set_bbox(self, (x0,y0,x1,y1)):
if x1 < x0: (x0,x1) = (x1,x0) if x1 < x0: (x0,x1) = (x1,x0)
if y1 < y0: (y0,y1) = (y1,y0) if y1 < y0: (y0,y1) = (y1,y0)
self.x0 = x0 self.x0 = x0
self.y0 = y0 self.y0 = y0
self.x1 = x1 self.x1 = x1
self.y1 = y1 self.y1 = y1
self.width = x1-x0 self.width = x1-x0
self.height = y1-y0 self.height = y1-y0
return return
def __repr__(self): def __repr__(self):
return ('<item bbox=%s>' % (self.get_bbox())) return ('<item bbox=%s>' % (self.get_bbox()))
def hoverlap(self, obj): def hoverlap(self, obj):
assert isinstance(obj, LayoutItem) assert isinstance(obj, LayoutItem)
if self.x1 <= obj.x0 or obj.x1 <= self.x0: if self.x1 <= obj.x0 or obj.x1 <= self.x0:
return 0 return 0
else: else:
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0)) return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
def voverlap(self, obj): def voverlap(self, obj):
assert isinstance(obj, LayoutItem) assert isinstance(obj, LayoutItem)
if self.y1 <= obj.y0 or obj.y1 <= self.y0: if self.y1 <= obj.y0 or obj.y1 <= self.y0:
return 0 return 0
else: else:
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0)) return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
def get_bbox(self): def get_bbox(self):
return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1) return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1)
def get_margin(self): def get_margin(self):
return 0 return 0
def get_weight(self): def get_weight(self):
return 0 return 0
def get_direction(self): def get_direction(self):
return None return None
## LayoutContainer ## LayoutContainer
## ##
class LayoutContainer(LayoutItem): class LayoutContainer(LayoutItem):
def __init__(self, id, bbox, objs=None): def __init__(self, id, bbox, objs=None):
LayoutItem.__init__(self, bbox) LayoutItem.__init__(self, bbox)
self.id = id self.id = id
if objs: if objs:
self.objs = set(objs) self.objs = set(objs)
else: else:
self.objs = set() self.objs = set()
self.weight = None self.weight = None
return return
def __repr__(self): def __repr__(self):
return ('<group %s>' % (self.get_bbox())) return ('<group %s>' % (self.get_bbox()))
def __iter__(self): def __iter__(self):
return iter(self.objs) return iter(self.objs)
def __len__(self): def __len__(self):
return len(self.objs) return len(self.objs)
def add(self, obj): def add(self, obj):
self.objs.add(obj) self.objs.add(obj)
return return
def merge(self, group): def merge(self, group):
self.objs.update(iter(group)) self.objs.update(iter(group))
return return
# fixate(): determines its boundery and writing direction. # fixate(): determines its boundery and writing direction.
def fixate(self): def fixate(self):
if not self.width and self.objs: if not self.width and self.objs:
(bx0, by0, bx1, by1) = (INF, INF, -INF, -INF) (bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
for obj in self.objs: for obj in self.objs:
bx0 = min(bx0, obj.x0) bx0 = min(bx0, obj.x0)
by0 = min(by0, obj.y0) by0 = min(by0, obj.y0)
bx1 = max(bx1, obj.x1) bx1 = max(bx1, obj.x1)
by1 = max(by1, obj.y1) by1 = max(by1, obj.y1)
self.set_bbox((bx0, by0, bx1, by1)) self.set_bbox((bx0, by0, bx1, by1))
self.weight = sum( obj.get_weight() for obj in self.objs ) self.weight = sum( obj.get_weight() for obj in self.objs )
return return
def get_weight(self): def get_weight(self):
return self.weight return self.weight
def get_direction(self): def get_direction(self):
return None return None
## LTLine ## LTLine
## ##
class LTLine(LayoutItem): class LTLine(LayoutItem):
def __init__(self, linewidth, direction, bbox): def __init__(self, linewidth, direction, bbox):
LayoutItem.__init__(self, bbox) LayoutItem.__init__(self, bbox)
self.linewidth = linewidth self.linewidth = linewidth
self.direction = direction self.direction = direction
return return
## LTRect ## LTRect
## ##
class LTRect(LayoutItem): class LTRect(LayoutItem):
def __init__(self, linewidth, bbox): def __init__(self, linewidth, bbox):
LayoutItem.__init__(self, bbox) LayoutItem.__init__(self, bbox)
self.linewidth = linewidth self.linewidth = linewidth
return return
## LTText ## LTText
## ##
class LTText(object): class LTText(object):
def __init__(self, text): def __init__(self, text):
self.text = text self.text = text
return return
def __repr__(self): def __repr__(self):
return '<text %r>' % self.text return '<text %r>' % self.text
def get_weight(self): def get_weight(self):
return len(self.text) return len(self.text)
def is_upright(self): def is_upright(self):
return True return True
## LTAnon ## LTAnon
## ##
class LTAnon(LTText): class LTAnon(LTText):
def get_weight(self): def get_weight(self):
return 0 return 0
## LTTextItem ## LTTextItem
## ##
class LTTextItem(LayoutItem, LTText): class LTTextItem(LayoutItem, LTText):
debug = 1 debug = 1
def __init__(self, matrix, font, fontsize, charspace, scaling, chars): def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
assert chars assert chars
self.matrix = matrix self.matrix = matrix
self.font = font self.font = font
self.vertical = font.is_vertical() self.vertical = font.is_vertical()
self.text = ''.join( char for (char,_) in chars ) self.text = ''.join( char for (char,_) in chars )
adv = sum( font.char_width(cid) for (_,cid) in chars ) adv = sum( font.char_width(cid) for (_,cid) in chars )
adv = (adv * fontsize + (len(chars)-1)*charspace) * scaling adv = (adv * fontsize + (len(chars)-1)*charspace) * scaling
#size = (font.get_ascent() - font.get_descent()) * fontsize #size = (font.get_ascent() - font.get_descent()) * fontsize
size = font.get_size() * fontsize size = font.get_size() * fontsize
(_,_,_,_,tx,ty) = self.matrix (_,_,_,_,tx,ty) = self.matrix
if not self.vertical: if not self.vertical:
# horizontal text # horizontal text
self.adv = (adv, 0) self.adv = (adv, 0)
(dx,dy) = apply_matrix_norm(self.matrix, (adv,size)) (dx,dy) = apply_matrix_norm(self.matrix, (adv,size))
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize)) (_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
ty += descent ty += descent
bbox = (tx, ty, tx+dx, ty+dy) bbox = (tx, ty, tx+dx, ty+dy)
else: else:
# vertical text # vertical text
self.adv = (0, adv) self.adv = (0, adv)
(_,cid) = chars[0] (_,cid) = chars[0]
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001)) (_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001))
(dx,dy) = apply_matrix_norm(self.matrix, (size,adv)) (dx,dy) = apply_matrix_norm(self.matrix, (size,adv))
tx -= dx/2 tx -= dx/2
ty += disp ty += disp
bbox = (tx, ty+dy, tx+dx, ty) bbox = (tx, ty+dy, tx+dx, ty)
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size))) self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
LayoutItem.__init__(self, bbox) LayoutItem.__init__(self, bbox)
return return
def __repr__(self): def __repr__(self):
if self.debug: if self.debug:
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' % return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
('[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % self.matrix, ('[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % self.matrix,
self.font, self.fontsize, self.get_bbox(), self.font, self.fontsize, self.get_bbox(),
'(%.1f, %.1f)' % self.adv, '(%.1f, %.1f)' % self.adv,
self.text)) self.text))
else: else:
return '<text %r>' % self.text return '<text %r>' % self.text
def get_margin(self): def get_margin(self):
return abs(self.fontsize) return abs(self.fontsize)
def is_vertical(self): def is_vertical(self):
return self.vertical return self.vertical
def is_upright(self): def is_upright(self):
(a,b,c,d,e,f) = self.matrix (a,b,c,d,e,f) = self.matrix
return 0 < a*d and b*c <= 0 return 0 < a*d and b*c <= 0
## LTFigure ## LTFigure
## ##
class LTFigure(LayoutContainer): class LTFigure(LayoutContainer):
def __init__(self, id, bbox, matrix): def __init__(self, id, bbox, matrix):
(x,y,w,h) = bbox (x,y,w,h) = bbox
x0 = y0 = INF x0 = y0 = INF
x1 = y1 = -INF x1 = y1 = -INF
for (p,q) in ((x,y),(x+w,y),(x,y+h),(x+w,y+h)): for (p,q) in ((x,y),(x+w,y),(x,y+h),(x+w,y+h)):
(p,q) = apply_matrix_pt(matrix, (p,q)) (p,q) = apply_matrix_pt(matrix, (p,q))
x0 = min(x0, p) x0 = min(x0, p)
x1 = max(x1, p) x1 = max(x1, p)
y0 = min(y0, q) y0 = min(y0, q)
y1 = max(y1, q) y1 = max(y1, q)
bbox = (x0,y0,x1,y1) bbox = (x0,y0,x1,y1)
self.matrix = matrix self.matrix = matrix
LayoutContainer.__init__(self, id, bbox) LayoutContainer.__init__(self, id, bbox)
return return
def __repr__(self): def __repr__(self):
return ('<figure id=%r bbox=%s matrix=%r>' % (self.id, self.get_bbox(), self.matrix)) return ('<figure id=%r bbox=%s matrix=%r>' % (self.id, self.get_bbox(), self.matrix))
## LTTextLine ## LTTextLine
## ##
class LTTextLine(LayoutContainer): class LTTextLine(LayoutContainer):
def __init__(self, id, objs, direction, word_margin): def __init__(self, id, objs, direction, word_margin):
LayoutContainer.__init__(self, id, (0,0,0,0), objs) LayoutContainer.__init__(self, id, (0,0,0,0), objs)
self.direction = direction self.direction = direction
self.word_margin = word_margin self.word_margin = word_margin
return return
def __repr__(self): def __repr__(self):
return ('<line %s(%s)>' % (self.get_bbox(), self.direction)) return ('<line %s(%s)>' % (self.get_bbox(), self.direction))
def get_margin(self): def get_margin(self):
return min(self.width, self.height) return min(self.width, self.height)
def get_direction(self): def get_direction(self):
return self.direction return self.direction
def get_text(self): def get_text(self):
return ''.join( obj.text for obj in self.objs if isinstance(obj, LTText) ) return ''.join( obj.text for obj in self.objs if isinstance(obj, LTText) )
def fixate(self): def fixate(self):
LayoutContainer.fixate(self) LayoutContainer.fixate(self)
objs = [] objs = []
if self.direction == 'V': if self.direction == 'V':
y0 = -INF y0 = -INF
for obj in sorted(self.objs, key=lambda obj: -obj.y1): for obj in sorted(self.objs, key=lambda obj: -obj.y1):
if isinstance(obj, LTTextItem) and self.word_margin: if isinstance(obj, LTTextItem) and self.word_margin:
margin = self.word_margin * obj.get_margin() margin = self.word_margin * obj.get_margin()
if obj.y1+margin < y0: if obj.y1+margin < y0:
objs.append(LTAnon(' ')) objs.append(LTAnon(' '))
objs.append(obj) objs.append(obj)
y0 = obj.y0 y0 = obj.y0
else: else:
x1 = INF x1 = INF
for obj in sorted(self.objs, key=lambda obj: obj.x0): for obj in sorted(self.objs, key=lambda obj: obj.x0):
if isinstance(obj, LTTextItem) and self.word_margin: if isinstance(obj, LTTextItem) and self.word_margin:
margin = self.word_margin * obj.get_margin() margin = self.word_margin * obj.get_margin()
if x1 < obj.x0-margin: if x1 < obj.x0-margin:
objs.append(LTAnon(' ')) objs.append(LTAnon(' '))
objs.append(obj) objs.append(obj)
x1 = obj.x1 x1 = obj.x1
objs.append(LTAnon('\n')) objs.append(LTAnon('\n'))
self.objs = objs self.objs = objs
return return
## LTTextBox ## LTTextBox
@ -392,109 +392,109 @@ class LTTextLine(LayoutContainer):
## ##
class LTTextBox(LayoutContainer): class LTTextBox(LayoutContainer):
def __init__(self, id, objs, direction): def __init__(self, id, objs, direction):
LayoutContainer.__init__(self, id, (0,0,0,0), objs) LayoutContainer.__init__(self, id, (0,0,0,0), objs)
self.direction = direction self.direction = direction
return return
def __repr__(self): def __repr__(self):
return ('<textbox %s(%s) %r...>' % (self.get_bbox(), self.direction, self.get_text()[:20])) return ('<textbox %s(%s) %r...>' % (self.get_bbox(), self.direction, self.get_text()[:20]))
def get_text(self): def get_text(self):
return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) ) return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) )
def fixate(self): def fixate(self):
LayoutContainer.fixate(self) LayoutContainer.fixate(self)
if self.direction == 'V': if self.direction == 'V':
self.objs = sorted(self.objs, key=lambda obj: -obj.x1) self.objs = sorted(self.objs, key=lambda obj: -obj.x1)
else: else:
self.objs = sorted(self.objs, key=lambda obj: -obj.y1) self.objs = sorted(self.objs, key=lambda obj: -obj.y1)
return return
def get_direction(self): def get_direction(self):
return self.direction return self.direction
def tsort(objs, f): def tsort(objs, f):
gi = dict( (obj,[]) for obj in objs ) gi = dict( (obj,[]) for obj in objs )
go = dict( (obj,[]) for obj in objs ) go = dict( (obj,[]) for obj in objs )
for obj1 in objs: for obj1 in objs:
for obj2 in objs: for obj2 in objs:
if obj1 is obj2: continue if obj1 is obj2: continue
if f(obj1, obj2): # obj1 -> obj2 if f(obj1, obj2): # obj1 -> obj2
go[obj1].append(obj2) go[obj1].append(obj2)
gi[obj2].append(obj1) gi[obj2].append(obj1)
r = objs[:] r = objs[:]
s = [] s = []
while r: while r:
for obj in r: for obj in r:
if not go[obj] or gi[obj]: continue if not go[obj] or gi[obj]: continue
for c in go[obj]: for c in go[obj]:
gi[c].remove(obj) gi[c].remove(obj)
del gi[obj] del gi[obj]
del go[obj] del go[obj]
r.remove(obj) r.remove(obj)
s.append(obj) s.append(obj)
break break
else: else:
obj = r.pop() obj = r.pop()
del gi[obj] del gi[obj]
del go[obj] del go[obj]
s.append(obj) s.append(obj)
return s return s
## LTPage ## LTPage
## ##
class LTPage(LayoutContainer): class LTPage(LayoutContainer):
def __init__(self, id, bbox, rotate=0): def __init__(self, id, bbox, rotate=0):
LayoutContainer.__init__(self, id, bbox) LayoutContainer.__init__(self, id, bbox)
self.rotate = rotate self.rotate = rotate
return return
def __repr__(self): def __repr__(self):
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.get_bbox(), self.rotate)) return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.get_bbox(), self.rotate))
def analyze_layout(self, laparams): def analyze_layout(self, laparams):
textobjs = [] textobjs = []
otherobjs = [] otherobjs = []
for obj in self.objs: for obj in self.objs:
if isinstance(obj, LTText) and obj.is_upright(): if isinstance(obj, LTText) and obj.is_upright():
textobjs.append(obj) textobjs.append(obj)
else: else:
otherobjs.append(obj) otherobjs.append(obj)
if laparams.direction == 'V': if laparams.direction == 'V':
def vline(obj1, obj2): def vline(obj1, obj2):
return obj1.width * laparams.line_overlap < obj1.hoverlap(obj2) return obj1.width * laparams.line_overlap < obj1.hoverlap(obj2)
def vorder(obj1, obj2): def vorder(obj1, obj2):
if obj1.voverlap(obj2): if obj1.voverlap(obj2):
return obj2.x1 < obj1.x0 return obj2.x1 < obj1.x0
elif obj1.hoverlap(obj2): elif obj1.hoverlap(obj2):
return obj2.y1 < obj1.y0 return obj2.y1 < obj1.y0
else:
return obj2.x1 < obj1.x0 and obj2.y1 < obj1.y0
lines = ClusterSet.build(textobjs, 0, laparams.char_margin,
(lambda id,objs: LTTextLine(id, objs, 'V', laparams.word_margin)),
vline)
boxes = ClusterSet.build(lines, laparams.line_margin, 0,
(lambda id,objs: LTTextBox(id, objs, 'V')))
boxes = tsort(boxes, vorder)
else: else:
return obj2.x1 < obj1.x0 and obj2.y1 < obj1.y0 def hline(obj1, obj2):
lines = ClusterSet.build(textobjs, 0, laparams.char_margin, return obj1.height * laparams.line_overlap < obj1.voverlap(obj2)
(lambda id,objs: LTTextLine(id, objs, 'V', laparams.word_margin)), def horder(obj1, obj2):
vline) if obj1.hoverlap(obj2):
boxes = ClusterSet.build(lines, laparams.line_margin, 0, return obj2.y1 < obj1.y0
(lambda id,objs: LTTextBox(id, objs, 'V'))) elif obj1.voverlap(obj2):
boxes = tsort(boxes, vorder) return obj1.x1 < obj2.x0
else: else:
def hline(obj1, obj2): return obj1.x1 < obj2.x0 and obj2.y1 < obj1.y0
return obj1.height * laparams.line_overlap < obj1.voverlap(obj2) lines = ClusterSet.build(textobjs, laparams.char_margin, 0,
def horder(obj1, obj2): (lambda id,objs: LTTextLine(id, objs, 'H', laparams.word_margin)),
if obj1.hoverlap(obj2): hline)
return obj2.y1 < obj1.y0 boxes = ClusterSet.build(lines, 0, laparams.line_margin,
elif obj1.voverlap(obj2): (lambda id,objs: LTTextBox(id, objs, 'H')))
return obj1.x1 < obj2.x0 boxes = tsort(boxes, horder)
else: self.objs = otherobjs + boxes
return obj1.x1 < obj2.x0 and obj2.y1 < obj1.y0 return
lines = ClusterSet.build(textobjs, laparams.char_margin, 0,
(lambda id,objs: LTTextLine(id, objs, 'H', laparams.word_margin)),
hline)
boxes = ClusterSet.build(lines, 0, laparams.line_margin,
(lambda id,objs: LTTextBox(id, objs, 'H')))
boxes = tsort(boxes, horder)
self.objs = otherobjs + boxes
return

View File

@ -7,93 +7,93 @@ stderr = sys.stderr
## ##
class LZWDecoder(object): class LZWDecoder(object):
debug = 0 debug = 0
def __init__(self, fp): def __init__(self, fp):
self.fp = fp self.fp = fp
self.buff = 0 self.buff = 0
self.bpos = 8 self.bpos = 8
self.nbits = 9 self.nbits = 9
self.table = None self.table = None
self.prevbuf = None self.prevbuf = None
return return
def readbits(self, bits): def readbits(self, bits):
v = 0 v = 0
while 1: while 1:
# the number of remaining bits we can get from the current buffer. # the number of remaining bits we can get from the current buffer.
r = 8-self.bpos r = 8-self.bpos
if bits <= r: if bits <= r:
# |-----8-bits-----| # |-----8-bits-----|
# |-bpos-|-bits-| | # |-bpos-|-bits-| |
# | |----r----| # | |----r----|
v = (v<<bits) | ((self.buff>>(r-bits)) & ((1<<bits)-1)) v = (v<<bits) | ((self.buff>>(r-bits)) & ((1<<bits)-1))
self.bpos += bits self.bpos += bits
break break
else: else:
# |-----8-bits-----| # |-----8-bits-----|
# |-bpos-|---bits----... # |-bpos-|---bits----...
# | |----r----| # | |----r----|
v = (v<<r) | (self.buff & ((1<<r)-1)) v = (v<<r) | (self.buff & ((1<<r)-1))
bits -= r bits -= r
x = self.fp.read(1) x = self.fp.read(1)
if not x: raise EOFError if not x: raise EOFError
self.buff = ord(x) self.buff = ord(x)
self.bpos = 0 self.bpos = 0
return v return v
def feed(self, code): def feed(self, code):
x = '' x = ''
if code == 256: if code == 256:
self.table = [ chr(c) for c in xrange(256) ] # 0-255 self.table = [ chr(c) for c in xrange(256) ] # 0-255
self.table.append(None) # 256 self.table.append(None) # 256
self.table.append(None) # 257 self.table.append(None) # 257
self.prevbuf = '' self.prevbuf = ''
self.nbits = 9 self.nbits = 9
elif code == 257: elif code == 257:
pass pass
elif not self.prevbuf: elif not self.prevbuf:
x = self.prevbuf = self.table[code] x = self.prevbuf = self.table[code]
else: else:
if code < len(self.table): if code < len(self.table):
x = self.table[code] x = self.table[code]
self.table.append(self.prevbuf+x[0]) self.table.append(self.prevbuf+x[0])
else: else:
self.table.append(self.prevbuf+self.prevbuf[0]) self.table.append(self.prevbuf+self.prevbuf[0])
x = self.table[code] x = self.table[code]
l = len(self.table) l = len(self.table)
if l == 511: if l == 511:
self.nbits = 10 self.nbits = 10
elif l == 1023: elif l == 1023:
self.nbits = 11 self.nbits = 11
elif l == 2047: elif l == 2047:
self.nbits = 12 self.nbits = 12
self.prevbuf = x self.prevbuf = x
return x return x
def run(self): def run(self):
while 1: while 1:
try: try:
code = self.readbits(self.nbits) code = self.readbits(self.nbits)
except EOFError: except EOFError:
break break
x = self.feed(code) x = self.feed(code)
yield x yield x
if self.debug: if self.debug:
print >>stderr, ('nbits=%d, code=%d, output=%r, table=%r' % print >>stderr, ('nbits=%d, code=%d, output=%r, table=%r' %
(self.nbits, code, x, self.table[258:])) (self.nbits, code, x, self.table[258:]))
return return
def main(argv): def main(argv):
import StringIO import StringIO
data = '\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01' data = '\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01'
fp = StringIO.StringIO(data) fp = StringIO.StringIO(data)
expected = '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42' expected = '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42'
LZWDecoder.debug = 1 LZWDecoder.debug = 1
output = ''.join(LZWDecoder(fp).run()) output = ''.join(LZWDecoder(fp).run())
print (data, expected, output) print (data, expected, output)
print output == expected print output == expected
return 0 return 0
if __name__ == '__main__': sys.exit(main(sys.argv)) if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -11,13 +11,13 @@ LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
class PDFColorSpace(object): class PDFColorSpace(object):
def __init__(self, name, ncomponents): def __init__(self, name, ncomponents):
self.name = name self.name = name
self.ncomponents = ncomponents self.ncomponents = ncomponents
return return
def __repr__(self): def __repr__(self):
return '<PDFColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents) return '<PDFColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
PREDEFINED_COLORSPACE = dict( PREDEFINED_COLORSPACE = dict(

View File

@ -9,116 +9,116 @@ from pdfminer.pdffont import PDFUnicodeNotDefined
## ##
class PDFDevice(object): class PDFDevice(object):
debug = 0 debug = 0
def __init__(self, rsrc): def __init__(self, rsrc):
self.rsrc = rsrc self.rsrc = rsrc
self.ctm = None self.ctm = None
return return
def __repr__(self): def __repr__(self):
return '<PDFDevice>' return '<PDFDevice>'
def close(self): def close(self):
return return
def set_ctm(self, ctm): def set_ctm(self, ctm):
self.ctm = ctm self.ctm = ctm
return return
def begin_tag(self, tag, props=None): def begin_tag(self, tag, props=None):
return return
def end_tag(self): def end_tag(self):
return return
def do_tag(self, tag, props=None): def do_tag(self, tag, props=None):
return return
def begin_page(self, page, ctm): def begin_page(self, page, ctm):
return return
def end_page(self, page): def end_page(self, page):
return return
def begin_figure(self, name, bbox, matrix): def begin_figure(self, name, bbox, matrix):
return return
def end_figure(self, name): def end_figure(self, name):
return return
def paint_path(self, graphicstate, stroke, fill, evenodd, path): def paint_path(self, graphicstate, stroke, fill, evenodd, path):
return return
def render_image(self, stream, size): def render_image(self, stream, size):
return return
def render_string(self, textstate, seq): def render_string(self, textstate, seq):
return return
## PDFTextDevice ## PDFTextDevice
## ##
class PDFTextDevice(PDFDevice): class PDFTextDevice(PDFDevice):
def handle_undefined_char(self, cidcoding, cid): def handle_undefined_char(self, cidcoding, cid):
if self.debug: if self.debug:
print >>sys.stderr, 'undefined: %r, %r' % (cidcoding, cid) print >>sys.stderr, 'undefined: %r, %r' % (cidcoding, cid)
return '?' return '?'
def render_chars(self, matrix, font, fontsize, charspace, scaling, chars): def render_chars(self, matrix, font, fontsize, charspace, scaling, chars):
return (0, 0) return (0, 0)
def render_string(self, textstate, seq): def render_string(self, textstate, seq):
matrix = mult_matrix(textstate.matrix, self.ctm) matrix = mult_matrix(textstate.matrix, self.ctm)
font = textstate.font font = textstate.font
fontsize = textstate.fontsize fontsize = textstate.fontsize
scaling = textstate.scaling * .01 scaling = textstate.scaling * .01
charspace = textstate.charspace * scaling charspace = textstate.charspace * scaling
wordspace = textstate.wordspace * scaling wordspace = textstate.wordspace * scaling
dxscale = .001 * fontsize * scaling dxscale = .001 * fontsize * scaling
chars = []
needspace = False
(x,y) = textstate.linematrix
for obj in seq:
if isinstance(obj, int) or isinstance(obj, float):
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars)
x += dx
y += dy
d = -obj*dxscale
if font.is_vertical():
y += d
else:
x += d
chars = [] chars = []
needspace = False needspace = False
else: (x,y) = textstate.linematrix
for cid in font.decode(obj): for obj in seq:
try: if isinstance(obj, int) or isinstance(obj, float):
char = font.to_unicode(cid) (dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
except PDFUnicodeNotDefined, e: fontsize, charspace, scaling, chars)
(cidcoding, cid) = e.args x += dx
char = self.handle_undefined_char(cidcoding, cid) y += dy
chars.append((char, cid)) d = -obj*dxscale
if cid == 32 and textstate.wordspace and not font.is_multibyte(): if font.is_vertical():
y += d
else:
x += d
chars = []
needspace = False
else:
for cid in font.decode(obj):
try:
char = font.to_unicode(cid)
except PDFUnicodeNotDefined, e:
(cidcoding, cid) = e.args
char = self.handle_undefined_char(cidcoding, cid)
chars.append((char, cid))
if cid == 32 and textstate.wordspace and not font.is_multibyte():
if needspace:
if font.is_vertical():
y += charspace
else:
x += charspace
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars)
needspace = True
x += dx
y += dy
if font.is_vertical():
y += wordspace
else:
x += wordspace
chars = []
if chars:
if needspace: if needspace:
if font.is_vertical(): if font.is_vertical():
y += charspace y += charspace
else: else:
x += charspace x += charspace
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font, (dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars) fontsize, charspace, scaling, chars)
needspace = True
x += dx x += dx
y += dy y += dy
if font.is_vertical(): textstate.linematrix = (x,y)
y += wordspace return
else:
x += wordspace
chars = []
if chars:
if needspace:
if font.is_vertical():
y += charspace
else:
x += charspace
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars)
x += dx
y += dy
textstate.linematrix = (x,y)
return

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -26,217 +26,217 @@ class PDFNotImplementedError(PSException): pass
## ##
class PDFObjRef(PDFObject): class PDFObjRef(PDFObject):
def __init__(self, doc, objid, _): def __init__(self, doc, objid, _):
if objid == 0: if objid == 0:
if STRICT: if STRICT:
raise PDFValueError('PDF object id cannot be 0.') raise PDFValueError('PDF object id cannot be 0.')
self.doc = doc self.doc = doc
self.objid = objid self.objid = objid
#self.genno = genno # Never used. #self.genno = genno # Never used.
return return
def __repr__(self): def __repr__(self):
return '<PDFObjRef:%d>' % (self.objid) return '<PDFObjRef:%d>' % (self.objid)
def resolve(self): def resolve(self):
return self.doc.getobj(self.objid) return self.doc.getobj(self.objid)
# resolve # resolve
def resolve1(x): def resolve1(x):
''' '''
Resolve an object. If this is an array or dictionary, Resolve an object. If this is an array or dictionary,
it may still contains some indirect objects inside. it may still contains some indirect objects inside.
''' '''
while isinstance(x, PDFObjRef): while isinstance(x, PDFObjRef):
x = x.resolve() x = x.resolve()
return x return x
def resolve_all(x): def resolve_all(x):
''' '''
Recursively resolve X and all the internals. Recursively resolve X and all the internals.
Make sure there is no indirect reference within the nested object. Make sure there is no indirect reference within the nested object.
This procedure might be slow. This procedure might be slow.
''' '''
while isinstance(x, PDFObjRef): while isinstance(x, PDFObjRef):
x = x.resolve() x = x.resolve()
if isinstance(x, list): if isinstance(x, list):
x = [ resolve_all(v) for v in x ] x = [ resolve_all(v) for v in x ]
elif isinstance(x, dict): elif isinstance(x, dict):
for (k,v) in x.iteritems(): for (k,v) in x.iteritems():
x[k] = resolve_all(v) x[k] = resolve_all(v)
return x return x
def decipher_all(decipher, objid, genno, x): def decipher_all(decipher, objid, genno, x):
''' '''
Recursively decipher X. Recursively decipher X.
''' '''
if isinstance(x, str): if isinstance(x, str):
return decipher(objid, genno, x) return decipher(objid, genno, x)
if isinstance(x, list): if isinstance(x, list):
x = [ decipher_all(decipher, objid, genno, v) for v in x ] x = [ decipher_all(decipher, objid, genno, v) for v in x ]
elif isinstance(x, dict): elif isinstance(x, dict):
for (k,v) in x.iteritems(): for (k,v) in x.iteritems():
x[k] = decipher_all(decipher, objid, genno, v) x[k] = decipher_all(decipher, objid, genno, v)
return x return x
# Type cheking # Type cheking
def int_value(x): def int_value(x):
x = resolve1(x) x = resolve1(x)
if not isinstance(x, int): if not isinstance(x, int):
if STRICT: if STRICT:
raise PDFTypeError('Integer required: %r' % x) raise PDFTypeError('Integer required: %r' % x)
return 0 return 0
return x return x
def float_value(x): def float_value(x):
x = resolve1(x) x = resolve1(x)
if not isinstance(x, float): if not isinstance(x, float):
if STRICT: if STRICT:
raise PDFTypeError('Float required: %r' % x) raise PDFTypeError('Float required: %r' % x)
return 0.0 return 0.0
return x return x
def num_value(x): def num_value(x):
x = resolve1(x) x = resolve1(x)
if not (isinstance(x, int) or isinstance(x, float)): if not (isinstance(x, int) or isinstance(x, float)):
if STRICT: if STRICT:
raise PDFTypeError('Int or Float required: %r' % x) raise PDFTypeError('Int or Float required: %r' % x)
return 0 return 0
return x return x
def str_value(x): def str_value(x):
x = resolve1(x) x = resolve1(x)
if not isinstance(x, str): if not isinstance(x, str):
if STRICT: if STRICT:
raise PDFTypeError('String required: %r' % x) raise PDFTypeError('String required: %r' % x)
return '' return ''
return x return x
def list_value(x): def list_value(x):
x = resolve1(x) x = resolve1(x)
if not (isinstance(x, list) or isinstance(x, tuple)): if not (isinstance(x, list) or isinstance(x, tuple)):
if STRICT: if STRICT:
raise PDFTypeError('List required: %r' % x) raise PDFTypeError('List required: %r' % x)
return [] return []
return x return x
def dict_value(x): def dict_value(x):
x = resolve1(x) x = resolve1(x)
if not isinstance(x, dict): if not isinstance(x, dict):
if STRICT: if STRICT:
raise PDFTypeError('Dict required: %r' % x) raise PDFTypeError('Dict required: %r' % x)
return {} return {}
return x return x
def stream_value(x): def stream_value(x):
x = resolve1(x) x = resolve1(x)
if not isinstance(x, PDFStream): if not isinstance(x, PDFStream):
if STRICT: if STRICT:
raise PDFTypeError('PDFStream required: %r' % x) raise PDFTypeError('PDFStream required: %r' % x)
return PDFStream({}, '') return PDFStream({}, '')
return x return x
## PDFStream type ## PDFStream type
## ##
class PDFStream(PDFObject): class PDFStream(PDFObject):
def __init__(self, dic, rawdata, decipher=None): def __init__(self, dic, rawdata, decipher=None):
self.dic = dic self.dic = dic
self.rawdata = rawdata self.rawdata = rawdata
self.decipher = decipher self.decipher = decipher
self.data = None self.data = None
self.objid = None self.objid = None
self.genno = None self.genno = None
return return
def set_objid(self, objid, genno): def set_objid(self, objid, genno):
self.objid = objid self.objid = objid
self.genno = genno self.genno = genno
return return
def __repr__(self): def __repr__(self):
return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic) return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
def decomp(self,data): def decomp(self,data):
import zlib import zlib
buf = data buf = data
# some FlateDecode streams have garbage (newlines, etc) appended to the # some FlateDecode streams have garbage (newlines, etc) appended to the
# end. remove chars from the end to try and decompress the buffer # end. remove chars from the end to try and decompress the buffer
while 8 <= len(buf): while 8 <= len(buf):
try: try:
# will get errors if the document is encrypted. # will get errors if the document is encrypted.
dco = zlib.decompressobj() dco = zlib.decompressobj()
return dco.decompress(buf) return dco.decompress(buf)
except zlib.error: except zlib.error:
buf = buf[:-1] buf = buf[:-1]
raise Exception, "zlib.error while decompressing data" raise Exception, "zlib.error while decompressing data"
def decode(self): def decode(self):
assert self.data == None and self.rawdata != None assert self.data == None and self.rawdata != None
data = self.rawdata data = self.rawdata
if self.decipher: if self.decipher:
# Handle encryption # Handle encryption
data = self.decipher(self.objid, self.genno, data) data = self.decipher(self.objid, self.genno, data)
if 'Filter' not in self.dic: if 'Filter' not in self.dic:
self.data = data self.data = data
self.rawdata = None self.rawdata = None
return return
filters = self.dic['Filter'] filters = self.dic['Filter']
if not isinstance(filters, list): if not isinstance(filters, list):
filters = [ filters ] filters = [ filters ]
for f in filters: for f in filters:
if f in LITERALS_FLATE_DECODE: if f in LITERALS_FLATE_DECODE:
# will get errors if the document is encrypted. # will get errors if the document is encrypted.
data = self.decomp(data) data = self.decomp(data)
elif f in LITERALS_LZW_DECODE: elif f in LITERALS_LZW_DECODE:
try: try:
from cStringIO import StringIO from cStringIO import StringIO
except ImportError: except ImportError:
from StringIO import StringIO from StringIO import StringIO
data = ''.join(LZWDecoder(StringIO(data)).run()) data = ''.join(LZWDecoder(StringIO(data)).run())
elif f in LITERALS_ASCII85_DECODE: elif f in LITERALS_ASCII85_DECODE:
import ascii85 import ascii85
data = ascii85.ascii85decode(data) data = ascii85.ascii85decode(data)
elif f in LITERALS_ASCIIHEX_DECODE: elif f in LITERALS_ASCIIHEX_DECODE:
import ascii85 import ascii85
data = ascii85.asciihexdecode(data) data = ascii85.asciihexdecode(data)
elif f == LITERAL_CRYPT: elif f == LITERAL_CRYPT:
raise PDFNotImplementedError('/Crypt filter is unsupported') raise PDFNotImplementedError('/Crypt filter is unsupported')
else: else:
raise PDFNotImplementedError('Unsupported filter: %r' % f) raise PDFNotImplementedError('Unsupported filter: %r' % f)
# apply predictors # apply predictors
if 'DP' in self.dic: if 'DP' in self.dic:
params = self.dic['DP'] params = self.dic['DP']
else: else:
params = self.dic.get('DecodeParms', {}) params = self.dic.get('DecodeParms', {})
if 'Predictor' in params: if 'Predictor' in params:
pred = int_value(params['Predictor']) pred = int_value(params['Predictor'])
if pred: if pred:
if pred != 12: if pred != 12:
raise PDFNotImplementedError('Unsupported predictor: %r' % pred) raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
if 'Columns' not in params: if 'Columns' not in params:
raise PDFValueError('Columns undefined for predictor=12') raise PDFValueError('Columns undefined for predictor=12')
columns = int_value(params['Columns']) columns = int_value(params['Columns'])
buf = '' buf = ''
ent0 = '\x00' * columns ent0 = '\x00' * columns
for i in xrange(0, len(data), columns+1): for i in xrange(0, len(data), columns+1):
pred = data[i] pred = data[i]
ent1 = data[i+1:i+1+columns] ent1 = data[i+1:i+1+columns]
if pred == '\x02': if pred == '\x02':
ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) ) ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )
buf += ent1 buf += ent1
ent0 = ent1 ent0 = ent1
data = buf data = buf
self.data = data self.data = data
self.rawdata = None self.rawdata = None
return return
def get_data(self): def get_data(self):
if self.data == None: if self.data == None:
self.decode() self.decode()
return self.data return self.data
def get_rawdata(self): def get_rawdata(self):
return self.rawdata return self.rawdata

File diff suppressed because it is too large Load Diff

View File

@ -13,24 +13,24 @@ from array import array
# calc hash value with a given key # calc hash value with a given key
def cdbhash(s, n=5381L): def cdbhash(s, n=5381L):
return reduce(lambda h,c: ((h*33) ^ ord(c)) & 0xffffffffL, s, n) return reduce(lambda h,c: ((h*33) ^ ord(c)) & 0xffffffffL, s, n)
if pack('=i',1) == pack('>i',1): if pack('=i',1) == pack('>i',1):
# big endian # big endian
def decode(x): def decode(x):
a = array('I', x) a = array('I', x)
a.byteswap() a.byteswap()
return a return a
def encode(a): def encode(a):
a.byteswap() a.byteswap()
return a.tostring() return a.tostring()
else: else:
# little endian # little endian
def decode(x): def decode(x):
a = array('I', x) a = array('I', x)
return a return a
def encode(a): def encode(a):
return a.tostring() return a.tostring()
## CDB ## CDB
@ -38,234 +38,234 @@ else:
# cdbiter # cdbiter
def cdbiter(fp, eod): def cdbiter(fp, eod):
kloc = 2048 kloc = 2048
while kloc < eod: while kloc < eod:
fp.seek(kloc) fp.seek(kloc)
(klen, vlen) = unpack('<II', fp.read(8)) (klen, vlen) = unpack('<II', fp.read(8))
k = fp.read(klen) k = fp.read(klen)
v = fp.read(vlen) v = fp.read(vlen)
kloc += 8+klen+vlen kloc += 8+klen+vlen
yield (k,v) yield (k,v)
fp.close() fp.close()
return return
# CDBReader # CDBReader
class CDBReader(object): class CDBReader(object):
def __init__(self, cdbname, docache=1): def __init__(self, cdbname, docache=1):
self.name = cdbname self.name = cdbname
self._fp = file(cdbname, 'rb') self._fp = file(cdbname, 'rb')
hash0 = decode(self._fp.read(2048)) hash0 = decode(self._fp.read(2048))
self._hash0 = [ (hash0[i], hash0[i+1]) for i in xrange(0, 512, 2) ] self._hash0 = [ (hash0[i], hash0[i+1]) for i in xrange(0, 512, 2) ]
self._hash1 = [ None ] * 256 self._hash1 = [ None ] * 256
self._eod = hash0[0] self._eod = hash0[0]
self._docache = docache self._docache = docache
self._cache = {} self._cache = {}
self._keyiter = None self._keyiter = None
self._eachiter = None self._eachiter = None
return return
def __repr__(self): def __repr__(self):
return '<CDBReader: %r>' % self.name return '<CDBReader: %r>' % self.name
def __getstate__(self): def __getstate__(self):
raise TypeError raise TypeError
def __setstate__(self, dict): def __setstate__(self, dict):
raise TypeError raise TypeError
def __getitem__(self, k): def __getitem__(self, k):
k = str(k) k = str(k)
if k in self._cache: return self._cache[k] if k in self._cache: return self._cache[k]
h = cdbhash(k) h = cdbhash(k)
h1 = h & 0xff h1 = h & 0xff
(pos_bucket, ncells) = self._hash0[h1] (pos_bucket, ncells) = self._hash0[h1]
if ncells == 0: raise KeyError(k) if ncells == 0: raise KeyError(k)
hs = self._hash1[h1] hs = self._hash1[h1]
if hs == None: if hs == None:
self._fp.seek(pos_bucket) self._fp.seek(pos_bucket)
hs = decode(self._fp.read(ncells * 8)) hs = decode(self._fp.read(ncells * 8))
self._hash1[h1] = hs self._hash1[h1] = hs
i = ((h >> 8) % ncells) * 2 i = ((h >> 8) % ncells) * 2
n = ncells*2 n = ncells*2
for _ in xrange(ncells): for _ in xrange(ncells):
p1 = hs[i+1] p1 = hs[i+1]
if p1 == 0: raise KeyError(k) if p1 == 0: raise KeyError(k)
if hs[i] == h: if hs[i] == h:
self._fp.seek(p1) self._fp.seek(p1)
(klen, vlen) = unpack('<II', self._fp.read(8)) (klen, vlen) = unpack('<II', self._fp.read(8))
k1 = self._fp.read(klen) k1 = self._fp.read(klen)
if k1 == k: if k1 == k:
v1 = self._fp.read(vlen) v1 = self._fp.read(vlen)
if self._docache: if self._docache:
self._cache[k] = v1 self._cache[k] = v1
return v1 return v1
i = (i+2) % n i = (i+2) % n
raise KeyError(k) raise KeyError(k)
def get(self, k, failed=None): def get(self, k, failed=None):
try: try:
return self.__getitem__(k) return self.__getitem__(k)
except KeyError: except KeyError:
return failed return failed
def has_key(self, k): def has_key(self, k):
try: try:
self.__getitem__(k) self.__getitem__(k)
return True return True
except KeyError: except KeyError:
return False return False
def __contains__(self, k): def __contains__(self, k):
return self.has_key(k) return self.has_key(k)
def firstkey(self): def firstkey(self):
self._keyiter = None self._keyiter = None
return self.nextkey() return self.nextkey()
def nextkey(self): def nextkey(self):
if not self._keyiter: if not self._keyiter:
self._keyiter = ( k for (k,v) in cdbiter(self._fp, self._eod) ) self._keyiter = ( k for (k,v) in cdbiter(self._fp, self._eod) )
try: try:
return self._keyiter.next() return self._keyiter.next()
except StopIteration: except StopIteration:
return None return None
def each(self): def each(self):
if not self._eachiter: if not self._eachiter:
self._eachiter = cdbiter(self._fp, self._eod) self._eachiter = cdbiter(self._fp, self._eod)
try: try:
return self._eachiter.next() return self._eachiter.next()
except StopIteration: except StopIteration:
return None return None
def iterkeys(self): def iterkeys(self):
return ( k for (k,v) in cdbiter(self._fp, self._eod) ) return ( k for (k,v) in cdbiter(self._fp, self._eod) )
def itervalues(self): def itervalues(self):
return ( v for (k,v) in cdbiter(self._fp, self._eod) ) return ( v for (k,v) in cdbiter(self._fp, self._eod) )
def iteritems(self): def iteritems(self):
return cdbiter(self._fp, self._eod) return cdbiter(self._fp, self._eod)
# CDBMaker # CDBMaker
class CDBMaker(object): class CDBMaker(object):
def __init__(self, cdbname, tmpname): def __init__(self, cdbname, tmpname):
self.fn = cdbname self.fn = cdbname
self.fntmp = tmpname self.fntmp = tmpname
self.numentries = 0 self.numentries = 0
self._fp = file(tmpname, 'wb') self._fp = file(tmpname, 'wb')
self._pos = 2048 # sizeof((h,p))*256 self._pos = 2048 # sizeof((h,p))*256
self._bucket = [ array('I') for _ in xrange(256) ] self._bucket = [ array('I') for _ in xrange(256) ]
return return
def __repr__(self): def __repr__(self):
return '<CDBMaker: %r, %r, %d ents>' % (self.fn, self.fntmp, self.numentries) return '<CDBMaker: %r, %r, %d ents>' % (self.fn, self.fntmp, self.numentries)
def __len__(self): def __len__(self):
return self.numentries return self.numentries
def __getstate__(self): def __getstate__(self):
raise TypeError raise TypeError
def __setstate__(self, dict): def __setstate__(self, dict):
raise TypeError raise TypeError
def add(self, k, v): def add(self, k, v):
(k, v) = (str(k), str(v)) (k, v) = (str(k), str(v))
(klen, vlen) = (len(k), len(v)) (klen, vlen) = (len(k), len(v))
self._fp.seek(self._pos) self._fp.seek(self._pos)
self._fp.write(pack('<II', klen, vlen)) self._fp.write(pack('<II', klen, vlen))
self._fp.write(k) self._fp.write(k)
self._fp.write(v) self._fp.write(v)
h = cdbhash(k) h = cdbhash(k)
b = self._bucket[h % 256] b = self._bucket[h % 256]
b.append(h) b.append(h)
b.append(self._pos) b.append(self._pos)
# sizeof(keylen)+sizeof(datalen)+sizeof(key)+sizeof(data) # sizeof(keylen)+sizeof(datalen)+sizeof(key)+sizeof(data)
self._pos += 8+klen+vlen self._pos += 8+klen+vlen
self.numentries += 1 self.numentries += 1
return self return self
def finish(self): def finish(self):
self._fp.seek(self._pos) self._fp.seek(self._pos)
pos_hash = self._pos pos_hash = self._pos
# write hashes # write hashes
for b1 in self._bucket: for b1 in self._bucket:
if not b1: continue if not b1: continue
blen = len(b1) blen = len(b1)
a = array('I', [0]*blen*2) a = array('I', [0]*blen*2)
for j in xrange(0, blen, 2): for j in xrange(0, blen, 2):
(h,p) = (b1[j],b1[j+1]) (h,p) = (b1[j],b1[j+1])
i = ((h >> 8) % blen)*2 i = ((h >> 8) % blen)*2
while a[i+1]: # is cell[i] already occupied? while a[i+1]: # is cell[i] already occupied?
i = (i+2) % len(a) i = (i+2) % len(a)
a[i] = h a[i] = h
a[i+1] = p a[i+1] = p
self._fp.write(encode(a)) self._fp.write(encode(a))
# write header # write header
self._fp.seek(0) self._fp.seek(0)
a = array('I') a = array('I')
for b1 in self._bucket: for b1 in self._bucket:
a.append(pos_hash) a.append(pos_hash)
a.append(len(b1)) a.append(len(b1))
pos_hash += len(b1)*8 pos_hash += len(b1)*8
self._fp.write(encode(a)) self._fp.write(encode(a))
# close # close
self._fp.close() self._fp.close()
os.rename(self.fntmp, self.fn) os.rename(self.fntmp, self.fn)
return return
# txt2cdb # txt2cdb
def txt2cdb(self, lines): def txt2cdb(self, lines):
import re import re
HEAD = re.compile(r'^\+(\d+),(\d+):') HEAD = re.compile(r'^\+(\d+),(\d+):')
for line in lines: for line in lines:
m = HEAD.match(line) m = HEAD.match(line)
if not m: break if not m: break
(klen, vlen) = (int(m.group(1)), int(m.group(2))) (klen, vlen) = (int(m.group(1)), int(m.group(2)))
i = len(m.group(0)) i = len(m.group(0))
k = line[i:i+klen] k = line[i:i+klen]
i += klen i += klen
if line[i:i+2] != '->': raise ValueError('invalid separator: %r' % line) if line[i:i+2] != '->': raise ValueError('invalid separator: %r' % line)
i += 2 i += 2
v = line[i:i+vlen] v = line[i:i+vlen]
self.add(k, v) self.add(k, v)
return self return self
# cdbdump # cdbdump
def cdbdump(cdbname): def cdbdump(cdbname):
fp = file(cdbname, 'rb') fp = file(cdbname, 'rb')
(eor,) = unpack('<I', fp.read(4)) (eor,) = unpack('<I', fp.read(4))
return cdbiter(fp, eor) return cdbiter(fp, eor)
# cdbmerge # cdbmerge
def cdbmerge(iters): def cdbmerge(iters):
q = [] q = []
for it in iters: for it in iters:
try: try:
q.append((it.next(),it)) q.append((it.next(),it))
except StopIteration: except StopIteration:
pass pass
k0 = None k0 = None
vs = None vs = None
while q: while q:
q.sort() q.sort()
((k,v),it) = q.pop(0) ((k,v),it) = q.pop(0)
if k0 != k: if k0 != k:
if vs: yield (k0,vs) if vs: yield (k0,vs)
vs = [] vs = []
vs.append(v) vs.append(v)
k0 = k k0 = k
try: try:
q.append((it.next(),it)) q.append((it.next(),it))
except StopIteration: except StopIteration:
continue continue
if vs: yield (k0,vs) if vs: yield (k0,vs)
return return
# aliases # aliases
@ -278,132 +278,132 @@ init = CDBReader
# tcdbiter # tcdbiter
def tcdbiter(fp, eor): def tcdbiter(fp, eor):
locs = {} locs = {}
fp.seek(eor) fp.seek(eor)
while 1: while 1:
x = fp.read(8) x = fp.read(8)
if not x: break if not x: break
(h, pos) = unpack('<II', x) (h, pos) = unpack('<II', x)
if pos: locs[pos] = h if pos: locs[pos] = h
pos = 2048 pos = 2048
fp.seek(pos) fp.seek(pos)
key = () key = ()
parents = [0] parents = [0]
while pos < eor: while pos < eor:
(klen, vlen) = unpack('<II', fp.read(8)) (klen, vlen) = unpack('<II', fp.read(8))
k = fp.read(klen) k = fp.read(klen)
v = fp.read(vlen) v = fp.read(vlen)
h = locs[pos] h = locs[pos]
for (i,p) in enumerate(parents): for (i,p) in enumerate(parents):
if cdbhash(k, p+5381L) == h: if cdbhash(k, p+5381L) == h:
parents = parents[:i+1] parents = parents[:i+1]
key = key[:i] key = key[:i]
break break
key += (k,) key += (k,)
yield (key, v) yield (key, v)
parents.append(pos) parents.append(pos)
pos += 8+klen+vlen pos += 8+klen+vlen
fp.close() fp.close()
return return
# TCDBMaker # TCDBMaker
class TCDBMaker(CDBMaker): class TCDBMaker(CDBMaker):
def __init__(self, cdbname, tmpname): def __init__(self, cdbname, tmpname):
CDBMaker.__init__(self, cdbname, tmpname) CDBMaker.__init__(self, cdbname, tmpname)
self._parent = 0 self._parent = 0
self._stack = [self._parent] self._stack = [self._parent]
return return
def put(self, depth, k, v): def put(self, depth, k, v):
if depth == len(self._stack)+1: if depth == len(self._stack)+1:
self._stack.append(self._parent) self._stack.append(self._parent)
elif depth < len(self._stack): elif depth < len(self._stack):
self._stack = self._stack[:depth] self._stack = self._stack[:depth]
elif depth != len(self._stack): elif depth != len(self._stack):
raise ValueError('invalid depth: %d' % depth) raise ValueError('invalid depth: %d' % depth)
# #
(k, v) = (str(k), str(v)) (k, v) = (str(k), str(v))
(klen, vlen) = (len(k), len(v)) (klen, vlen) = (len(k), len(v))
self._parent = self._pos self._parent = self._pos
# sizeof(keylen)+sizeof(datalen)+sizeof(key)+sizeof(data) # sizeof(keylen)+sizeof(datalen)+sizeof(key)+sizeof(data)
self._fp.seek(self._pos) self._fp.seek(self._pos)
self._fp.write(pack('<II', klen, vlen)) self._fp.write(pack('<II', klen, vlen))
self._fp.write(k) self._fp.write(k)
self._fp.write(v) self._fp.write(v)
self._pos += 4+4+klen+vlen self._pos += 4+4+klen+vlen
h = cdbhash(k, self._stack[-1]+5381L) h = cdbhash(k, self._stack[-1]+5381L)
b = self._bucket[h % 256] b = self._bucket[h % 256]
b.append(h) b.append(h)
b.append(self._parent) b.append(self._parent)
self.numentries += 1 self.numentries += 1
return self return self
def txt2tcdb(self, lines): def txt2tcdb(self, lines):
import re import re
HEAD = re.compile(r'^(\++)(\d+),(\d+):') HEAD = re.compile(r'^(\++)(\d+),(\d+):')
for line in lines: for line in lines:
m = HEAD.match(line) m = HEAD.match(line)
if not m: break if not m: break
(depth, klen, vlen) = (len(m.group(1)), int(m.group(2)), int(m.group(3))) (depth, klen, vlen) = (len(m.group(1)), int(m.group(2)), int(m.group(3)))
i = len(m.group(0)) i = len(m.group(0))
k = line[i:i+klen] k = line[i:i+klen]
i += klen i += klen
if line[i:i+2] != '->': raise ValueError('invalid separator: %r' % line) if line[i:i+2] != '->': raise ValueError('invalid separator: %r' % line)
i += 2 i += 2
v = line[i:i+vlen] v = line[i:i+vlen]
self.put(depth, k, v) self.put(depth, k, v)
return self return self
# TCDBReader # TCDBReader
class TCDBReader(CDBReader): class TCDBReader(CDBReader):
def lookup(self, seq, parent=0L): def lookup(self, seq, parent=0L):
r = [] r = []
for k in seq: for k in seq:
(v, parent) = self.lookup1(k, parent) (v, parent) = self.lookup1(k, parent)
r.append(v) r.append(v)
return r return r
def lookup1(self, k, parent=0L): def lookup1(self, k, parent=0L):
k = str(k) k = str(k)
if self._docache and (parent,k) in self._cache: if self._docache and (parent,k) in self._cache:
return self._cache[(parent,k)] return self._cache[(parent,k)]
h = cdbhash(k, parent+5381L) h = cdbhash(k, parent+5381L)
self._fp.seek((h % 256) << 3) self._fp.seek((h % 256) << 3)
(pos_bucket, ncells) = unpack('<II', self._fp.read(8)) (pos_bucket, ncells) = unpack('<II', self._fp.read(8))
if ncells == 0: raise KeyError(k) if ncells == 0: raise KeyError(k)
start = (h >> 8) % ncells start = (h >> 8) % ncells
for i in xrange(ncells): for i in xrange(ncells):
self._fp.seek(pos_bucket + ((start+i) % ncells << 3)) self._fp.seek(pos_bucket + ((start+i) % ncells << 3))
(h1, p1) = unpack('<II', self._fp.read(8)) (h1, p1) = unpack('<II', self._fp.read(8))
if p1 == 0: raise KeyError(k) if p1 == 0: raise KeyError(k)
if h1 == h: if h1 == h:
self._fp.seek(p1) self._fp.seek(p1)
(klen, vlen) = unpack('<II', self._fp.read(8)) (klen, vlen) = unpack('<II', self._fp.read(8))
k1 = self._fp.read(klen) k1 = self._fp.read(klen)
if k1 == k: if k1 == k:
v1 = self._fp.read(vlen) v1 = self._fp.read(vlen)
if self._docache: if self._docache:
self._cache[(parent,k)] = (v1,p1) self._cache[(parent,k)] = (v1,p1)
return (v1,p1) return (v1,p1)
raise KeyError(k) raise KeyError(k)
def iterkeys(self): def iterkeys(self):
return ( k for (k,v) in tcdbiter(self._fp, self._eod) ) return ( k for (k,v) in tcdbiter(self._fp, self._eod) )
def itervalues(self): def itervalues(self):
return ( v for (k,v) in tcdbiter(self._fp, self._eod) ) return ( v for (k,v) in tcdbiter(self._fp, self._eod) )
def iteritems(self): def iteritems(self):
return tcdbiter(self._fp, self._eod) return tcdbiter(self._fp, self._eod)
# tcdbdump # tcdbdump
def tcdbdump(cdbname): def tcdbdump(cdbname):
fp = file(cdbname, 'rb') fp = file(cdbname, 'rb')
(eor,) = unpack('<I', fp.read(4)) (eor,) = unpack('<I', fp.read(4))
return tcdbiter(fp, eor) return tcdbiter(fp, eor)
# aliases # aliases
@ -414,64 +414,64 @@ tcdbmerge = cdbmerge
# main # main
def main(argv): def main(argv):
import getopt, fileinput import getopt, fileinput
def usage(): def usage():
print 'usage: %s {cmake,cget,cdump,cmerge} [options] cdbname [args ...]' % argv[0] print 'usage: %s {cmake,cget,cdump,cmerge} [options] cdbname [args ...]' % argv[0]
print 'usage: %s {tmake,tget,tdump,tmerge} [options] tcdbname [args ...]' % argv[0] print 'usage: %s {tmake,tget,tdump,tmerge} [options] tcdbname [args ...]' % argv[0]
return 100 return 100
args = argv[1:] args = argv[1:]
if not args: return usage() if not args: return usage()
cmd = args.pop(0) cmd = args.pop(0)
try: try:
(opts, args) = getopt.getopt(args, 'kv2') (opts, args) = getopt.getopt(args, 'kv2')
except getopt.GetoptError: except getopt.GetoptError:
return usage() return usage()
if not args: return usage() if not args: return usage()
dbname = args.pop(0) dbname = args.pop(0)
# cdb # cdb
if cmd == 'cmake': if cmd == 'cmake':
CDBMaker(dbname, dbname+'.tmp').txt2cdb(fileinput.input(args)).finish() CDBMaker(dbname, dbname+'.tmp').txt2cdb(fileinput.input(args)).finish()
elif cmd == 'cget': elif cmd == 'cget':
print repr(CDBReader(dbname).get(args[0])) print repr(CDBReader(dbname).get(args[0]))
elif cmd == 'cdump': elif cmd == 'cdump':
f = (lambda k,v: '+%d,%d:%s->%s' % (len(k), len(v), k, v)) f = (lambda k,v: '+%d,%d:%s->%s' % (len(k), len(v), k, v))
for (k, v) in opts: for (k, v) in opts:
if k == '-k': f = (lambda k,_: k) if k == '-k': f = (lambda k,_: k)
elif k == '-v': f = (lambda _,v: v) elif k == '-v': f = (lambda _,v: v)
elif k == '-2': f = (lambda k,v: k+'\t'+v) elif k == '-2': f = (lambda k,v: k+'\t'+v)
for (k,v) in cdbdump(dbname): for (k,v) in cdbdump(dbname):
print f(k,v) print f(k,v)
print print
elif cmd == 'cmerge': elif cmd == 'cmerge':
dbs = [ cdbdump(fname) for fname in args ] dbs = [ cdbdump(fname) for fname in args ]
m = CDBMaker(dbname, dbname+'.tmp') m = CDBMaker(dbname, dbname+'.tmp')
for (k,vs) in tcdbmerge(dbs): for (k,vs) in tcdbmerge(dbs):
m.add(k, ' '.join(vs)) m.add(k, ' '.join(vs))
m.finish() m.finish()
# tcdb # tcdb
elif cmd == 'tmake': elif cmd == 'tmake':
TCDBMaker(dbname, dbname+'.tmp').txt2tcdb(fileinput.input(args)).finish() TCDBMaker(dbname, dbname+'.tmp').txt2tcdb(fileinput.input(args)).finish()
elif cmd == 'tget': elif cmd == 'tget':
print repr(TCDBReader(dbname).lookup(args)) print repr(TCDBReader(dbname).lookup(args))
elif cmd == 'tdump': elif cmd == 'tdump':
f = (lambda k,v: '%s%d,%d:%s->%s' % ('+'*len(k), len(k[-1]), len(v), k[-1], v)) f = (lambda k,v: '%s%d,%d:%s->%s' % ('+'*len(k), len(k[-1]), len(v), k[-1], v))
for (k, v) in opts: for (k, v) in opts:
if k == '-k': f = (lambda k,_: '/'.join(k)) if k == '-k': f = (lambda k,_: '/'.join(k))
elif k == '-v': f = (lambda _,v: v) elif k == '-v': f = (lambda _,v: v)
elif k == '-2': f = (lambda k,v: '/'.join(k)+'\t'+v) elif k == '-2': f = (lambda k,v: '/'.join(k)+'\t'+v)
for (k,v) in tcdbdump(dbname): for (k,v) in tcdbdump(dbname):
print f(k,v) print f(k,v)
print print
elif cmd == 'tmerge': elif cmd == 'tmerge':
dbs = [ tcdbdump(fname) for fname in args ] dbs = [ tcdbdump(fname) for fname in args ]
m = TCDBMaker(dbname, dbname+'.tmp') m = TCDBMaker(dbname, dbname+'.tmp')
for (k,vs) in tcdbmerge(dbs): for (k,vs) in tcdbmerge(dbs):
m.put(len(k), k[-1], ' '.join(vs)) m.put(len(k), k[-1], ' '.join(vs))
m.finish() m.finish()
else: else:
return usage() return usage()
return return
if __name__ == '__main__': sys.exit(main(sys.argv)) if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -691,88 +691,88 @@ rcon = [
] ]
if len(pack('L',0)) == 4: if len(pack('L',0)) == 4:
# 32bit # 32bit
def GETU32(x): return unpack('>L', x)[0] def GETU32(x): return unpack('>L', x)[0]
def PUTU32(x): return pack('>L', x) def PUTU32(x): return pack('>L', x)
else: else:
# 64bit # 64bit
def GETU32(x): return unpack('>I', x)[0] def GETU32(x): return unpack('>I', x)[0]
def PUTU32(x): return pack('>I', x) def PUTU32(x): return pack('>I', x)
# Expand the cipher key into the encryption key schedule. # Expand the cipher key into the encryption key schedule.
# #
# @return the number of rounds for the given cipher key size. # @return the number of rounds for the given cipher key size.
def rijndaelSetupEncrypt(key, keybits): def rijndaelSetupEncrypt(key, keybits):
i = p = 0 i = p = 0
rk = [0]*RKLENGTH(keybits) rk = [0]*RKLENGTH(keybits)
rk[0] = GETU32(key[0:4]) rk[0] = GETU32(key[0:4])
rk[1] = GETU32(key[4:8]) rk[1] = GETU32(key[4:8])
rk[2] = GETU32(key[8:12]) rk[2] = GETU32(key[8:12])
rk[3] = GETU32(key[12:16]) rk[3] = GETU32(key[12:16])
if keybits == 128: if keybits == 128:
while 1: while 1:
temp = rk[p+3] temp = rk[p+3]
rk[p+4] = (rk[p+0] ^ rk[p+4] = (rk[p+0] ^
(Te4[(temp >> 16) & 0xff] & 0xff000000) ^ (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
(Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^ (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
(Te4[(temp ) & 0xff] & 0x0000ff00) ^ (Te4[(temp ) & 0xff] & 0x0000ff00) ^
(Te4[(temp >> 24) ] & 0x000000ff) ^ (Te4[(temp >> 24) ] & 0x000000ff) ^
rcon[i]) rcon[i])
rk[p+5] = rk[p+1] ^ rk[p+4] rk[p+5] = rk[p+1] ^ rk[p+4]
rk[p+6] = rk[p+2] ^ rk[p+5] rk[p+6] = rk[p+2] ^ rk[p+5]
rk[p+7] = rk[p+3] ^ rk[p+6] rk[p+7] = rk[p+3] ^ rk[p+6]
i += 1 i += 1
if i == 10: return (rk, 10) if i == 10: return (rk, 10)
p += 4 p += 4
rk[4] = GETU32(key[16:20]) rk[4] = GETU32(key[16:20])
rk[5] = GETU32(key[20:24]) rk[5] = GETU32(key[20:24])
if keybits == 192: if keybits == 192:
while 1: while 1:
temp = rk[p+5] temp = rk[p+5]
rk[p+6] = (rk[p+0] ^ rk[p+6] = (rk[p+0] ^
(Te4[(temp >> 16) & 0xff] & 0xff000000) ^ (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
(Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^ (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
(Te4[(temp ) & 0xff] & 0x0000ff00) ^ (Te4[(temp ) & 0xff] & 0x0000ff00) ^
(Te4[(temp >> 24) ] & 0x000000ff) ^ (Te4[(temp >> 24) ] & 0x000000ff) ^
rcon[i]) rcon[i])
rk[p+7] = rk[p+1] ^ rk[p+6] rk[p+7] = rk[p+1] ^ rk[p+6]
rk[p+8] = rk[p+2] ^ rk[p+7] rk[p+8] = rk[p+2] ^ rk[p+7]
rk[p+9] = rk[p+3] ^ rk[p+8] rk[p+9] = rk[p+3] ^ rk[p+8]
i += 1 i += 1
if i == 8: return (rk, 12) if i == 8: return (rk, 12)
rk[p+10] = rk[p+4] ^ rk[p+9] rk[p+10] = rk[p+4] ^ rk[p+9]
rk[p+11] = rk[p+5] ^ rk[p+10] rk[p+11] = rk[p+5] ^ rk[p+10]
p += 6 p += 6
rk[6] = GETU32(key[24:28]) rk[6] = GETU32(key[24:28])
rk[7] = GETU32(key[28:32]) rk[7] = GETU32(key[28:32])
if keybits == 256: if keybits == 256:
while 1: while 1:
temp = rk[p+7] temp = rk[p+7]
rk[p+8] = (rk[p+0] ^ rk[p+8] = (rk[p+0] ^
(Te4[(temp >> 16) & 0xff] & 0xff000000) ^ (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
(Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^ (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
(Te4[(temp ) & 0xff] & 0x0000ff00) ^ (Te4[(temp ) & 0xff] & 0x0000ff00) ^
(Te4[(temp >> 24) ] & 0x000000ff) ^ (Te4[(temp >> 24) ] & 0x000000ff) ^
rcon[i]) rcon[i])
rk[p+9] = rk[p+1] ^ rk[p+8] rk[p+9] = rk[p+1] ^ rk[p+8]
rk[p+10] = rk[p+2] ^ rk[p+9] rk[p+10] = rk[p+2] ^ rk[p+9]
rk[p+11] = rk[p+3] ^ rk[p+10] rk[p+11] = rk[p+3] ^ rk[p+10]
i += 1 i += 1
if i == 7: return (rk, 14) if i == 7: return (rk, 14)
temp = rk[p+11] temp = rk[p+11]
rk[p+12] = (rk[p+4] ^ rk[p+12] = (rk[p+4] ^
(Te4[(temp >> 24) ] & 0xff000000) ^ (Te4[(temp >> 24) ] & 0xff000000) ^
(Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^ (Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^
(Te4[(temp >> 8) & 0xff] & 0x0000ff00) ^ (Te4[(temp >> 8) & 0xff] & 0x0000ff00) ^
(Te4[(temp ) & 0xff] & 0x000000ff)) (Te4[(temp ) & 0xff] & 0x000000ff))
rk[p+13] = rk[p+5] ^ rk[p+12] rk[p+13] = rk[p+5] ^ rk[p+12]
rk[p+14] = rk[p+6] ^ rk[p+13] rk[p+14] = rk[p+6] ^ rk[p+13]
rk[p+15] = rk[p+7] ^ rk[p+14] rk[p+15] = rk[p+7] ^ rk[p+14]
p += 8 p += 8
raise ValueError(keybits) raise ValueError(keybits)
# Expand the cipher key into the decryption key schedule. # Expand the cipher key into the decryption key schedule.
@ -780,291 +780,291 @@ def rijndaelSetupEncrypt(key, keybits):
# @return the number of rounds for the given cipher key size. # @return the number of rounds for the given cipher key size.
def rijndaelSetupDecrypt(key, keybits): def rijndaelSetupDecrypt(key, keybits):
# expand the cipher key: # expand the cipher key:
(rk, nrounds) = rijndaelSetupEncrypt(key, keybits) (rk, nrounds) = rijndaelSetupEncrypt(key, keybits)
# invert the order of the round keys: # invert the order of the round keys:
i = 0 i = 0
j = 4*nrounds j = 4*nrounds
while i < j: while i < j:
temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp
temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp
temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp
temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp
i += 4 i += 4
j -= 4 j -= 4
# apply the inverse MixColumn transform to all round keys but the first and the last: # apply the inverse MixColumn transform to all round keys but the first and the last:
p = 0 p = 0
for i in xrange(1, nrounds): for i in xrange(1, nrounds):
p += 4 p += 4
rk[p+0] = ( rk[p+0] = (
Td0[Te4[(rk[p+0] >> 24) ] & 0xff] ^ Td0[Te4[(rk[p+0] >> 24) ] & 0xff] ^
Td1[Te4[(rk[p+0] >> 16) & 0xff] & 0xff] ^ Td1[Te4[(rk[p+0] >> 16) & 0xff] & 0xff] ^
Td2[Te4[(rk[p+0] >> 8) & 0xff] & 0xff] ^ Td2[Te4[(rk[p+0] >> 8) & 0xff] & 0xff] ^
Td3[Te4[(rk[p+0] ) & 0xff] & 0xff]) Td3[Te4[(rk[p+0] ) & 0xff] & 0xff])
rk[p+1] = ( rk[p+1] = (
Td0[Te4[(rk[p+1] >> 24) ] & 0xff] ^ Td0[Te4[(rk[p+1] >> 24) ] & 0xff] ^
Td1[Te4[(rk[p+1] >> 16) & 0xff] & 0xff] ^ Td1[Te4[(rk[p+1] >> 16) & 0xff] & 0xff] ^
Td2[Te4[(rk[p+1] >> 8) & 0xff] & 0xff] ^ Td2[Te4[(rk[p+1] >> 8) & 0xff] & 0xff] ^
Td3[Te4[(rk[p+1] ) & 0xff] & 0xff]) Td3[Te4[(rk[p+1] ) & 0xff] & 0xff])
rk[p+2] = ( rk[p+2] = (
Td0[Te4[(rk[p+2] >> 24) ] & 0xff] ^ Td0[Te4[(rk[p+2] >> 24) ] & 0xff] ^
Td1[Te4[(rk[p+2] >> 16) & 0xff] & 0xff] ^ Td1[Te4[(rk[p+2] >> 16) & 0xff] & 0xff] ^
Td2[Te4[(rk[p+2] >> 8) & 0xff] & 0xff] ^ Td2[Te4[(rk[p+2] >> 8) & 0xff] & 0xff] ^
Td3[Te4[(rk[p+2] ) & 0xff] & 0xff]) Td3[Te4[(rk[p+2] ) & 0xff] & 0xff])
rk[p+3] = ( rk[p+3] = (
Td0[Te4[(rk[p+3] >> 24) ] & 0xff] ^ Td0[Te4[(rk[p+3] >> 24) ] & 0xff] ^
Td1[Te4[(rk[p+3] >> 16) & 0xff] & 0xff] ^ Td1[Te4[(rk[p+3] >> 16) & 0xff] & 0xff] ^
Td2[Te4[(rk[p+3] >> 8) & 0xff] & 0xff] ^ Td2[Te4[(rk[p+3] >> 8) & 0xff] & 0xff] ^
Td3[Te4[(rk[p+3] ) & 0xff] & 0xff]) Td3[Te4[(rk[p+3] ) & 0xff] & 0xff])
return (rk, nrounds) return (rk, nrounds)
def rijndaelEncrypt(rk, nrounds, plaintext): def rijndaelEncrypt(rk, nrounds, plaintext):
assert len(plaintext) == 16 assert len(plaintext) == 16
# map byte array block to cipher state # map byte array block to cipher state
# and add initial round key: # and add initial round key:
s0 = GETU32(plaintext[0:4]) ^ rk[0] s0 = GETU32(plaintext[0:4]) ^ rk[0]
s1 = GETU32(plaintext[4:8]) ^ rk[1] s1 = GETU32(plaintext[4:8]) ^ rk[1]
s2 = GETU32(plaintext[8:12]) ^ rk[2] s2 = GETU32(plaintext[8:12]) ^ rk[2]
s3 = GETU32(plaintext[12:16]) ^ rk[3] s3 = GETU32(plaintext[12:16]) ^ rk[3]
# nrounds - 1 full rounds: # nrounds - 1 full rounds:
r = nrounds >> 1 r = nrounds >> 1
p = 0 p = 0
while 1: while 1:
t0 = ( t0 = (
Te0[(s0 >> 24) ] ^ Te0[(s0 >> 24) ] ^
Te1[(s1 >> 16) & 0xff] ^ Te1[(s1 >> 16) & 0xff] ^
Te2[(s2 >> 8) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^
Te3[(s3 ) & 0xff] ^ Te3[(s3 ) & 0xff] ^
rk[p+4]) rk[p+4])
t1 = ( t1 = (
Te0[(s1 >> 24) ] ^ Te0[(s1 >> 24) ] ^
Te1[(s2 >> 16) & 0xff] ^ Te1[(s2 >> 16) & 0xff] ^
Te2[(s3 >> 8) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^
Te3[(s0 ) & 0xff] ^ Te3[(s0 ) & 0xff] ^
rk[p+5]) rk[p+5])
t2 = ( t2 = (
Te0[(s2 >> 24) ] ^ Te0[(s2 >> 24) ] ^
Te1[(s3 >> 16) & 0xff] ^ Te1[(s3 >> 16) & 0xff] ^
Te2[(s0 >> 8) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^
Te3[(s1 ) & 0xff] ^ Te3[(s1 ) & 0xff] ^
rk[p+6]) rk[p+6])
t3 = ( t3 = (
Te0[(s3 >> 24) ] ^ Te0[(s3 >> 24) ] ^
Te1[(s0 >> 16) & 0xff] ^ Te1[(s0 >> 16) & 0xff] ^
Te2[(s1 >> 8) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^
Te3[(s2 ) & 0xff] ^ Te3[(s2 ) & 0xff] ^
rk[p+7]) rk[p+7])
p += 8 p += 8
r -= 1 r -= 1
if r == 0: break if r == 0: break
s0 = (
Te0[(t0 >> 24) ] ^
Te1[(t1 >> 16) & 0xff] ^
Te2[(t2 >> 8) & 0xff] ^
Te3[(t3 ) & 0xff] ^
rk[p+0])
s1 = (
Te0[(t1 >> 24) ] ^
Te1[(t2 >> 16) & 0xff] ^
Te2[(t3 >> 8) & 0xff] ^
Te3[(t0 ) & 0xff] ^
rk[p+1])
s2 = (
Te0[(t2 >> 24) ] ^
Te1[(t3 >> 16) & 0xff] ^
Te2[(t0 >> 8) & 0xff] ^
Te3[(t1 ) & 0xff] ^
rk[p+2])
s3 = (
Te0[(t3 >> 24) ] ^
Te1[(t0 >> 16) & 0xff] ^
Te2[(t1 >> 8) & 0xff] ^
Te3[(t2 ) & 0xff] ^
rk[p+3])
ciphertext = ''
# apply last round and
# map cipher state to byte array block:
s0 = ( s0 = (
Te0[(t0 >> 24) ] ^ (Te4[(t0 >> 24) ] & 0xff000000) ^
Te1[(t1 >> 16) & 0xff] ^ (Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
Te2[(t2 >> 8) & 0xff] ^ (Te4[(t2 >> 8) & 0xff] & 0x0000ff00) ^
Te3[(t3 ) & 0xff] ^ (Te4[(t3 ) & 0xff] & 0x000000ff) ^
rk[p+0]) rk[p+0])
ciphertext += PUTU32(s0)
s1 = ( s1 = (
Te0[(t1 >> 24) ] ^ (Te4[(t1 >> 24) ] & 0xff000000) ^
Te1[(t2 >> 16) & 0xff] ^ (Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
Te2[(t3 >> 8) & 0xff] ^ (Te4[(t3 >> 8) & 0xff] & 0x0000ff00) ^
Te3[(t0 ) & 0xff] ^ (Te4[(t0 ) & 0xff] & 0x000000ff) ^
rk[p+1]) rk[p+1])
ciphertext += PUTU32(s1)
s2 = ( s2 = (
Te0[(t2 >> 24) ] ^ (Te4[(t2 >> 24) ] & 0xff000000) ^
Te1[(t3 >> 16) & 0xff] ^ (Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
Te2[(t0 >> 8) & 0xff] ^ (Te4[(t0 >> 8) & 0xff] & 0x0000ff00) ^
Te3[(t1 ) & 0xff] ^ (Te4[(t1 ) & 0xff] & 0x000000ff) ^
rk[p+2]) rk[p+2])
ciphertext += PUTU32(s2)
s3 = ( s3 = (
Te0[(t3 >> 24) ] ^ (Te4[(t3 >> 24) ] & 0xff000000) ^
Te1[(t0 >> 16) & 0xff] ^ (Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
Te2[(t1 >> 8) & 0xff] ^ (Te4[(t1 >> 8) & 0xff] & 0x0000ff00) ^
Te3[(t2 ) & 0xff] ^ (Te4[(t2 ) & 0xff] & 0x000000ff) ^
rk[p+3]) rk[p+3])
ciphertext += PUTU32(s3)
ciphertext = '' assert len(ciphertext) == 16
return ciphertext
# apply last round and
# map cipher state to byte array block:
s0 = (
(Te4[(t0 >> 24) ] & 0xff000000) ^
(Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
(Te4[(t2 >> 8) & 0xff] & 0x0000ff00) ^
(Te4[(t3 ) & 0xff] & 0x000000ff) ^
rk[p+0])
ciphertext += PUTU32(s0)
s1 = (
(Te4[(t1 >> 24) ] & 0xff000000) ^
(Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
(Te4[(t3 >> 8) & 0xff] & 0x0000ff00) ^
(Te4[(t0 ) & 0xff] & 0x000000ff) ^
rk[p+1])
ciphertext += PUTU32(s1)
s2 = (
(Te4[(t2 >> 24) ] & 0xff000000) ^
(Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
(Te4[(t0 >> 8) & 0xff] & 0x0000ff00) ^
(Te4[(t1 ) & 0xff] & 0x000000ff) ^
rk[p+2])
ciphertext += PUTU32(s2)
s3 = (
(Te4[(t3 >> 24) ] & 0xff000000) ^
(Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
(Te4[(t1 >> 8) & 0xff] & 0x0000ff00) ^
(Te4[(t2 ) & 0xff] & 0x000000ff) ^
rk[p+3])
ciphertext += PUTU32(s3)
assert len(ciphertext) == 16
return ciphertext
def rijndaelDecrypt(rk, nrounds, ciphertext): def rijndaelDecrypt(rk, nrounds, ciphertext):
assert len(ciphertext) == 16 assert len(ciphertext) == 16
# map byte array block to cipher state # map byte array block to cipher state
# and add initial round key: # and add initial round key:
s0 = GETU32(ciphertext[0:4]) ^ rk[0] s0 = GETU32(ciphertext[0:4]) ^ rk[0]
s1 = GETU32(ciphertext[4:8]) ^ rk[1] s1 = GETU32(ciphertext[4:8]) ^ rk[1]
s2 = GETU32(ciphertext[8:12]) ^ rk[2] s2 = GETU32(ciphertext[8:12]) ^ rk[2]
s3 = GETU32(ciphertext[12:16]) ^ rk[3] s3 = GETU32(ciphertext[12:16]) ^ rk[3]
# nrounds - 1 full rounds: # nrounds - 1 full rounds:
r = nrounds >> 1 r = nrounds >> 1
p = 0 p = 0
while 1: while 1:
t0 = ( t0 = (
Td0[(s0 >> 24) ] ^ Td0[(s0 >> 24) ] ^
Td1[(s3 >> 16) & 0xff] ^ Td1[(s3 >> 16) & 0xff] ^
Td2[(s2 >> 8) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^
Td3[(s1 ) & 0xff] ^ Td3[(s1 ) & 0xff] ^
rk[p+4]) rk[p+4])
t1 = ( t1 = (
Td0[(s1 >> 24) ] ^ Td0[(s1 >> 24) ] ^
Td1[(s0 >> 16) & 0xff] ^ Td1[(s0 >> 16) & 0xff] ^
Td2[(s3 >> 8) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^
Td3[(s2 ) & 0xff] ^ Td3[(s2 ) & 0xff] ^
rk[p+5]) rk[p+5])
t2 = ( t2 = (
Td0[(s2 >> 24) ] ^ Td0[(s2 >> 24) ] ^
Td1[(s1 >> 16) & 0xff] ^ Td1[(s1 >> 16) & 0xff] ^
Td2[(s0 >> 8) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^
Td3[(s3 ) & 0xff] ^ Td3[(s3 ) & 0xff] ^
rk[p+6]) rk[p+6])
t3 = ( t3 = (
Td0[(s3 >> 24) ] ^ Td0[(s3 >> 24) ] ^
Td1[(s2 >> 16) & 0xff] ^ Td1[(s2 >> 16) & 0xff] ^
Td2[(s1 >> 8) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^
Td3[(s0 ) & 0xff] ^ Td3[(s0 ) & 0xff] ^
rk[p+7]) rk[p+7])
p += 8 p += 8
r -= 1 r -= 1
if r == 0: break if r == 0: break
s0 = (
Td0[(t0 >> 24) ] ^
Td1[(t3 >> 16) & 0xff] ^
Td2[(t2 >> 8) & 0xff] ^
Td3[(t1 ) & 0xff] ^
rk[p+0])
s1 = (
Td0[(t1 >> 24) ] ^
Td1[(t0 >> 16) & 0xff] ^
Td2[(t3 >> 8) & 0xff] ^
Td3[(t2 ) & 0xff] ^
rk[p+1])
s2 = (
Td0[(t2 >> 24) ] ^
Td1[(t1 >> 16) & 0xff] ^
Td2[(t0 >> 8) & 0xff] ^
Td3[(t3 ) & 0xff] ^
rk[p+2])
s3 = (
Td0[(t3 >> 24) ] ^
Td1[(t2 >> 16) & 0xff] ^
Td2[(t1 >> 8) & 0xff] ^
Td3[(t0 ) & 0xff] ^
rk[p+3])
plaintext = ''
# apply last round and
# map cipher state to byte array block:
s0 = ( s0 = (
Td0[(t0 >> 24) ] ^ (Td4[(t0 >> 24) ] & 0xff000000) ^
Td1[(t3 >> 16) & 0xff] ^ (Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
Td2[(t2 >> 8) & 0xff] ^ (Td4[(t2 >> 8) & 0xff] & 0x0000ff00) ^
Td3[(t1 ) & 0xff] ^ (Td4[(t1 ) & 0xff] & 0x000000ff) ^
rk[p+0]) rk[p+0])
plaintext += PUTU32(s0)
s1 = ( s1 = (
Td0[(t1 >> 24) ] ^ (Td4[(t1 >> 24) ] & 0xff000000) ^
Td1[(t0 >> 16) & 0xff] ^ (Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
Td2[(t3 >> 8) & 0xff] ^ (Td4[(t3 >> 8) & 0xff] & 0x0000ff00) ^
Td3[(t2 ) & 0xff] ^ (Td4[(t2 ) & 0xff] & 0x000000ff) ^
rk[p+1]) rk[p+1])
plaintext += PUTU32(s1)
s2 = ( s2 = (
Td0[(t2 >> 24) ] ^ (Td4[(t2 >> 24) ] & 0xff000000) ^
Td1[(t1 >> 16) & 0xff] ^ (Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
Td2[(t0 >> 8) & 0xff] ^ (Td4[(t0 >> 8) & 0xff] & 0x0000ff00) ^
Td3[(t3 ) & 0xff] ^ (Td4[(t3 ) & 0xff] & 0x000000ff) ^
rk[p+2]) rk[p+2])
plaintext += PUTU32(s2)
s3 = ( s3 = (
Td0[(t3 >> 24) ] ^ (Td4[(t3 >> 24) ] & 0xff000000) ^
Td1[(t2 >> 16) & 0xff] ^ (Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
Td2[(t1 >> 8) & 0xff] ^ (Td4[(t1 >> 8) & 0xff] & 0x0000ff00) ^
Td3[(t0 ) & 0xff] ^ (Td4[(t0 ) & 0xff] & 0x000000ff) ^
rk[p+3]) rk[p+3])
plaintext += PUTU32(s3)
plaintext = '' assert len(plaintext) == 16
return plaintext
# apply last round and
# map cipher state to byte array block:
s0 = (
(Td4[(t0 >> 24) ] & 0xff000000) ^
(Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
(Td4[(t2 >> 8) & 0xff] & 0x0000ff00) ^
(Td4[(t1 ) & 0xff] & 0x000000ff) ^
rk[p+0])
plaintext += PUTU32(s0)
s1 = (
(Td4[(t1 >> 24) ] & 0xff000000) ^
(Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
(Td4[(t3 >> 8) & 0xff] & 0x0000ff00) ^
(Td4[(t2 ) & 0xff] & 0x000000ff) ^
rk[p+1])
plaintext += PUTU32(s1)
s2 = (
(Td4[(t2 >> 24) ] & 0xff000000) ^
(Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
(Td4[(t0 >> 8) & 0xff] & 0x0000ff00) ^
(Td4[(t3 ) & 0xff] & 0x000000ff) ^
rk[p+2])
plaintext += PUTU32(s2)
s3 = (
(Td4[(t3 >> 24) ] & 0xff000000) ^
(Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
(Td4[(t1 >> 8) & 0xff] & 0x0000ff00) ^
(Td4[(t0 ) & 0xff] & 0x000000ff) ^
rk[p+3])
plaintext += PUTU32(s3)
assert len(plaintext) == 16
return plaintext
# decrypt(key, fin, fout, keybits=256) # decrypt(key, fin, fout, keybits=256)
class RijndaelDecryptor(object): class RijndaelDecryptor(object):
def __init__(self, key, keybits=256): def __init__(self, key, keybits=256):
assert len(key) == KEYLENGTH(keybits) assert len(key) == KEYLENGTH(keybits)
(self.rk, self.nrounds) = rijndaelSetupDecrypt(key, keybits) (self.rk, self.nrounds) = rijndaelSetupDecrypt(key, keybits)
assert len(self.rk) == RKLENGTH(keybits) assert len(self.rk) == RKLENGTH(keybits)
assert self.nrounds == NROUNDS(keybits) assert self.nrounds == NROUNDS(keybits)
return return
def decrypt(self, ciphertext): def decrypt(self, ciphertext):
assert len(ciphertext) == 16 assert len(ciphertext) == 16
return rijndaelDecrypt(self.rk, self.nrounds, ciphertext) return rijndaelDecrypt(self.rk, self.nrounds, ciphertext)
# encrypt(key, fin, fout, keybits=256) # encrypt(key, fin, fout, keybits=256)
class RijndaelEncryptor(object): class RijndaelEncryptor(object):
def __init__(self, key, keybits=256): def __init__(self, key, keybits=256):
assert len(key) == KEYLENGTH(keybits) assert len(key) == KEYLENGTH(keybits)
(self.rk, self.nrounds) = rijndaelSetupEncrypt(key, keybits) (self.rk, self.nrounds) = rijndaelSetupEncrypt(key, keybits)
assert len(self.rk) == RKLENGTH(keybits) assert len(self.rk) == RKLENGTH(keybits)
assert self.nrounds == NROUNDS(keybits) assert self.nrounds == NROUNDS(keybits)
return return
def encrypt(self, plaintext): def encrypt(self, plaintext):
assert len(plaintext) == 16 assert len(plaintext) == 16
return rijndaelEncrypt(self.rk, self.nrounds, plaintext) return rijndaelEncrypt(self.rk, self.nrounds, plaintext)
def main(argv): def main(argv):
# test # test
key = '00010203050607080A0B0C0D0F101112'.decode('hex') key = '00010203050607080A0B0C0D0F101112'.decode('hex')
plaintext = '506812A45F08C889B97F5980038B8359'.decode('hex') plaintext = '506812A45F08C889B97F5980038B8359'.decode('hex')
ciphertext = 'D8F532538289EF7D06B506A4FD5BE9C9'.decode('hex') ciphertext = 'D8F532538289EF7D06B506A4FD5BE9C9'.decode('hex')
e = RijndaelEncryptor(key, 128) e = RijndaelEncryptor(key, 128)
text = e.encrypt(plaintext) text = e.encrypt(plaintext)
assert text == ciphertext assert text == ciphertext
d = RijndaelDecryptor(key, 128) d = RijndaelDecryptor(key, 128)
text = d.decrypt(ciphertext) text = d.decrypt(ciphertext)
assert text == plaintext assert text == plaintext
return 0 return 0
if __name__ == '__main__': sys.exit(main(sys.argv)) if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -7,21 +7,21 @@ from struct import unpack
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0) MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)): def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
'''Multiplies two matrices.''' '''Multiplies two matrices.'''
return (a0*a1+c0*b1, b0*a1+d0*b1, return (a0*a1+c0*b1, b0*a1+d0*b1,
a0*c1+c0*d1, b0*c1+d0*d1, a0*c1+c0*d1, b0*c1+d0*d1,
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0) a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
def translate_matrix((a,b,c,d,e,f), (x,y)): def translate_matrix((a,b,c,d,e,f), (x,y)):
return (a,b,c,d,x*a+y*c+e,x*b+y*d+f) return (a,b,c,d,x*a+y*c+e,x*b+y*d+f)
def apply_matrix_pt((a,b,c,d,e,f), (x,y)): def apply_matrix_pt((a,b,c,d,e,f), (x,y)):
'''Applies a matrix to a point.''' '''Applies a matrix to a point.'''
return (a*x+c*y+e, b*x+d*y+f) return (a*x+c*y+e, b*x+d*y+f)
def apply_matrix_norm((a,b,c,d,e,f), (p,q)): def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
'''Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))''' '''Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))'''
return (a*p+c*q, b*p+d*q) return (a*p+c*q, b*p+d*q)
## Utility functions ## Utility functions
@ -29,62 +29,62 @@ def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
# pick # pick
def pick(seq, func, maxobj=None): def pick(seq, func, maxobj=None):
'''Picks the object that has the highest value of func(obj).''' '''Picks the object that has the highest value of func(obj).'''
maxscore = None maxscore = None
for obj in seq: for obj in seq:
score = func(obj) score = func(obj)
if maxscore == None or maxscore < score: if maxscore == None or maxscore < score:
(maxscore,maxobj) = (score,obj) (maxscore,maxobj) = (score,obj)
return maxobj return maxobj
# bsearch # bsearch
def bsearch(objs, v0): def bsearch(objs, v0):
'''Tries to find the closest value to v0.''' '''Tries to find the closest value to v0.'''
i0 = 0 i0 = 0
i1 = len(objs) i1 = len(objs)
while i0 < i1: while i0 < i1:
i = (i0+i1)/2 i = (i0+i1)/2
(v, obj) = objs[i] (v, obj) = objs[i]
if v0 == v: if v0 == v:
(i0,i1) = (i,i+1) (i0,i1) = (i,i+1)
while 0 < i0 and objs[i0-1][0] == v0: while 0 < i0 and objs[i0-1][0] == v0:
i0 -= 1 i0 -= 1
while i1 < len(objs)-1 and objs[i1][0] == v0: while i1 < len(objs)-1 and objs[i1][0] == v0:
i1 += 1 i1 += 1
break break
elif v0 < v: elif v0 < v:
i1 = i i1 = i
else: else:
i0 = i+1 i0 = i+1
return (i0,i1) return (i0,i1)
# choplist # choplist
def choplist(n, seq): def choplist(n, seq):
'''Groups every n elements of the list.''' '''Groups every n elements of the list.'''
r = [] r = []
for x in seq: for x in seq:
r.append(x) r.append(x)
if len(r) == n: if len(r) == n:
yield tuple(r) yield tuple(r)
r = [] r = []
return return
# nunpack # nunpack
def nunpack(s, default=0): def nunpack(s, default=0):
'''Unpacks up to 4 bytes big endian.''' '''Unpacks up to 4 bytes big endian.'''
l = len(s) l = len(s)
if not l: if not l:
return default return default
elif l == 1: elif l == 1:
return ord(s) return ord(s)
elif l == 2: elif l == 2:
return unpack('>H', s)[0] return unpack('>H', s)[0]
elif l == 3: elif l == 3:
return unpack('>L', '\x00'+s)[0] return unpack('>L', '\x00'+s)[0]
elif l == 4: elif l == 4:
return unpack('>L', s)[0] return unpack('>L', s)[0]
else: else:
return TypeError('invalid length: %d' % l) return TypeError('invalid length: %d' % l)
# decode_text # decode_text
PDFDocEncoding = ''.join( unichr(x) for x in ( PDFDocEncoding = ''.join( unichr(x) for x in (
@ -122,14 +122,14 @@ PDFDocEncoding = ''.join( unichr(x) for x in (
0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
)) ))
def decode_text(s): def decode_text(s):
'''Decodes a PDFDocEncoding string to Unicode.''' '''Decodes a PDFDocEncoding string to Unicode.'''
if s.startswith('\xfe\xff'): if s.startswith('\xfe\xff'):
return unicode(s[2:], 'utf-16be', 'ignore') return unicode(s[2:], 'utf-16be', 'ignore')
else: else:
return ''.join( PDFDocEncoding[ord(c)] for c in s ) return ''.join( PDFDocEncoding[ord(c)] for c in s )
# enc # enc
def enc(x, codec='ascii'): def enc(x, codec='ascii'):
'''Encodes a string for SGML/XML/HTML''' '''Encodes a string for SGML/XML/HTML'''
x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;') x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
return x.encode(codec, 'xmlcharrefreplace') return x.encode(codec, 'xmlcharrefreplace')

View File

@ -3,10 +3,10 @@ from distutils.core import setup
from pdfminer import __version__ from pdfminer import __version__
setup( setup(
name='pdfminer', name='pdfminer',
version=__version__, version=__version__,
description='PDF parser and analyzer', description='PDF parser and analyzer',
long_description='''PDFMiner is a suite of programs that help long_description='''PDFMiner is a suite of programs that help
extracting and analyzing text data of PDF documents. extracting and analyzing text data of PDF documents.
Unlike other PDF-related tools, it allows to obtain Unlike other PDF-related tools, it allows to obtain
the exact location of texts in a page, as well as the exact location of texts in a page, as well as
@ -14,23 +14,23 @@ other extra information such as font information or ruled lines.
It includes a PDF converter that can transform PDF files It includes a PDF converter that can transform PDF files
into other text formats (such as HTML). It has an extensible into other text formats (such as HTML). It has an extensible
PDF parser that can be used for other purposes instead of text analysis.''', PDF parser that can be used for other purposes instead of text analysis.''',
license='MIT/X', license='MIT/X',
author='Yusuke Shinyama', author='Yusuke Shinyama',
author_email='yusuke at cs dot nyu dot edu', author_email='yusuke at cs dot nyu dot edu',
url='http://www.unixuser.org/~euske/python/pdfminer/index.html', url='http://www.unixuser.org/~euske/python/pdfminer/index.html',
packages=[ packages=[
'pdfminer' 'pdfminer'
], ],
scripts=[ scripts=[
'tools/pdf2txt.py', 'tools/pdf2txt.py',
'tools/dumppdf.py' 'tools/dumppdf.py'
], ],
keywords=['pdf parser', 'pdf converter', 'text mining'], keywords=['pdf parser', 'pdf converter', 'text mining'],
classifiers=[ classifiers=[
'Development Status :: 4 - Beta', 'Development Status :: 4 - Beta',
'Environment :: Console', 'Environment :: Console',
'Intended Audience :: Developers', 'Intended Audience :: Developers',
'Intended Audience :: Science/Research', 'Intended Audience :: Science/Research',
'License :: OSI Approved :: MIT License', 'License :: OSI Approved :: MIT License',
], ],
) )

View File

@ -5,38 +5,38 @@ stdout = sys.stdout
stderr = sys.stderr stderr = sys.stderr
def main(argv): def main(argv):
fonts = {} fonts = {}
for line in fileinput.input(): for line in fileinput.input():
f = line.strip().split(' ') f = line.strip().split(' ')
if not f: continue if not f: continue
k = f[0] k = f[0]
if k == 'FontName': if k == 'FontName':
fontname = f[1] fontname = f[1]
props = {'FontName': fontname, 'Flags': 0} props = {'FontName': fontname, 'Flags': 0}
chars = {} chars = {}
fonts[fontname] = (props, chars) fonts[fontname] = (props, chars)
elif k == 'C': elif k == 'C':
cid = int(f[1]) cid = int(f[1])
if 0 <= cid and cid <= 255: if 0 <= cid and cid <= 255:
width = int(f[4]) width = int(f[4])
chars[cid] = width chars[cid] = width
elif k in ('CapHeight', 'XHeight', 'ItalicAngle', elif k in ('CapHeight', 'XHeight', 'ItalicAngle',
'Ascender', 'Descender'): 'Ascender', 'Descender'):
k = {'Ascender':'Ascent', 'Descender':'Descent'}.get(k,k) k = {'Ascender':'Ascent', 'Descender':'Descent'}.get(k,k)
props[k] = float(f[1]) props[k] = float(f[1])
elif k in ('FontName', 'FamilyName', 'Weight'): elif k in ('FontName', 'FamilyName', 'Weight'):
k = {'FamilyName':'FontFamily', 'Weight':'FontWeight'}.get(k,k) k = {'FamilyName':'FontFamily', 'Weight':'FontWeight'}.get(k,k)
props[k] = f[1] props[k] = f[1]
elif k == 'IsFixedPitch': elif k == 'IsFixedPitch':
if f[1].lower() == 'true': if f[1].lower() == 'true':
props['Flags'] = 64 props['Flags'] = 64
elif k == 'FontBBox': elif k == 'FontBBox':
props[k] = tuple(map(float, f[1:5])) props[k] = tuple(map(float, f[1:5]))
print '# -*- python -*-' print '# -*- python -*-'
print 'FONT_METRICS = {' print 'FONT_METRICS = {'
for (fontname,(props,chars)) in fonts.iteritems(): for (fontname,(props,chars)) in fonts.iteritems():
print ' %r: %r,' % (fontname, (props,chars)) print ' %r: %r,' % (fontname, (props,chars))
print '}' print '}'
return 0 return 0
if __name__ == '__main__': sys.exit(main(sys.argv)) if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -13,173 +13,173 @@ from pdfminer.pdftypes import PDFStream, PDFObjRef, PSKeyword, PSLiteral, resolv
ESC_PAT = re.compile(r'[\000-\037&<>()\042\047\134\177-\377]') ESC_PAT = re.compile(r'[\000-\037&<>()\042\047\134\177-\377]')
def esc(s): def esc(s):
return ESC_PAT.sub(lambda m:'&#%d;' % ord(m.group(0)), s) return ESC_PAT.sub(lambda m:'&#%d;' % ord(m.group(0)), s)
# dumpxml # dumpxml
def dumpxml(out, obj, codec=None): def dumpxml(out, obj, codec=None):
if isinstance(obj, dict): if isinstance(obj, dict):
out.write('<dict size="%d">\n' % len(obj)) out.write('<dict size="%d">\n' % len(obj))
for (k,v) in obj.iteritems(): for (k,v) in obj.iteritems():
out.write('<key>%s</key>\n' % k) out.write('<key>%s</key>\n' % k)
out.write('<value>') out.write('<value>')
dumpxml(out, v) dumpxml(out, v)
out.write('</value>\n') out.write('</value>\n')
out.write('</dict>') out.write('</dict>')
return return
if isinstance(obj, list): if isinstance(obj, list):
out.write('<list size="%d">\n' % len(obj)) out.write('<list size="%d">\n' % len(obj))
for v in obj: for v in obj:
dumpxml(out, v) dumpxml(out, v)
out.write('\n') out.write('\n')
out.write('</list>') out.write('</list>')
return return
if isinstance(obj, str): if isinstance(obj, str):
out.write('<string size="%d">%s</string>' % (len(obj), esc(obj))) out.write('<string size="%d">%s</string>' % (len(obj), esc(obj)))
return return
if isinstance(obj, PDFStream): if isinstance(obj, PDFStream):
out.write('<stream>\n<props>\n') out.write('<stream>\n<props>\n')
dumpxml(out, obj.dic) dumpxml(out, obj.dic)
out.write('\n</props>\n') out.write('\n</props>\n')
if codec == 'text': if codec == 'text':
data = obj.get_data() data = obj.get_data()
out.write('<data size="%d">%s</data>\n' % (len(data), esc(data))) out.write('<data size="%d">%s</data>\n' % (len(data), esc(data)))
out.write('</stream>') out.write('</stream>')
return return
if isinstance(obj, PDFObjRef): if isinstance(obj, PDFObjRef):
out.write('<ref id="%d"/>' % obj.objid) out.write('<ref id="%d"/>' % obj.objid)
return return
if isinstance(obj, PSKeyword): if isinstance(obj, PSKeyword):
out.write('<keyword>%s</keyword>' % obj.name) out.write('<keyword>%s</keyword>' % obj.name)
return return
if isinstance(obj, PSLiteral): if isinstance(obj, PSLiteral):
out.write('<literal>%s</literal>' % obj.name) out.write('<literal>%s</literal>' % obj.name)
return return
if isinstance(obj, int) or isinstance(obj, float): if isinstance(obj, int) or isinstance(obj, float):
out.write('<number>%s</number>' % obj) out.write('<number>%s</number>' % obj)
return return
raise TypeError(obj) raise TypeError(obj)
# dumptrailers # dumptrailers
def dumptrailers(out, doc): def dumptrailers(out, doc):
for xref in doc.xrefs: for xref in doc.xrefs:
out.write('<trailer>\n') out.write('<trailer>\n')
dumpxml(out, xref.trailer) dumpxml(out, xref.trailer)
out.write('\n</trailer>\n\n') out.write('\n</trailer>\n\n')
return return
# dumpallobjs # dumpallobjs
def dumpallobjs(out, doc, codec=None): def dumpallobjs(out, doc, codec=None):
out.write('<pdf>') out.write('<pdf>')
for xref in doc.xrefs: for xref in doc.xrefs:
for objid in xref.objids(): for objid in xref.objids():
try: try:
obj = doc.getobj(objid) obj = doc.getobj(objid)
if obj == None: continue if obj == None: continue
out.write('<object id="%d">\n' % objid) out.write('<object id="%d">\n' % objid)
dumpxml(out, obj, codec=codec) dumpxml(out, obj, codec=codec)
out.write('\n</object>\n\n') out.write('\n</object>\n\n')
except: except:
raise raise
dumptrailers(out, doc) dumptrailers(out, doc)
out.write('</pdf>') out.write('</pdf>')
return return
# dumpoutline # dumpoutline
def dumpoutline(outfp, fname, objids, pagenos, password='', def dumpoutline(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None): dumpall=False, codec=None):
doc = PDFDocument() doc = PDFDocument()
fp = file(fname, 'rb') fp = file(fname, 'rb')
parser = PDFParser(doc, fp) parser = PDFParser(doc, fp)
doc.initialize(password) doc.initialize(password)
pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) ) pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
for (level,title,dest,a,se) in doc.get_outlines(): for (level,title,dest,a,se) in doc.get_outlines():
pageno = None pageno = None
if dest: if dest:
dest = resolve1( doc.lookup_name('Dests', dest) ) dest = resolve1( doc.lookup_name('Dests', dest) )
if isinstance(dest, dict): if isinstance(dest, dict):
dest = dest['D'] dest = dest['D']
pageno = pages[dest[0].objid] pageno = pages[dest[0].objid]
outfp.write(repr((level,title,dest,pageno))+'\n') outfp.write(repr((level,title,dest,pageno))+'\n')
parser.close() parser.close()
fp.close() fp.close()
return return
# dumppdf # dumppdf
def dumppdf(outfp, fname, objids, pagenos, password='', def dumppdf(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None): dumpall=False, codec=None):
doc = PDFDocument() doc = PDFDocument()
fp = file(fname, 'rb') fp = file(fname, 'rb')
parser = PDFParser(doc, fp) parser = PDFParser(doc, fp)
doc.initialize(password) doc.initialize(password)
if objids: if objids:
for objid in objids: for objid in objids:
obj = doc.getobj(objid) obj = doc.getobj(objid)
if isinstance(obj, PDFStream) and codec == 'raw': if isinstance(obj, PDFStream) and codec == 'raw':
outfp.write(obj.get_rawdata()) outfp.write(obj.get_rawdata())
elif isinstance(obj, PDFStream) and codec == 'binary': elif isinstance(obj, PDFStream) and codec == 'binary':
outfp.write(obj.get_data()) outfp.write(obj.get_data())
else: else:
dumpxml(outfp, obj, codec=codec) dumpxml(outfp, obj, codec=codec)
if pagenos: if pagenos:
for (pageno,page) in enumerate(doc.get_pages()): for (pageno,page) in enumerate(doc.get_pages()):
if pageno in pagenos: if pageno in pagenos:
dumpxml(outfp, page.attrs) dumpxml(outfp, page.attrs)
if dumpall: if dumpall:
dumpallobjs(outfp, doc, codec=codec) dumpallobjs(outfp, doc, codec=codec)
if (not objids) and (not pagenos) and (not dumpall): if (not objids) and (not pagenos) and (not dumpall):
dumptrailers(outfp, doc) dumptrailers(outfp, doc)
fp.close() fp.close()
if codec not in ('raw','binary'): if codec not in ('raw','binary'):
outfp.write('\n') outfp.write('\n')
return return
# main # main
def main(argv): def main(argv):
import getopt import getopt
def usage(): def usage():
print 'usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-i objid] file ...' % argv[0] print 'usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-i objid] file ...' % argv[0]
return 100 return 100
try: try:
(opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTi:') (opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTi:')
except getopt.GetoptError: except getopt.GetoptError:
return usage() return usage()
if not args: return usage() if not args: return usage()
debug = 0 debug = 0
objids = [] objids = []
pagenos = set() pagenos = set()
codec = None codec = None
password = '' password = ''
dumpall = False dumpall = False
proc = dumppdf proc = dumppdf
outfp = sys.stdout outfp = sys.stdout
for (k, v) in opts: for (k, v) in opts:
if k == '-d': debug += 1 if k == '-d': debug += 1
elif k == '-i': objids.extend( int(x) for x in v.split(',') ) elif k == '-i': objids.extend( int(x) for x in v.split(',') )
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-P': password = v elif k == '-P': password = v
elif k == '-a': dumpall = True elif k == '-a': dumpall = True
elif k == '-r': codec = 'raw' elif k == '-r': codec = 'raw'
elif k == '-b': codec = 'binary' elif k == '-b': codec = 'binary'
elif k == '-t': codec = 'text' elif k == '-t': codec = 'text'
elif k == '-T': proc = dumpoutline elif k == '-T': proc = dumpoutline
elif k == '-o': outfp = file(v, 'wb') elif k == '-o': outfp = file(v, 'wb')
# #
PDFDocument.debug = debug PDFDocument.debug = debug
PDFParser.debug = debug PDFParser.debug = debug
# #
for fname in args: for fname in args:
proc(outfp, fname, objids, pagenos, password=password, proc(outfp, fname, objids, pagenos, password=password,
dumpall=dumpall, codec=codec) dumpall=dumpall, codec=codec)
return return
if __name__ == '__main__': sys.exit(main(sys.argv)) if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -27,16 +27,16 @@ from pdfminer.cmap import CMapDB
# quote HTML metacharacters # quote HTML metacharacters
def q(x): def q(x):
return x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;') return x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
# encode parameters as a URL # encode parameters as a URL
Q = re.compile(r'[^a-zA-Z0-9_.-=]') Q = re.compile(r'[^a-zA-Z0-9_.-=]')
def url(base, **kw): def url(base, **kw):
r = [] r = []
for (k,v) in kw.iteritems(): for (k,v) in kw.iteritems():
v = Q.sub(lambda m: '%%%02X' % ord(m.group(0)), encoder(q(v), 'replace')[0]) v = Q.sub(lambda m: '%%%02X' % ord(m.group(0)), encoder(q(v), 'replace')[0])
r.append('%s=%s' % (k, v)) r.append('%s=%s' % (k, v))
return base+'&'.join(r) return base+'&'.join(r)
## convert ## convert
@ -44,156 +44,156 @@ def url(base, **kw):
class FileSizeExceeded(ValueError): pass class FileSizeExceeded(ValueError): pass
def convert(outfp, infp, path, codec='utf-8', maxpages=10, def convert(outfp, infp, path, codec='utf-8', maxpages=10,
maxfilesize=5000000, pagenos=None, html=True): maxfilesize=5000000, pagenos=None, html=True):
# save the input file. # save the input file.
src = file(path, 'wb') src = file(path, 'wb')
nbytes = 0 nbytes = 0
while 1: while 1:
data = infp.read(4096) data = infp.read(4096)
nbytes += len(data) nbytes += len(data)
if maxfilesize and maxfilesize < nbytes: if maxfilesize and maxfilesize < nbytes:
raise FileSizeExceeded(maxfilesize) raise FileSizeExceeded(maxfilesize)
if not data: break if not data: break
src.write(data) src.write(data)
src.close() src.close()
infp.close() infp.close()
# perform conversion and # perform conversion and
# send the results over the network. # send the results over the network.
CMapDB.initialize() CMapDB.initialize()
rsrc = PDFResourceManager() rsrc = PDFResourceManager()
laparams = LAParams() laparams = LAParams()
if html: if html:
device = HTMLConverter(rsrc, outfp, codec=codec, laparams=laparams) device = HTMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
else: else:
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams) device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
fp = file(path, 'rb') fp = file(path, 'rb')
process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages) process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages)
fp.close() fp.close()
return return
## PDF2HTMLApp ## PDF2HTMLApp
## ##
class PDF2HTMLApp(object): class PDF2HTMLApp(object):
APPURL = '/convert' APPURL = '/convert'
TMPDIR = './var/' TMPDIR = './var/'
LOGPATH = './var/log' LOGPATH = './var/log'
MAXFILESIZE = 5000000 MAXFILESIZE = 5000000
MAXPAGES = 10 MAXPAGES = 10
def __init__(self, outfp, logpath=LOGPATH, loglevel=logging.DEBUG, codec='utf-8'): def __init__(self, outfp, logpath=LOGPATH, loglevel=logging.DEBUG, codec='utf-8'):
self.outfp = outfp self.outfp = outfp
self.codec = codec self.codec = codec
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
level=loglevel, filename=logpath, filemode='a') level=loglevel, filename=logpath, filemode='a')
self.remote_addr = os.environ.get('REMOTE_ADDR') self.remote_addr = os.environ.get('REMOTE_ADDR')
self.path_info = os.environ.get('PATH_INFO') self.path_info = os.environ.get('PATH_INFO')
self.method = os.environ.get('REQUEST_METHOD', 'GET') self.method = os.environ.get('REQUEST_METHOD', 'GET')
self.server = os.environ.get('SERVER_SOFTWARE', '') self.server = os.environ.get('SERVER_SOFTWARE', '')
self.content_type = 'text/html; charset=%s' % codec self.content_type = 'text/html; charset=%s' % codec
self.cur_time = time.time() self.cur_time = time.time()
self.form = cgi.FieldStorage() self.form = cgi.FieldStorage()
return return
def put(self, *args): def put(self, *args):
for x in args: for x in args:
if isinstance(x, str): if isinstance(x, str):
self.outfp.write(x) self.outfp.write(x)
elif isinstance(x, unicode): elif isinstance(x, unicode):
self.outfp.write(x.encode(self.codec, 'xmlcharrefreplace')) self.outfp.write(x.encode(self.codec, 'xmlcharrefreplace'))
return return
def http_200(self): def http_200(self):
if self.server.startswith('cgi-httpd'): if self.server.startswith('cgi-httpd'):
# required for cgi-httpd # required for cgi-httpd
self.outfp.write('HTTP/1.0 200 OK\r\n') self.outfp.write('HTTP/1.0 200 OK\r\n')
self.outfp.write('Content-type: %s\r\n' % self.content_type) self.outfp.write('Content-type: %s\r\n' % self.content_type)
self.outfp.write('Connection: close\r\n\r\n') self.outfp.write('Connection: close\r\n\r\n')
return return
def http_404(self): def http_404(self):
if self.server.startswith('cgi-httpd'): if self.server.startswith('cgi-httpd'):
# required for cgi-httpd # required for cgi-httpd
self.outfp.write('HTTP/1.0 404 Not Found\r\n') self.outfp.write('HTTP/1.0 404 Not Found\r\n')
self.outfp.write('Content-type: text/html\r\n') self.outfp.write('Content-type: text/html\r\n')
self.outfp.write('Connection: close\r\n\r\n') self.outfp.write('Connection: close\r\n\r\n')
self.outfp.write('<html><body>page does not exist</body></body>\n') self.outfp.write('<html><body>page does not exist</body></body>\n')
return return
def http_301(self, url): def http_301(self, url):
if self.server.startswith('cgi-httpd'): if self.server.startswith('cgi-httpd'):
# required for cgi-httpd # required for cgi-httpd
self.outfp.write('HTTP/1.0 301 Moved\r\n') self.outfp.write('HTTP/1.0 301 Moved\r\n')
self.outfp.write('Location: %s\r\n\r\n' % url) self.outfp.write('Location: %s\r\n\r\n' % url)
return return
def coverpage(self): def coverpage(self):
self.put( self.put(
'<html><head><title>pdf2html demo</title></head><body>\n', '<html><head><title>pdf2html demo</title></head><body>\n',
'<h1>pdf2html demo</h1><hr>\n', '<h1>pdf2html demo</h1><hr>\n',
'<form method="POST" action="%s" enctype="multipart/form-data">\n' % q(self.APPURL), '<form method="POST" action="%s" enctype="multipart/form-data">\n' % q(self.APPURL),
'<p>Upload PDF File: <input name="f" type="file" value="">\n', '<p>Upload PDF File: <input name="f" type="file" value="">\n',
'&nbsp; Page numbers (comma-separated): <input name="p" type="text" size="10" value="">\n', '&nbsp; Page numbers (comma-separated): <input name="p" type="text" size="10" value="">\n',
'<p>(Text extraction is limited to maximum %d pages.\n' % self.MAXPAGES, '<p>(Text extraction is limited to maximum %d pages.\n' % self.MAXPAGES,
'Maximum file size for input is %d bytes.)\n' % self.MAXFILESIZE, 'Maximum file size for input is %d bytes.)\n' % self.MAXFILESIZE,
'<p><input type="submit" name="c" value="Convert to HTML">\n', '<p><input type="submit" name="c" value="Convert to HTML">\n',
'<input type="submit" name="c" value="Convert to TEXT">\n', '<input type="submit" name="c" value="Convert to TEXT">\n',
'<input type="reset" value="Reset">\n', '<input type="reset" value="Reset">\n',
'</form><hr>\n', '</form><hr>\n',
'<p>Powered by <a href="http://www.unixuser.org/~euske/python/pdfminer/">PDFMiner</a>\n', '<p>Powered by <a href="http://www.unixuser.org/~euske/python/pdfminer/">PDFMiner</a>\n',
'</body></html>\n', '</body></html>\n',
) )
return return
def run(self, argv): def run(self, argv):
if self.path_info == '/': if self.path_info == '/':
self.http_200() self.http_200()
self.coverpage() self.coverpage()
return return
if self.path_info != self.APPURL: if self.path_info != self.APPURL:
self.http_404() self.http_404()
return return
if not os.path.isdir(self.TMPDIR): if not os.path.isdir(self.TMPDIR):
self.bummer('error') self.bummer('error')
return return
if 'f' not in self.form: if 'f' not in self.form:
self.http_301('/') self.http_301('/')
return return
if 'c' not in self.form: if 'c' not in self.form:
self.http_301('/') self.http_301('/')
return return
item = self.form['f'] item = self.form['f']
if not (item.file and item.filename): if not (item.file and item.filename):
self.http_301('/') self.http_301('/')
return return
cmd = self.form.getvalue('c') cmd = self.form.getvalue('c')
html = (cmd == 'Convert to HTML') html = (cmd == 'Convert to HTML')
pagenos = [] pagenos = []
if 'p' in self.form: if 'p' in self.form:
for m in re.finditer(r'\d+', self.form.getvalue('p')): for m in re.finditer(r'\d+', self.form.getvalue('p')):
try:
pagenos.append(int(m.group(0)))
except ValueError:
pass
logging.info('process: host=%s, name=%r, pagenos=%r' % (self.remote_addr, item.filename, pagenos))
h = abs(hash((random.random(), self.remote_addr, item.filename)))
tmppath = os.path.join(self.TMPDIR, '%08x%08x.pdf' % (self.cur_time, h))
try: try:
pagenos.append(int(m.group(0))) try:
except ValueError: if not html:
pass self.content_type = 'text/plain; charset=%s' % self.codec
logging.info('process: host=%s, name=%r, pagenos=%r' % (self.remote_addr, item.filename, pagenos)) self.http_200()
h = abs(hash((random.random(), self.remote_addr, item.filename))) convert(sys.stdout, item.file, tmppath, pagenos=pagenos, codec=self.codec,
tmppath = os.path.join(self.TMPDIR, '%08x%08x.pdf' % (self.cur_time, h)) maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE, html=html)
try: except Exception, e:
try: self.put('<p>Sorry, an error has occured: %s' % q(repr(e)))
if not html: logging.error('error: %r: path=%r: %s' % (e, tmppath, traceback.format_exc()))
self.content_type = 'text/plain; charset=%s' % self.codec finally:
self.http_200() try:
convert(sys.stdout, item.file, tmppath, pagenos=pagenos, codec=self.codec, os.remove(tmppath)
maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE, html=html) except:
except Exception, e: pass
self.put('<p>Sorry, an error has occured: %s' % q(repr(e))) return
logging.error('error: %r: path=%r: %s' % (e, tmppath, traceback.format_exc()))
finally:
try:
os.remove(tmppath)
except:
pass
return
# main # main

View File

@ -9,85 +9,85 @@ from pdfminer.layout import LAParams
# main # main
def main(argv): def main(argv):
import getopt import getopt
def usage(): def usage():
print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] ' print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
'[-D direction] [-M char_margin] [-L line_margin] [-W word_margin] ' '[-D direction] [-M char_margin] [-L line_margin] [-W word_margin] '
'[-t text|html|sgml|tag] [-o output] file ...' % argv[0]) '[-t text|html|sgml|tag] [-o output] file ...' % argv[0])
return 100 return 100
try: try:
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:D:M:L:W:t:o:C:D:m:') (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:D:M:L:W:t:o:C:D:m:')
except getopt.GetoptError: except getopt.GetoptError:
return usage() return usage()
if not args: return usage() if not args: return usage()
# debug option # debug option
debug = 0 debug = 0
# path option # path option
cmapdir = find_cmap_path() cmapdir = find_cmap_path()
# input option # input option
password = '' password = ''
pagenos = set() pagenos = set()
maxpages = 0 maxpages = 0
# output option # output option
outfile = None outfile = None
outtype = None outtype = None
codec = 'utf-8' codec = 'utf-8'
pageno = 1 pageno = 1
scale = 1 scale = 1
showpageno = True showpageno = True
laparams = LAParams() laparams = LAParams()
for (k, v) in opts: for (k, v) in opts:
if k == '-d': debug += 1 if k == '-d': debug += 1
elif k == '-C': cmapdir = v elif k == '-C': cmapdir = v
elif k == '-P': password = v elif k == '-P': password = v
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v) elif k == '-m': maxpages = int(v)
elif k == '-t': outtype = v elif k == '-t': outtype = v
elif k == '-c': codec = v elif k == '-c': codec = v
elif k == '-o': outfile = v elif k == '-o': outfile = v
elif k == '-s': scale = float(v) elif k == '-s': scale = float(v)
elif k == '-D': laparams.direction = v elif k == '-D': laparams.direction = v
elif k == '-M': laparams.char_margin = float(v) elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v) elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v) elif k == '-W': laparams.word_margin = float(v)
# #
CMapDB.debug = debug CMapDB.debug = debug
PDFResourceManager.debug = debug PDFResourceManager.debug = debug
PDFDocument.debug = debug PDFDocument.debug = debug
PDFParser.debug = debug PDFParser.debug = debug
PDFPageInterpreter.debug = debug PDFPageInterpreter.debug = debug
PDFDevice.debug = debug PDFDevice.debug = debug
# #
CMapDB.initialize(cmapdir) CMapDB.initialize(cmapdir)
rsrc = PDFResourceManager() rsrc = PDFResourceManager()
if not outtype: if not outtype:
outtype = 'text' outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.sgml'):
outtype = 'sgml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile: if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'): outfp = file(outfile, 'w')
outtype = 'html' else:
elif outfile.endswith('.sgml'): outfp = sys.stdout
outtype = 'sgml' if outtype == 'text':
elif outfile.endswith('.tag'): device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
outtype = 'tag' elif outtype == 'sgml':
if outfile: device = SGMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
outfp = file(outfile, 'w') elif outtype == 'html':
else: device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams)
outfp = sys.stdout elif outtype == 'tag':
if outtype == 'text': device = TagExtractor(rsrc, outfp, codec=codec)
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams) else:
elif outtype == 'sgml': return usage()
device = SGMLConverter(rsrc, outfp, codec=codec, laparams=laparams) for fname in args:
elif outtype == 'html': fp = file(fname, 'rb')
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams) process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages, password=password)
elif outtype == 'tag': fp.close()
device = TagExtractor(rsrc, outfp, codec=codec) device.close()
else: return
return usage()
for fname in args:
fp = file(fname, 'rb')
process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages, password=password)
fp.close()
device.close()
return
if __name__ == '__main__': sys.exit(main(sys.argv)) if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -2,29 +2,29 @@
import sys import sys
def prof_main(argv): def prof_main(argv):
import getopt import getopt
import hotshot, hotshot.stats import hotshot, hotshot.stats
def usage(): def usage():
print 'usage: %s module.function [args ...]' % argv[0] print 'usage: %s module.function [args ...]' % argv[0]
return 100 return 100
args = argv[1:] args = argv[1:]
if len(args) < 1: return usage() if len(args) < 1: return usage()
name = args.pop(0) name = args.pop(0)
prof = name+'.prof' prof = name+'.prof'
i = name.rindex('.') i = name.rindex('.')
(modname, funcname) = (name[:i], name[i+1:]) (modname, funcname) = (name[:i], name[i+1:])
module = __import__(modname, fromlist=1) module = __import__(modname, fromlist=1)
func = getattr(module, funcname) func = getattr(module, funcname)
if args: if args:
args.insert(0, argv[0]) args.insert(0, argv[0])
prof = hotshot.Profile(prof) prof = hotshot.Profile(prof)
prof.runcall(lambda : func(args)) prof.runcall(lambda : func(args))
prof.close() prof.close()
else: else:
stats = hotshot.stats.load(prof) stats = hotshot.stats.load(prof)
stats.strip_dirs() stats.strip_dirs()
stats.sort_stats('time', 'calls') stats.sort_stats('time', 'calls')
stats.print_stats(1000) stats.print_stats(1000)
return return
if __name__ == '__main__': sys.exit(prof_main(sys.argv)) if __name__ == '__main__': sys.exit(prof_main(sys.argv))