to 4-space indentation
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
a09b71d89d
commit
7790808560
|
@ -18,7 +18,7 @@ Python PDF parser and analyzer
|
||||||
|
|
||||||
<div align=right class=lastmod>
|
<div align=right class=lastmod>
|
||||||
<!-- hhmts start -->
|
<!-- hhmts start -->
|
||||||
Last Modified: Sat Oct 24 12:42:25 JST 2009
|
Last Modified: Sat Oct 24 13:40:19 JST 2009
|
||||||
<!-- hhmts end -->
|
<!-- hhmts end -->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -352,7 +352,7 @@ no stream header is displayed for the ease of saving it to a file.
|
||||||
<hr noshade>
|
<hr noshade>
|
||||||
<h2>Changes</h2>
|
<h2>Changes</h2>
|
||||||
<ul>
|
<ul>
|
||||||
<li> 2009/10/24: Charspace bug fixed.
|
<li> 2009/10/24: Charspace bug fixed. Adjusted for 4-space indentation.
|
||||||
<li> 2009/10/04: Another matrix operation bug fixed. Thanks to Vitaly Sedelnik.
|
<li> 2009/10/04: Another matrix operation bug fixed. Thanks to Vitaly Sedelnik.
|
||||||
<li> 2009/09/12: Fixed rectangle handling. Able to extract image boundaries.
|
<li> 2009/09/12: Fixed rectangle handling. Able to extract image boundaries.
|
||||||
<li> 2009/08/30: Fixed page rotation handling.
|
<li> 2009/08/30: Fixed page rotation handling.
|
||||||
|
|
|
@ -8,37 +8,37 @@
|
||||||
## Arcfour
|
## Arcfour
|
||||||
##
|
##
|
||||||
class Arcfour(object):
|
class Arcfour(object):
|
||||||
|
|
||||||
def __init__(self, key):
|
|
||||||
s = range(256)
|
|
||||||
j = 0
|
|
||||||
klen = len(key)
|
|
||||||
for i in xrange(256):
|
|
||||||
j = (j + s[i] + ord(key[i % klen])) % 256
|
|
||||||
(s[i], s[j]) = (s[j], s[i])
|
|
||||||
self.s = s
|
|
||||||
(self.i, self.j) = (0, 0)
|
|
||||||
return
|
|
||||||
|
|
||||||
def process(self, data):
|
def __init__(self, key):
|
||||||
(i, j) = (self.i, self.j)
|
s = range(256)
|
||||||
s = self.s
|
j = 0
|
||||||
r = ''
|
klen = len(key)
|
||||||
for c in data:
|
for i in xrange(256):
|
||||||
i = (i+1) % 256
|
j = (j + s[i] + ord(key[i % klen])) % 256
|
||||||
j = (j+s[i]) % 256
|
(s[i], s[j]) = (s[j], s[i])
|
||||||
(s[i], s[j]) = (s[j], s[i])
|
self.s = s
|
||||||
k = s[(s[i]+s[j]) % 256]
|
(self.i, self.j) = (0, 0)
|
||||||
r += chr(ord(c) ^ k)
|
return
|
||||||
(self.i, self.j) = (i, j)
|
|
||||||
return r
|
def process(self, data):
|
||||||
|
(i, j) = (self.i, self.j)
|
||||||
|
s = self.s
|
||||||
|
r = ''
|
||||||
|
for c in data:
|
||||||
|
i = (i+1) % 256
|
||||||
|
j = (j+s[i]) % 256
|
||||||
|
(s[i], s[j]) = (s[j], s[i])
|
||||||
|
k = s[(s[i]+s[j]) % 256]
|
||||||
|
r += chr(ord(c) ^ k)
|
||||||
|
(self.i, self.j) = (i, j)
|
||||||
|
return r
|
||||||
|
|
||||||
# test
|
# test
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
def doit(key, data):
|
def doit(key, data):
|
||||||
cipher = Arcfour(key)
|
cipher = Arcfour(key)
|
||||||
return ''.join( '%02X' % ord(c) for c in cipher.process(data) )
|
return ''.join( '%02X' % ord(c) for c in cipher.process(data) )
|
||||||
assert doit("Key", "Plaintext") == 'BBF316E8D940AF0AD3'
|
assert doit("Key", "Plaintext") == 'BBF316E8D940AF0AD3'
|
||||||
assert doit("Wiki", "pedia") == '1021BF0420'
|
assert doit("Wiki", "pedia") == '1021BF0420'
|
||||||
assert doit("Secret", "Attack at dawn") == '45A01F645FC35B383552544B9BF5'
|
assert doit("Secret", "Attack at dawn") == '45A01F645FC35B383552544B9BF5'
|
||||||
print 'test succeeded'
|
print 'test succeeded'
|
||||||
|
|
|
@ -6,72 +6,72 @@
|
||||||
|
|
||||||
# ascii85decode(data)
|
# ascii85decode(data)
|
||||||
def ascii85decode(data):
|
def ascii85decode(data):
|
||||||
import struct
|
import struct
|
||||||
n = b = 0
|
n = b = 0
|
||||||
out = ''
|
out = ''
|
||||||
for c in data:
|
for c in data:
|
||||||
if '!' <= c and c <= 'u':
|
if '!' <= c and c <= 'u':
|
||||||
n += 1
|
n += 1
|
||||||
b = b*85+(ord(c)-33)
|
b = b*85+(ord(c)-33)
|
||||||
if n == 5:
|
if n == 5:
|
||||||
out += struct.pack('>L',b)
|
out += struct.pack('>L',b)
|
||||||
n = b = 0
|
n = b = 0
|
||||||
elif c == 'z':
|
elif c == 'z':
|
||||||
assert n == 0
|
assert n == 0
|
||||||
out += '\0\0\0\0'
|
out += '\0\0\0\0'
|
||||||
elif c == '~':
|
elif c == '~':
|
||||||
if n:
|
if n:
|
||||||
for _ in range(5-n):
|
for _ in range(5-n):
|
||||||
b = b*85+84
|
b = b*85+84
|
||||||
out += struct.pack('>L',b)[:n-1]
|
out += struct.pack('>L',b)[:n-1]
|
||||||
break
|
break
|
||||||
return out
|
return out
|
||||||
|
|
||||||
# asciihexdecode(data)
|
# asciihexdecode(data)
|
||||||
def asciihexdecode(data):
|
def asciihexdecode(data):
|
||||||
"""
|
"""
|
||||||
ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
|
ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
|
||||||
For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the
|
For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the
|
||||||
ASCIIHexDecode filter produces one byte of binary data. All white-space
|
ASCIIHexDecode filter produces one byte of binary data. All white-space
|
||||||
characters are ignored. A right angle bracket character (>) indicates
|
characters are ignored. A right angle bracket character (>) indicates
|
||||||
EOD. Any other characters will cause an error. If the filter encounters
|
EOD. Any other characters will cause an error. If the filter encounters
|
||||||
the EOD marker after reading an odd number of hexadecimal digits, it
|
the EOD marker after reading an odd number of hexadecimal digits, it
|
||||||
will behave as if a 0 followed the last digit.
|
will behave as if a 0 followed the last digit.
|
||||||
>>> asciihexdecode("61 62 2e6364 65")
|
>>> asciihexdecode("61 62 2e6364 65")
|
||||||
'ab.cde'
|
'ab.cde'
|
||||||
>>> asciihexdecode("61 62 2e6364 657>")
|
>>> asciihexdecode("61 62 2e6364 657>")
|
||||||
'ab.cdep'
|
'ab.cdep'
|
||||||
>>> asciihexdecode("7>")
|
>>> asciihexdecode("7>")
|
||||||
'p'
|
'p'
|
||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE)
|
hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE)
|
||||||
trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
|
trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
|
||||||
decode = (lambda hx: chr(int(hx, 16)))
|
decode = (lambda hx: chr(int(hx, 16)))
|
||||||
out = map(decode, hex_re.findall(data))
|
out = map(decode, hex_re.findall(data))
|
||||||
m = trail_re.search(data)
|
m = trail_re.search(data)
|
||||||
if m:
|
if m:
|
||||||
out.append(decode("%c0" % m.group(1)))
|
out.append(decode("%c0" % m.group(1)))
|
||||||
return ''.join(out)
|
return ''.join(out)
|
||||||
|
|
||||||
|
|
||||||
# test
|
# test
|
||||||
# sample taken from: http://en.wikipedia.org/w/index.php?title=Ascii85
|
# sample taken from: http://en.wikipedia.org/w/index.php?title=Ascii85
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
orig = r'''
|
orig = r'''
|
||||||
9jqo^BlbD-BleB1DJ+*+F(f,q/0JhKF<GL>Cj@.4Gp$d7F!,L7@<6@)/0JDEF<G%<+EV:2F!,
|
9jqo^BlbD-BleB1DJ+*+F(f,q/0JhKF<GL>Cj@.4Gp$d7F!,L7@<6@)/0JDEF<G%<+EV:2F!,
|
||||||
O<DJ+*.@<*K0@<6L(Df-\0Ec5e;DffZ(EZee.Bl.9pF"AGXBPCsi+DGm>@3BB/F*&OCAfu2/AKY
|
O<DJ+*.@<*K0@<6L(Df-\0Ec5e;DffZ(EZee.Bl.9pF"AGXBPCsi+DGm>@3BB/F*&OCAfu2/AKY
|
||||||
i(DIb:@FD,*)+C]U=@3BN#EcYf8ATD3s@q?d$AftVqCh[NqF<G:8+EV:.+Cf>-FD5W8ARlolDIa
|
i(DIb:@FD,*)+C]U=@3BN#EcYf8ATD3s@q?d$AftVqCh[NqF<G:8+EV:.+Cf>-FD5W8ARlolDIa
|
||||||
l(DId<j@<?3r@:F%a+D58'ATD4$Bl@l3De:,-DJs`8ARoFb/0JMK@qB4^F!,R<AKZ&-DfTqBG%G
|
l(DId<j@<?3r@:F%a+D58'ATD4$Bl@l3De:,-DJs`8ARoFb/0JMK@qB4^F!,R<AKZ&-DfTqBG%G
|
||||||
>uD.RTpAKYo'+CT/5+Cei#DII?(E,9)oF*2M7/c~>
|
>uD.RTpAKYo'+CT/5+Cei#DII?(E,9)oF*2M7/c~>
|
||||||
'''
|
'''
|
||||||
data = \
|
data = \
|
||||||
'Man is distinguished, not only by his reason, but by this singular passion from '\
|
'Man is distinguished, not only by his reason, but by this singular passion from '\
|
||||||
'other animals, which is a lust of the mind, that by a perseverance of delight in the '\
|
'other animals, which is a lust of the mind, that by a perseverance of delight in the '\
|
||||||
'continued and indefatigable generation of knowledge, exceeds the short vehemence of '\
|
'continued and indefatigable generation of knowledge, exceeds the short vehemence of '\
|
||||||
'any carnal pleasure.'
|
'any carnal pleasure.'
|
||||||
assert ascii85decode(orig) == data
|
assert ascii85decode(orig) == data
|
||||||
print 'ascii85decode test succeeded'
|
print 'ascii85decode test succeeded'
|
||||||
|
|
||||||
import doctest
|
import doctest
|
||||||
doctest.testmod()
|
doctest.testmod()
|
||||||
|
|
764
pdfminer/cmap.py
764
pdfminer/cmap.py
|
@ -10,9 +10,9 @@ from pdfminer.psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
|
||||||
PSLiteral, PSKeyword, literal_name, keyword_name, \
|
PSLiteral, PSKeyword, literal_name, keyword_name, \
|
||||||
PSStackParser
|
PSStackParser
|
||||||
try:
|
try:
|
||||||
import cdb
|
import cdb
|
||||||
except ImportError:
|
except ImportError:
|
||||||
import pdfminer.pycdb as cdb
|
import pdfminer.pycdb as cdb
|
||||||
|
|
||||||
|
|
||||||
class CMapError(Exception): pass
|
class CMapError(Exception): pass
|
||||||
|
@ -21,449 +21,449 @@ class CMapError(Exception): pass
|
||||||
## find_cmap_path
|
## find_cmap_path
|
||||||
##
|
##
|
||||||
def find_cmap_path():
|
def find_cmap_path():
|
||||||
try:
|
try:
|
||||||
return os.environ['CMAP_PATH']
|
return os.environ['CMAP_PATH']
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
basedir = os.path.dirname(__file__)
|
basedir = os.path.dirname(__file__)
|
||||||
return os.path.join(basedir, 'CMap')
|
return os.path.join(basedir, 'CMap')
|
||||||
|
|
||||||
|
|
||||||
STRIP_NAME = re.compile(r'[0-9]+')
|
STRIP_NAME = re.compile(r'[0-9]+')
|
||||||
def name2unicode(name):
|
def name2unicode(name):
|
||||||
if name in charname2unicode:
|
if name in charname2unicode:
|
||||||
return charname2unicode[name]
|
return charname2unicode[name]
|
||||||
m = STRIP_NAME.search(name)
|
m = STRIP_NAME.search(name)
|
||||||
if not m: raise KeyError(name)
|
if not m: raise KeyError(name)
|
||||||
return int(m.group(0))
|
return int(m.group(0))
|
||||||
|
|
||||||
|
|
||||||
## CMap
|
## CMap
|
||||||
##
|
##
|
||||||
class CMap(object):
|
class CMap(object):
|
||||||
|
|
||||||
debug = 0
|
debug = 0
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.code2cid = {}
|
|
||||||
self.cid2code = {}
|
|
||||||
self.attrs = {}
|
|
||||||
return
|
|
||||||
|
|
||||||
def __repr__(self):
|
def __init__(self):
|
||||||
return '<CMap: %s>' % self.attrs.get('CMapName')
|
self.code2cid = {}
|
||||||
|
self.cid2code = {}
|
||||||
|
self.attrs = {}
|
||||||
|
return
|
||||||
|
|
||||||
def update(self, code2cid=None, cid2code=None):
|
def __repr__(self):
|
||||||
if code2cid:
|
return '<CMap: %s>' % self.attrs.get('CMapName')
|
||||||
self.code2cid.update(code2cid)
|
|
||||||
if cid2code:
|
|
||||||
self.cid2code.update(cid2code)
|
|
||||||
return self
|
|
||||||
|
|
||||||
def copycmap(self, cmap):
|
|
||||||
self.code2cid.update(cmap.getall_code2cid())
|
|
||||||
self.cid2code.update(cmap.getall_cid2code())
|
|
||||||
return self
|
|
||||||
|
|
||||||
def register_code2cid(self, code, cid):
|
def update(self, code2cid=None, cid2code=None):
|
||||||
if isinstance(code, str) and isinstance(cid, int):
|
if code2cid:
|
||||||
self.code2cid[code] = cid
|
self.code2cid.update(code2cid)
|
||||||
return self
|
if cid2code:
|
||||||
|
self.cid2code.update(cid2code)
|
||||||
|
return self
|
||||||
|
|
||||||
def register_cid2code(self, cid, code):
|
def copycmap(self, cmap):
|
||||||
if isinstance(cid, int):
|
self.code2cid.update(cmap.getall_code2cid())
|
||||||
if isinstance(code, PSLiteral):
|
self.cid2code.update(cmap.getall_cid2code())
|
||||||
self.cid2code[cid] = pack('>H', name2unicode(code.name))
|
return self
|
||||||
elif isinstance(code, str):
|
|
||||||
self.cid2code[cid] = code
|
|
||||||
return self
|
|
||||||
|
|
||||||
def decode(self, bytes):
|
def register_code2cid(self, code, cid):
|
||||||
if self.debug:
|
if isinstance(code, str) and isinstance(cid, int):
|
||||||
print >>stderr, 'decode: %r, %r' % (self, bytes)
|
self.code2cid[code] = cid
|
||||||
x = ''
|
return self
|
||||||
for c in bytes:
|
|
||||||
if x:
|
def register_cid2code(self, cid, code):
|
||||||
if x+c in self.code2cid:
|
if isinstance(cid, int):
|
||||||
yield self.code2cid[x+c]
|
if isinstance(code, PSLiteral):
|
||||||
|
self.cid2code[cid] = pack('>H', name2unicode(code.name))
|
||||||
|
elif isinstance(code, str):
|
||||||
|
self.cid2code[cid] = code
|
||||||
|
return self
|
||||||
|
|
||||||
|
def decode(self, bytes):
|
||||||
|
if self.debug:
|
||||||
|
print >>stderr, 'decode: %r, %r' % (self, bytes)
|
||||||
x = ''
|
x = ''
|
||||||
elif c in self.code2cid:
|
for c in bytes:
|
||||||
yield self.code2cid[c]
|
if x:
|
||||||
else:
|
if x+c in self.code2cid:
|
||||||
x = c
|
yield self.code2cid[x+c]
|
||||||
return
|
x = ''
|
||||||
|
elif c in self.code2cid:
|
||||||
def is_vertical(self):
|
yield self.code2cid[c]
|
||||||
return self.attrs.get('WMode', 0)
|
else:
|
||||||
|
x = c
|
||||||
|
return
|
||||||
|
|
||||||
def tocid(self, code):
|
def is_vertical(self):
|
||||||
return self.code2cid.get(code)
|
return self.attrs.get('WMode', 0)
|
||||||
def tocode(self, cid):
|
|
||||||
return self.cid2code.get(cid)
|
def tocid(self, code):
|
||||||
|
return self.code2cid.get(code)
|
||||||
|
def tocode(self, cid):
|
||||||
|
return self.cid2code.get(cid)
|
||||||
|
|
||||||
|
def getall_attrs(self):
|
||||||
|
return self.attrs.iteritems()
|
||||||
|
def getall_code2cid(self):
|
||||||
|
return self.code2cid.iteritems()
|
||||||
|
def getall_cid2code(self):
|
||||||
|
return self.cid2code.iteritems()
|
||||||
|
|
||||||
def getall_attrs(self):
|
|
||||||
return self.attrs.iteritems()
|
|
||||||
def getall_code2cid(self):
|
|
||||||
return self.code2cid.iteritems()
|
|
||||||
def getall_cid2code(self):
|
|
||||||
return self.cid2code.iteritems()
|
|
||||||
|
|
||||||
|
|
||||||
## CDBCMap
|
## CDBCMap
|
||||||
##
|
##
|
||||||
class CDBCMap(CMap):
|
class CDBCMap(CMap):
|
||||||
|
|
||||||
def __init__(self, cdbname):
|
|
||||||
CMap.__init__(self)
|
|
||||||
self.cdbname = cdbname
|
|
||||||
self.db = cdb.init(cdbname)
|
|
||||||
return
|
|
||||||
|
|
||||||
def __repr__(self):
|
def __init__(self, cdbname):
|
||||||
return '<CDBCMap: %s (%r)>' % (self.db['/CMapName'], self.cdbname)
|
CMap.__init__(self)
|
||||||
|
self.cdbname = cdbname
|
||||||
|
self.db = cdb.init(cdbname)
|
||||||
|
return
|
||||||
|
|
||||||
def tocid(self, code):
|
def __repr__(self):
|
||||||
k = 'c'+code
|
return '<CDBCMap: %s (%r)>' % (self.db['/CMapName'], self.cdbname)
|
||||||
if not self.db.has_key(k):
|
|
||||||
return None
|
|
||||||
return unpack('>L', self.db[k])
|
|
||||||
def tocode(self, cid):
|
|
||||||
k = 'i'+pack('>L', cid)
|
|
||||||
if not self.db.has_key(k):
|
|
||||||
return None
|
|
||||||
return self.db[k]
|
|
||||||
|
|
||||||
def is_vertical(self):
|
|
||||||
return (self.db.has_key('/WMode') and
|
|
||||||
self.db['/WMode'] == '1')
|
|
||||||
|
|
||||||
def getall(self, c):
|
def tocid(self, code):
|
||||||
while 1:
|
k = 'c'+code
|
||||||
x = self.db.each()
|
if not self.db.has_key(k):
|
||||||
if not x: break
|
return None
|
||||||
(k,v) = x
|
return unpack('>L', self.db[k])
|
||||||
if k.startswith(c):
|
def tocode(self, cid):
|
||||||
yield (k[1:], unpack('>L', v)[0])
|
k = 'i'+pack('>L', cid)
|
||||||
return
|
if not self.db.has_key(k):
|
||||||
|
return None
|
||||||
|
return self.db[k]
|
||||||
|
|
||||||
def getall_attrs(self):
|
def is_vertical(self):
|
||||||
while 1:
|
return (self.db.has_key('/WMode') and
|
||||||
x = self.db.each()
|
self.db['/WMode'] == '1')
|
||||||
if not x: break
|
|
||||||
(k,v) = x
|
|
||||||
if k.startswith('/'):
|
|
||||||
yield (k[1:], eval(v)[0])
|
|
||||||
return
|
|
||||||
|
|
||||||
def getall_cid2code(self):
|
|
||||||
return self.getall('i')
|
|
||||||
def getall_code2cid(self):
|
|
||||||
return self.getall('c')
|
|
||||||
|
|
||||||
def decode(self, bytes):
|
def getall(self, c):
|
||||||
if self.debug:
|
while 1:
|
||||||
print >>stderr, 'decode: %r, %r' % (self, bytes)
|
x = self.db.each()
|
||||||
x = ''
|
if not x: break
|
||||||
for c in bytes:
|
(k,v) = x
|
||||||
if x:
|
if k.startswith(c):
|
||||||
if x+c in self.code2cid:
|
yield (k[1:], unpack('>L', v)[0])
|
||||||
yield self.code2cid[x+c]
|
return
|
||||||
elif self.db.has_key('c'+x+c):
|
|
||||||
(dest,) = unpack('>L', self.db['c'+x+c])
|
def getall_attrs(self):
|
||||||
self.code2cid[x+c] = dest
|
while 1:
|
||||||
yield dest
|
x = self.db.each()
|
||||||
|
if not x: break
|
||||||
|
(k,v) = x
|
||||||
|
if k.startswith('/'):
|
||||||
|
yield (k[1:], eval(v)[0])
|
||||||
|
return
|
||||||
|
|
||||||
|
def getall_cid2code(self):
|
||||||
|
return self.getall('i')
|
||||||
|
def getall_code2cid(self):
|
||||||
|
return self.getall('c')
|
||||||
|
|
||||||
|
def decode(self, bytes):
|
||||||
|
if self.debug:
|
||||||
|
print >>stderr, 'decode: %r, %r' % (self, bytes)
|
||||||
x = ''
|
x = ''
|
||||||
elif c in self.code2cid:
|
for c in bytes:
|
||||||
yield self.code2cid[c]
|
if x:
|
||||||
elif self.db.has_key('c'+c):
|
if x+c in self.code2cid:
|
||||||
(dest,) = unpack('>L', self.db['c'+c])
|
yield self.code2cid[x+c]
|
||||||
self.code2cid[c] = dest
|
elif self.db.has_key('c'+x+c):
|
||||||
yield dest
|
(dest,) = unpack('>L', self.db['c'+x+c])
|
||||||
else:
|
self.code2cid[x+c] = dest
|
||||||
x = c
|
yield dest
|
||||||
return
|
x = ''
|
||||||
|
elif c in self.code2cid:
|
||||||
|
yield self.code2cid[c]
|
||||||
|
elif self.db.has_key('c'+c):
|
||||||
|
(dest,) = unpack('>L', self.db['c'+c])
|
||||||
|
self.code2cid[c] = dest
|
||||||
|
yield dest
|
||||||
|
else:
|
||||||
|
x = c
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
## CMapDB
|
## CMapDB
|
||||||
##
|
##
|
||||||
class CMapDB(object):
|
class CMapDB(object):
|
||||||
|
|
||||||
class CMapNotFound(CMapError): pass
|
class CMapNotFound(CMapError): pass
|
||||||
|
|
||||||
CMAP_ALIAS = {
|
|
||||||
}
|
|
||||||
|
|
||||||
debug = 0
|
|
||||||
dirname = None
|
|
||||||
cdbdirname = None
|
|
||||||
cmapdb = {}
|
|
||||||
|
|
||||||
@classmethod
|
CMAP_ALIAS = {
|
||||||
def initialize(klass, dirname=None, cdbdirname=None):
|
}
|
||||||
if not dirname:
|
|
||||||
dirname = find_cmap_path()
|
|
||||||
klass.dirname = dirname
|
|
||||||
klass.cdbdirname = cdbdirname or dirname
|
|
||||||
return
|
|
||||||
|
|
||||||
@classmethod
|
debug = 0
|
||||||
def get_cmap(klass, cmapname, strict=True):
|
dirname = None
|
||||||
cmapname = klass.CMAP_ALIAS.get(cmapname, cmapname)
|
cdbdirname = None
|
||||||
if cmapname in klass.cmapdb:
|
cmapdb = {}
|
||||||
cmap = klass.cmapdb[cmapname]
|
|
||||||
else:
|
@classmethod
|
||||||
fname = os.path.join(klass.dirname, cmapname)
|
def initialize(klass, dirname=None, cdbdirname=None):
|
||||||
cdbname = os.path.join(klass.cdbdirname, cmapname+'.cmap.cdb')
|
if not dirname:
|
||||||
if os.path.exists(cdbname):
|
dirname = find_cmap_path()
|
||||||
if 1 <= klass.debug:
|
klass.dirname = dirname
|
||||||
print >>stderr, 'Opening: CDBCMap %r...' % cdbname
|
klass.cdbdirname = cdbdirname or dirname
|
||||||
cmap = CDBCMap(cdbname)
|
return
|
||||||
elif os.path.exists(fname):
|
|
||||||
if 1 <= klass.debug:
|
@classmethod
|
||||||
print >>stderr, 'Reading: CMap %r...' % fname
|
def get_cmap(klass, cmapname, strict=True):
|
||||||
cmap = CMap()
|
cmapname = klass.CMAP_ALIAS.get(cmapname, cmapname)
|
||||||
fp = file(fname, 'rb')
|
if cmapname in klass.cmapdb:
|
||||||
CMapParser(cmap, fp).run()
|
cmap = klass.cmapdb[cmapname]
|
||||||
fp.close()
|
else:
|
||||||
elif not strict:
|
fname = os.path.join(klass.dirname, cmapname)
|
||||||
cmap = CMap() # just create empty cmap
|
cdbname = os.path.join(klass.cdbdirname, cmapname+'.cmap.cdb')
|
||||||
else:
|
if os.path.exists(cdbname):
|
||||||
raise CMapDB.CMapNotFound(cmapname)
|
if 1 <= klass.debug:
|
||||||
klass.cmapdb[cmapname] = cmap
|
print >>stderr, 'Opening: CDBCMap %r...' % cdbname
|
||||||
return cmap
|
cmap = CDBCMap(cdbname)
|
||||||
|
elif os.path.exists(fname):
|
||||||
|
if 1 <= klass.debug:
|
||||||
|
print >>stderr, 'Reading: CMap %r...' % fname
|
||||||
|
cmap = CMap()
|
||||||
|
fp = file(fname, 'rb')
|
||||||
|
CMapParser(cmap, fp).run()
|
||||||
|
fp.close()
|
||||||
|
elif not strict:
|
||||||
|
cmap = CMap() # just create empty cmap
|
||||||
|
else:
|
||||||
|
raise CMapDB.CMapNotFound(cmapname)
|
||||||
|
klass.cmapdb[cmapname] = cmap
|
||||||
|
return cmap
|
||||||
|
|
||||||
|
|
||||||
## CMapParser
|
## CMapParser
|
||||||
##
|
##
|
||||||
class CMapParser(PSStackParser):
|
class CMapParser(PSStackParser):
|
||||||
|
|
||||||
def __init__(self, cmap, fp):
|
def __init__(self, cmap, fp):
|
||||||
PSStackParser.__init__(self, fp)
|
PSStackParser.__init__(self, fp)
|
||||||
self.cmap = cmap
|
self.cmap = cmap
|
||||||
self.in_cmap = False
|
self.in_cmap = False
|
||||||
return
|
return
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
try:
|
try:
|
||||||
self.nextobject()
|
self.nextobject()
|
||||||
except PSEOF:
|
except PSEOF:
|
||||||
pass
|
pass
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_keyword(self, pos, token):
|
def do_keyword(self, pos, token):
|
||||||
name = token.name
|
name = token.name
|
||||||
if name == 'begincmap':
|
if name == 'begincmap':
|
||||||
self.in_cmap = True
|
self.in_cmap = True
|
||||||
self.popall()
|
self.popall()
|
||||||
return
|
return
|
||||||
elif name == 'endcmap':
|
elif name == 'endcmap':
|
||||||
self.in_cmap = False
|
self.in_cmap = False
|
||||||
return
|
return
|
||||||
if not self.in_cmap: return
|
if not self.in_cmap: return
|
||||||
#
|
#
|
||||||
if name == 'def':
|
if name == 'def':
|
||||||
try:
|
try:
|
||||||
((_,k),(_,v)) = self.pop(2)
|
((_,k),(_,v)) = self.pop(2)
|
||||||
self.cmap.attrs[literal_name(k)] = v
|
self.cmap.attrs[literal_name(k)] = v
|
||||||
except PSSyntaxError:
|
except PSSyntaxError:
|
||||||
pass
|
pass
|
||||||
return
|
return
|
||||||
|
|
||||||
if name == 'usecmap':
|
|
||||||
try:
|
|
||||||
((_,cmapname),) = self.pop(1)
|
|
||||||
self.cmap.copycmap(CMapDB.get_cmap(literal_name(cmapname)))
|
|
||||||
except PSSyntaxError:
|
|
||||||
pass
|
|
||||||
return
|
|
||||||
|
|
||||||
if name == 'begincodespacerange':
|
|
||||||
self.popall()
|
|
||||||
return
|
|
||||||
if name == 'endcodespacerange':
|
|
||||||
self.popall()
|
|
||||||
return
|
|
||||||
|
|
||||||
if name == 'begincidrange':
|
|
||||||
self.popall()
|
|
||||||
return
|
|
||||||
if name == 'endcidrange':
|
|
||||||
objs = [ obj for (_,obj) in self.popall() ]
|
|
||||||
for (s,e,cid) in choplist(3, objs):
|
|
||||||
if (not isinstance(s, str) or not isinstance(e, str) or
|
|
||||||
not isinstance(cid, int) or len(s) != len(e)): continue
|
|
||||||
sprefix = s[:-4]
|
|
||||||
eprefix = e[:-4]
|
|
||||||
if sprefix != eprefix: continue
|
|
||||||
svar = s[-4:]
|
|
||||||
evar = e[-4:]
|
|
||||||
s1 = nunpack(svar)
|
|
||||||
e1 = nunpack(evar)
|
|
||||||
vlen = len(svar)
|
|
||||||
#assert s1 <= e1
|
|
||||||
for i in xrange(e1-s1+1):
|
|
||||||
x = sprefix+pack('>L',s1+i)[-vlen:]
|
|
||||||
self.cmap.register_code2cid(x, cid+i)
|
|
||||||
return
|
|
||||||
|
|
||||||
if name == 'begincidchar':
|
|
||||||
self.popall()
|
|
||||||
return
|
|
||||||
if name == 'endcidchar':
|
|
||||||
objs = [ obj for (_,obj) in self.popall() ]
|
|
||||||
for (cid,code) in choplist(2, objs):
|
|
||||||
if isinstance(code, str) and isinstance(cid, str):
|
|
||||||
self.cmap.register_code2cid(code, nunpack(cid))
|
|
||||||
return
|
|
||||||
|
|
||||||
if name == 'beginbfrange':
|
|
||||||
self.popall()
|
|
||||||
return
|
|
||||||
if name == 'endbfrange':
|
|
||||||
objs = [ obj for (_,obj) in self.popall() ]
|
|
||||||
for (s,e,code) in choplist(3, objs):
|
|
||||||
if (not isinstance(s, str) or not isinstance(e, str) or
|
|
||||||
len(s) != len(e)): continue
|
|
||||||
s1 = nunpack(s)
|
|
||||||
e1 = nunpack(e)
|
|
||||||
#assert s1 <= e1
|
|
||||||
if isinstance(code, list):
|
|
||||||
for i in xrange(e1-s1+1):
|
|
||||||
self.cmap.register_cid2code(s1+i, code[i])
|
|
||||||
else:
|
|
||||||
var = code[-4:]
|
|
||||||
base = nunpack(var)
|
|
||||||
prefix = code[:-4]
|
|
||||||
vlen = len(var)
|
|
||||||
for i in xrange(e1-s1+1):
|
|
||||||
x = prefix+pack('>L',base+i)[-vlen:]
|
|
||||||
self.cmap.register_cid2code(s1+i, x)
|
|
||||||
return
|
|
||||||
|
|
||||||
if name == 'beginbfchar':
|
|
||||||
self.popall()
|
|
||||||
return
|
|
||||||
if name == 'endbfchar':
|
|
||||||
objs = [ obj for (_,obj) in self.popall() ]
|
|
||||||
for (cid,code) in choplist(2, objs):
|
|
||||||
if isinstance(cid, str) and isinstance(code, str):
|
|
||||||
self.cmap.register_cid2code(nunpack(cid), code)
|
|
||||||
return
|
|
||||||
|
|
||||||
if name == 'beginnotdefrange':
|
|
||||||
self.popall()
|
|
||||||
return
|
|
||||||
if name == 'endnotdefrange':
|
|
||||||
self.popall()
|
|
||||||
return
|
|
||||||
|
|
||||||
self.push((pos, token))
|
if name == 'usecmap':
|
||||||
return
|
try:
|
||||||
|
((_,cmapname),) = self.pop(1)
|
||||||
|
self.cmap.copycmap(CMapDB.get_cmap(literal_name(cmapname)))
|
||||||
|
except PSSyntaxError:
|
||||||
|
pass
|
||||||
|
return
|
||||||
|
|
||||||
|
if name == 'begincodespacerange':
|
||||||
|
self.popall()
|
||||||
|
return
|
||||||
|
if name == 'endcodespacerange':
|
||||||
|
self.popall()
|
||||||
|
return
|
||||||
|
|
||||||
|
if name == 'begincidrange':
|
||||||
|
self.popall()
|
||||||
|
return
|
||||||
|
if name == 'endcidrange':
|
||||||
|
objs = [ obj for (_,obj) in self.popall() ]
|
||||||
|
for (s,e,cid) in choplist(3, objs):
|
||||||
|
if (not isinstance(s, str) or not isinstance(e, str) or
|
||||||
|
not isinstance(cid, int) or len(s) != len(e)): continue
|
||||||
|
sprefix = s[:-4]
|
||||||
|
eprefix = e[:-4]
|
||||||
|
if sprefix != eprefix: continue
|
||||||
|
svar = s[-4:]
|
||||||
|
evar = e[-4:]
|
||||||
|
s1 = nunpack(svar)
|
||||||
|
e1 = nunpack(evar)
|
||||||
|
vlen = len(svar)
|
||||||
|
#assert s1 <= e1
|
||||||
|
for i in xrange(e1-s1+1):
|
||||||
|
x = sprefix+pack('>L',s1+i)[-vlen:]
|
||||||
|
self.cmap.register_code2cid(x, cid+i)
|
||||||
|
return
|
||||||
|
|
||||||
|
if name == 'begincidchar':
|
||||||
|
self.popall()
|
||||||
|
return
|
||||||
|
if name == 'endcidchar':
|
||||||
|
objs = [ obj for (_,obj) in self.popall() ]
|
||||||
|
for (cid,code) in choplist(2, objs):
|
||||||
|
if isinstance(code, str) and isinstance(cid, str):
|
||||||
|
self.cmap.register_code2cid(code, nunpack(cid))
|
||||||
|
return
|
||||||
|
|
||||||
|
if name == 'beginbfrange':
|
||||||
|
self.popall()
|
||||||
|
return
|
||||||
|
if name == 'endbfrange':
|
||||||
|
objs = [ obj for (_,obj) in self.popall() ]
|
||||||
|
for (s,e,code) in choplist(3, objs):
|
||||||
|
if (not isinstance(s, str) or not isinstance(e, str) or
|
||||||
|
len(s) != len(e)): continue
|
||||||
|
s1 = nunpack(s)
|
||||||
|
e1 = nunpack(e)
|
||||||
|
#assert s1 <= e1
|
||||||
|
if isinstance(code, list):
|
||||||
|
for i in xrange(e1-s1+1):
|
||||||
|
self.cmap.register_cid2code(s1+i, code[i])
|
||||||
|
else:
|
||||||
|
var = code[-4:]
|
||||||
|
base = nunpack(var)
|
||||||
|
prefix = code[:-4]
|
||||||
|
vlen = len(var)
|
||||||
|
for i in xrange(e1-s1+1):
|
||||||
|
x = prefix+pack('>L',base+i)[-vlen:]
|
||||||
|
self.cmap.register_cid2code(s1+i, x)
|
||||||
|
return
|
||||||
|
|
||||||
|
if name == 'beginbfchar':
|
||||||
|
self.popall()
|
||||||
|
return
|
||||||
|
if name == 'endbfchar':
|
||||||
|
objs = [ obj for (_,obj) in self.popall() ]
|
||||||
|
for (cid,code) in choplist(2, objs):
|
||||||
|
if isinstance(cid, str) and isinstance(code, str):
|
||||||
|
self.cmap.register_cid2code(nunpack(cid), code)
|
||||||
|
return
|
||||||
|
|
||||||
|
if name == 'beginnotdefrange':
|
||||||
|
self.popall()
|
||||||
|
return
|
||||||
|
if name == 'endnotdefrange':
|
||||||
|
self.popall()
|
||||||
|
return
|
||||||
|
|
||||||
|
self.push((pos, token))
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
## FontMetricsDB
|
## FontMetricsDB
|
||||||
##
|
##
|
||||||
class FontMetricsDB(object):
|
class FontMetricsDB(object):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_metrics(klass, fontname):
|
def get_metrics(klass, fontname):
|
||||||
return FONT_METRICS[fontname]
|
return FONT_METRICS[fontname]
|
||||||
|
|
||||||
|
|
||||||
## EncodingDB
|
## EncodingDB
|
||||||
##
|
##
|
||||||
class EncodingDB(object):
|
class EncodingDB(object):
|
||||||
|
|
||||||
std2unicode = {}
|
std2unicode = {}
|
||||||
mac2unicode = {}
|
mac2unicode = {}
|
||||||
win2unicode = {}
|
win2unicode = {}
|
||||||
pdf2unicode = {}
|
pdf2unicode = {}
|
||||||
for (name,std,mac,win,pdf) in ENCODING:
|
for (name,std,mac,win,pdf) in ENCODING:
|
||||||
c = unichr(name2unicode(name))
|
c = unichr(name2unicode(name))
|
||||||
if std: std2unicode[std] = c
|
if std: std2unicode[std] = c
|
||||||
if mac: mac2unicode[mac] = c
|
if mac: mac2unicode[mac] = c
|
||||||
if win: win2unicode[win] = c
|
if win: win2unicode[win] = c
|
||||||
if pdf: pdf2unicode[pdf] = c
|
if pdf: pdf2unicode[pdf] = c
|
||||||
|
|
||||||
encodings = {
|
encodings = {
|
||||||
'StandardEncoding': std2unicode,
|
'StandardEncoding': std2unicode,
|
||||||
'MacRomanEncoding': mac2unicode,
|
'MacRomanEncoding': mac2unicode,
|
||||||
'WinAnsiEncoding': win2unicode,
|
'WinAnsiEncoding': win2unicode,
|
||||||
'PDFDocEncoding': pdf2unicode,
|
'PDFDocEncoding': pdf2unicode,
|
||||||
}
|
}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_encoding(klass, name, diff=None):
|
def get_encoding(klass, name, diff=None):
|
||||||
cid2unicode = klass.encodings.get(name, klass.std2unicode)
|
cid2unicode = klass.encodings.get(name, klass.std2unicode)
|
||||||
if diff:
|
if diff:
|
||||||
cid2unicode = cid2unicode.copy()
|
cid2unicode = cid2unicode.copy()
|
||||||
cid = 0
|
cid = 0
|
||||||
for x in diff:
|
for x in diff:
|
||||||
if isinstance(x, int):
|
if isinstance(x, int):
|
||||||
cid = x
|
cid = x
|
||||||
elif isinstance(x, PSLiteral):
|
elif isinstance(x, PSLiteral):
|
||||||
try:
|
try:
|
||||||
cid2unicode[cid] = unichr(name2unicode(x.name))
|
cid2unicode[cid] = unichr(name2unicode(x.name))
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
cid += 1
|
cid += 1
|
||||||
return cid2unicode
|
return cid2unicode
|
||||||
|
|
||||||
|
|
||||||
## CMap -> CMapCDB conversion
|
## CMap -> CMapCDB conversion
|
||||||
##
|
##
|
||||||
def dumpcdb(cmap, cdbfile, verbose=1):
|
def dumpcdb(cmap, cdbfile, verbose=1):
|
||||||
m = cdb.cdbmake(cdbfile, cdbfile+'.tmp')
|
m = cdb.cdbmake(cdbfile, cdbfile+'.tmp')
|
||||||
if verbose:
|
if verbose:
|
||||||
print >>stderr, 'Writing: %r...' % cdbfile
|
print >>stderr, 'Writing: %r...' % cdbfile
|
||||||
for (k,v) in cmap.getall_attrs():
|
for (k,v) in cmap.getall_attrs():
|
||||||
m.add('/'+k, repr(v))
|
m.add('/'+k, repr(v))
|
||||||
for (code,cid) in cmap.getall_code2cid():
|
for (code,cid) in cmap.getall_code2cid():
|
||||||
m.add('c'+code, pack('>L',cid))
|
m.add('c'+code, pack('>L',cid))
|
||||||
for (cid,code) in cmap.getall_cid2code():
|
for (cid,code) in cmap.getall_cid2code():
|
||||||
m.add('i'+pack('>L',cid), code)
|
m.add('i'+pack('>L',cid), code)
|
||||||
m.finish()
|
m.finish()
|
||||||
return
|
return
|
||||||
|
|
||||||
def convert_cmap(cmapdir, outputdir, force=False):
|
def convert_cmap(cmapdir, outputdir, force=False):
|
||||||
CMapDB.initialize(cmapdir)
|
CMapDB.initialize(cmapdir)
|
||||||
for fname in os.listdir(cmapdir):
|
for fname in os.listdir(cmapdir):
|
||||||
if '.' in fname: continue
|
if '.' in fname: continue
|
||||||
cmapname = os.path.basename(fname)
|
cmapname = os.path.basename(fname)
|
||||||
cdbname = os.path.join(outputdir, cmapname+'.cmap.cdb')
|
cdbname = os.path.join(outputdir, cmapname+'.cmap.cdb')
|
||||||
if not force and os.path.exists(cdbname):
|
if not force and os.path.exists(cdbname):
|
||||||
print >>stderr, 'Skipping: %r' % cmapname
|
print >>stderr, 'Skipping: %r' % cmapname
|
||||||
continue
|
continue
|
||||||
print >>stderr, 'Reading: %r...' % cmapname
|
print >>stderr, 'Reading: %r...' % cmapname
|
||||||
cmap = CMapDB.get_cmap(cmapname)
|
cmap = CMapDB.get_cmap(cmapname)
|
||||||
dumpcdb(cmap, cdbname)
|
dumpcdb(cmap, cdbname)
|
||||||
return
|
return
|
||||||
|
|
||||||
def main(argv):
|
def main(argv):
|
||||||
import getopt
|
import getopt
|
||||||
def usage():
|
def usage():
|
||||||
print 'usage: %s [-D outputdir] [-f] cmap_dir' % argv[0]
|
print 'usage: %s [-D outputdir] [-f] cmap_dir' % argv[0]
|
||||||
return 100
|
return 100
|
||||||
try:
|
try:
|
||||||
(opts, args) = getopt.getopt(argv[1:], 'C:D:f')
|
(opts, args) = getopt.getopt(argv[1:], 'C:D:f')
|
||||||
except getopt.GetoptError:
|
except getopt.GetoptError:
|
||||||
return usage()
|
return usage()
|
||||||
if args:
|
if args:
|
||||||
cmapdir = args.pop(0)
|
cmapdir = args.pop(0)
|
||||||
else:
|
else:
|
||||||
cmapdir = find_cmap_path()
|
cmapdir = find_cmap_path()
|
||||||
outputdir = cmapdir
|
outputdir = cmapdir
|
||||||
force = False
|
force = False
|
||||||
for (k, v) in opts:
|
for (k, v) in opts:
|
||||||
if k == '-f': force = True
|
if k == '-f': force = True
|
||||||
elif k == '-C': cmapdir = v
|
elif k == '-C': cmapdir = v
|
||||||
elif k == '-D': outputdir = v
|
elif k == '-D': outputdir = v
|
||||||
if not os.path.isdir(cmapdir):
|
if not os.path.isdir(cmapdir):
|
||||||
print >>stderr, 'directory does not exist: %r' % cmapdir
|
print >>stderr, 'directory does not exist: %r' % cmapdir
|
||||||
return 111
|
return 111
|
||||||
if not os.path.isdir(outputdir):
|
if not os.path.isdir(outputdir):
|
||||||
print >>stderr, 'directory does not exist: %r' % outputdir
|
print >>stderr, 'directory does not exist: %r' % outputdir
|
||||||
return 111
|
return 111
|
||||||
return convert_cmap(cmapdir, outputdir, force=force)
|
return convert_cmap(cmapdir, outputdir, force=force)
|
||||||
|
|
||||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||||
|
|
|
@ -10,298 +10,298 @@ from pdfminer.utils import apply_matrix_pt, mult_matrix, enc
|
||||||
##
|
##
|
||||||
class TagExtractor(PDFDevice):
|
class TagExtractor(PDFDevice):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, codec='utf-8'):
|
def __init__(self, rsrc, outfp, codec='utf-8'):
|
||||||
PDFDevice.__init__(self, rsrc)
|
PDFDevice.__init__(self, rsrc)
|
||||||
self.outfp = outfp
|
self.outfp = outfp
|
||||||
self.codec = codec
|
self.codec = codec
|
||||||
self.pageno = 0
|
self.pageno = 0
|
||||||
self.tag = None
|
self.tag = None
|
||||||
return
|
return
|
||||||
|
|
||||||
def render_string(self, textstate, seq):
|
|
||||||
font = textstate.font
|
|
||||||
text = ''
|
|
||||||
for obj in seq:
|
|
||||||
if not isinstance(obj, str): continue
|
|
||||||
chars = font.decode(obj)
|
|
||||||
for cid in chars:
|
|
||||||
try:
|
|
||||||
char = font.to_unicode(cid)
|
|
||||||
text += char
|
|
||||||
except PDFUnicodeNotDefined:
|
|
||||||
pass
|
|
||||||
self.outfp.write(enc(text, self.codec))
|
|
||||||
return
|
|
||||||
|
|
||||||
def begin_page(self, page, ctm):
|
def render_string(self, textstate, seq):
|
||||||
(x0, y0, x1, y1) = page.mediabox
|
font = textstate.font
|
||||||
bbox = '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
|
text = ''
|
||||||
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' %
|
for obj in seq:
|
||||||
(self.pageno, bbox, page.rotate))
|
if not isinstance(obj, str): continue
|
||||||
return
|
chars = font.decode(obj)
|
||||||
|
for cid in chars:
|
||||||
def end_page(self, page):
|
try:
|
||||||
self.outfp.write('</page>\n')
|
char = font.to_unicode(cid)
|
||||||
self.pageno += 1
|
text += char
|
||||||
return
|
except PDFUnicodeNotDefined:
|
||||||
|
pass
|
||||||
def begin_tag(self, tag, props=None):
|
self.outfp.write(enc(text, self.codec))
|
||||||
s = ''
|
return
|
||||||
if props:
|
|
||||||
s = ''.join( ' %s="%s"' % (enc(k), enc(str(v))) for (k,v)
|
def begin_page(self, page, ctm):
|
||||||
in sorted(props.iteritems()) )
|
(x0, y0, x1, y1) = page.mediabox
|
||||||
self.outfp.write('<%s%s>' % (enc(tag.name), s))
|
bbox = '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
|
||||||
self.tag = tag
|
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' %
|
||||||
return
|
(self.pageno, bbox, page.rotate))
|
||||||
|
return
|
||||||
def end_tag(self):
|
|
||||||
assert self.tag
|
def end_page(self, page):
|
||||||
self.outfp.write('</%s>' % enc(self.tag.name))
|
self.outfp.write('</page>\n')
|
||||||
self.tag = None
|
self.pageno += 1
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_tag(self, tag, props=None):
|
def begin_tag(self, tag, props=None):
|
||||||
self.begin_tag(tag, props)
|
s = ''
|
||||||
self.tag = None
|
if props:
|
||||||
return
|
s = ''.join( ' %s="%s"' % (enc(k), enc(str(v))) for (k,v)
|
||||||
|
in sorted(props.iteritems()) )
|
||||||
|
self.outfp.write('<%s%s>' % (enc(tag.name), s))
|
||||||
|
self.tag = tag
|
||||||
|
return
|
||||||
|
|
||||||
|
def end_tag(self):
|
||||||
|
assert self.tag
|
||||||
|
self.outfp.write('</%s>' % enc(self.tag.name))
|
||||||
|
self.tag = None
|
||||||
|
return
|
||||||
|
|
||||||
|
def do_tag(self, tag, props=None):
|
||||||
|
self.begin_tag(tag, props)
|
||||||
|
self.tag = None
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
## PDFPageAggregator
|
## PDFPageAggregator
|
||||||
##
|
##
|
||||||
class PDFPageAggregator(PDFTextDevice):
|
class PDFPageAggregator(PDFTextDevice):
|
||||||
|
|
||||||
def __init__(self, rsrc, pageno=1, laparams=None):
|
def __init__(self, rsrc, pageno=1, laparams=None):
|
||||||
PDFTextDevice.__init__(self, rsrc)
|
PDFTextDevice.__init__(self, rsrc)
|
||||||
self.laparams = laparams
|
self.laparams = laparams
|
||||||
self.pageno = pageno
|
self.pageno = pageno
|
||||||
self.stack = []
|
self.stack = []
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_page(self, page, ctm):
|
def begin_page(self, page, ctm):
|
||||||
(x0,y0,x1,y1) = page.mediabox
|
(x0,y0,x1,y1) = page.mediabox
|
||||||
(x0,y0) = apply_matrix_pt(ctm, (x0,y0))
|
(x0,y0) = apply_matrix_pt(ctm, (x0,y0))
|
||||||
(x1,y1) = apply_matrix_pt(ctm, (x1,y1))
|
(x1,y1) = apply_matrix_pt(ctm, (x1,y1))
|
||||||
mediabox = (0, 0, abs(x0-x1), abs(y0-y1))
|
mediabox = (0, 0, abs(x0-x1), abs(y0-y1))
|
||||||
self.cur_item = LTPage(self.pageno, mediabox)
|
self.cur_item = LTPage(self.pageno, mediabox)
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_page(self, _):
|
|
||||||
assert not self.stack
|
|
||||||
assert isinstance(self.cur_item, LTPage)
|
|
||||||
self.cur_item.fixate()
|
|
||||||
if self.laparams:
|
|
||||||
self.cur_item.analyze_layout(self.laparams)
|
|
||||||
self.pageno += 1
|
|
||||||
return self.cur_item
|
|
||||||
|
|
||||||
def begin_figure(self, name, bbox, matrix):
|
def end_page(self, _):
|
||||||
self.stack.append(self.cur_item)
|
assert not self.stack
|
||||||
self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
|
assert isinstance(self.cur_item, LTPage)
|
||||||
return
|
self.cur_item.fixate()
|
||||||
|
if self.laparams:
|
||||||
def end_figure(self, _):
|
self.cur_item.analyze_layout(self.laparams)
|
||||||
fig = self.cur_item
|
self.pageno += 1
|
||||||
self.cur_item.fixate()
|
return self.cur_item
|
||||||
self.cur_item = self.stack.pop()
|
|
||||||
self.cur_item.add(fig)
|
|
||||||
return
|
|
||||||
|
|
||||||
def paint_path(self, gstate, stroke, fill, evenodd, path):
|
def begin_figure(self, name, bbox, matrix):
|
||||||
shape = ''.join(x[0] for x in path)
|
self.stack.append(self.cur_item)
|
||||||
if shape == 'ml': # horizontal/vertical line
|
self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
|
||||||
(_,x0,y0) = path[0]
|
return
|
||||||
(_,x1,y1) = path[1]
|
|
||||||
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
|
def end_figure(self, _):
|
||||||
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
|
fig = self.cur_item
|
||||||
if y0 == y1:
|
self.cur_item.fixate()
|
||||||
# horizontal ruler
|
self.cur_item = self.stack.pop()
|
||||||
self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1)))
|
self.cur_item.add(fig)
|
||||||
elif x0 == x1:
|
return
|
||||||
# vertical ruler
|
|
||||||
self.cur_item.add(LTLine(gstate.linewidth, 'V', (x0,y0,x1,y1)))
|
def paint_path(self, gstate, stroke, fill, evenodd, path):
|
||||||
elif shape == 'mlllh':
|
shape = ''.join(x[0] for x in path)
|
||||||
# rectangle
|
if shape == 'ml': # horizontal/vertical line
|
||||||
(_,x0,y0) = path[0]
|
(_,x0,y0) = path[0]
|
||||||
(_,x1,y1) = path[1]
|
(_,x1,y1) = path[1]
|
||||||
(_,x2,y2) = path[2]
|
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
|
||||||
(_,x3,y3) = path[3]
|
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
|
||||||
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
|
if y0 == y1:
|
||||||
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
|
# horizontal ruler
|
||||||
(x2,y2) = apply_matrix_pt(self.ctm, (x2,y2))
|
self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1)))
|
||||||
(x3,y3) = apply_matrix_pt(self.ctm, (x3,y3))
|
elif x0 == x1:
|
||||||
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
|
# vertical ruler
|
||||||
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
|
self.cur_item.add(LTLine(gstate.linewidth, 'V', (x0,y0,x1,y1)))
|
||||||
self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
|
elif shape == 'mlllh':
|
||||||
return
|
# rectangle
|
||||||
|
(_,x0,y0) = path[0]
|
||||||
def render_chars(self, matrix, font, fontsize, charspace, scaling, chars):
|
(_,x1,y1) = path[1]
|
||||||
if not chars: return (0, 0)
|
(_,x2,y2) = path[2]
|
||||||
item = LTTextItem(matrix, font, fontsize, charspace, scaling, chars)
|
(_,x3,y3) = path[3]
|
||||||
self.cur_item.add(item)
|
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
|
||||||
return item.adv
|
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
|
||||||
|
(x2,y2) = apply_matrix_pt(self.ctm, (x2,y2))
|
||||||
|
(x3,y3) = apply_matrix_pt(self.ctm, (x3,y3))
|
||||||
|
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
|
||||||
|
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
|
||||||
|
self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
|
||||||
|
return
|
||||||
|
|
||||||
|
def render_chars(self, matrix, font, fontsize, charspace, scaling, chars):
|
||||||
|
if not chars: return (0, 0)
|
||||||
|
item = LTTextItem(matrix, font, fontsize, charspace, scaling, chars)
|
||||||
|
self.cur_item.add(item)
|
||||||
|
return item.adv
|
||||||
|
|
||||||
|
|
||||||
## PDFConverter
|
## PDFConverter
|
||||||
##
|
##
|
||||||
class PDFConverter(PDFPageAggregator):
|
class PDFConverter(PDFPageAggregator):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None):
|
|
||||||
PDFPageAggregator.__init__(self, rsrc, pageno=pageno, laparams=laparams)
|
|
||||||
self.outfp = outfp
|
|
||||||
self.codec = codec
|
|
||||||
return
|
|
||||||
|
|
||||||
def write(self, text):
|
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None):
|
||||||
self.outfp.write(enc(text, self.codec))
|
PDFPageAggregator.__init__(self, rsrc, pageno=pageno, laparams=laparams)
|
||||||
return
|
self.outfp = outfp
|
||||||
|
self.codec = codec
|
||||||
|
return
|
||||||
|
|
||||||
|
def write(self, text):
|
||||||
|
self.outfp.write(enc(text, self.codec))
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
## SGMLConverter
|
## SGMLConverter
|
||||||
##
|
##
|
||||||
class SGMLConverter(PDFConverter):
|
class SGMLConverter(PDFConverter):
|
||||||
|
|
||||||
def end_page(self, page):
|
def end_page(self, page):
|
||||||
def render(item):
|
def render(item):
|
||||||
if isinstance(item, LTPage):
|
if isinstance(item, LTPage):
|
||||||
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
|
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
|
||||||
(item.id, item.get_bbox(), item.rotate))
|
(item.id, item.get_bbox(), item.rotate))
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
self.outfp.write('</page>\n')
|
self.outfp.write('</page>\n')
|
||||||
elif isinstance(item, LTLine):
|
elif isinstance(item, LTLine):
|
||||||
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox()))
|
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox()))
|
||||||
elif isinstance(item, LTRect):
|
elif isinstance(item, LTRect):
|
||||||
self.outfp.write('<rect linewidth="%d" bbox="%s" />' % (item.linewidth, item.get_bbox()))
|
self.outfp.write('<rect linewidth="%d" bbox="%s" />' % (item.linewidth, item.get_bbox()))
|
||||||
elif isinstance(item, LTFigure):
|
elif isinstance(item, LTFigure):
|
||||||
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
|
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
self.outfp.write('</figure>\n')
|
self.outfp.write('</figure>\n')
|
||||||
elif isinstance(item, LTTextLine):
|
elif isinstance(item, LTTextLine):
|
||||||
self.outfp.write('<textline bbox="%s">\n' % (item.get_bbox()))
|
self.outfp.write('<textline bbox="%s">\n' % (item.get_bbox()))
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
self.outfp.write('</textline>\n')
|
self.outfp.write('</textline>\n')
|
||||||
elif isinstance(item, LTTextBox):
|
elif isinstance(item, LTTextBox):
|
||||||
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
|
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
self.outfp.write('</textbox>\n')
|
self.outfp.write('</textbox>\n')
|
||||||
elif isinstance(item, LTTextItem):
|
elif isinstance(item, LTTextItem):
|
||||||
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
|
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
|
||||||
(enc(item.font.fontname), item.is_vertical(),
|
(enc(item.font.fontname), item.is_vertical(),
|
||||||
item.get_bbox(), item.fontsize))
|
item.get_bbox(), item.fontsize))
|
||||||
self.write(item.text)
|
self.write(item.text)
|
||||||
self.outfp.write('</text>\n')
|
self.outfp.write('</text>\n')
|
||||||
elif isinstance(item, LTText):
|
elif isinstance(item, LTText):
|
||||||
self.outfp.write('<text>%s</text>\n' % item.text)
|
self.outfp.write('<text>%s</text>\n' % item.text)
|
||||||
else:
|
else:
|
||||||
assert 0, item
|
assert 0, item
|
||||||
return
|
return
|
||||||
page = PDFConverter.end_page(self, page)
|
page = PDFConverter.end_page(self, page)
|
||||||
render(page)
|
render(page)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
## HTMLConverter
|
## HTMLConverter
|
||||||
##
|
##
|
||||||
class HTMLConverter(PDFConverter):
|
class HTMLConverter(PDFConverter):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
|
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
|
||||||
scale=1, showpageno=True, pagepad=50):
|
scale=1, showpageno=True, pagepad=50):
|
||||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||||
self.showpageno = showpageno
|
self.showpageno = showpageno
|
||||||
self.pagepad = pagepad
|
self.pagepad = pagepad
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
self.outfp.write('<html><head>\n')
|
self.outfp.write('<html><head>\n')
|
||||||
self.outfp.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' %
|
self.outfp.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' %
|
||||||
self.codec)
|
self.codec)
|
||||||
self.outfp.write('</head><body>\n')
|
self.outfp.write('</head><body>\n')
|
||||||
self.yoffset = self.pagepad
|
self.yoffset = self.pagepad
|
||||||
return
|
return
|
||||||
|
|
||||||
def write_rect(self, color, width, x, y, w, h):
|
def write_rect(self, color, width, x, y, w, h):
|
||||||
self.outfp.write('<span style="position:absolute; border: %s %dpx solid; '
|
self.outfp.write('<span style="position:absolute; border: %s %dpx solid; '
|
||||||
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
|
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
|
||||||
(color, width, x*self.scale, y*self.scale, w*self.scale, h*self.scale))
|
(color, width, x*self.scale, y*self.scale, w*self.scale, h*self.scale))
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_page(self, page):
|
def end_page(self, page):
|
||||||
def render(item):
|
def render(item):
|
||||||
if isinstance(item, LTPage):
|
if isinstance(item, LTPage):
|
||||||
self.yoffset += item.y1
|
self.yoffset += item.y1
|
||||||
self.write_rect('gray', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
self.write_rect('gray', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||||
if self.showpageno:
|
if self.showpageno:
|
||||||
self.outfp.write('<div style="position:absolute; top:%dpx;">' %
|
self.outfp.write('<div style="position:absolute; top:%dpx;">' %
|
||||||
((self.yoffset-item.y1)*self.scale))
|
((self.yoffset-item.y1)*self.scale))
|
||||||
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id))
|
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id))
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
elif isinstance(item, LTTextItem):
|
elif isinstance(item, LTTextItem):
|
||||||
if item.vertical:
|
if item.vertical:
|
||||||
wmode = 'tb-rl'
|
wmode = 'tb-rl'
|
||||||
else:
|
else:
|
||||||
wmode = 'lr-tb'
|
wmode = 'lr-tb'
|
||||||
self.outfp.write('<span style="position:absolute; writing-mode:%s;'
|
self.outfp.write('<span style="position:absolute; writing-mode:%s;'
|
||||||
' left:%dpx; top:%dpx; font-size:%dpx;">' %
|
' left:%dpx; top:%dpx; font-size:%dpx;">' %
|
||||||
(wmode, item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
|
(wmode, item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
|
||||||
item.fontsize*self.scale))
|
item.fontsize*self.scale))
|
||||||
self.write(item.text)
|
self.write(item.text)
|
||||||
self.outfp.write('</span>\n')
|
self.outfp.write('</span>\n')
|
||||||
if self.debug:
|
if self.debug:
|
||||||
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||||
elif isinstance(item, LTLine) or isinstance(item, LTRect):
|
elif isinstance(item, LTLine) or isinstance(item, LTRect):
|
||||||
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||||
elif isinstance(item, LTTextLine):
|
elif isinstance(item, LTTextLine):
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
elif isinstance(item, LTTextBox):
|
elif isinstance(item, LTTextBox):
|
||||||
self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
elif isinstance(item, LTFigure):
|
elif isinstance(item, LTFigure):
|
||||||
self.write_rect('green', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
self.write_rect('green', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
return
|
return
|
||||||
page = PDFConverter.end_page(self, page)
|
page = PDFConverter.end_page(self, page)
|
||||||
render(page)
|
render(page)
|
||||||
self.yoffset += self.pagepad
|
self.yoffset += self.pagepad
|
||||||
return
|
return
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
self.outfp.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' %
|
self.outfp.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' %
|
||||||
', '.join('<a href="#%s">%s</a>' % (i,i) for i in xrange(1,self.pageno)))
|
', '.join('<a href="#%s">%s</a>' % (i,i) for i in xrange(1,self.pageno)))
|
||||||
self.outfp.write('</body></html>\n')
|
self.outfp.write('</body></html>\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
## TextConverter
|
## TextConverter
|
||||||
##
|
##
|
||||||
class TextConverter(PDFConverter):
|
class TextConverter(PDFConverter):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
|
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
|
||||||
showpageno=False):
|
showpageno=False):
|
||||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||||
self.showpageno = showpageno
|
self.showpageno = showpageno
|
||||||
return
|
return
|
||||||
|
|
||||||
def write(self, text):
|
def write(self, text):
|
||||||
self.outfp.write(text.encode(self.codec, 'ignore'))
|
self.outfp.write(text.encode(self.codec, 'ignore'))
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_page(self, page):
|
def end_page(self, page):
|
||||||
def render(item):
|
def render(item):
|
||||||
if isinstance(item, LTText):
|
if isinstance(item, LTText):
|
||||||
self.write(item.text)
|
self.write(item.text)
|
||||||
elif isinstance(item, LayoutContainer):
|
elif isinstance(item, LayoutContainer):
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
if isinstance(item, LTTextBox):
|
if isinstance(item, LTTextBox):
|
||||||
self.write('\n')
|
self.write('\n')
|
||||||
page = PDFConverter.end_page(self, page)
|
page = PDFConverter.end_page(self, page)
|
||||||
if self.showpageno:
|
if self.showpageno:
|
||||||
self.write('Page %d\n' % page.id)
|
self.write('Page %d\n' % page.id)
|
||||||
render(page)
|
render(page)
|
||||||
self.write('\f')
|
self.write('\f')
|
||||||
return
|
return
|
||||||
|
|
|
@ -8,9 +8,9 @@
|
||||||
|
|
||||||
### BEGIN Verbatim copy of the license part
|
### BEGIN Verbatim copy of the license part
|
||||||
|
|
||||||
#
|
#
|
||||||
# Adobe Core 35 AFM Files with 229 Glyph Entries - ReadMe
|
# Adobe Core 35 AFM Files with 229 Glyph Entries - ReadMe
|
||||||
#
|
#
|
||||||
# This file and the 35 PostScript(R) AFM files it accompanies may be
|
# This file and the 35 PostScript(R) AFM files it accompanies may be
|
||||||
# used, copied, and distributed for any purpose and without charge,
|
# used, copied, and distributed for any purpose and without charge,
|
||||||
# with or without modification, provided that all copyright notices
|
# with or without modification, provided that all copyright notices
|
||||||
|
|
|
@ -7,23 +7,23 @@ INF = sys.maxint
|
||||||
## LAParams
|
## LAParams
|
||||||
##
|
##
|
||||||
class LAParams(object):
|
class LAParams(object):
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
direction=None,
|
|
||||||
line_overlap=0.5,
|
|
||||||
char_margin=1.0,
|
|
||||||
line_margin=0.5,
|
|
||||||
word_margin=0.1):
|
|
||||||
self.direction = direction
|
|
||||||
self.line_overlap = line_overlap
|
|
||||||
self.char_margin = char_margin
|
|
||||||
self.line_margin = line_margin
|
|
||||||
self.word_margin = word_margin
|
|
||||||
return
|
|
||||||
|
|
||||||
def __repr__(self):
|
def __init__(self,
|
||||||
return ('<LAParams: direction=%r, char_margin=%.1f, line_margin=%.1f, word_margin=%.1f>' %
|
direction=None,
|
||||||
(self.direction, self.char_margin, self.line_margin, self.word_margin))
|
line_overlap=0.5,
|
||||||
|
char_margin=1.0,
|
||||||
|
line_margin=0.5,
|
||||||
|
word_margin=0.1):
|
||||||
|
self.direction = direction
|
||||||
|
self.line_overlap = line_overlap
|
||||||
|
self.char_margin = char_margin
|
||||||
|
self.line_margin = line_margin
|
||||||
|
self.word_margin = word_margin
|
||||||
|
return
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return ('<LAParams: direction=%r, char_margin=%.1f, line_margin=%.1f, word_margin=%.1f>' %
|
||||||
|
(self.direction, self.char_margin, self.line_margin, self.word_margin))
|
||||||
|
|
||||||
|
|
||||||
## Plane
|
## Plane
|
||||||
|
@ -35,354 +35,354 @@ class LAParams(object):
|
||||||
##
|
##
|
||||||
class Plane(object):
|
class Plane(object):
|
||||||
|
|
||||||
def __init__(self, objs):
|
def __init__(self, objs):
|
||||||
self.xobjs = []
|
self.xobjs = []
|
||||||
self.yobjs = []
|
self.yobjs = []
|
||||||
for obj in objs:
|
for obj in objs:
|
||||||
self.place(obj)
|
self.place(obj)
|
||||||
self.xobjs.sort()
|
self.xobjs.sort()
|
||||||
self.yobjs.sort()
|
self.yobjs.sort()
|
||||||
return
|
return
|
||||||
|
|
||||||
# place(obj): place an object in a certain area.
|
# place(obj): place an object in a certain area.
|
||||||
def place(self, obj):
|
def place(self, obj):
|
||||||
assert isinstance(obj, LayoutItem)
|
assert isinstance(obj, LayoutItem)
|
||||||
self.xobjs.append((obj.x0, obj))
|
self.xobjs.append((obj.x0, obj))
|
||||||
self.xobjs.append((obj.x1, obj))
|
self.xobjs.append((obj.x1, obj))
|
||||||
self.yobjs.append((obj.y0, obj))
|
self.yobjs.append((obj.y0, obj))
|
||||||
self.yobjs.append((obj.y1, obj))
|
self.yobjs.append((obj.y1, obj))
|
||||||
return
|
return
|
||||||
|
|
||||||
# find(): finds objects that are in a certain area.
|
# find(): finds objects that are in a certain area.
|
||||||
def find(self, (x0,y0,x1,y1)):
|
def find(self, (x0,y0,x1,y1)):
|
||||||
(i0,_) = bsearch(self.xobjs, x0)
|
(i0,_) = bsearch(self.xobjs, x0)
|
||||||
(_,i1) = bsearch(self.xobjs, x1)
|
(_,i1) = bsearch(self.xobjs, x1)
|
||||||
xobjs = set( obj for (_,obj) in self.xobjs[i0:i1] )
|
xobjs = set( obj for (_,obj) in self.xobjs[i0:i1] )
|
||||||
(i0,_) = bsearch(self.yobjs, y0)
|
(i0,_) = bsearch(self.yobjs, y0)
|
||||||
(_,i1) = bsearch(self.yobjs, y1)
|
(_,i1) = bsearch(self.yobjs, y1)
|
||||||
yobjs = set( obj for (_,obj) in self.yobjs[i0:i1] )
|
yobjs = set( obj for (_,obj) in self.yobjs[i0:i1] )
|
||||||
objs = xobjs.intersection(yobjs)
|
objs = xobjs.intersection(yobjs)
|
||||||
return objs
|
return objs
|
||||||
|
|
||||||
|
|
||||||
## ClusterSet
|
## ClusterSet
|
||||||
##
|
##
|
||||||
class ClusterSet(object):
|
class ClusterSet(object):
|
||||||
|
|
||||||
def __init__(self, klass):
|
def __init__(self, klass):
|
||||||
self.clusters = {}
|
self.clusters = {}
|
||||||
self.klass = klass
|
self.klass = klass
|
||||||
self.i = 0
|
self.i = 0
|
||||||
return
|
return
|
||||||
|
|
||||||
# add(objs): groups text objects if necessary.
|
# add(objs): groups text objects if necessary.
|
||||||
def add(self, objs):
|
def add(self, objs):
|
||||||
group = self.klass(self.i, objs)
|
group = self.klass(self.i, objs)
|
||||||
self.i += 1
|
self.i += 1
|
||||||
for obj in objs:
|
for obj in objs:
|
||||||
if obj in self.clusters:
|
if obj in self.clusters:
|
||||||
group.merge(self.clusters[obj])
|
group.merge(self.clusters[obj])
|
||||||
for obj in group:
|
for obj in group:
|
||||||
self.clusters[obj] = group
|
self.clusters[obj] = group
|
||||||
return
|
return
|
||||||
|
|
||||||
# finish(): returns all the LTTextBoxes in a page.
|
# finish(): returns all the LTTextBoxes in a page.
|
||||||
def finish(self):
|
def finish(self):
|
||||||
r = set(self.clusters.itervalues())
|
r = set(self.clusters.itervalues())
|
||||||
for group in r:
|
for group in r:
|
||||||
group.fixate()
|
group.fixate()
|
||||||
return list(r)
|
return list(r)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def build(klass, objs, hratio, vratio, objtype, func=None):
|
def build(klass, objs, hratio, vratio, objtype, func=None):
|
||||||
plane = Plane(objs)
|
plane = Plane(objs)
|
||||||
cset = ClusterSet(objtype)
|
cset = ClusterSet(objtype)
|
||||||
for obj in objs:
|
for obj in objs:
|
||||||
margin = obj.get_margin()
|
margin = obj.get_margin()
|
||||||
hmargin = hratio * margin
|
hmargin = hratio * margin
|
||||||
vmargin = vratio * margin
|
vmargin = vratio * margin
|
||||||
neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin))
|
neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin))
|
||||||
assert obj in neighbors, obj
|
assert obj in neighbors, obj
|
||||||
if func:
|
if func:
|
||||||
neighbors = [ x for x in neighbors if x is obj or func(obj, x) ]
|
neighbors = [ x for x in neighbors if x is obj or func(obj, x) ]
|
||||||
cset.add(neighbors)
|
cset.add(neighbors)
|
||||||
return cset.finish()
|
return cset.finish()
|
||||||
|
|
||||||
|
|
||||||
## LayoutItem
|
## LayoutItem
|
||||||
##
|
##
|
||||||
class LayoutItem(object):
|
class LayoutItem(object):
|
||||||
|
|
||||||
def __init__(self, bbox):
|
def __init__(self, bbox):
|
||||||
self.set_bbox(bbox)
|
self.set_bbox(bbox)
|
||||||
return
|
return
|
||||||
|
|
||||||
def set_bbox(self, (x0,y0,x1,y1)):
|
def set_bbox(self, (x0,y0,x1,y1)):
|
||||||
if x1 < x0: (x0,x1) = (x1,x0)
|
if x1 < x0: (x0,x1) = (x1,x0)
|
||||||
if y1 < y0: (y0,y1) = (y1,y0)
|
if y1 < y0: (y0,y1) = (y1,y0)
|
||||||
self.x0 = x0
|
self.x0 = x0
|
||||||
self.y0 = y0
|
self.y0 = y0
|
||||||
self.x1 = x1
|
self.x1 = x1
|
||||||
self.y1 = y1
|
self.y1 = y1
|
||||||
self.width = x1-x0
|
self.width = x1-x0
|
||||||
self.height = y1-y0
|
self.height = y1-y0
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<item bbox=%s>' % (self.get_bbox()))
|
return ('<item bbox=%s>' % (self.get_bbox()))
|
||||||
|
|
||||||
def hoverlap(self, obj):
|
|
||||||
assert isinstance(obj, LayoutItem)
|
|
||||||
if self.x1 <= obj.x0 or obj.x1 <= self.x0:
|
|
||||||
return 0
|
|
||||||
else:
|
|
||||||
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
|
|
||||||
|
|
||||||
def voverlap(self, obj):
|
def hoverlap(self, obj):
|
||||||
assert isinstance(obj, LayoutItem)
|
assert isinstance(obj, LayoutItem)
|
||||||
if self.y1 <= obj.y0 or obj.y1 <= self.y0:
|
if self.x1 <= obj.x0 or obj.x1 <= self.x0:
|
||||||
return 0
|
return 0
|
||||||
else:
|
else:
|
||||||
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
|
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
|
||||||
|
|
||||||
def get_bbox(self):
|
def voverlap(self, obj):
|
||||||
return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1)
|
assert isinstance(obj, LayoutItem)
|
||||||
|
if self.y1 <= obj.y0 or obj.y1 <= self.y0:
|
||||||
def get_margin(self):
|
return 0
|
||||||
return 0
|
else:
|
||||||
|
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
|
||||||
|
|
||||||
def get_weight(self):
|
def get_bbox(self):
|
||||||
return 0
|
return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1)
|
||||||
|
|
||||||
def get_direction(self):
|
def get_margin(self):
|
||||||
return None
|
return 0
|
||||||
|
|
||||||
|
def get_weight(self):
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def get_direction(self):
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
## LayoutContainer
|
## LayoutContainer
|
||||||
##
|
##
|
||||||
class LayoutContainer(LayoutItem):
|
class LayoutContainer(LayoutItem):
|
||||||
|
|
||||||
def __init__(self, id, bbox, objs=None):
|
|
||||||
LayoutItem.__init__(self, bbox)
|
|
||||||
self.id = id
|
|
||||||
if objs:
|
|
||||||
self.objs = set(objs)
|
|
||||||
else:
|
|
||||||
self.objs = set()
|
|
||||||
self.weight = None
|
|
||||||
return
|
|
||||||
|
|
||||||
def __repr__(self):
|
def __init__(self, id, bbox, objs=None):
|
||||||
return ('<group %s>' % (self.get_bbox()))
|
LayoutItem.__init__(self, bbox)
|
||||||
|
self.id = id
|
||||||
|
if objs:
|
||||||
|
self.objs = set(objs)
|
||||||
|
else:
|
||||||
|
self.objs = set()
|
||||||
|
self.weight = None
|
||||||
|
return
|
||||||
|
|
||||||
def __iter__(self):
|
def __repr__(self):
|
||||||
return iter(self.objs)
|
return ('<group %s>' % (self.get_bbox()))
|
||||||
|
|
||||||
def __len__(self):
|
def __iter__(self):
|
||||||
return len(self.objs)
|
return iter(self.objs)
|
||||||
|
|
||||||
def add(self, obj):
|
|
||||||
self.objs.add(obj)
|
|
||||||
return
|
|
||||||
|
|
||||||
def merge(self, group):
|
def __len__(self):
|
||||||
self.objs.update(iter(group))
|
return len(self.objs)
|
||||||
return
|
|
||||||
|
|
||||||
# fixate(): determines its boundery and writing direction.
|
def add(self, obj):
|
||||||
def fixate(self):
|
self.objs.add(obj)
|
||||||
if not self.width and self.objs:
|
return
|
||||||
(bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
|
|
||||||
for obj in self.objs:
|
|
||||||
bx0 = min(bx0, obj.x0)
|
|
||||||
by0 = min(by0, obj.y0)
|
|
||||||
bx1 = max(bx1, obj.x1)
|
|
||||||
by1 = max(by1, obj.y1)
|
|
||||||
self.set_bbox((bx0, by0, bx1, by1))
|
|
||||||
self.weight = sum( obj.get_weight() for obj in self.objs )
|
|
||||||
return
|
|
||||||
|
|
||||||
def get_weight(self):
|
def merge(self, group):
|
||||||
return self.weight
|
self.objs.update(iter(group))
|
||||||
|
return
|
||||||
def get_direction(self):
|
|
||||||
return None
|
# fixate(): determines its boundery and writing direction.
|
||||||
|
def fixate(self):
|
||||||
|
if not self.width and self.objs:
|
||||||
|
(bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
|
||||||
|
for obj in self.objs:
|
||||||
|
bx0 = min(bx0, obj.x0)
|
||||||
|
by0 = min(by0, obj.y0)
|
||||||
|
bx1 = max(bx1, obj.x1)
|
||||||
|
by1 = max(by1, obj.y1)
|
||||||
|
self.set_bbox((bx0, by0, bx1, by1))
|
||||||
|
self.weight = sum( obj.get_weight() for obj in self.objs )
|
||||||
|
return
|
||||||
|
|
||||||
|
def get_weight(self):
|
||||||
|
return self.weight
|
||||||
|
|
||||||
|
def get_direction(self):
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
## LTLine
|
## LTLine
|
||||||
##
|
##
|
||||||
class LTLine(LayoutItem):
|
class LTLine(LayoutItem):
|
||||||
|
|
||||||
def __init__(self, linewidth, direction, bbox):
|
def __init__(self, linewidth, direction, bbox):
|
||||||
LayoutItem.__init__(self, bbox)
|
LayoutItem.__init__(self, bbox)
|
||||||
self.linewidth = linewidth
|
self.linewidth = linewidth
|
||||||
self.direction = direction
|
self.direction = direction
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
## LTRect
|
## LTRect
|
||||||
##
|
##
|
||||||
class LTRect(LayoutItem):
|
class LTRect(LayoutItem):
|
||||||
|
|
||||||
def __init__(self, linewidth, bbox):
|
def __init__(self, linewidth, bbox):
|
||||||
LayoutItem.__init__(self, bbox)
|
LayoutItem.__init__(self, bbox)
|
||||||
self.linewidth = linewidth
|
self.linewidth = linewidth
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
## LTText
|
## LTText
|
||||||
##
|
##
|
||||||
class LTText(object):
|
class LTText(object):
|
||||||
|
|
||||||
def __init__(self, text):
|
def __init__(self, text):
|
||||||
self.text = text
|
self.text = text
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<text %r>' % self.text
|
return '<text %r>' % self.text
|
||||||
|
|
||||||
def get_weight(self):
|
def get_weight(self):
|
||||||
return len(self.text)
|
return len(self.text)
|
||||||
|
|
||||||
def is_upright(self):
|
def is_upright(self):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
## LTAnon
|
## LTAnon
|
||||||
##
|
##
|
||||||
class LTAnon(LTText):
|
class LTAnon(LTText):
|
||||||
|
|
||||||
def get_weight(self):
|
def get_weight(self):
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
## LTTextItem
|
## LTTextItem
|
||||||
##
|
##
|
||||||
class LTTextItem(LayoutItem, LTText):
|
class LTTextItem(LayoutItem, LTText):
|
||||||
|
|
||||||
debug = 1
|
debug = 1
|
||||||
|
|
||||||
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
|
|
||||||
assert chars
|
|
||||||
self.matrix = matrix
|
|
||||||
self.font = font
|
|
||||||
self.vertical = font.is_vertical()
|
|
||||||
self.text = ''.join( char for (char,_) in chars )
|
|
||||||
adv = sum( font.char_width(cid) for (_,cid) in chars )
|
|
||||||
adv = (adv * fontsize + (len(chars)-1)*charspace) * scaling
|
|
||||||
#size = (font.get_ascent() - font.get_descent()) * fontsize
|
|
||||||
size = font.get_size() * fontsize
|
|
||||||
(_,_,_,_,tx,ty) = self.matrix
|
|
||||||
if not self.vertical:
|
|
||||||
# horizontal text
|
|
||||||
self.adv = (adv, 0)
|
|
||||||
(dx,dy) = apply_matrix_norm(self.matrix, (adv,size))
|
|
||||||
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
|
|
||||||
ty += descent
|
|
||||||
bbox = (tx, ty, tx+dx, ty+dy)
|
|
||||||
else:
|
|
||||||
# vertical text
|
|
||||||
self.adv = (0, adv)
|
|
||||||
(_,cid) = chars[0]
|
|
||||||
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001))
|
|
||||||
(dx,dy) = apply_matrix_norm(self.matrix, (size,adv))
|
|
||||||
tx -= dx/2
|
|
||||||
ty += disp
|
|
||||||
bbox = (tx, ty+dy, tx+dx, ty)
|
|
||||||
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
|
|
||||||
LayoutItem.__init__(self, bbox)
|
|
||||||
return
|
|
||||||
|
|
||||||
def __repr__(self):
|
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
|
||||||
if self.debug:
|
assert chars
|
||||||
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
|
self.matrix = matrix
|
||||||
('[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % self.matrix,
|
self.font = font
|
||||||
self.font, self.fontsize, self.get_bbox(),
|
self.vertical = font.is_vertical()
|
||||||
'(%.1f, %.1f)' % self.adv,
|
self.text = ''.join( char for (char,_) in chars )
|
||||||
self.text))
|
adv = sum( font.char_width(cid) for (_,cid) in chars )
|
||||||
else:
|
adv = (adv * fontsize + (len(chars)-1)*charspace) * scaling
|
||||||
return '<text %r>' % self.text
|
#size = (font.get_ascent() - font.get_descent()) * fontsize
|
||||||
|
size = font.get_size() * fontsize
|
||||||
|
(_,_,_,_,tx,ty) = self.matrix
|
||||||
|
if not self.vertical:
|
||||||
|
# horizontal text
|
||||||
|
self.adv = (adv, 0)
|
||||||
|
(dx,dy) = apply_matrix_norm(self.matrix, (adv,size))
|
||||||
|
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
|
||||||
|
ty += descent
|
||||||
|
bbox = (tx, ty, tx+dx, ty+dy)
|
||||||
|
else:
|
||||||
|
# vertical text
|
||||||
|
self.adv = (0, adv)
|
||||||
|
(_,cid) = chars[0]
|
||||||
|
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001))
|
||||||
|
(dx,dy) = apply_matrix_norm(self.matrix, (size,adv))
|
||||||
|
tx -= dx/2
|
||||||
|
ty += disp
|
||||||
|
bbox = (tx, ty+dy, tx+dx, ty)
|
||||||
|
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
|
||||||
|
LayoutItem.__init__(self, bbox)
|
||||||
|
return
|
||||||
|
|
||||||
def get_margin(self):
|
def __repr__(self):
|
||||||
return abs(self.fontsize)
|
if self.debug:
|
||||||
|
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
|
||||||
|
('[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % self.matrix,
|
||||||
|
self.font, self.fontsize, self.get_bbox(),
|
||||||
|
'(%.1f, %.1f)' % self.adv,
|
||||||
|
self.text))
|
||||||
|
else:
|
||||||
|
return '<text %r>' % self.text
|
||||||
|
|
||||||
def is_vertical(self):
|
def get_margin(self):
|
||||||
return self.vertical
|
return abs(self.fontsize)
|
||||||
|
|
||||||
def is_upright(self):
|
def is_vertical(self):
|
||||||
(a,b,c,d,e,f) = self.matrix
|
return self.vertical
|
||||||
return 0 < a*d and b*c <= 0
|
|
||||||
|
def is_upright(self):
|
||||||
|
(a,b,c,d,e,f) = self.matrix
|
||||||
|
return 0 < a*d and b*c <= 0
|
||||||
|
|
||||||
|
|
||||||
## LTFigure
|
## LTFigure
|
||||||
##
|
##
|
||||||
class LTFigure(LayoutContainer):
|
class LTFigure(LayoutContainer):
|
||||||
|
|
||||||
def __init__(self, id, bbox, matrix):
|
|
||||||
(x,y,w,h) = bbox
|
|
||||||
x0 = y0 = INF
|
|
||||||
x1 = y1 = -INF
|
|
||||||
for (p,q) in ((x,y),(x+w,y),(x,y+h),(x+w,y+h)):
|
|
||||||
(p,q) = apply_matrix_pt(matrix, (p,q))
|
|
||||||
x0 = min(x0, p)
|
|
||||||
x1 = max(x1, p)
|
|
||||||
y0 = min(y0, q)
|
|
||||||
y1 = max(y1, q)
|
|
||||||
bbox = (x0,y0,x1,y1)
|
|
||||||
self.matrix = matrix
|
|
||||||
LayoutContainer.__init__(self, id, bbox)
|
|
||||||
return
|
|
||||||
|
|
||||||
def __repr__(self):
|
def __init__(self, id, bbox, matrix):
|
||||||
return ('<figure id=%r bbox=%s matrix=%r>' % (self.id, self.get_bbox(), self.matrix))
|
(x,y,w,h) = bbox
|
||||||
|
x0 = y0 = INF
|
||||||
|
x1 = y1 = -INF
|
||||||
|
for (p,q) in ((x,y),(x+w,y),(x,y+h),(x+w,y+h)):
|
||||||
|
(p,q) = apply_matrix_pt(matrix, (p,q))
|
||||||
|
x0 = min(x0, p)
|
||||||
|
x1 = max(x1, p)
|
||||||
|
y0 = min(y0, q)
|
||||||
|
y1 = max(y1, q)
|
||||||
|
bbox = (x0,y0,x1,y1)
|
||||||
|
self.matrix = matrix
|
||||||
|
LayoutContainer.__init__(self, id, bbox)
|
||||||
|
return
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return ('<figure id=%r bbox=%s matrix=%r>' % (self.id, self.get_bbox(), self.matrix))
|
||||||
|
|
||||||
|
|
||||||
## LTTextLine
|
## LTTextLine
|
||||||
##
|
##
|
||||||
class LTTextLine(LayoutContainer):
|
class LTTextLine(LayoutContainer):
|
||||||
|
|
||||||
def __init__(self, id, objs, direction, word_margin):
|
def __init__(self, id, objs, direction, word_margin):
|
||||||
LayoutContainer.__init__(self, id, (0,0,0,0), objs)
|
LayoutContainer.__init__(self, id, (0,0,0,0), objs)
|
||||||
self.direction = direction
|
self.direction = direction
|
||||||
self.word_margin = word_margin
|
self.word_margin = word_margin
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<line %s(%s)>' % (self.get_bbox(), self.direction))
|
return ('<line %s(%s)>' % (self.get_bbox(), self.direction))
|
||||||
|
|
||||||
def get_margin(self):
|
def get_margin(self):
|
||||||
return min(self.width, self.height)
|
return min(self.width, self.height)
|
||||||
|
|
||||||
def get_direction(self):
|
def get_direction(self):
|
||||||
return self.direction
|
return self.direction
|
||||||
|
|
||||||
def get_text(self):
|
def get_text(self):
|
||||||
return ''.join( obj.text for obj in self.objs if isinstance(obj, LTText) )
|
return ''.join( obj.text for obj in self.objs if isinstance(obj, LTText) )
|
||||||
|
|
||||||
def fixate(self):
|
def fixate(self):
|
||||||
LayoutContainer.fixate(self)
|
LayoutContainer.fixate(self)
|
||||||
objs = []
|
objs = []
|
||||||
if self.direction == 'V':
|
if self.direction == 'V':
|
||||||
y0 = -INF
|
y0 = -INF
|
||||||
for obj in sorted(self.objs, key=lambda obj: -obj.y1):
|
for obj in sorted(self.objs, key=lambda obj: -obj.y1):
|
||||||
if isinstance(obj, LTTextItem) and self.word_margin:
|
if isinstance(obj, LTTextItem) and self.word_margin:
|
||||||
margin = self.word_margin * obj.get_margin()
|
margin = self.word_margin * obj.get_margin()
|
||||||
if obj.y1+margin < y0:
|
if obj.y1+margin < y0:
|
||||||
objs.append(LTAnon(' '))
|
objs.append(LTAnon(' '))
|
||||||
objs.append(obj)
|
objs.append(obj)
|
||||||
y0 = obj.y0
|
y0 = obj.y0
|
||||||
else:
|
else:
|
||||||
x1 = INF
|
x1 = INF
|
||||||
for obj in sorted(self.objs, key=lambda obj: obj.x0):
|
for obj in sorted(self.objs, key=lambda obj: obj.x0):
|
||||||
if isinstance(obj, LTTextItem) and self.word_margin:
|
if isinstance(obj, LTTextItem) and self.word_margin:
|
||||||
margin = self.word_margin * obj.get_margin()
|
margin = self.word_margin * obj.get_margin()
|
||||||
if x1 < obj.x0-margin:
|
if x1 < obj.x0-margin:
|
||||||
objs.append(LTAnon(' '))
|
objs.append(LTAnon(' '))
|
||||||
objs.append(obj)
|
objs.append(obj)
|
||||||
x1 = obj.x1
|
x1 = obj.x1
|
||||||
objs.append(LTAnon('\n'))
|
objs.append(LTAnon('\n'))
|
||||||
self.objs = objs
|
self.objs = objs
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
## LTTextBox
|
## LTTextBox
|
||||||
|
@ -392,109 +392,109 @@ class LTTextLine(LayoutContainer):
|
||||||
##
|
##
|
||||||
class LTTextBox(LayoutContainer):
|
class LTTextBox(LayoutContainer):
|
||||||
|
|
||||||
def __init__(self, id, objs, direction):
|
def __init__(self, id, objs, direction):
|
||||||
LayoutContainer.__init__(self, id, (0,0,0,0), objs)
|
LayoutContainer.__init__(self, id, (0,0,0,0), objs)
|
||||||
self.direction = direction
|
self.direction = direction
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<textbox %s(%s) %r...>' % (self.get_bbox(), self.direction, self.get_text()[:20]))
|
return ('<textbox %s(%s) %r...>' % (self.get_bbox(), self.direction, self.get_text()[:20]))
|
||||||
|
|
||||||
def get_text(self):
|
def get_text(self):
|
||||||
return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) )
|
return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) )
|
||||||
|
|
||||||
def fixate(self):
|
|
||||||
LayoutContainer.fixate(self)
|
|
||||||
if self.direction == 'V':
|
|
||||||
self.objs = sorted(self.objs, key=lambda obj: -obj.x1)
|
|
||||||
else:
|
|
||||||
self.objs = sorted(self.objs, key=lambda obj: -obj.y1)
|
|
||||||
return
|
|
||||||
|
|
||||||
def get_direction(self):
|
def fixate(self):
|
||||||
return self.direction
|
LayoutContainer.fixate(self)
|
||||||
|
if self.direction == 'V':
|
||||||
|
self.objs = sorted(self.objs, key=lambda obj: -obj.x1)
|
||||||
|
else:
|
||||||
|
self.objs = sorted(self.objs, key=lambda obj: -obj.y1)
|
||||||
|
return
|
||||||
|
|
||||||
|
def get_direction(self):
|
||||||
|
return self.direction
|
||||||
|
|
||||||
|
|
||||||
def tsort(objs, f):
|
def tsort(objs, f):
|
||||||
gi = dict( (obj,[]) for obj in objs )
|
gi = dict( (obj,[]) for obj in objs )
|
||||||
go = dict( (obj,[]) for obj in objs )
|
go = dict( (obj,[]) for obj in objs )
|
||||||
for obj1 in objs:
|
for obj1 in objs:
|
||||||
for obj2 in objs:
|
for obj2 in objs:
|
||||||
if obj1 is obj2: continue
|
if obj1 is obj2: continue
|
||||||
if f(obj1, obj2): # obj1 -> obj2
|
if f(obj1, obj2): # obj1 -> obj2
|
||||||
go[obj1].append(obj2)
|
go[obj1].append(obj2)
|
||||||
gi[obj2].append(obj1)
|
gi[obj2].append(obj1)
|
||||||
r = objs[:]
|
r = objs[:]
|
||||||
s = []
|
s = []
|
||||||
while r:
|
while r:
|
||||||
for obj in r:
|
for obj in r:
|
||||||
if not go[obj] or gi[obj]: continue
|
if not go[obj] or gi[obj]: continue
|
||||||
for c in go[obj]:
|
for c in go[obj]:
|
||||||
gi[c].remove(obj)
|
gi[c].remove(obj)
|
||||||
del gi[obj]
|
del gi[obj]
|
||||||
del go[obj]
|
del go[obj]
|
||||||
r.remove(obj)
|
r.remove(obj)
|
||||||
s.append(obj)
|
s.append(obj)
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
obj = r.pop()
|
obj = r.pop()
|
||||||
del gi[obj]
|
del gi[obj]
|
||||||
del go[obj]
|
del go[obj]
|
||||||
s.append(obj)
|
s.append(obj)
|
||||||
return s
|
return s
|
||||||
|
|
||||||
|
|
||||||
## LTPage
|
## LTPage
|
||||||
##
|
##
|
||||||
class LTPage(LayoutContainer):
|
class LTPage(LayoutContainer):
|
||||||
|
|
||||||
def __init__(self, id, bbox, rotate=0):
|
|
||||||
LayoutContainer.__init__(self, id, bbox)
|
|
||||||
self.rotate = rotate
|
|
||||||
return
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.get_bbox(), self.rotate))
|
|
||||||
|
|
||||||
def analyze_layout(self, laparams):
|
def __init__(self, id, bbox, rotate=0):
|
||||||
textobjs = []
|
LayoutContainer.__init__(self, id, bbox)
|
||||||
otherobjs = []
|
self.rotate = rotate
|
||||||
for obj in self.objs:
|
return
|
||||||
if isinstance(obj, LTText) and obj.is_upright():
|
|
||||||
textobjs.append(obj)
|
def __repr__(self):
|
||||||
else:
|
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.get_bbox(), self.rotate))
|
||||||
otherobjs.append(obj)
|
|
||||||
if laparams.direction == 'V':
|
def analyze_layout(self, laparams):
|
||||||
def vline(obj1, obj2):
|
textobjs = []
|
||||||
return obj1.width * laparams.line_overlap < obj1.hoverlap(obj2)
|
otherobjs = []
|
||||||
def vorder(obj1, obj2):
|
for obj in self.objs:
|
||||||
if obj1.voverlap(obj2):
|
if isinstance(obj, LTText) and obj.is_upright():
|
||||||
return obj2.x1 < obj1.x0
|
textobjs.append(obj)
|
||||||
elif obj1.hoverlap(obj2):
|
else:
|
||||||
return obj2.y1 < obj1.y0
|
otherobjs.append(obj)
|
||||||
|
if laparams.direction == 'V':
|
||||||
|
def vline(obj1, obj2):
|
||||||
|
return obj1.width * laparams.line_overlap < obj1.hoverlap(obj2)
|
||||||
|
def vorder(obj1, obj2):
|
||||||
|
if obj1.voverlap(obj2):
|
||||||
|
return obj2.x1 < obj1.x0
|
||||||
|
elif obj1.hoverlap(obj2):
|
||||||
|
return obj2.y1 < obj1.y0
|
||||||
|
else:
|
||||||
|
return obj2.x1 < obj1.x0 and obj2.y1 < obj1.y0
|
||||||
|
lines = ClusterSet.build(textobjs, 0, laparams.char_margin,
|
||||||
|
(lambda id,objs: LTTextLine(id, objs, 'V', laparams.word_margin)),
|
||||||
|
vline)
|
||||||
|
boxes = ClusterSet.build(lines, laparams.line_margin, 0,
|
||||||
|
(lambda id,objs: LTTextBox(id, objs, 'V')))
|
||||||
|
boxes = tsort(boxes, vorder)
|
||||||
else:
|
else:
|
||||||
return obj2.x1 < obj1.x0 and obj2.y1 < obj1.y0
|
def hline(obj1, obj2):
|
||||||
lines = ClusterSet.build(textobjs, 0, laparams.char_margin,
|
return obj1.height * laparams.line_overlap < obj1.voverlap(obj2)
|
||||||
(lambda id,objs: LTTextLine(id, objs, 'V', laparams.word_margin)),
|
def horder(obj1, obj2):
|
||||||
vline)
|
if obj1.hoverlap(obj2):
|
||||||
boxes = ClusterSet.build(lines, laparams.line_margin, 0,
|
return obj2.y1 < obj1.y0
|
||||||
(lambda id,objs: LTTextBox(id, objs, 'V')))
|
elif obj1.voverlap(obj2):
|
||||||
boxes = tsort(boxes, vorder)
|
return obj1.x1 < obj2.x0
|
||||||
else:
|
else:
|
||||||
def hline(obj1, obj2):
|
return obj1.x1 < obj2.x0 and obj2.y1 < obj1.y0
|
||||||
return obj1.height * laparams.line_overlap < obj1.voverlap(obj2)
|
lines = ClusterSet.build(textobjs, laparams.char_margin, 0,
|
||||||
def horder(obj1, obj2):
|
(lambda id,objs: LTTextLine(id, objs, 'H', laparams.word_margin)),
|
||||||
if obj1.hoverlap(obj2):
|
hline)
|
||||||
return obj2.y1 < obj1.y0
|
boxes = ClusterSet.build(lines, 0, laparams.line_margin,
|
||||||
elif obj1.voverlap(obj2):
|
(lambda id,objs: LTTextBox(id, objs, 'H')))
|
||||||
return obj1.x1 < obj2.x0
|
boxes = tsort(boxes, horder)
|
||||||
else:
|
self.objs = otherobjs + boxes
|
||||||
return obj1.x1 < obj2.x0 and obj2.y1 < obj1.y0
|
return
|
||||||
lines = ClusterSet.build(textobjs, laparams.char_margin, 0,
|
|
||||||
(lambda id,objs: LTTextLine(id, objs, 'H', laparams.word_margin)),
|
|
||||||
hline)
|
|
||||||
boxes = ClusterSet.build(lines, 0, laparams.line_margin,
|
|
||||||
(lambda id,objs: LTTextBox(id, objs, 'H')))
|
|
||||||
boxes = tsort(boxes, horder)
|
|
||||||
self.objs = otherobjs + boxes
|
|
||||||
return
|
|
||||||
|
|
168
pdfminer/lzw.py
168
pdfminer/lzw.py
|
@ -7,93 +7,93 @@ stderr = sys.stderr
|
||||||
##
|
##
|
||||||
class LZWDecoder(object):
|
class LZWDecoder(object):
|
||||||
|
|
||||||
debug = 0
|
debug = 0
|
||||||
|
|
||||||
def __init__(self, fp):
|
|
||||||
self.fp = fp
|
|
||||||
self.buff = 0
|
|
||||||
self.bpos = 8
|
|
||||||
self.nbits = 9
|
|
||||||
self.table = None
|
|
||||||
self.prevbuf = None
|
|
||||||
return
|
|
||||||
|
|
||||||
def readbits(self, bits):
|
def __init__(self, fp):
|
||||||
v = 0
|
self.fp = fp
|
||||||
while 1:
|
self.buff = 0
|
||||||
# the number of remaining bits we can get from the current buffer.
|
self.bpos = 8
|
||||||
r = 8-self.bpos
|
self.nbits = 9
|
||||||
if bits <= r:
|
self.table = None
|
||||||
# |-----8-bits-----|
|
self.prevbuf = None
|
||||||
# |-bpos-|-bits-| |
|
return
|
||||||
# | |----r----|
|
|
||||||
v = (v<<bits) | ((self.buff>>(r-bits)) & ((1<<bits)-1))
|
def readbits(self, bits):
|
||||||
self.bpos += bits
|
v = 0
|
||||||
break
|
while 1:
|
||||||
else:
|
# the number of remaining bits we can get from the current buffer.
|
||||||
# |-----8-bits-----|
|
r = 8-self.bpos
|
||||||
# |-bpos-|---bits----...
|
if bits <= r:
|
||||||
# | |----r----|
|
# |-----8-bits-----|
|
||||||
v = (v<<r) | (self.buff & ((1<<r)-1))
|
# |-bpos-|-bits-| |
|
||||||
bits -= r
|
# | |----r----|
|
||||||
x = self.fp.read(1)
|
v = (v<<bits) | ((self.buff>>(r-bits)) & ((1<<bits)-1))
|
||||||
if not x: raise EOFError
|
self.bpos += bits
|
||||||
self.buff = ord(x)
|
break
|
||||||
self.bpos = 0
|
else:
|
||||||
return v
|
# |-----8-bits-----|
|
||||||
|
# |-bpos-|---bits----...
|
||||||
|
# | |----r----|
|
||||||
|
v = (v<<r) | (self.buff & ((1<<r)-1))
|
||||||
|
bits -= r
|
||||||
|
x = self.fp.read(1)
|
||||||
|
if not x: raise EOFError
|
||||||
|
self.buff = ord(x)
|
||||||
|
self.bpos = 0
|
||||||
|
return v
|
||||||
|
|
||||||
|
def feed(self, code):
|
||||||
|
x = ''
|
||||||
|
if code == 256:
|
||||||
|
self.table = [ chr(c) for c in xrange(256) ] # 0-255
|
||||||
|
self.table.append(None) # 256
|
||||||
|
self.table.append(None) # 257
|
||||||
|
self.prevbuf = ''
|
||||||
|
self.nbits = 9
|
||||||
|
elif code == 257:
|
||||||
|
pass
|
||||||
|
elif not self.prevbuf:
|
||||||
|
x = self.prevbuf = self.table[code]
|
||||||
|
else:
|
||||||
|
if code < len(self.table):
|
||||||
|
x = self.table[code]
|
||||||
|
self.table.append(self.prevbuf+x[0])
|
||||||
|
else:
|
||||||
|
self.table.append(self.prevbuf+self.prevbuf[0])
|
||||||
|
x = self.table[code]
|
||||||
|
l = len(self.table)
|
||||||
|
if l == 511:
|
||||||
|
self.nbits = 10
|
||||||
|
elif l == 1023:
|
||||||
|
self.nbits = 11
|
||||||
|
elif l == 2047:
|
||||||
|
self.nbits = 12
|
||||||
|
self.prevbuf = x
|
||||||
|
return x
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
while 1:
|
||||||
|
try:
|
||||||
|
code = self.readbits(self.nbits)
|
||||||
|
except EOFError:
|
||||||
|
break
|
||||||
|
x = self.feed(code)
|
||||||
|
yield x
|
||||||
|
if self.debug:
|
||||||
|
print >>stderr, ('nbits=%d, code=%d, output=%r, table=%r' %
|
||||||
|
(self.nbits, code, x, self.table[258:]))
|
||||||
|
return
|
||||||
|
|
||||||
def feed(self, code):
|
|
||||||
x = ''
|
|
||||||
if code == 256:
|
|
||||||
self.table = [ chr(c) for c in xrange(256) ] # 0-255
|
|
||||||
self.table.append(None) # 256
|
|
||||||
self.table.append(None) # 257
|
|
||||||
self.prevbuf = ''
|
|
||||||
self.nbits = 9
|
|
||||||
elif code == 257:
|
|
||||||
pass
|
|
||||||
elif not self.prevbuf:
|
|
||||||
x = self.prevbuf = self.table[code]
|
|
||||||
else:
|
|
||||||
if code < len(self.table):
|
|
||||||
x = self.table[code]
|
|
||||||
self.table.append(self.prevbuf+x[0])
|
|
||||||
else:
|
|
||||||
self.table.append(self.prevbuf+self.prevbuf[0])
|
|
||||||
x = self.table[code]
|
|
||||||
l = len(self.table)
|
|
||||||
if l == 511:
|
|
||||||
self.nbits = 10
|
|
||||||
elif l == 1023:
|
|
||||||
self.nbits = 11
|
|
||||||
elif l == 2047:
|
|
||||||
self.nbits = 12
|
|
||||||
self.prevbuf = x
|
|
||||||
return x
|
|
||||||
|
|
||||||
def run(self):
|
|
||||||
while 1:
|
|
||||||
try:
|
|
||||||
code = self.readbits(self.nbits)
|
|
||||||
except EOFError:
|
|
||||||
break
|
|
||||||
x = self.feed(code)
|
|
||||||
yield x
|
|
||||||
if self.debug:
|
|
||||||
print >>stderr, ('nbits=%d, code=%d, output=%r, table=%r' %
|
|
||||||
(self.nbits, code, x, self.table[258:]))
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
def main(argv):
|
def main(argv):
|
||||||
import StringIO
|
import StringIO
|
||||||
data = '\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01'
|
data = '\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01'
|
||||||
fp = StringIO.StringIO(data)
|
fp = StringIO.StringIO(data)
|
||||||
expected = '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42'
|
expected = '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42'
|
||||||
LZWDecoder.debug = 1
|
LZWDecoder.debug = 1
|
||||||
output = ''.join(LZWDecoder(fp).run())
|
output = ''.join(LZWDecoder(fp).run())
|
||||||
print (data, expected, output)
|
print (data, expected, output)
|
||||||
print output == expected
|
print output == expected
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||||
|
|
|
@ -10,14 +10,14 @@ LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
|
||||||
LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
|
LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
|
||||||
|
|
||||||
class PDFColorSpace(object):
|
class PDFColorSpace(object):
|
||||||
|
|
||||||
def __init__(self, name, ncomponents):
|
def __init__(self, name, ncomponents):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.ncomponents = ncomponents
|
self.ncomponents = ncomponents
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<PDFColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
|
return '<PDFColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
|
||||||
|
|
||||||
|
|
||||||
PREDEFINED_COLORSPACE = dict(
|
PREDEFINED_COLORSPACE = dict(
|
||||||
|
|
|
@ -9,116 +9,116 @@ from pdfminer.pdffont import PDFUnicodeNotDefined
|
||||||
##
|
##
|
||||||
class PDFDevice(object):
|
class PDFDevice(object):
|
||||||
|
|
||||||
debug = 0
|
debug = 0
|
||||||
|
|
||||||
def __init__(self, rsrc):
|
|
||||||
self.rsrc = rsrc
|
|
||||||
self.ctm = None
|
|
||||||
return
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return '<PDFDevice>'
|
|
||||||
|
|
||||||
def close(self):
|
def __init__(self, rsrc):
|
||||||
return
|
self.rsrc = rsrc
|
||||||
|
self.ctm = None
|
||||||
|
return
|
||||||
|
|
||||||
def set_ctm(self, ctm):
|
def __repr__(self):
|
||||||
self.ctm = ctm
|
return '<PDFDevice>'
|
||||||
return
|
|
||||||
|
|
||||||
def begin_tag(self, tag, props=None):
|
def close(self):
|
||||||
return
|
return
|
||||||
def end_tag(self):
|
|
||||||
return
|
|
||||||
def do_tag(self, tag, props=None):
|
|
||||||
return
|
|
||||||
|
|
||||||
def begin_page(self, page, ctm):
|
def set_ctm(self, ctm):
|
||||||
return
|
self.ctm = ctm
|
||||||
def end_page(self, page):
|
return
|
||||||
return
|
|
||||||
def begin_figure(self, name, bbox, matrix):
|
|
||||||
return
|
|
||||||
def end_figure(self, name):
|
|
||||||
return
|
|
||||||
|
|
||||||
def paint_path(self, graphicstate, stroke, fill, evenodd, path):
|
def begin_tag(self, tag, props=None):
|
||||||
return
|
return
|
||||||
def render_image(self, stream, size):
|
def end_tag(self):
|
||||||
return
|
return
|
||||||
def render_string(self, textstate, seq):
|
def do_tag(self, tag, props=None):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def begin_page(self, page, ctm):
|
||||||
|
return
|
||||||
|
def end_page(self, page):
|
||||||
|
return
|
||||||
|
def begin_figure(self, name, bbox, matrix):
|
||||||
|
return
|
||||||
|
def end_figure(self, name):
|
||||||
|
return
|
||||||
|
|
||||||
|
def paint_path(self, graphicstate, stroke, fill, evenodd, path):
|
||||||
|
return
|
||||||
|
def render_image(self, stream, size):
|
||||||
|
return
|
||||||
|
def render_string(self, textstate, seq):
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
## PDFTextDevice
|
## PDFTextDevice
|
||||||
##
|
##
|
||||||
class PDFTextDevice(PDFDevice):
|
class PDFTextDevice(PDFDevice):
|
||||||
|
|
||||||
def handle_undefined_char(self, cidcoding, cid):
|
def handle_undefined_char(self, cidcoding, cid):
|
||||||
if self.debug:
|
if self.debug:
|
||||||
print >>sys.stderr, 'undefined: %r, %r' % (cidcoding, cid)
|
print >>sys.stderr, 'undefined: %r, %r' % (cidcoding, cid)
|
||||||
return '?'
|
return '?'
|
||||||
|
|
||||||
def render_chars(self, matrix, font, fontsize, charspace, scaling, chars):
|
def render_chars(self, matrix, font, fontsize, charspace, scaling, chars):
|
||||||
return (0, 0)
|
return (0, 0)
|
||||||
|
|
||||||
def render_string(self, textstate, seq):
|
def render_string(self, textstate, seq):
|
||||||
matrix = mult_matrix(textstate.matrix, self.ctm)
|
matrix = mult_matrix(textstate.matrix, self.ctm)
|
||||||
font = textstate.font
|
font = textstate.font
|
||||||
fontsize = textstate.fontsize
|
fontsize = textstate.fontsize
|
||||||
scaling = textstate.scaling * .01
|
scaling = textstate.scaling * .01
|
||||||
charspace = textstate.charspace * scaling
|
charspace = textstate.charspace * scaling
|
||||||
wordspace = textstate.wordspace * scaling
|
wordspace = textstate.wordspace * scaling
|
||||||
dxscale = .001 * fontsize * scaling
|
dxscale = .001 * fontsize * scaling
|
||||||
chars = []
|
|
||||||
needspace = False
|
|
||||||
(x,y) = textstate.linematrix
|
|
||||||
for obj in seq:
|
|
||||||
if isinstance(obj, int) or isinstance(obj, float):
|
|
||||||
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
|
|
||||||
fontsize, charspace, scaling, chars)
|
|
||||||
x += dx
|
|
||||||
y += dy
|
|
||||||
d = -obj*dxscale
|
|
||||||
if font.is_vertical():
|
|
||||||
y += d
|
|
||||||
else:
|
|
||||||
x += d
|
|
||||||
chars = []
|
chars = []
|
||||||
needspace = False
|
needspace = False
|
||||||
else:
|
(x,y) = textstate.linematrix
|
||||||
for cid in font.decode(obj):
|
for obj in seq:
|
||||||
try:
|
if isinstance(obj, int) or isinstance(obj, float):
|
||||||
char = font.to_unicode(cid)
|
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
|
||||||
except PDFUnicodeNotDefined, e:
|
fontsize, charspace, scaling, chars)
|
||||||
(cidcoding, cid) = e.args
|
x += dx
|
||||||
char = self.handle_undefined_char(cidcoding, cid)
|
y += dy
|
||||||
chars.append((char, cid))
|
d = -obj*dxscale
|
||||||
if cid == 32 and textstate.wordspace and not font.is_multibyte():
|
if font.is_vertical():
|
||||||
|
y += d
|
||||||
|
else:
|
||||||
|
x += d
|
||||||
|
chars = []
|
||||||
|
needspace = False
|
||||||
|
else:
|
||||||
|
for cid in font.decode(obj):
|
||||||
|
try:
|
||||||
|
char = font.to_unicode(cid)
|
||||||
|
except PDFUnicodeNotDefined, e:
|
||||||
|
(cidcoding, cid) = e.args
|
||||||
|
char = self.handle_undefined_char(cidcoding, cid)
|
||||||
|
chars.append((char, cid))
|
||||||
|
if cid == 32 and textstate.wordspace and not font.is_multibyte():
|
||||||
|
if needspace:
|
||||||
|
if font.is_vertical():
|
||||||
|
y += charspace
|
||||||
|
else:
|
||||||
|
x += charspace
|
||||||
|
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
|
||||||
|
fontsize, charspace, scaling, chars)
|
||||||
|
needspace = True
|
||||||
|
x += dx
|
||||||
|
y += dy
|
||||||
|
if font.is_vertical():
|
||||||
|
y += wordspace
|
||||||
|
else:
|
||||||
|
x += wordspace
|
||||||
|
chars = []
|
||||||
|
if chars:
|
||||||
if needspace:
|
if needspace:
|
||||||
if font.is_vertical():
|
if font.is_vertical():
|
||||||
y += charspace
|
y += charspace
|
||||||
else:
|
else:
|
||||||
x += charspace
|
x += charspace
|
||||||
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
|
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
|
||||||
fontsize, charspace, scaling, chars)
|
fontsize, charspace, scaling, chars)
|
||||||
needspace = True
|
|
||||||
x += dx
|
x += dx
|
||||||
y += dy
|
y += dy
|
||||||
if font.is_vertical():
|
textstate.linematrix = (x,y)
|
||||||
y += wordspace
|
return
|
||||||
else:
|
|
||||||
x += wordspace
|
|
||||||
chars = []
|
|
||||||
if chars:
|
|
||||||
if needspace:
|
|
||||||
if font.is_vertical():
|
|
||||||
y += charspace
|
|
||||||
else:
|
|
||||||
x += charspace
|
|
||||||
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
|
|
||||||
fontsize, charspace, scaling, chars)
|
|
||||||
x += dx
|
|
||||||
y += dy
|
|
||||||
textstate.linematrix = (x,y)
|
|
||||||
return
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -25,218 +25,218 @@ class PDFNotImplementedError(PSException): pass
|
||||||
## PDFObjRef
|
## PDFObjRef
|
||||||
##
|
##
|
||||||
class PDFObjRef(PDFObject):
|
class PDFObjRef(PDFObject):
|
||||||
|
|
||||||
def __init__(self, doc, objid, _):
|
|
||||||
if objid == 0:
|
|
||||||
if STRICT:
|
|
||||||
raise PDFValueError('PDF object id cannot be 0.')
|
|
||||||
self.doc = doc
|
|
||||||
self.objid = objid
|
|
||||||
#self.genno = genno # Never used.
|
|
||||||
return
|
|
||||||
|
|
||||||
def __repr__(self):
|
def __init__(self, doc, objid, _):
|
||||||
return '<PDFObjRef:%d>' % (self.objid)
|
if objid == 0:
|
||||||
|
if STRICT:
|
||||||
|
raise PDFValueError('PDF object id cannot be 0.')
|
||||||
|
self.doc = doc
|
||||||
|
self.objid = objid
|
||||||
|
#self.genno = genno # Never used.
|
||||||
|
return
|
||||||
|
|
||||||
def resolve(self):
|
def __repr__(self):
|
||||||
return self.doc.getobj(self.objid)
|
return '<PDFObjRef:%d>' % (self.objid)
|
||||||
|
|
||||||
|
def resolve(self):
|
||||||
|
return self.doc.getobj(self.objid)
|
||||||
|
|
||||||
|
|
||||||
# resolve
|
# resolve
|
||||||
def resolve1(x):
|
def resolve1(x):
|
||||||
'''
|
'''
|
||||||
Resolve an object. If this is an array or dictionary,
|
Resolve an object. If this is an array or dictionary,
|
||||||
it may still contains some indirect objects inside.
|
it may still contains some indirect objects inside.
|
||||||
'''
|
'''
|
||||||
while isinstance(x, PDFObjRef):
|
while isinstance(x, PDFObjRef):
|
||||||
x = x.resolve()
|
x = x.resolve()
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def resolve_all(x):
|
def resolve_all(x):
|
||||||
'''
|
'''
|
||||||
Recursively resolve X and all the internals.
|
Recursively resolve X and all the internals.
|
||||||
Make sure there is no indirect reference within the nested object.
|
Make sure there is no indirect reference within the nested object.
|
||||||
This procedure might be slow.
|
This procedure might be slow.
|
||||||
'''
|
'''
|
||||||
while isinstance(x, PDFObjRef):
|
while isinstance(x, PDFObjRef):
|
||||||
x = x.resolve()
|
x = x.resolve()
|
||||||
if isinstance(x, list):
|
if isinstance(x, list):
|
||||||
x = [ resolve_all(v) for v in x ]
|
x = [ resolve_all(v) for v in x ]
|
||||||
elif isinstance(x, dict):
|
elif isinstance(x, dict):
|
||||||
for (k,v) in x.iteritems():
|
for (k,v) in x.iteritems():
|
||||||
x[k] = resolve_all(v)
|
x[k] = resolve_all(v)
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def decipher_all(decipher, objid, genno, x):
|
def decipher_all(decipher, objid, genno, x):
|
||||||
'''
|
'''
|
||||||
Recursively decipher X.
|
Recursively decipher X.
|
||||||
'''
|
'''
|
||||||
if isinstance(x, str):
|
if isinstance(x, str):
|
||||||
return decipher(objid, genno, x)
|
return decipher(objid, genno, x)
|
||||||
if isinstance(x, list):
|
if isinstance(x, list):
|
||||||
x = [ decipher_all(decipher, objid, genno, v) for v in x ]
|
x = [ decipher_all(decipher, objid, genno, v) for v in x ]
|
||||||
elif isinstance(x, dict):
|
elif isinstance(x, dict):
|
||||||
for (k,v) in x.iteritems():
|
for (k,v) in x.iteritems():
|
||||||
x[k] = decipher_all(decipher, objid, genno, v)
|
x[k] = decipher_all(decipher, objid, genno, v)
|
||||||
return x
|
return x
|
||||||
|
|
||||||
# Type cheking
|
# Type cheking
|
||||||
def int_value(x):
|
def int_value(x):
|
||||||
x = resolve1(x)
|
x = resolve1(x)
|
||||||
if not isinstance(x, int):
|
if not isinstance(x, int):
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFTypeError('Integer required: %r' % x)
|
raise PDFTypeError('Integer required: %r' % x)
|
||||||
return 0
|
return 0
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def float_value(x):
|
def float_value(x):
|
||||||
x = resolve1(x)
|
x = resolve1(x)
|
||||||
if not isinstance(x, float):
|
if not isinstance(x, float):
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFTypeError('Float required: %r' % x)
|
raise PDFTypeError('Float required: %r' % x)
|
||||||
return 0.0
|
return 0.0
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def num_value(x):
|
def num_value(x):
|
||||||
x = resolve1(x)
|
x = resolve1(x)
|
||||||
if not (isinstance(x, int) or isinstance(x, float)):
|
if not (isinstance(x, int) or isinstance(x, float)):
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFTypeError('Int or Float required: %r' % x)
|
raise PDFTypeError('Int or Float required: %r' % x)
|
||||||
return 0
|
return 0
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def str_value(x):
|
def str_value(x):
|
||||||
x = resolve1(x)
|
x = resolve1(x)
|
||||||
if not isinstance(x, str):
|
if not isinstance(x, str):
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFTypeError('String required: %r' % x)
|
raise PDFTypeError('String required: %r' % x)
|
||||||
return ''
|
return ''
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def list_value(x):
|
def list_value(x):
|
||||||
x = resolve1(x)
|
x = resolve1(x)
|
||||||
if not (isinstance(x, list) or isinstance(x, tuple)):
|
if not (isinstance(x, list) or isinstance(x, tuple)):
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFTypeError('List required: %r' % x)
|
raise PDFTypeError('List required: %r' % x)
|
||||||
return []
|
return []
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def dict_value(x):
|
def dict_value(x):
|
||||||
x = resolve1(x)
|
x = resolve1(x)
|
||||||
if not isinstance(x, dict):
|
if not isinstance(x, dict):
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFTypeError('Dict required: %r' % x)
|
raise PDFTypeError('Dict required: %r' % x)
|
||||||
return {}
|
return {}
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def stream_value(x):
|
def stream_value(x):
|
||||||
x = resolve1(x)
|
x = resolve1(x)
|
||||||
if not isinstance(x, PDFStream):
|
if not isinstance(x, PDFStream):
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFTypeError('PDFStream required: %r' % x)
|
raise PDFTypeError('PDFStream required: %r' % x)
|
||||||
return PDFStream({}, '')
|
return PDFStream({}, '')
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
## PDFStream type
|
## PDFStream type
|
||||||
##
|
##
|
||||||
class PDFStream(PDFObject):
|
class PDFStream(PDFObject):
|
||||||
|
|
||||||
def __init__(self, dic, rawdata, decipher=None):
|
|
||||||
self.dic = dic
|
|
||||||
self.rawdata = rawdata
|
|
||||||
self.decipher = decipher
|
|
||||||
self.data = None
|
|
||||||
self.objid = None
|
|
||||||
self.genno = None
|
|
||||||
return
|
|
||||||
|
|
||||||
def set_objid(self, objid, genno):
|
def __init__(self, dic, rawdata, decipher=None):
|
||||||
self.objid = objid
|
self.dic = dic
|
||||||
self.genno = genno
|
self.rawdata = rawdata
|
||||||
return
|
self.decipher = decipher
|
||||||
|
self.data = None
|
||||||
def __repr__(self):
|
self.objid = None
|
||||||
return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
|
self.genno = None
|
||||||
|
return
|
||||||
|
|
||||||
def decomp(self,data):
|
def set_objid(self, objid, genno):
|
||||||
import zlib
|
self.objid = objid
|
||||||
buf = data
|
self.genno = genno
|
||||||
# some FlateDecode streams have garbage (newlines, etc) appended to the
|
return
|
||||||
# end. remove chars from the end to try and decompress the buffer
|
|
||||||
while 8 <= len(buf):
|
|
||||||
try:
|
|
||||||
# will get errors if the document is encrypted.
|
|
||||||
dco = zlib.decompressobj()
|
|
||||||
return dco.decompress(buf)
|
|
||||||
except zlib.error:
|
|
||||||
buf = buf[:-1]
|
|
||||||
raise Exception, "zlib.error while decompressing data"
|
|
||||||
|
|
||||||
def decode(self):
|
def __repr__(self):
|
||||||
assert self.data == None and self.rawdata != None
|
return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
|
||||||
data = self.rawdata
|
|
||||||
if self.decipher:
|
|
||||||
# Handle encryption
|
|
||||||
data = self.decipher(self.objid, self.genno, data)
|
|
||||||
if 'Filter' not in self.dic:
|
|
||||||
self.data = data
|
|
||||||
self.rawdata = None
|
|
||||||
return
|
|
||||||
filters = self.dic['Filter']
|
|
||||||
if not isinstance(filters, list):
|
|
||||||
filters = [ filters ]
|
|
||||||
for f in filters:
|
|
||||||
if f in LITERALS_FLATE_DECODE:
|
|
||||||
# will get errors if the document is encrypted.
|
|
||||||
data = self.decomp(data)
|
|
||||||
elif f in LITERALS_LZW_DECODE:
|
|
||||||
try:
|
|
||||||
from cStringIO import StringIO
|
|
||||||
except ImportError:
|
|
||||||
from StringIO import StringIO
|
|
||||||
data = ''.join(LZWDecoder(StringIO(data)).run())
|
|
||||||
elif f in LITERALS_ASCII85_DECODE:
|
|
||||||
import ascii85
|
|
||||||
data = ascii85.ascii85decode(data)
|
|
||||||
elif f in LITERALS_ASCIIHEX_DECODE:
|
|
||||||
import ascii85
|
|
||||||
data = ascii85.asciihexdecode(data)
|
|
||||||
elif f == LITERAL_CRYPT:
|
|
||||||
raise PDFNotImplementedError('/Crypt filter is unsupported')
|
|
||||||
else:
|
|
||||||
raise PDFNotImplementedError('Unsupported filter: %r' % f)
|
|
||||||
# apply predictors
|
|
||||||
if 'DP' in self.dic:
|
|
||||||
params = self.dic['DP']
|
|
||||||
else:
|
|
||||||
params = self.dic.get('DecodeParms', {})
|
|
||||||
if 'Predictor' in params:
|
|
||||||
pred = int_value(params['Predictor'])
|
|
||||||
if pred:
|
|
||||||
if pred != 12:
|
|
||||||
raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
|
|
||||||
if 'Columns' not in params:
|
|
||||||
raise PDFValueError('Columns undefined for predictor=12')
|
|
||||||
columns = int_value(params['Columns'])
|
|
||||||
buf = ''
|
|
||||||
ent0 = '\x00' * columns
|
|
||||||
for i in xrange(0, len(data), columns+1):
|
|
||||||
pred = data[i]
|
|
||||||
ent1 = data[i+1:i+1+columns]
|
|
||||||
if pred == '\x02':
|
|
||||||
ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )
|
|
||||||
buf += ent1
|
|
||||||
ent0 = ent1
|
|
||||||
data = buf
|
|
||||||
self.data = data
|
|
||||||
self.rawdata = None
|
|
||||||
return
|
|
||||||
|
|
||||||
def get_data(self):
|
def decomp(self,data):
|
||||||
if self.data == None:
|
import zlib
|
||||||
self.decode()
|
buf = data
|
||||||
return self.data
|
# some FlateDecode streams have garbage (newlines, etc) appended to the
|
||||||
|
# end. remove chars from the end to try and decompress the buffer
|
||||||
|
while 8 <= len(buf):
|
||||||
|
try:
|
||||||
|
# will get errors if the document is encrypted.
|
||||||
|
dco = zlib.decompressobj()
|
||||||
|
return dco.decompress(buf)
|
||||||
|
except zlib.error:
|
||||||
|
buf = buf[:-1]
|
||||||
|
raise Exception, "zlib.error while decompressing data"
|
||||||
|
|
||||||
def get_rawdata(self):
|
def decode(self):
|
||||||
return self.rawdata
|
assert self.data == None and self.rawdata != None
|
||||||
|
data = self.rawdata
|
||||||
|
if self.decipher:
|
||||||
|
# Handle encryption
|
||||||
|
data = self.decipher(self.objid, self.genno, data)
|
||||||
|
if 'Filter' not in self.dic:
|
||||||
|
self.data = data
|
||||||
|
self.rawdata = None
|
||||||
|
return
|
||||||
|
filters = self.dic['Filter']
|
||||||
|
if not isinstance(filters, list):
|
||||||
|
filters = [ filters ]
|
||||||
|
for f in filters:
|
||||||
|
if f in LITERALS_FLATE_DECODE:
|
||||||
|
# will get errors if the document is encrypted.
|
||||||
|
data = self.decomp(data)
|
||||||
|
elif f in LITERALS_LZW_DECODE:
|
||||||
|
try:
|
||||||
|
from cStringIO import StringIO
|
||||||
|
except ImportError:
|
||||||
|
from StringIO import StringIO
|
||||||
|
data = ''.join(LZWDecoder(StringIO(data)).run())
|
||||||
|
elif f in LITERALS_ASCII85_DECODE:
|
||||||
|
import ascii85
|
||||||
|
data = ascii85.ascii85decode(data)
|
||||||
|
elif f in LITERALS_ASCIIHEX_DECODE:
|
||||||
|
import ascii85
|
||||||
|
data = ascii85.asciihexdecode(data)
|
||||||
|
elif f == LITERAL_CRYPT:
|
||||||
|
raise PDFNotImplementedError('/Crypt filter is unsupported')
|
||||||
|
else:
|
||||||
|
raise PDFNotImplementedError('Unsupported filter: %r' % f)
|
||||||
|
# apply predictors
|
||||||
|
if 'DP' in self.dic:
|
||||||
|
params = self.dic['DP']
|
||||||
|
else:
|
||||||
|
params = self.dic.get('DecodeParms', {})
|
||||||
|
if 'Predictor' in params:
|
||||||
|
pred = int_value(params['Predictor'])
|
||||||
|
if pred:
|
||||||
|
if pred != 12:
|
||||||
|
raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
|
||||||
|
if 'Columns' not in params:
|
||||||
|
raise PDFValueError('Columns undefined for predictor=12')
|
||||||
|
columns = int_value(params['Columns'])
|
||||||
|
buf = ''
|
||||||
|
ent0 = '\x00' * columns
|
||||||
|
for i in xrange(0, len(data), columns+1):
|
||||||
|
pred = data[i]
|
||||||
|
ent1 = data[i+1:i+1+columns]
|
||||||
|
if pred == '\x02':
|
||||||
|
ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )
|
||||||
|
buf += ent1
|
||||||
|
ent0 = ent1
|
||||||
|
data = buf
|
||||||
|
self.data = data
|
||||||
|
self.rawdata = None
|
||||||
|
return
|
||||||
|
|
||||||
|
def get_data(self):
|
||||||
|
if self.data == None:
|
||||||
|
self.decode()
|
||||||
|
return self.data
|
||||||
|
|
||||||
|
def get_rawdata(self):
|
||||||
|
return self.rawdata
|
||||||
|
|
1030
pdfminer/psparser.py
1030
pdfminer/psparser.py
File diff suppressed because it is too large
Load Diff
|
@ -4,7 +4,7 @@
|
||||||
#
|
#
|
||||||
# by Yusuke Shinyama
|
# by Yusuke Shinyama
|
||||||
# * public domain *
|
# * public domain *
|
||||||
#
|
#
|
||||||
|
|
||||||
import sys, os
|
import sys, os
|
||||||
from struct import pack, unpack
|
from struct import pack, unpack
|
||||||
|
@ -13,24 +13,24 @@ from array import array
|
||||||
|
|
||||||
# calc hash value with a given key
|
# calc hash value with a given key
|
||||||
def cdbhash(s, n=5381L):
|
def cdbhash(s, n=5381L):
|
||||||
return reduce(lambda h,c: ((h*33) ^ ord(c)) & 0xffffffffL, s, n)
|
return reduce(lambda h,c: ((h*33) ^ ord(c)) & 0xffffffffL, s, n)
|
||||||
|
|
||||||
if pack('=i',1) == pack('>i',1):
|
if pack('=i',1) == pack('>i',1):
|
||||||
# big endian
|
# big endian
|
||||||
def decode(x):
|
def decode(x):
|
||||||
a = array('I', x)
|
a = array('I', x)
|
||||||
a.byteswap()
|
a.byteswap()
|
||||||
return a
|
return a
|
||||||
def encode(a):
|
def encode(a):
|
||||||
a.byteswap()
|
a.byteswap()
|
||||||
return a.tostring()
|
return a.tostring()
|
||||||
else:
|
else:
|
||||||
# little endian
|
# little endian
|
||||||
def decode(x):
|
def decode(x):
|
||||||
a = array('I', x)
|
a = array('I', x)
|
||||||
return a
|
return a
|
||||||
def encode(a):
|
def encode(a):
|
||||||
return a.tostring()
|
return a.tostring()
|
||||||
|
|
||||||
|
|
||||||
## CDB
|
## CDB
|
||||||
|
@ -38,234 +38,234 @@ else:
|
||||||
|
|
||||||
# cdbiter
|
# cdbiter
|
||||||
def cdbiter(fp, eod):
|
def cdbiter(fp, eod):
|
||||||
kloc = 2048
|
kloc = 2048
|
||||||
while kloc < eod:
|
while kloc < eod:
|
||||||
fp.seek(kloc)
|
fp.seek(kloc)
|
||||||
(klen, vlen) = unpack('<II', fp.read(8))
|
(klen, vlen) = unpack('<II', fp.read(8))
|
||||||
k = fp.read(klen)
|
k = fp.read(klen)
|
||||||
v = fp.read(vlen)
|
v = fp.read(vlen)
|
||||||
kloc += 8+klen+vlen
|
kloc += 8+klen+vlen
|
||||||
yield (k,v)
|
yield (k,v)
|
||||||
fp.close()
|
fp.close()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
# CDBReader
|
# CDBReader
|
||||||
class CDBReader(object):
|
class CDBReader(object):
|
||||||
|
|
||||||
def __init__(self, cdbname, docache=1):
|
|
||||||
self.name = cdbname
|
|
||||||
self._fp = file(cdbname, 'rb')
|
|
||||||
hash0 = decode(self._fp.read(2048))
|
|
||||||
self._hash0 = [ (hash0[i], hash0[i+1]) for i in xrange(0, 512, 2) ]
|
|
||||||
self._hash1 = [ None ] * 256
|
|
||||||
self._eod = hash0[0]
|
|
||||||
self._docache = docache
|
|
||||||
self._cache = {}
|
|
||||||
self._keyiter = None
|
|
||||||
self._eachiter = None
|
|
||||||
return
|
|
||||||
|
|
||||||
def __repr__(self):
|
def __init__(self, cdbname, docache=1):
|
||||||
return '<CDBReader: %r>' % self.name
|
self.name = cdbname
|
||||||
|
self._fp = file(cdbname, 'rb')
|
||||||
|
hash0 = decode(self._fp.read(2048))
|
||||||
|
self._hash0 = [ (hash0[i], hash0[i+1]) for i in xrange(0, 512, 2) ]
|
||||||
|
self._hash1 = [ None ] * 256
|
||||||
|
self._eod = hash0[0]
|
||||||
|
self._docache = docache
|
||||||
|
self._cache = {}
|
||||||
|
self._keyiter = None
|
||||||
|
self._eachiter = None
|
||||||
|
return
|
||||||
|
|
||||||
def __getstate__(self):
|
def __repr__(self):
|
||||||
raise TypeError
|
return '<CDBReader: %r>' % self.name
|
||||||
|
|
||||||
def __setstate__(self, dict):
|
def __getstate__(self):
|
||||||
raise TypeError
|
raise TypeError
|
||||||
|
|
||||||
def __getitem__(self, k):
|
def __setstate__(self, dict):
|
||||||
k = str(k)
|
raise TypeError
|
||||||
if k in self._cache: return self._cache[k]
|
|
||||||
h = cdbhash(k)
|
|
||||||
h1 = h & 0xff
|
|
||||||
(pos_bucket, ncells) = self._hash0[h1]
|
|
||||||
if ncells == 0: raise KeyError(k)
|
|
||||||
hs = self._hash1[h1]
|
|
||||||
if hs == None:
|
|
||||||
self._fp.seek(pos_bucket)
|
|
||||||
hs = decode(self._fp.read(ncells * 8))
|
|
||||||
self._hash1[h1] = hs
|
|
||||||
i = ((h >> 8) % ncells) * 2
|
|
||||||
n = ncells*2
|
|
||||||
for _ in xrange(ncells):
|
|
||||||
p1 = hs[i+1]
|
|
||||||
if p1 == 0: raise KeyError(k)
|
|
||||||
if hs[i] == h:
|
|
||||||
self._fp.seek(p1)
|
|
||||||
(klen, vlen) = unpack('<II', self._fp.read(8))
|
|
||||||
k1 = self._fp.read(klen)
|
|
||||||
if k1 == k:
|
|
||||||
v1 = self._fp.read(vlen)
|
|
||||||
if self._docache:
|
|
||||||
self._cache[k] = v1
|
|
||||||
return v1
|
|
||||||
i = (i+2) % n
|
|
||||||
raise KeyError(k)
|
|
||||||
|
|
||||||
def get(self, k, failed=None):
|
def __getitem__(self, k):
|
||||||
try:
|
k = str(k)
|
||||||
return self.__getitem__(k)
|
if k in self._cache: return self._cache[k]
|
||||||
except KeyError:
|
h = cdbhash(k)
|
||||||
return failed
|
h1 = h & 0xff
|
||||||
|
(pos_bucket, ncells) = self._hash0[h1]
|
||||||
|
if ncells == 0: raise KeyError(k)
|
||||||
|
hs = self._hash1[h1]
|
||||||
|
if hs == None:
|
||||||
|
self._fp.seek(pos_bucket)
|
||||||
|
hs = decode(self._fp.read(ncells * 8))
|
||||||
|
self._hash1[h1] = hs
|
||||||
|
i = ((h >> 8) % ncells) * 2
|
||||||
|
n = ncells*2
|
||||||
|
for _ in xrange(ncells):
|
||||||
|
p1 = hs[i+1]
|
||||||
|
if p1 == 0: raise KeyError(k)
|
||||||
|
if hs[i] == h:
|
||||||
|
self._fp.seek(p1)
|
||||||
|
(klen, vlen) = unpack('<II', self._fp.read(8))
|
||||||
|
k1 = self._fp.read(klen)
|
||||||
|
if k1 == k:
|
||||||
|
v1 = self._fp.read(vlen)
|
||||||
|
if self._docache:
|
||||||
|
self._cache[k] = v1
|
||||||
|
return v1
|
||||||
|
i = (i+2) % n
|
||||||
|
raise KeyError(k)
|
||||||
|
|
||||||
def has_key(self, k):
|
def get(self, k, failed=None):
|
||||||
try:
|
try:
|
||||||
self.__getitem__(k)
|
return self.__getitem__(k)
|
||||||
return True
|
except KeyError:
|
||||||
except KeyError:
|
return failed
|
||||||
return False
|
|
||||||
|
|
||||||
def __contains__(self, k):
|
def has_key(self, k):
|
||||||
return self.has_key(k)
|
try:
|
||||||
|
self.__getitem__(k)
|
||||||
|
return True
|
||||||
|
except KeyError:
|
||||||
|
return False
|
||||||
|
|
||||||
def firstkey(self):
|
def __contains__(self, k):
|
||||||
self._keyiter = None
|
return self.has_key(k)
|
||||||
return self.nextkey()
|
|
||||||
|
|
||||||
def nextkey(self):
|
|
||||||
if not self._keyiter:
|
|
||||||
self._keyiter = ( k for (k,v) in cdbiter(self._fp, self._eod) )
|
|
||||||
try:
|
|
||||||
return self._keyiter.next()
|
|
||||||
except StopIteration:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def each(self):
|
def firstkey(self):
|
||||||
if not self._eachiter:
|
self._keyiter = None
|
||||||
self._eachiter = cdbiter(self._fp, self._eod)
|
return self.nextkey()
|
||||||
try:
|
|
||||||
return self._eachiter.next()
|
def nextkey(self):
|
||||||
except StopIteration:
|
if not self._keyiter:
|
||||||
return None
|
self._keyiter = ( k for (k,v) in cdbiter(self._fp, self._eod) )
|
||||||
|
try:
|
||||||
def iterkeys(self):
|
return self._keyiter.next()
|
||||||
return ( k for (k,v) in cdbiter(self._fp, self._eod) )
|
except StopIteration:
|
||||||
def itervalues(self):
|
return None
|
||||||
return ( v for (k,v) in cdbiter(self._fp, self._eod) )
|
|
||||||
def iteritems(self):
|
def each(self):
|
||||||
return cdbiter(self._fp, self._eod)
|
if not self._eachiter:
|
||||||
|
self._eachiter = cdbiter(self._fp, self._eod)
|
||||||
|
try:
|
||||||
|
return self._eachiter.next()
|
||||||
|
except StopIteration:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def iterkeys(self):
|
||||||
|
return ( k for (k,v) in cdbiter(self._fp, self._eod) )
|
||||||
|
def itervalues(self):
|
||||||
|
return ( v for (k,v) in cdbiter(self._fp, self._eod) )
|
||||||
|
def iteritems(self):
|
||||||
|
return cdbiter(self._fp, self._eod)
|
||||||
|
|
||||||
|
|
||||||
# CDBMaker
|
# CDBMaker
|
||||||
class CDBMaker(object):
|
class CDBMaker(object):
|
||||||
|
|
||||||
def __init__(self, cdbname, tmpname):
|
def __init__(self, cdbname, tmpname):
|
||||||
self.fn = cdbname
|
self.fn = cdbname
|
||||||
self.fntmp = tmpname
|
self.fntmp = tmpname
|
||||||
self.numentries = 0
|
self.numentries = 0
|
||||||
self._fp = file(tmpname, 'wb')
|
self._fp = file(tmpname, 'wb')
|
||||||
self._pos = 2048 # sizeof((h,p))*256
|
self._pos = 2048 # sizeof((h,p))*256
|
||||||
self._bucket = [ array('I') for _ in xrange(256) ]
|
self._bucket = [ array('I') for _ in xrange(256) ]
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<CDBMaker: %r, %r, %d ents>' % (self.fn, self.fntmp, self.numentries)
|
return '<CDBMaker: %r, %r, %d ents>' % (self.fn, self.fntmp, self.numentries)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return self.numentries
|
return self.numentries
|
||||||
|
|
||||||
def __getstate__(self):
|
def __getstate__(self):
|
||||||
raise TypeError
|
raise TypeError
|
||||||
|
|
||||||
def __setstate__(self, dict):
|
def __setstate__(self, dict):
|
||||||
raise TypeError
|
raise TypeError
|
||||||
|
|
||||||
def add(self, k, v):
|
def add(self, k, v):
|
||||||
(k, v) = (str(k), str(v))
|
(k, v) = (str(k), str(v))
|
||||||
(klen, vlen) = (len(k), len(v))
|
(klen, vlen) = (len(k), len(v))
|
||||||
self._fp.seek(self._pos)
|
self._fp.seek(self._pos)
|
||||||
self._fp.write(pack('<II', klen, vlen))
|
self._fp.write(pack('<II', klen, vlen))
|
||||||
self._fp.write(k)
|
self._fp.write(k)
|
||||||
self._fp.write(v)
|
self._fp.write(v)
|
||||||
h = cdbhash(k)
|
h = cdbhash(k)
|
||||||
b = self._bucket[h % 256]
|
b = self._bucket[h % 256]
|
||||||
b.append(h)
|
b.append(h)
|
||||||
b.append(self._pos)
|
b.append(self._pos)
|
||||||
# sizeof(keylen)+sizeof(datalen)+sizeof(key)+sizeof(data)
|
# sizeof(keylen)+sizeof(datalen)+sizeof(key)+sizeof(data)
|
||||||
self._pos += 8+klen+vlen
|
self._pos += 8+klen+vlen
|
||||||
self.numentries += 1
|
self.numentries += 1
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def finish(self):
|
|
||||||
self._fp.seek(self._pos)
|
|
||||||
pos_hash = self._pos
|
|
||||||
# write hashes
|
|
||||||
for b1 in self._bucket:
|
|
||||||
if not b1: continue
|
|
||||||
blen = len(b1)
|
|
||||||
a = array('I', [0]*blen*2)
|
|
||||||
for j in xrange(0, blen, 2):
|
|
||||||
(h,p) = (b1[j],b1[j+1])
|
|
||||||
i = ((h >> 8) % blen)*2
|
|
||||||
while a[i+1]: # is cell[i] already occupied?
|
|
||||||
i = (i+2) % len(a)
|
|
||||||
a[i] = h
|
|
||||||
a[i+1] = p
|
|
||||||
self._fp.write(encode(a))
|
|
||||||
# write header
|
|
||||||
self._fp.seek(0)
|
|
||||||
a = array('I')
|
|
||||||
for b1 in self._bucket:
|
|
||||||
a.append(pos_hash)
|
|
||||||
a.append(len(b1))
|
|
||||||
pos_hash += len(b1)*8
|
|
||||||
self._fp.write(encode(a))
|
|
||||||
# close
|
|
||||||
self._fp.close()
|
|
||||||
os.rename(self.fntmp, self.fn)
|
|
||||||
return
|
|
||||||
|
|
||||||
# txt2cdb
|
def finish(self):
|
||||||
def txt2cdb(self, lines):
|
self._fp.seek(self._pos)
|
||||||
import re
|
pos_hash = self._pos
|
||||||
HEAD = re.compile(r'^\+(\d+),(\d+):')
|
# write hashes
|
||||||
for line in lines:
|
for b1 in self._bucket:
|
||||||
m = HEAD.match(line)
|
if not b1: continue
|
||||||
if not m: break
|
blen = len(b1)
|
||||||
(klen, vlen) = (int(m.group(1)), int(m.group(2)))
|
a = array('I', [0]*blen*2)
|
||||||
i = len(m.group(0))
|
for j in xrange(0, blen, 2):
|
||||||
k = line[i:i+klen]
|
(h,p) = (b1[j],b1[j+1])
|
||||||
i += klen
|
i = ((h >> 8) % blen)*2
|
||||||
if line[i:i+2] != '->': raise ValueError('invalid separator: %r' % line)
|
while a[i+1]: # is cell[i] already occupied?
|
||||||
i += 2
|
i = (i+2) % len(a)
|
||||||
v = line[i:i+vlen]
|
a[i] = h
|
||||||
self.add(k, v)
|
a[i+1] = p
|
||||||
return self
|
self._fp.write(encode(a))
|
||||||
|
# write header
|
||||||
|
self._fp.seek(0)
|
||||||
|
a = array('I')
|
||||||
|
for b1 in self._bucket:
|
||||||
|
a.append(pos_hash)
|
||||||
|
a.append(len(b1))
|
||||||
|
pos_hash += len(b1)*8
|
||||||
|
self._fp.write(encode(a))
|
||||||
|
# close
|
||||||
|
self._fp.close()
|
||||||
|
os.rename(self.fntmp, self.fn)
|
||||||
|
return
|
||||||
|
|
||||||
|
# txt2cdb
|
||||||
|
def txt2cdb(self, lines):
|
||||||
|
import re
|
||||||
|
HEAD = re.compile(r'^\+(\d+),(\d+):')
|
||||||
|
for line in lines:
|
||||||
|
m = HEAD.match(line)
|
||||||
|
if not m: break
|
||||||
|
(klen, vlen) = (int(m.group(1)), int(m.group(2)))
|
||||||
|
i = len(m.group(0))
|
||||||
|
k = line[i:i+klen]
|
||||||
|
i += klen
|
||||||
|
if line[i:i+2] != '->': raise ValueError('invalid separator: %r' % line)
|
||||||
|
i += 2
|
||||||
|
v = line[i:i+vlen]
|
||||||
|
self.add(k, v)
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
# cdbdump
|
# cdbdump
|
||||||
def cdbdump(cdbname):
|
def cdbdump(cdbname):
|
||||||
fp = file(cdbname, 'rb')
|
fp = file(cdbname, 'rb')
|
||||||
(eor,) = unpack('<I', fp.read(4))
|
(eor,) = unpack('<I', fp.read(4))
|
||||||
return cdbiter(fp, eor)
|
return cdbiter(fp, eor)
|
||||||
|
|
||||||
|
|
||||||
# cdbmerge
|
# cdbmerge
|
||||||
def cdbmerge(iters):
|
def cdbmerge(iters):
|
||||||
q = []
|
q = []
|
||||||
for it in iters:
|
for it in iters:
|
||||||
try:
|
try:
|
||||||
q.append((it.next(),it))
|
q.append((it.next(),it))
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
pass
|
pass
|
||||||
k0 = None
|
k0 = None
|
||||||
vs = None
|
vs = None
|
||||||
while q:
|
while q:
|
||||||
q.sort()
|
q.sort()
|
||||||
((k,v),it) = q.pop(0)
|
((k,v),it) = q.pop(0)
|
||||||
if k0 != k:
|
if k0 != k:
|
||||||
if vs: yield (k0,vs)
|
if vs: yield (k0,vs)
|
||||||
vs = []
|
vs = []
|
||||||
vs.append(v)
|
vs.append(v)
|
||||||
k0 = k
|
k0 = k
|
||||||
try:
|
try:
|
||||||
q.append((it.next(),it))
|
q.append((it.next(),it))
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
continue
|
continue
|
||||||
if vs: yield (k0,vs)
|
if vs: yield (k0,vs)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
# aliases
|
# aliases
|
||||||
|
@ -278,132 +278,132 @@ init = CDBReader
|
||||||
|
|
||||||
# tcdbiter
|
# tcdbiter
|
||||||
def tcdbiter(fp, eor):
|
def tcdbiter(fp, eor):
|
||||||
locs = {}
|
locs = {}
|
||||||
fp.seek(eor)
|
fp.seek(eor)
|
||||||
while 1:
|
while 1:
|
||||||
x = fp.read(8)
|
x = fp.read(8)
|
||||||
if not x: break
|
if not x: break
|
||||||
(h, pos) = unpack('<II', x)
|
(h, pos) = unpack('<II', x)
|
||||||
if pos: locs[pos] = h
|
if pos: locs[pos] = h
|
||||||
pos = 2048
|
pos = 2048
|
||||||
fp.seek(pos)
|
fp.seek(pos)
|
||||||
key = ()
|
key = ()
|
||||||
parents = [0]
|
parents = [0]
|
||||||
while pos < eor:
|
while pos < eor:
|
||||||
(klen, vlen) = unpack('<II', fp.read(8))
|
(klen, vlen) = unpack('<II', fp.read(8))
|
||||||
k = fp.read(klen)
|
k = fp.read(klen)
|
||||||
v = fp.read(vlen)
|
v = fp.read(vlen)
|
||||||
h = locs[pos]
|
h = locs[pos]
|
||||||
for (i,p) in enumerate(parents):
|
for (i,p) in enumerate(parents):
|
||||||
if cdbhash(k, p+5381L) == h:
|
if cdbhash(k, p+5381L) == h:
|
||||||
parents = parents[:i+1]
|
parents = parents[:i+1]
|
||||||
key = key[:i]
|
key = key[:i]
|
||||||
break
|
break
|
||||||
key += (k,)
|
key += (k,)
|
||||||
yield (key, v)
|
yield (key, v)
|
||||||
parents.append(pos)
|
parents.append(pos)
|
||||||
pos += 8+klen+vlen
|
pos += 8+klen+vlen
|
||||||
fp.close()
|
fp.close()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
# TCDBMaker
|
# TCDBMaker
|
||||||
class TCDBMaker(CDBMaker):
|
class TCDBMaker(CDBMaker):
|
||||||
|
|
||||||
def __init__(self, cdbname, tmpname):
|
def __init__(self, cdbname, tmpname):
|
||||||
CDBMaker.__init__(self, cdbname, tmpname)
|
CDBMaker.__init__(self, cdbname, tmpname)
|
||||||
self._parent = 0
|
self._parent = 0
|
||||||
self._stack = [self._parent]
|
self._stack = [self._parent]
|
||||||
return
|
return
|
||||||
|
|
||||||
def put(self, depth, k, v):
|
def put(self, depth, k, v):
|
||||||
if depth == len(self._stack)+1:
|
if depth == len(self._stack)+1:
|
||||||
self._stack.append(self._parent)
|
self._stack.append(self._parent)
|
||||||
elif depth < len(self._stack):
|
elif depth < len(self._stack):
|
||||||
self._stack = self._stack[:depth]
|
self._stack = self._stack[:depth]
|
||||||
elif depth != len(self._stack):
|
elif depth != len(self._stack):
|
||||||
raise ValueError('invalid depth: %d' % depth)
|
raise ValueError('invalid depth: %d' % depth)
|
||||||
#
|
#
|
||||||
(k, v) = (str(k), str(v))
|
(k, v) = (str(k), str(v))
|
||||||
(klen, vlen) = (len(k), len(v))
|
(klen, vlen) = (len(k), len(v))
|
||||||
self._parent = self._pos
|
self._parent = self._pos
|
||||||
# sizeof(keylen)+sizeof(datalen)+sizeof(key)+sizeof(data)
|
# sizeof(keylen)+sizeof(datalen)+sizeof(key)+sizeof(data)
|
||||||
self._fp.seek(self._pos)
|
self._fp.seek(self._pos)
|
||||||
self._fp.write(pack('<II', klen, vlen))
|
self._fp.write(pack('<II', klen, vlen))
|
||||||
self._fp.write(k)
|
self._fp.write(k)
|
||||||
self._fp.write(v)
|
self._fp.write(v)
|
||||||
self._pos += 4+4+klen+vlen
|
self._pos += 4+4+klen+vlen
|
||||||
h = cdbhash(k, self._stack[-1]+5381L)
|
h = cdbhash(k, self._stack[-1]+5381L)
|
||||||
b = self._bucket[h % 256]
|
b = self._bucket[h % 256]
|
||||||
b.append(h)
|
b.append(h)
|
||||||
b.append(self._parent)
|
b.append(self._parent)
|
||||||
self.numentries += 1
|
self.numentries += 1
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def txt2tcdb(self, lines):
|
def txt2tcdb(self, lines):
|
||||||
import re
|
import re
|
||||||
HEAD = re.compile(r'^(\++)(\d+),(\d+):')
|
HEAD = re.compile(r'^(\++)(\d+),(\d+):')
|
||||||
for line in lines:
|
for line in lines:
|
||||||
m = HEAD.match(line)
|
m = HEAD.match(line)
|
||||||
if not m: break
|
if not m: break
|
||||||
(depth, klen, vlen) = (len(m.group(1)), int(m.group(2)), int(m.group(3)))
|
(depth, klen, vlen) = (len(m.group(1)), int(m.group(2)), int(m.group(3)))
|
||||||
i = len(m.group(0))
|
i = len(m.group(0))
|
||||||
k = line[i:i+klen]
|
k = line[i:i+klen]
|
||||||
i += klen
|
i += klen
|
||||||
if line[i:i+2] != '->': raise ValueError('invalid separator: %r' % line)
|
if line[i:i+2] != '->': raise ValueError('invalid separator: %r' % line)
|
||||||
i += 2
|
i += 2
|
||||||
v = line[i:i+vlen]
|
v = line[i:i+vlen]
|
||||||
self.put(depth, k, v)
|
self.put(depth, k, v)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
# TCDBReader
|
# TCDBReader
|
||||||
class TCDBReader(CDBReader):
|
class TCDBReader(CDBReader):
|
||||||
|
|
||||||
def lookup(self, seq, parent=0L):
|
def lookup(self, seq, parent=0L):
|
||||||
r = []
|
r = []
|
||||||
for k in seq:
|
for k in seq:
|
||||||
(v, parent) = self.lookup1(k, parent)
|
(v, parent) = self.lookup1(k, parent)
|
||||||
r.append(v)
|
r.append(v)
|
||||||
return r
|
return r
|
||||||
|
|
||||||
def lookup1(self, k, parent=0L):
|
def lookup1(self, k, parent=0L):
|
||||||
k = str(k)
|
k = str(k)
|
||||||
if self._docache and (parent,k) in self._cache:
|
if self._docache and (parent,k) in self._cache:
|
||||||
return self._cache[(parent,k)]
|
return self._cache[(parent,k)]
|
||||||
h = cdbhash(k, parent+5381L)
|
h = cdbhash(k, parent+5381L)
|
||||||
self._fp.seek((h % 256) << 3)
|
self._fp.seek((h % 256) << 3)
|
||||||
(pos_bucket, ncells) = unpack('<II', self._fp.read(8))
|
(pos_bucket, ncells) = unpack('<II', self._fp.read(8))
|
||||||
if ncells == 0: raise KeyError(k)
|
if ncells == 0: raise KeyError(k)
|
||||||
start = (h >> 8) % ncells
|
start = (h >> 8) % ncells
|
||||||
for i in xrange(ncells):
|
for i in xrange(ncells):
|
||||||
self._fp.seek(pos_bucket + ((start+i) % ncells << 3))
|
self._fp.seek(pos_bucket + ((start+i) % ncells << 3))
|
||||||
(h1, p1) = unpack('<II', self._fp.read(8))
|
(h1, p1) = unpack('<II', self._fp.read(8))
|
||||||
if p1 == 0: raise KeyError(k)
|
if p1 == 0: raise KeyError(k)
|
||||||
if h1 == h:
|
if h1 == h:
|
||||||
self._fp.seek(p1)
|
self._fp.seek(p1)
|
||||||
(klen, vlen) = unpack('<II', self._fp.read(8))
|
(klen, vlen) = unpack('<II', self._fp.read(8))
|
||||||
k1 = self._fp.read(klen)
|
k1 = self._fp.read(klen)
|
||||||
if k1 == k:
|
if k1 == k:
|
||||||
v1 = self._fp.read(vlen)
|
v1 = self._fp.read(vlen)
|
||||||
if self._docache:
|
if self._docache:
|
||||||
self._cache[(parent,k)] = (v1,p1)
|
self._cache[(parent,k)] = (v1,p1)
|
||||||
return (v1,p1)
|
return (v1,p1)
|
||||||
raise KeyError(k)
|
raise KeyError(k)
|
||||||
|
|
||||||
def iterkeys(self):
|
def iterkeys(self):
|
||||||
return ( k for (k,v) in tcdbiter(self._fp, self._eod) )
|
return ( k for (k,v) in tcdbiter(self._fp, self._eod) )
|
||||||
def itervalues(self):
|
def itervalues(self):
|
||||||
return ( v for (k,v) in tcdbiter(self._fp, self._eod) )
|
return ( v for (k,v) in tcdbiter(self._fp, self._eod) )
|
||||||
def iteritems(self):
|
def iteritems(self):
|
||||||
return tcdbiter(self._fp, self._eod)
|
return tcdbiter(self._fp, self._eod)
|
||||||
|
|
||||||
|
|
||||||
# tcdbdump
|
# tcdbdump
|
||||||
def tcdbdump(cdbname):
|
def tcdbdump(cdbname):
|
||||||
fp = file(cdbname, 'rb')
|
fp = file(cdbname, 'rb')
|
||||||
(eor,) = unpack('<I', fp.read(4))
|
(eor,) = unpack('<I', fp.read(4))
|
||||||
return tcdbiter(fp, eor)
|
return tcdbiter(fp, eor)
|
||||||
|
|
||||||
|
|
||||||
# aliases
|
# aliases
|
||||||
|
@ -414,64 +414,64 @@ tcdbmerge = cdbmerge
|
||||||
|
|
||||||
# main
|
# main
|
||||||
def main(argv):
|
def main(argv):
|
||||||
import getopt, fileinput
|
import getopt, fileinput
|
||||||
def usage():
|
def usage():
|
||||||
print 'usage: %s {cmake,cget,cdump,cmerge} [options] cdbname [args ...]' % argv[0]
|
print 'usage: %s {cmake,cget,cdump,cmerge} [options] cdbname [args ...]' % argv[0]
|
||||||
print 'usage: %s {tmake,tget,tdump,tmerge} [options] tcdbname [args ...]' % argv[0]
|
print 'usage: %s {tmake,tget,tdump,tmerge} [options] tcdbname [args ...]' % argv[0]
|
||||||
return 100
|
return 100
|
||||||
args = argv[1:]
|
args = argv[1:]
|
||||||
if not args: return usage()
|
if not args: return usage()
|
||||||
cmd = args.pop(0)
|
cmd = args.pop(0)
|
||||||
try:
|
try:
|
||||||
(opts, args) = getopt.getopt(args, 'kv2')
|
(opts, args) = getopt.getopt(args, 'kv2')
|
||||||
except getopt.GetoptError:
|
except getopt.GetoptError:
|
||||||
return usage()
|
return usage()
|
||||||
if not args: return usage()
|
if not args: return usage()
|
||||||
dbname = args.pop(0)
|
dbname = args.pop(0)
|
||||||
|
|
||||||
# cdb
|
# cdb
|
||||||
if cmd == 'cmake':
|
if cmd == 'cmake':
|
||||||
CDBMaker(dbname, dbname+'.tmp').txt2cdb(fileinput.input(args)).finish()
|
CDBMaker(dbname, dbname+'.tmp').txt2cdb(fileinput.input(args)).finish()
|
||||||
elif cmd == 'cget':
|
elif cmd == 'cget':
|
||||||
print repr(CDBReader(dbname).get(args[0]))
|
print repr(CDBReader(dbname).get(args[0]))
|
||||||
elif cmd == 'cdump':
|
elif cmd == 'cdump':
|
||||||
f = (lambda k,v: '+%d,%d:%s->%s' % (len(k), len(v), k, v))
|
f = (lambda k,v: '+%d,%d:%s->%s' % (len(k), len(v), k, v))
|
||||||
for (k, v) in opts:
|
for (k, v) in opts:
|
||||||
if k == '-k': f = (lambda k,_: k)
|
if k == '-k': f = (lambda k,_: k)
|
||||||
elif k == '-v': f = (lambda _,v: v)
|
elif k == '-v': f = (lambda _,v: v)
|
||||||
elif k == '-2': f = (lambda k,v: k+'\t'+v)
|
elif k == '-2': f = (lambda k,v: k+'\t'+v)
|
||||||
for (k,v) in cdbdump(dbname):
|
for (k,v) in cdbdump(dbname):
|
||||||
print f(k,v)
|
print f(k,v)
|
||||||
print
|
print
|
||||||
elif cmd == 'cmerge':
|
elif cmd == 'cmerge':
|
||||||
dbs = [ cdbdump(fname) for fname in args ]
|
dbs = [ cdbdump(fname) for fname in args ]
|
||||||
m = CDBMaker(dbname, dbname+'.tmp')
|
m = CDBMaker(dbname, dbname+'.tmp')
|
||||||
for (k,vs) in tcdbmerge(dbs):
|
for (k,vs) in tcdbmerge(dbs):
|
||||||
m.add(k, ' '.join(vs))
|
m.add(k, ' '.join(vs))
|
||||||
m.finish()
|
m.finish()
|
||||||
# tcdb
|
# tcdb
|
||||||
elif cmd == 'tmake':
|
elif cmd == 'tmake':
|
||||||
TCDBMaker(dbname, dbname+'.tmp').txt2tcdb(fileinput.input(args)).finish()
|
TCDBMaker(dbname, dbname+'.tmp').txt2tcdb(fileinput.input(args)).finish()
|
||||||
elif cmd == 'tget':
|
elif cmd == 'tget':
|
||||||
print repr(TCDBReader(dbname).lookup(args))
|
print repr(TCDBReader(dbname).lookup(args))
|
||||||
elif cmd == 'tdump':
|
elif cmd == 'tdump':
|
||||||
f = (lambda k,v: '%s%d,%d:%s->%s' % ('+'*len(k), len(k[-1]), len(v), k[-1], v))
|
f = (lambda k,v: '%s%d,%d:%s->%s' % ('+'*len(k), len(k[-1]), len(v), k[-1], v))
|
||||||
for (k, v) in opts:
|
for (k, v) in opts:
|
||||||
if k == '-k': f = (lambda k,_: '/'.join(k))
|
if k == '-k': f = (lambda k,_: '/'.join(k))
|
||||||
elif k == '-v': f = (lambda _,v: v)
|
elif k == '-v': f = (lambda _,v: v)
|
||||||
elif k == '-2': f = (lambda k,v: '/'.join(k)+'\t'+v)
|
elif k == '-2': f = (lambda k,v: '/'.join(k)+'\t'+v)
|
||||||
for (k,v) in tcdbdump(dbname):
|
for (k,v) in tcdbdump(dbname):
|
||||||
print f(k,v)
|
print f(k,v)
|
||||||
print
|
print
|
||||||
elif cmd == 'tmerge':
|
elif cmd == 'tmerge':
|
||||||
dbs = [ tcdbdump(fname) for fname in args ]
|
dbs = [ tcdbdump(fname) for fname in args ]
|
||||||
m = TCDBMaker(dbname, dbname+'.tmp')
|
m = TCDBMaker(dbname, dbname+'.tmp')
|
||||||
for (k,vs) in tcdbmerge(dbs):
|
for (k,vs) in tcdbmerge(dbs):
|
||||||
m.put(len(k), k[-1], ' '.join(vs))
|
m.put(len(k), k[-1], ' '.join(vs))
|
||||||
m.finish()
|
m.finish()
|
||||||
|
|
||||||
else:
|
else:
|
||||||
return usage()
|
return usage()
|
||||||
return
|
return
|
||||||
|
|
||||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||||
|
|
|
@ -691,88 +691,88 @@ rcon = [
|
||||||
]
|
]
|
||||||
|
|
||||||
if len(pack('L',0)) == 4:
|
if len(pack('L',0)) == 4:
|
||||||
# 32bit
|
# 32bit
|
||||||
def GETU32(x): return unpack('>L', x)[0]
|
def GETU32(x): return unpack('>L', x)[0]
|
||||||
def PUTU32(x): return pack('>L', x)
|
def PUTU32(x): return pack('>L', x)
|
||||||
else:
|
else:
|
||||||
# 64bit
|
# 64bit
|
||||||
def GETU32(x): return unpack('>I', x)[0]
|
def GETU32(x): return unpack('>I', x)[0]
|
||||||
def PUTU32(x): return pack('>I', x)
|
def PUTU32(x): return pack('>I', x)
|
||||||
|
|
||||||
# Expand the cipher key into the encryption key schedule.
|
# Expand the cipher key into the encryption key schedule.
|
||||||
#
|
#
|
||||||
# @return the number of rounds for the given cipher key size.
|
# @return the number of rounds for the given cipher key size.
|
||||||
def rijndaelSetupEncrypt(key, keybits):
|
def rijndaelSetupEncrypt(key, keybits):
|
||||||
i = p = 0
|
i = p = 0
|
||||||
rk = [0]*RKLENGTH(keybits)
|
rk = [0]*RKLENGTH(keybits)
|
||||||
rk[0] = GETU32(key[0:4])
|
rk[0] = GETU32(key[0:4])
|
||||||
rk[1] = GETU32(key[4:8])
|
rk[1] = GETU32(key[4:8])
|
||||||
rk[2] = GETU32(key[8:12])
|
rk[2] = GETU32(key[8:12])
|
||||||
rk[3] = GETU32(key[12:16])
|
rk[3] = GETU32(key[12:16])
|
||||||
if keybits == 128:
|
if keybits == 128:
|
||||||
while 1:
|
while 1:
|
||||||
temp = rk[p+3]
|
temp = rk[p+3]
|
||||||
rk[p+4] = (rk[p+0] ^
|
rk[p+4] = (rk[p+0] ^
|
||||||
(Te4[(temp >> 16) & 0xff] & 0xff000000) ^
|
(Te4[(temp >> 16) & 0xff] & 0xff000000) ^
|
||||||
(Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
|
(Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
|
||||||
(Te4[(temp ) & 0xff] & 0x0000ff00) ^
|
(Te4[(temp ) & 0xff] & 0x0000ff00) ^
|
||||||
(Te4[(temp >> 24) ] & 0x000000ff) ^
|
(Te4[(temp >> 24) ] & 0x000000ff) ^
|
||||||
rcon[i])
|
rcon[i])
|
||||||
rk[p+5] = rk[p+1] ^ rk[p+4]
|
rk[p+5] = rk[p+1] ^ rk[p+4]
|
||||||
rk[p+6] = rk[p+2] ^ rk[p+5]
|
rk[p+6] = rk[p+2] ^ rk[p+5]
|
||||||
rk[p+7] = rk[p+3] ^ rk[p+6]
|
rk[p+7] = rk[p+3] ^ rk[p+6]
|
||||||
i += 1
|
i += 1
|
||||||
if i == 10: return (rk, 10)
|
if i == 10: return (rk, 10)
|
||||||
p += 4
|
p += 4
|
||||||
|
|
||||||
rk[4] = GETU32(key[16:20])
|
rk[4] = GETU32(key[16:20])
|
||||||
rk[5] = GETU32(key[20:24])
|
rk[5] = GETU32(key[20:24])
|
||||||
if keybits == 192:
|
if keybits == 192:
|
||||||
while 1:
|
while 1:
|
||||||
temp = rk[p+5]
|
temp = rk[p+5]
|
||||||
rk[p+6] = (rk[p+0] ^
|
rk[p+6] = (rk[p+0] ^
|
||||||
(Te4[(temp >> 16) & 0xff] & 0xff000000) ^
|
(Te4[(temp >> 16) & 0xff] & 0xff000000) ^
|
||||||
(Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
|
(Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
|
||||||
(Te4[(temp ) & 0xff] & 0x0000ff00) ^
|
(Te4[(temp ) & 0xff] & 0x0000ff00) ^
|
||||||
(Te4[(temp >> 24) ] & 0x000000ff) ^
|
(Te4[(temp >> 24) ] & 0x000000ff) ^
|
||||||
rcon[i])
|
rcon[i])
|
||||||
rk[p+7] = rk[p+1] ^ rk[p+6]
|
rk[p+7] = rk[p+1] ^ rk[p+6]
|
||||||
rk[p+8] = rk[p+2] ^ rk[p+7]
|
rk[p+8] = rk[p+2] ^ rk[p+7]
|
||||||
rk[p+9] = rk[p+3] ^ rk[p+8]
|
rk[p+9] = rk[p+3] ^ rk[p+8]
|
||||||
i += 1
|
i += 1
|
||||||
if i == 8: return (rk, 12)
|
if i == 8: return (rk, 12)
|
||||||
rk[p+10] = rk[p+4] ^ rk[p+9]
|
rk[p+10] = rk[p+4] ^ rk[p+9]
|
||||||
rk[p+11] = rk[p+5] ^ rk[p+10]
|
rk[p+11] = rk[p+5] ^ rk[p+10]
|
||||||
p += 6
|
p += 6
|
||||||
|
|
||||||
rk[6] = GETU32(key[24:28])
|
rk[6] = GETU32(key[24:28])
|
||||||
rk[7] = GETU32(key[28:32])
|
rk[7] = GETU32(key[28:32])
|
||||||
if keybits == 256:
|
if keybits == 256:
|
||||||
while 1:
|
while 1:
|
||||||
temp = rk[p+7]
|
temp = rk[p+7]
|
||||||
rk[p+8] = (rk[p+0] ^
|
rk[p+8] = (rk[p+0] ^
|
||||||
(Te4[(temp >> 16) & 0xff] & 0xff000000) ^
|
(Te4[(temp >> 16) & 0xff] & 0xff000000) ^
|
||||||
(Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
|
(Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
|
||||||
(Te4[(temp ) & 0xff] & 0x0000ff00) ^
|
(Te4[(temp ) & 0xff] & 0x0000ff00) ^
|
||||||
(Te4[(temp >> 24) ] & 0x000000ff) ^
|
(Te4[(temp >> 24) ] & 0x000000ff) ^
|
||||||
rcon[i])
|
rcon[i])
|
||||||
rk[p+9] = rk[p+1] ^ rk[p+8]
|
rk[p+9] = rk[p+1] ^ rk[p+8]
|
||||||
rk[p+10] = rk[p+2] ^ rk[p+9]
|
rk[p+10] = rk[p+2] ^ rk[p+9]
|
||||||
rk[p+11] = rk[p+3] ^ rk[p+10]
|
rk[p+11] = rk[p+3] ^ rk[p+10]
|
||||||
i += 1
|
i += 1
|
||||||
if i == 7: return (rk, 14)
|
if i == 7: return (rk, 14)
|
||||||
temp = rk[p+11]
|
temp = rk[p+11]
|
||||||
rk[p+12] = (rk[p+4] ^
|
rk[p+12] = (rk[p+4] ^
|
||||||
(Te4[(temp >> 24) ] & 0xff000000) ^
|
(Te4[(temp >> 24) ] & 0xff000000) ^
|
||||||
(Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^
|
(Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^
|
||||||
(Te4[(temp >> 8) & 0xff] & 0x0000ff00) ^
|
(Te4[(temp >> 8) & 0xff] & 0x0000ff00) ^
|
||||||
(Te4[(temp ) & 0xff] & 0x000000ff))
|
(Te4[(temp ) & 0xff] & 0x000000ff))
|
||||||
rk[p+13] = rk[p+5] ^ rk[p+12]
|
rk[p+13] = rk[p+5] ^ rk[p+12]
|
||||||
rk[p+14] = rk[p+6] ^ rk[p+13]
|
rk[p+14] = rk[p+6] ^ rk[p+13]
|
||||||
rk[p+15] = rk[p+7] ^ rk[p+14]
|
rk[p+15] = rk[p+7] ^ rk[p+14]
|
||||||
p += 8
|
p += 8
|
||||||
|
|
||||||
raise ValueError(keybits)
|
raise ValueError(keybits)
|
||||||
|
|
||||||
|
|
||||||
# Expand the cipher key into the decryption key schedule.
|
# Expand the cipher key into the decryption key schedule.
|
||||||
|
@ -780,291 +780,291 @@ def rijndaelSetupEncrypt(key, keybits):
|
||||||
# @return the number of rounds for the given cipher key size.
|
# @return the number of rounds for the given cipher key size.
|
||||||
def rijndaelSetupDecrypt(key, keybits):
|
def rijndaelSetupDecrypt(key, keybits):
|
||||||
|
|
||||||
# expand the cipher key:
|
# expand the cipher key:
|
||||||
(rk, nrounds) = rijndaelSetupEncrypt(key, keybits)
|
(rk, nrounds) = rijndaelSetupEncrypt(key, keybits)
|
||||||
# invert the order of the round keys:
|
# invert the order of the round keys:
|
||||||
i = 0
|
i = 0
|
||||||
j = 4*nrounds
|
j = 4*nrounds
|
||||||
while i < j:
|
while i < j:
|
||||||
temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp
|
temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp
|
||||||
temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp
|
temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp
|
||||||
temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp
|
temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp
|
||||||
temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp
|
temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp
|
||||||
i += 4
|
i += 4
|
||||||
j -= 4
|
j -= 4
|
||||||
# apply the inverse MixColumn transform to all round keys but the first and the last:
|
# apply the inverse MixColumn transform to all round keys but the first and the last:
|
||||||
p = 0
|
p = 0
|
||||||
for i in xrange(1, nrounds):
|
for i in xrange(1, nrounds):
|
||||||
p += 4
|
p += 4
|
||||||
rk[p+0] = (
|
rk[p+0] = (
|
||||||
Td0[Te4[(rk[p+0] >> 24) ] & 0xff] ^
|
Td0[Te4[(rk[p+0] >> 24) ] & 0xff] ^
|
||||||
Td1[Te4[(rk[p+0] >> 16) & 0xff] & 0xff] ^
|
Td1[Te4[(rk[p+0] >> 16) & 0xff] & 0xff] ^
|
||||||
Td2[Te4[(rk[p+0] >> 8) & 0xff] & 0xff] ^
|
Td2[Te4[(rk[p+0] >> 8) & 0xff] & 0xff] ^
|
||||||
Td3[Te4[(rk[p+0] ) & 0xff] & 0xff])
|
Td3[Te4[(rk[p+0] ) & 0xff] & 0xff])
|
||||||
rk[p+1] = (
|
rk[p+1] = (
|
||||||
Td0[Te4[(rk[p+1] >> 24) ] & 0xff] ^
|
Td0[Te4[(rk[p+1] >> 24) ] & 0xff] ^
|
||||||
Td1[Te4[(rk[p+1] >> 16) & 0xff] & 0xff] ^
|
Td1[Te4[(rk[p+1] >> 16) & 0xff] & 0xff] ^
|
||||||
Td2[Te4[(rk[p+1] >> 8) & 0xff] & 0xff] ^
|
Td2[Te4[(rk[p+1] >> 8) & 0xff] & 0xff] ^
|
||||||
Td3[Te4[(rk[p+1] ) & 0xff] & 0xff])
|
Td3[Te4[(rk[p+1] ) & 0xff] & 0xff])
|
||||||
rk[p+2] = (
|
rk[p+2] = (
|
||||||
Td0[Te4[(rk[p+2] >> 24) ] & 0xff] ^
|
Td0[Te4[(rk[p+2] >> 24) ] & 0xff] ^
|
||||||
Td1[Te4[(rk[p+2] >> 16) & 0xff] & 0xff] ^
|
Td1[Te4[(rk[p+2] >> 16) & 0xff] & 0xff] ^
|
||||||
Td2[Te4[(rk[p+2] >> 8) & 0xff] & 0xff] ^
|
Td2[Te4[(rk[p+2] >> 8) & 0xff] & 0xff] ^
|
||||||
Td3[Te4[(rk[p+2] ) & 0xff] & 0xff])
|
Td3[Te4[(rk[p+2] ) & 0xff] & 0xff])
|
||||||
rk[p+3] = (
|
rk[p+3] = (
|
||||||
Td0[Te4[(rk[p+3] >> 24) ] & 0xff] ^
|
Td0[Te4[(rk[p+3] >> 24) ] & 0xff] ^
|
||||||
Td1[Te4[(rk[p+3] >> 16) & 0xff] & 0xff] ^
|
Td1[Te4[(rk[p+3] >> 16) & 0xff] & 0xff] ^
|
||||||
Td2[Te4[(rk[p+3] >> 8) & 0xff] & 0xff] ^
|
Td2[Te4[(rk[p+3] >> 8) & 0xff] & 0xff] ^
|
||||||
Td3[Te4[(rk[p+3] ) & 0xff] & 0xff])
|
Td3[Te4[(rk[p+3] ) & 0xff] & 0xff])
|
||||||
|
|
||||||
return (rk, nrounds)
|
return (rk, nrounds)
|
||||||
|
|
||||||
|
|
||||||
def rijndaelEncrypt(rk, nrounds, plaintext):
|
def rijndaelEncrypt(rk, nrounds, plaintext):
|
||||||
assert len(plaintext) == 16
|
assert len(plaintext) == 16
|
||||||
|
|
||||||
# map byte array block to cipher state
|
# map byte array block to cipher state
|
||||||
# and add initial round key:
|
# and add initial round key:
|
||||||
s0 = GETU32(plaintext[0:4]) ^ rk[0]
|
s0 = GETU32(plaintext[0:4]) ^ rk[0]
|
||||||
s1 = GETU32(plaintext[4:8]) ^ rk[1]
|
s1 = GETU32(plaintext[4:8]) ^ rk[1]
|
||||||
s2 = GETU32(plaintext[8:12]) ^ rk[2]
|
s2 = GETU32(plaintext[8:12]) ^ rk[2]
|
||||||
s3 = GETU32(plaintext[12:16]) ^ rk[3]
|
s3 = GETU32(plaintext[12:16]) ^ rk[3]
|
||||||
|
|
||||||
# nrounds - 1 full rounds:
|
# nrounds - 1 full rounds:
|
||||||
r = nrounds >> 1
|
r = nrounds >> 1
|
||||||
p = 0
|
p = 0
|
||||||
while 1:
|
while 1:
|
||||||
t0 = (
|
t0 = (
|
||||||
Te0[(s0 >> 24) ] ^
|
Te0[(s0 >> 24) ] ^
|
||||||
Te1[(s1 >> 16) & 0xff] ^
|
Te1[(s1 >> 16) & 0xff] ^
|
||||||
Te2[(s2 >> 8) & 0xff] ^
|
Te2[(s2 >> 8) & 0xff] ^
|
||||||
Te3[(s3 ) & 0xff] ^
|
Te3[(s3 ) & 0xff] ^
|
||||||
rk[p+4])
|
rk[p+4])
|
||||||
t1 = (
|
t1 = (
|
||||||
Te0[(s1 >> 24) ] ^
|
Te0[(s1 >> 24) ] ^
|
||||||
Te1[(s2 >> 16) & 0xff] ^
|
Te1[(s2 >> 16) & 0xff] ^
|
||||||
Te2[(s3 >> 8) & 0xff] ^
|
Te2[(s3 >> 8) & 0xff] ^
|
||||||
Te3[(s0 ) & 0xff] ^
|
Te3[(s0 ) & 0xff] ^
|
||||||
rk[p+5])
|
rk[p+5])
|
||||||
t2 = (
|
t2 = (
|
||||||
Te0[(s2 >> 24) ] ^
|
Te0[(s2 >> 24) ] ^
|
||||||
Te1[(s3 >> 16) & 0xff] ^
|
Te1[(s3 >> 16) & 0xff] ^
|
||||||
Te2[(s0 >> 8) & 0xff] ^
|
Te2[(s0 >> 8) & 0xff] ^
|
||||||
Te3[(s1 ) & 0xff] ^
|
Te3[(s1 ) & 0xff] ^
|
||||||
rk[p+6])
|
rk[p+6])
|
||||||
t3 = (
|
t3 = (
|
||||||
Te0[(s3 >> 24) ] ^
|
Te0[(s3 >> 24) ] ^
|
||||||
Te1[(s0 >> 16) & 0xff] ^
|
Te1[(s0 >> 16) & 0xff] ^
|
||||||
Te2[(s1 >> 8) & 0xff] ^
|
Te2[(s1 >> 8) & 0xff] ^
|
||||||
Te3[(s2 ) & 0xff] ^
|
Te3[(s2 ) & 0xff] ^
|
||||||
rk[p+7])
|
rk[p+7])
|
||||||
p += 8
|
p += 8
|
||||||
r -= 1
|
r -= 1
|
||||||
if r == 0: break
|
if r == 0: break
|
||||||
|
s0 = (
|
||||||
|
Te0[(t0 >> 24) ] ^
|
||||||
|
Te1[(t1 >> 16) & 0xff] ^
|
||||||
|
Te2[(t2 >> 8) & 0xff] ^
|
||||||
|
Te3[(t3 ) & 0xff] ^
|
||||||
|
rk[p+0])
|
||||||
|
s1 = (
|
||||||
|
Te0[(t1 >> 24) ] ^
|
||||||
|
Te1[(t2 >> 16) & 0xff] ^
|
||||||
|
Te2[(t3 >> 8) & 0xff] ^
|
||||||
|
Te3[(t0 ) & 0xff] ^
|
||||||
|
rk[p+1])
|
||||||
|
s2 = (
|
||||||
|
Te0[(t2 >> 24) ] ^
|
||||||
|
Te1[(t3 >> 16) & 0xff] ^
|
||||||
|
Te2[(t0 >> 8) & 0xff] ^
|
||||||
|
Te3[(t1 ) & 0xff] ^
|
||||||
|
rk[p+2])
|
||||||
|
s3 = (
|
||||||
|
Te0[(t3 >> 24) ] ^
|
||||||
|
Te1[(t0 >> 16) & 0xff] ^
|
||||||
|
Te2[(t1 >> 8) & 0xff] ^
|
||||||
|
Te3[(t2 ) & 0xff] ^
|
||||||
|
rk[p+3])
|
||||||
|
|
||||||
|
ciphertext = ''
|
||||||
|
|
||||||
|
# apply last round and
|
||||||
|
# map cipher state to byte array block:
|
||||||
s0 = (
|
s0 = (
|
||||||
Te0[(t0 >> 24) ] ^
|
(Te4[(t0 >> 24) ] & 0xff000000) ^
|
||||||
Te1[(t1 >> 16) & 0xff] ^
|
(Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
|
||||||
Te2[(t2 >> 8) & 0xff] ^
|
(Te4[(t2 >> 8) & 0xff] & 0x0000ff00) ^
|
||||||
Te3[(t3 ) & 0xff] ^
|
(Te4[(t3 ) & 0xff] & 0x000000ff) ^
|
||||||
rk[p+0])
|
rk[p+0])
|
||||||
|
ciphertext += PUTU32(s0)
|
||||||
s1 = (
|
s1 = (
|
||||||
Te0[(t1 >> 24) ] ^
|
(Te4[(t1 >> 24) ] & 0xff000000) ^
|
||||||
Te1[(t2 >> 16) & 0xff] ^
|
(Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
|
||||||
Te2[(t3 >> 8) & 0xff] ^
|
(Te4[(t3 >> 8) & 0xff] & 0x0000ff00) ^
|
||||||
Te3[(t0 ) & 0xff] ^
|
(Te4[(t0 ) & 0xff] & 0x000000ff) ^
|
||||||
rk[p+1])
|
rk[p+1])
|
||||||
|
ciphertext += PUTU32(s1)
|
||||||
s2 = (
|
s2 = (
|
||||||
Te0[(t2 >> 24) ] ^
|
(Te4[(t2 >> 24) ] & 0xff000000) ^
|
||||||
Te1[(t3 >> 16) & 0xff] ^
|
(Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
|
||||||
Te2[(t0 >> 8) & 0xff] ^
|
(Te4[(t0 >> 8) & 0xff] & 0x0000ff00) ^
|
||||||
Te3[(t1 ) & 0xff] ^
|
(Te4[(t1 ) & 0xff] & 0x000000ff) ^
|
||||||
rk[p+2])
|
rk[p+2])
|
||||||
|
ciphertext += PUTU32(s2)
|
||||||
s3 = (
|
s3 = (
|
||||||
Te0[(t3 >> 24) ] ^
|
(Te4[(t3 >> 24) ] & 0xff000000) ^
|
||||||
Te1[(t0 >> 16) & 0xff] ^
|
(Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
|
||||||
Te2[(t1 >> 8) & 0xff] ^
|
(Te4[(t1 >> 8) & 0xff] & 0x0000ff00) ^
|
||||||
Te3[(t2 ) & 0xff] ^
|
(Te4[(t2 ) & 0xff] & 0x000000ff) ^
|
||||||
rk[p+3])
|
rk[p+3])
|
||||||
|
ciphertext += PUTU32(s3)
|
||||||
|
|
||||||
ciphertext = ''
|
assert len(ciphertext) == 16
|
||||||
|
return ciphertext
|
||||||
# apply last round and
|
|
||||||
# map cipher state to byte array block:
|
|
||||||
s0 = (
|
|
||||||
(Te4[(t0 >> 24) ] & 0xff000000) ^
|
|
||||||
(Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
|
|
||||||
(Te4[(t2 >> 8) & 0xff] & 0x0000ff00) ^
|
|
||||||
(Te4[(t3 ) & 0xff] & 0x000000ff) ^
|
|
||||||
rk[p+0])
|
|
||||||
ciphertext += PUTU32(s0)
|
|
||||||
s1 = (
|
|
||||||
(Te4[(t1 >> 24) ] & 0xff000000) ^
|
|
||||||
(Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
|
|
||||||
(Te4[(t3 >> 8) & 0xff] & 0x0000ff00) ^
|
|
||||||
(Te4[(t0 ) & 0xff] & 0x000000ff) ^
|
|
||||||
rk[p+1])
|
|
||||||
ciphertext += PUTU32(s1)
|
|
||||||
s2 = (
|
|
||||||
(Te4[(t2 >> 24) ] & 0xff000000) ^
|
|
||||||
(Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
|
|
||||||
(Te4[(t0 >> 8) & 0xff] & 0x0000ff00) ^
|
|
||||||
(Te4[(t1 ) & 0xff] & 0x000000ff) ^
|
|
||||||
rk[p+2])
|
|
||||||
ciphertext += PUTU32(s2)
|
|
||||||
s3 = (
|
|
||||||
(Te4[(t3 >> 24) ] & 0xff000000) ^
|
|
||||||
(Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
|
|
||||||
(Te4[(t1 >> 8) & 0xff] & 0x0000ff00) ^
|
|
||||||
(Te4[(t2 ) & 0xff] & 0x000000ff) ^
|
|
||||||
rk[p+3])
|
|
||||||
ciphertext += PUTU32(s3)
|
|
||||||
|
|
||||||
assert len(ciphertext) == 16
|
|
||||||
return ciphertext
|
|
||||||
|
|
||||||
|
|
||||||
def rijndaelDecrypt(rk, nrounds, ciphertext):
|
def rijndaelDecrypt(rk, nrounds, ciphertext):
|
||||||
assert len(ciphertext) == 16
|
assert len(ciphertext) == 16
|
||||||
|
|
||||||
# map byte array block to cipher state
|
# map byte array block to cipher state
|
||||||
# and add initial round key:
|
# and add initial round key:
|
||||||
s0 = GETU32(ciphertext[0:4]) ^ rk[0]
|
s0 = GETU32(ciphertext[0:4]) ^ rk[0]
|
||||||
s1 = GETU32(ciphertext[4:8]) ^ rk[1]
|
s1 = GETU32(ciphertext[4:8]) ^ rk[1]
|
||||||
s2 = GETU32(ciphertext[8:12]) ^ rk[2]
|
s2 = GETU32(ciphertext[8:12]) ^ rk[2]
|
||||||
s3 = GETU32(ciphertext[12:16]) ^ rk[3]
|
s3 = GETU32(ciphertext[12:16]) ^ rk[3]
|
||||||
|
|
||||||
# nrounds - 1 full rounds:
|
# nrounds - 1 full rounds:
|
||||||
r = nrounds >> 1
|
r = nrounds >> 1
|
||||||
p = 0
|
p = 0
|
||||||
while 1:
|
while 1:
|
||||||
t0 = (
|
t0 = (
|
||||||
Td0[(s0 >> 24) ] ^
|
Td0[(s0 >> 24) ] ^
|
||||||
Td1[(s3 >> 16) & 0xff] ^
|
Td1[(s3 >> 16) & 0xff] ^
|
||||||
Td2[(s2 >> 8) & 0xff] ^
|
Td2[(s2 >> 8) & 0xff] ^
|
||||||
Td3[(s1 ) & 0xff] ^
|
Td3[(s1 ) & 0xff] ^
|
||||||
rk[p+4])
|
rk[p+4])
|
||||||
t1 = (
|
t1 = (
|
||||||
Td0[(s1 >> 24) ] ^
|
Td0[(s1 >> 24) ] ^
|
||||||
Td1[(s0 >> 16) & 0xff] ^
|
Td1[(s0 >> 16) & 0xff] ^
|
||||||
Td2[(s3 >> 8) & 0xff] ^
|
Td2[(s3 >> 8) & 0xff] ^
|
||||||
Td3[(s2 ) & 0xff] ^
|
Td3[(s2 ) & 0xff] ^
|
||||||
rk[p+5])
|
rk[p+5])
|
||||||
t2 = (
|
t2 = (
|
||||||
Td0[(s2 >> 24) ] ^
|
Td0[(s2 >> 24) ] ^
|
||||||
Td1[(s1 >> 16) & 0xff] ^
|
Td1[(s1 >> 16) & 0xff] ^
|
||||||
Td2[(s0 >> 8) & 0xff] ^
|
Td2[(s0 >> 8) & 0xff] ^
|
||||||
Td3[(s3 ) & 0xff] ^
|
Td3[(s3 ) & 0xff] ^
|
||||||
rk[p+6])
|
rk[p+6])
|
||||||
t3 = (
|
t3 = (
|
||||||
Td0[(s3 >> 24) ] ^
|
Td0[(s3 >> 24) ] ^
|
||||||
Td1[(s2 >> 16) & 0xff] ^
|
Td1[(s2 >> 16) & 0xff] ^
|
||||||
Td2[(s1 >> 8) & 0xff] ^
|
Td2[(s1 >> 8) & 0xff] ^
|
||||||
Td3[(s0 ) & 0xff] ^
|
Td3[(s0 ) & 0xff] ^
|
||||||
rk[p+7])
|
rk[p+7])
|
||||||
p += 8
|
p += 8
|
||||||
r -= 1
|
r -= 1
|
||||||
if r == 0: break
|
if r == 0: break
|
||||||
|
s0 = (
|
||||||
|
Td0[(t0 >> 24) ] ^
|
||||||
|
Td1[(t3 >> 16) & 0xff] ^
|
||||||
|
Td2[(t2 >> 8) & 0xff] ^
|
||||||
|
Td3[(t1 ) & 0xff] ^
|
||||||
|
rk[p+0])
|
||||||
|
s1 = (
|
||||||
|
Td0[(t1 >> 24) ] ^
|
||||||
|
Td1[(t0 >> 16) & 0xff] ^
|
||||||
|
Td2[(t3 >> 8) & 0xff] ^
|
||||||
|
Td3[(t2 ) & 0xff] ^
|
||||||
|
rk[p+1])
|
||||||
|
s2 = (
|
||||||
|
Td0[(t2 >> 24) ] ^
|
||||||
|
Td1[(t1 >> 16) & 0xff] ^
|
||||||
|
Td2[(t0 >> 8) & 0xff] ^
|
||||||
|
Td3[(t3 ) & 0xff] ^
|
||||||
|
rk[p+2])
|
||||||
|
s3 = (
|
||||||
|
Td0[(t3 >> 24) ] ^
|
||||||
|
Td1[(t2 >> 16) & 0xff] ^
|
||||||
|
Td2[(t1 >> 8) & 0xff] ^
|
||||||
|
Td3[(t0 ) & 0xff] ^
|
||||||
|
rk[p+3])
|
||||||
|
|
||||||
|
plaintext = ''
|
||||||
|
|
||||||
|
# apply last round and
|
||||||
|
# map cipher state to byte array block:
|
||||||
s0 = (
|
s0 = (
|
||||||
Td0[(t0 >> 24) ] ^
|
(Td4[(t0 >> 24) ] & 0xff000000) ^
|
||||||
Td1[(t3 >> 16) & 0xff] ^
|
(Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
|
||||||
Td2[(t2 >> 8) & 0xff] ^
|
(Td4[(t2 >> 8) & 0xff] & 0x0000ff00) ^
|
||||||
Td3[(t1 ) & 0xff] ^
|
(Td4[(t1 ) & 0xff] & 0x000000ff) ^
|
||||||
rk[p+0])
|
rk[p+0])
|
||||||
|
plaintext += PUTU32(s0)
|
||||||
s1 = (
|
s1 = (
|
||||||
Td0[(t1 >> 24) ] ^
|
(Td4[(t1 >> 24) ] & 0xff000000) ^
|
||||||
Td1[(t0 >> 16) & 0xff] ^
|
(Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
|
||||||
Td2[(t3 >> 8) & 0xff] ^
|
(Td4[(t3 >> 8) & 0xff] & 0x0000ff00) ^
|
||||||
Td3[(t2 ) & 0xff] ^
|
(Td4[(t2 ) & 0xff] & 0x000000ff) ^
|
||||||
rk[p+1])
|
rk[p+1])
|
||||||
|
plaintext += PUTU32(s1)
|
||||||
s2 = (
|
s2 = (
|
||||||
Td0[(t2 >> 24) ] ^
|
(Td4[(t2 >> 24) ] & 0xff000000) ^
|
||||||
Td1[(t1 >> 16) & 0xff] ^
|
(Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
|
||||||
Td2[(t0 >> 8) & 0xff] ^
|
(Td4[(t0 >> 8) & 0xff] & 0x0000ff00) ^
|
||||||
Td3[(t3 ) & 0xff] ^
|
(Td4[(t3 ) & 0xff] & 0x000000ff) ^
|
||||||
rk[p+2])
|
rk[p+2])
|
||||||
|
plaintext += PUTU32(s2)
|
||||||
s3 = (
|
s3 = (
|
||||||
Td0[(t3 >> 24) ] ^
|
(Td4[(t3 >> 24) ] & 0xff000000) ^
|
||||||
Td1[(t2 >> 16) & 0xff] ^
|
(Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
|
||||||
Td2[(t1 >> 8) & 0xff] ^
|
(Td4[(t1 >> 8) & 0xff] & 0x0000ff00) ^
|
||||||
Td3[(t0 ) & 0xff] ^
|
(Td4[(t0 ) & 0xff] & 0x000000ff) ^
|
||||||
rk[p+3])
|
rk[p+3])
|
||||||
|
plaintext += PUTU32(s3)
|
||||||
|
|
||||||
plaintext = ''
|
assert len(plaintext) == 16
|
||||||
|
return plaintext
|
||||||
# apply last round and
|
|
||||||
# map cipher state to byte array block:
|
|
||||||
s0 = (
|
|
||||||
(Td4[(t0 >> 24) ] & 0xff000000) ^
|
|
||||||
(Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
|
|
||||||
(Td4[(t2 >> 8) & 0xff] & 0x0000ff00) ^
|
|
||||||
(Td4[(t1 ) & 0xff] & 0x000000ff) ^
|
|
||||||
rk[p+0])
|
|
||||||
plaintext += PUTU32(s0)
|
|
||||||
s1 = (
|
|
||||||
(Td4[(t1 >> 24) ] & 0xff000000) ^
|
|
||||||
(Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
|
|
||||||
(Td4[(t3 >> 8) & 0xff] & 0x0000ff00) ^
|
|
||||||
(Td4[(t2 ) & 0xff] & 0x000000ff) ^
|
|
||||||
rk[p+1])
|
|
||||||
plaintext += PUTU32(s1)
|
|
||||||
s2 = (
|
|
||||||
(Td4[(t2 >> 24) ] & 0xff000000) ^
|
|
||||||
(Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
|
|
||||||
(Td4[(t0 >> 8) & 0xff] & 0x0000ff00) ^
|
|
||||||
(Td4[(t3 ) & 0xff] & 0x000000ff) ^
|
|
||||||
rk[p+2])
|
|
||||||
plaintext += PUTU32(s2)
|
|
||||||
s3 = (
|
|
||||||
(Td4[(t3 >> 24) ] & 0xff000000) ^
|
|
||||||
(Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
|
|
||||||
(Td4[(t1 >> 8) & 0xff] & 0x0000ff00) ^
|
|
||||||
(Td4[(t0 ) & 0xff] & 0x000000ff) ^
|
|
||||||
rk[p+3])
|
|
||||||
plaintext += PUTU32(s3)
|
|
||||||
|
|
||||||
assert len(plaintext) == 16
|
|
||||||
return plaintext
|
|
||||||
|
|
||||||
|
|
||||||
# decrypt(key, fin, fout, keybits=256)
|
# decrypt(key, fin, fout, keybits=256)
|
||||||
class RijndaelDecryptor(object):
|
class RijndaelDecryptor(object):
|
||||||
|
|
||||||
def __init__(self, key, keybits=256):
|
|
||||||
assert len(key) == KEYLENGTH(keybits)
|
|
||||||
(self.rk, self.nrounds) = rijndaelSetupDecrypt(key, keybits)
|
|
||||||
assert len(self.rk) == RKLENGTH(keybits)
|
|
||||||
assert self.nrounds == NROUNDS(keybits)
|
|
||||||
return
|
|
||||||
|
|
||||||
def decrypt(self, ciphertext):
|
def __init__(self, key, keybits=256):
|
||||||
assert len(ciphertext) == 16
|
assert len(key) == KEYLENGTH(keybits)
|
||||||
return rijndaelDecrypt(self.rk, self.nrounds, ciphertext)
|
(self.rk, self.nrounds) = rijndaelSetupDecrypt(key, keybits)
|
||||||
|
assert len(self.rk) == RKLENGTH(keybits)
|
||||||
|
assert self.nrounds == NROUNDS(keybits)
|
||||||
|
return
|
||||||
|
|
||||||
|
def decrypt(self, ciphertext):
|
||||||
|
assert len(ciphertext) == 16
|
||||||
|
return rijndaelDecrypt(self.rk, self.nrounds, ciphertext)
|
||||||
|
|
||||||
# encrypt(key, fin, fout, keybits=256)
|
# encrypt(key, fin, fout, keybits=256)
|
||||||
class RijndaelEncryptor(object):
|
class RijndaelEncryptor(object):
|
||||||
|
|
||||||
def __init__(self, key, keybits=256):
|
def __init__(self, key, keybits=256):
|
||||||
assert len(key) == KEYLENGTH(keybits)
|
assert len(key) == KEYLENGTH(keybits)
|
||||||
(self.rk, self.nrounds) = rijndaelSetupEncrypt(key, keybits)
|
(self.rk, self.nrounds) = rijndaelSetupEncrypt(key, keybits)
|
||||||
assert len(self.rk) == RKLENGTH(keybits)
|
assert len(self.rk) == RKLENGTH(keybits)
|
||||||
assert self.nrounds == NROUNDS(keybits)
|
assert self.nrounds == NROUNDS(keybits)
|
||||||
return
|
return
|
||||||
|
|
||||||
def encrypt(self, plaintext):
|
def encrypt(self, plaintext):
|
||||||
assert len(plaintext) == 16
|
assert len(plaintext) == 16
|
||||||
return rijndaelEncrypt(self.rk, self.nrounds, plaintext)
|
return rijndaelEncrypt(self.rk, self.nrounds, plaintext)
|
||||||
|
|
||||||
def main(argv):
|
def main(argv):
|
||||||
# test
|
# test
|
||||||
key = '00010203050607080A0B0C0D0F101112'.decode('hex')
|
key = '00010203050607080A0B0C0D0F101112'.decode('hex')
|
||||||
plaintext = '506812A45F08C889B97F5980038B8359'.decode('hex')
|
plaintext = '506812A45F08C889B97F5980038B8359'.decode('hex')
|
||||||
ciphertext = 'D8F532538289EF7D06B506A4FD5BE9C9'.decode('hex')
|
ciphertext = 'D8F532538289EF7D06B506A4FD5BE9C9'.decode('hex')
|
||||||
e = RijndaelEncryptor(key, 128)
|
e = RijndaelEncryptor(key, 128)
|
||||||
text = e.encrypt(plaintext)
|
text = e.encrypt(plaintext)
|
||||||
assert text == ciphertext
|
assert text == ciphertext
|
||||||
d = RijndaelDecryptor(key, 128)
|
d = RijndaelDecryptor(key, 128)
|
||||||
text = d.decrypt(ciphertext)
|
text = d.decrypt(ciphertext)
|
||||||
assert text == plaintext
|
assert text == plaintext
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||||
|
|
|
@ -7,21 +7,21 @@ from struct import unpack
|
||||||
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
|
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
|
||||||
|
|
||||||
def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
|
def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
|
||||||
'''Multiplies two matrices.'''
|
'''Multiplies two matrices.'''
|
||||||
return (a0*a1+c0*b1, b0*a1+d0*b1,
|
return (a0*a1+c0*b1, b0*a1+d0*b1,
|
||||||
a0*c1+c0*d1, b0*c1+d0*d1,
|
a0*c1+c0*d1, b0*c1+d0*d1,
|
||||||
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
|
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
|
||||||
|
|
||||||
def translate_matrix((a,b,c,d,e,f), (x,y)):
|
def translate_matrix((a,b,c,d,e,f), (x,y)):
|
||||||
return (a,b,c,d,x*a+y*c+e,x*b+y*d+f)
|
return (a,b,c,d,x*a+y*c+e,x*b+y*d+f)
|
||||||
|
|
||||||
def apply_matrix_pt((a,b,c,d,e,f), (x,y)):
|
def apply_matrix_pt((a,b,c,d,e,f), (x,y)):
|
||||||
'''Applies a matrix to a point.'''
|
'''Applies a matrix to a point.'''
|
||||||
return (a*x+c*y+e, b*x+d*y+f)
|
return (a*x+c*y+e, b*x+d*y+f)
|
||||||
|
|
||||||
def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
|
def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
|
||||||
'''Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))'''
|
'''Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))'''
|
||||||
return (a*p+c*q, b*p+d*q)
|
return (a*p+c*q, b*p+d*q)
|
||||||
|
|
||||||
|
|
||||||
## Utility functions
|
## Utility functions
|
||||||
|
@ -29,62 +29,62 @@ def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
|
||||||
|
|
||||||
# pick
|
# pick
|
||||||
def pick(seq, func, maxobj=None):
|
def pick(seq, func, maxobj=None):
|
||||||
'''Picks the object that has the highest value of func(obj).'''
|
'''Picks the object that has the highest value of func(obj).'''
|
||||||
maxscore = None
|
maxscore = None
|
||||||
for obj in seq:
|
for obj in seq:
|
||||||
score = func(obj)
|
score = func(obj)
|
||||||
if maxscore == None or maxscore < score:
|
if maxscore == None or maxscore < score:
|
||||||
(maxscore,maxobj) = (score,obj)
|
(maxscore,maxobj) = (score,obj)
|
||||||
return maxobj
|
return maxobj
|
||||||
|
|
||||||
# bsearch
|
# bsearch
|
||||||
def bsearch(objs, v0):
|
def bsearch(objs, v0):
|
||||||
'''Tries to find the closest value to v0.'''
|
'''Tries to find the closest value to v0.'''
|
||||||
i0 = 0
|
i0 = 0
|
||||||
i1 = len(objs)
|
i1 = len(objs)
|
||||||
while i0 < i1:
|
while i0 < i1:
|
||||||
i = (i0+i1)/2
|
i = (i0+i1)/2
|
||||||
(v, obj) = objs[i]
|
(v, obj) = objs[i]
|
||||||
if v0 == v:
|
if v0 == v:
|
||||||
(i0,i1) = (i,i+1)
|
(i0,i1) = (i,i+1)
|
||||||
while 0 < i0 and objs[i0-1][0] == v0:
|
while 0 < i0 and objs[i0-1][0] == v0:
|
||||||
i0 -= 1
|
i0 -= 1
|
||||||
while i1 < len(objs)-1 and objs[i1][0] == v0:
|
while i1 < len(objs)-1 and objs[i1][0] == v0:
|
||||||
i1 += 1
|
i1 += 1
|
||||||
break
|
break
|
||||||
elif v0 < v:
|
elif v0 < v:
|
||||||
i1 = i
|
i1 = i
|
||||||
else:
|
else:
|
||||||
i0 = i+1
|
i0 = i+1
|
||||||
return (i0,i1)
|
return (i0,i1)
|
||||||
|
|
||||||
# choplist
|
# choplist
|
||||||
def choplist(n, seq):
|
def choplist(n, seq):
|
||||||
'''Groups every n elements of the list.'''
|
'''Groups every n elements of the list.'''
|
||||||
r = []
|
r = []
|
||||||
for x in seq:
|
for x in seq:
|
||||||
r.append(x)
|
r.append(x)
|
||||||
if len(r) == n:
|
if len(r) == n:
|
||||||
yield tuple(r)
|
yield tuple(r)
|
||||||
r = []
|
r = []
|
||||||
return
|
return
|
||||||
|
|
||||||
# nunpack
|
# nunpack
|
||||||
def nunpack(s, default=0):
|
def nunpack(s, default=0):
|
||||||
'''Unpacks up to 4 bytes big endian.'''
|
'''Unpacks up to 4 bytes big endian.'''
|
||||||
l = len(s)
|
l = len(s)
|
||||||
if not l:
|
if not l:
|
||||||
return default
|
return default
|
||||||
elif l == 1:
|
elif l == 1:
|
||||||
return ord(s)
|
return ord(s)
|
||||||
elif l == 2:
|
elif l == 2:
|
||||||
return unpack('>H', s)[0]
|
return unpack('>H', s)[0]
|
||||||
elif l == 3:
|
elif l == 3:
|
||||||
return unpack('>L', '\x00'+s)[0]
|
return unpack('>L', '\x00'+s)[0]
|
||||||
elif l == 4:
|
elif l == 4:
|
||||||
return unpack('>L', s)[0]
|
return unpack('>L', s)[0]
|
||||||
else:
|
else:
|
||||||
return TypeError('invalid length: %d' % l)
|
return TypeError('invalid length: %d' % l)
|
||||||
|
|
||||||
# decode_text
|
# decode_text
|
||||||
PDFDocEncoding = ''.join( unichr(x) for x in (
|
PDFDocEncoding = ''.join( unichr(x) for x in (
|
||||||
|
@ -122,14 +122,14 @@ PDFDocEncoding = ''.join( unichr(x) for x in (
|
||||||
0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
|
0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
|
||||||
))
|
))
|
||||||
def decode_text(s):
|
def decode_text(s):
|
||||||
'''Decodes a PDFDocEncoding string to Unicode.'''
|
'''Decodes a PDFDocEncoding string to Unicode.'''
|
||||||
if s.startswith('\xfe\xff'):
|
if s.startswith('\xfe\xff'):
|
||||||
return unicode(s[2:], 'utf-16be', 'ignore')
|
return unicode(s[2:], 'utf-16be', 'ignore')
|
||||||
else:
|
else:
|
||||||
return ''.join( PDFDocEncoding[ord(c)] for c in s )
|
return ''.join( PDFDocEncoding[ord(c)] for c in s )
|
||||||
|
|
||||||
# enc
|
# enc
|
||||||
def enc(x, codec='ascii'):
|
def enc(x, codec='ascii'):
|
||||||
'''Encodes a string for SGML/XML/HTML'''
|
'''Encodes a string for SGML/XML/HTML'''
|
||||||
x = x.replace('&','&').replace('>','>').replace('<','<').replace('"','"')
|
x = x.replace('&','&').replace('>','>').replace('<','<').replace('"','"')
|
||||||
return x.encode(codec, 'xmlcharrefreplace')
|
return x.encode(codec, 'xmlcharrefreplace')
|
||||||
|
|
30
setup.py
30
setup.py
|
@ -3,10 +3,10 @@ from distutils.core import setup
|
||||||
from pdfminer import __version__
|
from pdfminer import __version__
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name='pdfminer',
|
name='pdfminer',
|
||||||
version=__version__,
|
version=__version__,
|
||||||
description='PDF parser and analyzer',
|
description='PDF parser and analyzer',
|
||||||
long_description='''PDFMiner is a suite of programs that help
|
long_description='''PDFMiner is a suite of programs that help
|
||||||
extracting and analyzing text data of PDF documents.
|
extracting and analyzing text data of PDF documents.
|
||||||
Unlike other PDF-related tools, it allows to obtain
|
Unlike other PDF-related tools, it allows to obtain
|
||||||
the exact location of texts in a page, as well as
|
the exact location of texts in a page, as well as
|
||||||
|
@ -14,23 +14,23 @@ other extra information such as font information or ruled lines.
|
||||||
It includes a PDF converter that can transform PDF files
|
It includes a PDF converter that can transform PDF files
|
||||||
into other text formats (such as HTML). It has an extensible
|
into other text formats (such as HTML). It has an extensible
|
||||||
PDF parser that can be used for other purposes instead of text analysis.''',
|
PDF parser that can be used for other purposes instead of text analysis.''',
|
||||||
license='MIT/X',
|
license='MIT/X',
|
||||||
author='Yusuke Shinyama',
|
author='Yusuke Shinyama',
|
||||||
author_email='yusuke at cs dot nyu dot edu',
|
author_email='yusuke at cs dot nyu dot edu',
|
||||||
url='http://www.unixuser.org/~euske/python/pdfminer/index.html',
|
url='http://www.unixuser.org/~euske/python/pdfminer/index.html',
|
||||||
packages=[
|
packages=[
|
||||||
'pdfminer'
|
'pdfminer'
|
||||||
],
|
],
|
||||||
scripts=[
|
scripts=[
|
||||||
'tools/pdf2txt.py',
|
'tools/pdf2txt.py',
|
||||||
'tools/dumppdf.py'
|
'tools/dumppdf.py'
|
||||||
],
|
],
|
||||||
keywords=['pdf parser', 'pdf converter', 'text mining'],
|
keywords=['pdf parser', 'pdf converter', 'text mining'],
|
||||||
classifiers=[
|
classifiers=[
|
||||||
'Development Status :: 4 - Beta',
|
'Development Status :: 4 - Beta',
|
||||||
'Environment :: Console',
|
'Environment :: Console',
|
||||||
'Intended Audience :: Developers',
|
'Intended Audience :: Developers',
|
||||||
'Intended Audience :: Science/Research',
|
'Intended Audience :: Science/Research',
|
||||||
'License :: OSI Approved :: MIT License',
|
'License :: OSI Approved :: MIT License',
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
|
@ -5,38 +5,38 @@ stdout = sys.stdout
|
||||||
stderr = sys.stderr
|
stderr = sys.stderr
|
||||||
|
|
||||||
def main(argv):
|
def main(argv):
|
||||||
fonts = {}
|
fonts = {}
|
||||||
for line in fileinput.input():
|
for line in fileinput.input():
|
||||||
f = line.strip().split(' ')
|
f = line.strip().split(' ')
|
||||||
if not f: continue
|
if not f: continue
|
||||||
k = f[0]
|
k = f[0]
|
||||||
if k == 'FontName':
|
if k == 'FontName':
|
||||||
fontname = f[1]
|
fontname = f[1]
|
||||||
props = {'FontName': fontname, 'Flags': 0}
|
props = {'FontName': fontname, 'Flags': 0}
|
||||||
chars = {}
|
chars = {}
|
||||||
fonts[fontname] = (props, chars)
|
fonts[fontname] = (props, chars)
|
||||||
elif k == 'C':
|
elif k == 'C':
|
||||||
cid = int(f[1])
|
cid = int(f[1])
|
||||||
if 0 <= cid and cid <= 255:
|
if 0 <= cid and cid <= 255:
|
||||||
width = int(f[4])
|
width = int(f[4])
|
||||||
chars[cid] = width
|
chars[cid] = width
|
||||||
elif k in ('CapHeight', 'XHeight', 'ItalicAngle',
|
elif k in ('CapHeight', 'XHeight', 'ItalicAngle',
|
||||||
'Ascender', 'Descender'):
|
'Ascender', 'Descender'):
|
||||||
k = {'Ascender':'Ascent', 'Descender':'Descent'}.get(k,k)
|
k = {'Ascender':'Ascent', 'Descender':'Descent'}.get(k,k)
|
||||||
props[k] = float(f[1])
|
props[k] = float(f[1])
|
||||||
elif k in ('FontName', 'FamilyName', 'Weight'):
|
elif k in ('FontName', 'FamilyName', 'Weight'):
|
||||||
k = {'FamilyName':'FontFamily', 'Weight':'FontWeight'}.get(k,k)
|
k = {'FamilyName':'FontFamily', 'Weight':'FontWeight'}.get(k,k)
|
||||||
props[k] = f[1]
|
props[k] = f[1]
|
||||||
elif k == 'IsFixedPitch':
|
elif k == 'IsFixedPitch':
|
||||||
if f[1].lower() == 'true':
|
if f[1].lower() == 'true':
|
||||||
props['Flags'] = 64
|
props['Flags'] = 64
|
||||||
elif k == 'FontBBox':
|
elif k == 'FontBBox':
|
||||||
props[k] = tuple(map(float, f[1:5]))
|
props[k] = tuple(map(float, f[1:5]))
|
||||||
print '# -*- python -*-'
|
print '# -*- python -*-'
|
||||||
print 'FONT_METRICS = {'
|
print 'FONT_METRICS = {'
|
||||||
for (fontname,(props,chars)) in fonts.iteritems():
|
for (fontname,(props,chars)) in fonts.iteritems():
|
||||||
print ' %r: %r,' % (fontname, (props,chars))
|
print ' %r: %r,' % (fontname, (props,chars))
|
||||||
print '}'
|
print '}'
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||||
|
|
276
tools/dumppdf.py
276
tools/dumppdf.py
|
@ -13,173 +13,173 @@ from pdfminer.pdftypes import PDFStream, PDFObjRef, PSKeyword, PSLiteral, resolv
|
||||||
|
|
||||||
ESC_PAT = re.compile(r'[\000-\037&<>()\042\047\134\177-\377]')
|
ESC_PAT = re.compile(r'[\000-\037&<>()\042\047\134\177-\377]')
|
||||||
def esc(s):
|
def esc(s):
|
||||||
return ESC_PAT.sub(lambda m:'&#%d;' % ord(m.group(0)), s)
|
return ESC_PAT.sub(lambda m:'&#%d;' % ord(m.group(0)), s)
|
||||||
|
|
||||||
|
|
||||||
# dumpxml
|
# dumpxml
|
||||||
def dumpxml(out, obj, codec=None):
|
def dumpxml(out, obj, codec=None):
|
||||||
if isinstance(obj, dict):
|
if isinstance(obj, dict):
|
||||||
out.write('<dict size="%d">\n' % len(obj))
|
out.write('<dict size="%d">\n' % len(obj))
|
||||||
for (k,v) in obj.iteritems():
|
for (k,v) in obj.iteritems():
|
||||||
out.write('<key>%s</key>\n' % k)
|
out.write('<key>%s</key>\n' % k)
|
||||||
out.write('<value>')
|
out.write('<value>')
|
||||||
dumpxml(out, v)
|
dumpxml(out, v)
|
||||||
out.write('</value>\n')
|
out.write('</value>\n')
|
||||||
out.write('</dict>')
|
out.write('</dict>')
|
||||||
return
|
return
|
||||||
|
|
||||||
if isinstance(obj, list):
|
if isinstance(obj, list):
|
||||||
out.write('<list size="%d">\n' % len(obj))
|
out.write('<list size="%d">\n' % len(obj))
|
||||||
for v in obj:
|
for v in obj:
|
||||||
dumpxml(out, v)
|
dumpxml(out, v)
|
||||||
out.write('\n')
|
out.write('\n')
|
||||||
out.write('</list>')
|
out.write('</list>')
|
||||||
return
|
return
|
||||||
|
|
||||||
if isinstance(obj, str):
|
if isinstance(obj, str):
|
||||||
out.write('<string size="%d">%s</string>' % (len(obj), esc(obj)))
|
out.write('<string size="%d">%s</string>' % (len(obj), esc(obj)))
|
||||||
return
|
return
|
||||||
|
|
||||||
if isinstance(obj, PDFStream):
|
if isinstance(obj, PDFStream):
|
||||||
out.write('<stream>\n<props>\n')
|
out.write('<stream>\n<props>\n')
|
||||||
dumpxml(out, obj.dic)
|
dumpxml(out, obj.dic)
|
||||||
out.write('\n</props>\n')
|
out.write('\n</props>\n')
|
||||||
if codec == 'text':
|
if codec == 'text':
|
||||||
data = obj.get_data()
|
data = obj.get_data()
|
||||||
out.write('<data size="%d">%s</data>\n' % (len(data), esc(data)))
|
out.write('<data size="%d">%s</data>\n' % (len(data), esc(data)))
|
||||||
out.write('</stream>')
|
out.write('</stream>')
|
||||||
return
|
return
|
||||||
|
|
||||||
if isinstance(obj, PDFObjRef):
|
if isinstance(obj, PDFObjRef):
|
||||||
out.write('<ref id="%d"/>' % obj.objid)
|
out.write('<ref id="%d"/>' % obj.objid)
|
||||||
return
|
return
|
||||||
|
|
||||||
if isinstance(obj, PSKeyword):
|
if isinstance(obj, PSKeyword):
|
||||||
out.write('<keyword>%s</keyword>' % obj.name)
|
out.write('<keyword>%s</keyword>' % obj.name)
|
||||||
return
|
return
|
||||||
|
|
||||||
if isinstance(obj, PSLiteral):
|
if isinstance(obj, PSLiteral):
|
||||||
out.write('<literal>%s</literal>' % obj.name)
|
out.write('<literal>%s</literal>' % obj.name)
|
||||||
return
|
return
|
||||||
|
|
||||||
if isinstance(obj, int) or isinstance(obj, float):
|
if isinstance(obj, int) or isinstance(obj, float):
|
||||||
out.write('<number>%s</number>' % obj)
|
out.write('<number>%s</number>' % obj)
|
||||||
return
|
return
|
||||||
|
|
||||||
raise TypeError(obj)
|
raise TypeError(obj)
|
||||||
|
|
||||||
# dumptrailers
|
# dumptrailers
|
||||||
def dumptrailers(out, doc):
|
def dumptrailers(out, doc):
|
||||||
for xref in doc.xrefs:
|
for xref in doc.xrefs:
|
||||||
out.write('<trailer>\n')
|
out.write('<trailer>\n')
|
||||||
dumpxml(out, xref.trailer)
|
dumpxml(out, xref.trailer)
|
||||||
out.write('\n</trailer>\n\n')
|
out.write('\n</trailer>\n\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
# dumpallobjs
|
# dumpallobjs
|
||||||
def dumpallobjs(out, doc, codec=None):
|
def dumpallobjs(out, doc, codec=None):
|
||||||
out.write('<pdf>')
|
out.write('<pdf>')
|
||||||
for xref in doc.xrefs:
|
for xref in doc.xrefs:
|
||||||
for objid in xref.objids():
|
for objid in xref.objids():
|
||||||
try:
|
try:
|
||||||
obj = doc.getobj(objid)
|
obj = doc.getobj(objid)
|
||||||
if obj == None: continue
|
if obj == None: continue
|
||||||
out.write('<object id="%d">\n' % objid)
|
out.write('<object id="%d">\n' % objid)
|
||||||
dumpxml(out, obj, codec=codec)
|
dumpxml(out, obj, codec=codec)
|
||||||
out.write('\n</object>\n\n')
|
out.write('\n</object>\n\n')
|
||||||
except:
|
except:
|
||||||
raise
|
raise
|
||||||
dumptrailers(out, doc)
|
dumptrailers(out, doc)
|
||||||
out.write('</pdf>')
|
out.write('</pdf>')
|
||||||
return
|
return
|
||||||
|
|
||||||
# dumpoutline
|
# dumpoutline
|
||||||
def dumpoutline(outfp, fname, objids, pagenos, password='',
|
def dumpoutline(outfp, fname, objids, pagenos, password='',
|
||||||
dumpall=False, codec=None):
|
dumpall=False, codec=None):
|
||||||
doc = PDFDocument()
|
doc = PDFDocument()
|
||||||
fp = file(fname, 'rb')
|
fp = file(fname, 'rb')
|
||||||
parser = PDFParser(doc, fp)
|
parser = PDFParser(doc, fp)
|
||||||
doc.initialize(password)
|
doc.initialize(password)
|
||||||
pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
|
pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
|
||||||
for (level,title,dest,a,se) in doc.get_outlines():
|
for (level,title,dest,a,se) in doc.get_outlines():
|
||||||
pageno = None
|
pageno = None
|
||||||
if dest:
|
if dest:
|
||||||
dest = resolve1( doc.lookup_name('Dests', dest) )
|
dest = resolve1( doc.lookup_name('Dests', dest) )
|
||||||
if isinstance(dest, dict):
|
if isinstance(dest, dict):
|
||||||
dest = dest['D']
|
dest = dest['D']
|
||||||
pageno = pages[dest[0].objid]
|
pageno = pages[dest[0].objid]
|
||||||
outfp.write(repr((level,title,dest,pageno))+'\n')
|
outfp.write(repr((level,title,dest,pageno))+'\n')
|
||||||
parser.close()
|
parser.close()
|
||||||
fp.close()
|
fp.close()
|
||||||
return
|
return
|
||||||
|
|
||||||
# dumppdf
|
# dumppdf
|
||||||
def dumppdf(outfp, fname, objids, pagenos, password='',
|
def dumppdf(outfp, fname, objids, pagenos, password='',
|
||||||
dumpall=False, codec=None):
|
dumpall=False, codec=None):
|
||||||
doc = PDFDocument()
|
doc = PDFDocument()
|
||||||
fp = file(fname, 'rb')
|
fp = file(fname, 'rb')
|
||||||
parser = PDFParser(doc, fp)
|
parser = PDFParser(doc, fp)
|
||||||
doc.initialize(password)
|
doc.initialize(password)
|
||||||
if objids:
|
if objids:
|
||||||
for objid in objids:
|
for objid in objids:
|
||||||
obj = doc.getobj(objid)
|
obj = doc.getobj(objid)
|
||||||
if isinstance(obj, PDFStream) and codec == 'raw':
|
if isinstance(obj, PDFStream) and codec == 'raw':
|
||||||
outfp.write(obj.get_rawdata())
|
outfp.write(obj.get_rawdata())
|
||||||
elif isinstance(obj, PDFStream) and codec == 'binary':
|
elif isinstance(obj, PDFStream) and codec == 'binary':
|
||||||
outfp.write(obj.get_data())
|
outfp.write(obj.get_data())
|
||||||
else:
|
else:
|
||||||
dumpxml(outfp, obj, codec=codec)
|
dumpxml(outfp, obj, codec=codec)
|
||||||
if pagenos:
|
if pagenos:
|
||||||
for (pageno,page) in enumerate(doc.get_pages()):
|
for (pageno,page) in enumerate(doc.get_pages()):
|
||||||
if pageno in pagenos:
|
if pageno in pagenos:
|
||||||
dumpxml(outfp, page.attrs)
|
dumpxml(outfp, page.attrs)
|
||||||
if dumpall:
|
if dumpall:
|
||||||
dumpallobjs(outfp, doc, codec=codec)
|
dumpallobjs(outfp, doc, codec=codec)
|
||||||
if (not objids) and (not pagenos) and (not dumpall):
|
if (not objids) and (not pagenos) and (not dumpall):
|
||||||
dumptrailers(outfp, doc)
|
dumptrailers(outfp, doc)
|
||||||
fp.close()
|
fp.close()
|
||||||
if codec not in ('raw','binary'):
|
if codec not in ('raw','binary'):
|
||||||
outfp.write('\n')
|
outfp.write('\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
# main
|
# main
|
||||||
def main(argv):
|
def main(argv):
|
||||||
import getopt
|
import getopt
|
||||||
def usage():
|
def usage():
|
||||||
print 'usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-i objid] file ...' % argv[0]
|
print 'usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-i objid] file ...' % argv[0]
|
||||||
return 100
|
return 100
|
||||||
try:
|
try:
|
||||||
(opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTi:')
|
(opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTi:')
|
||||||
except getopt.GetoptError:
|
except getopt.GetoptError:
|
||||||
return usage()
|
return usage()
|
||||||
if not args: return usage()
|
if not args: return usage()
|
||||||
debug = 0
|
debug = 0
|
||||||
objids = []
|
objids = []
|
||||||
pagenos = set()
|
pagenos = set()
|
||||||
codec = None
|
codec = None
|
||||||
password = ''
|
password = ''
|
||||||
dumpall = False
|
dumpall = False
|
||||||
proc = dumppdf
|
proc = dumppdf
|
||||||
outfp = sys.stdout
|
outfp = sys.stdout
|
||||||
for (k, v) in opts:
|
for (k, v) in opts:
|
||||||
if k == '-d': debug += 1
|
if k == '-d': debug += 1
|
||||||
elif k == '-i': objids.extend( int(x) for x in v.split(',') )
|
elif k == '-i': objids.extend( int(x) for x in v.split(',') )
|
||||||
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
|
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
|
||||||
elif k == '-P': password = v
|
elif k == '-P': password = v
|
||||||
elif k == '-a': dumpall = True
|
elif k == '-a': dumpall = True
|
||||||
elif k == '-r': codec = 'raw'
|
elif k == '-r': codec = 'raw'
|
||||||
elif k == '-b': codec = 'binary'
|
elif k == '-b': codec = 'binary'
|
||||||
elif k == '-t': codec = 'text'
|
elif k == '-t': codec = 'text'
|
||||||
elif k == '-T': proc = dumpoutline
|
elif k == '-T': proc = dumpoutline
|
||||||
elif k == '-o': outfp = file(v, 'wb')
|
elif k == '-o': outfp = file(v, 'wb')
|
||||||
#
|
#
|
||||||
PDFDocument.debug = debug
|
PDFDocument.debug = debug
|
||||||
PDFParser.debug = debug
|
PDFParser.debug = debug
|
||||||
#
|
#
|
||||||
for fname in args:
|
for fname in args:
|
||||||
proc(outfp, fname, objids, pagenos, password=password,
|
proc(outfp, fname, objids, pagenos, password=password,
|
||||||
dumpall=dumpall, codec=codec)
|
dumpall=dumpall, codec=codec)
|
||||||
return
|
return
|
||||||
|
|
||||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||||
|
|
|
@ -12,7 +12,7 @@
|
||||||
# $ mkdir CGIDIR
|
# $ mkdir CGIDIR
|
||||||
# $ mkdir CGIDIR/var
|
# $ mkdir CGIDIR/var
|
||||||
# $ cp -a pdfminer/pdflib CGIDIR
|
# $ cp -a pdfminer/pdflib CGIDIR
|
||||||
# $ PYTHONPATH=CGIDIR pdfminer/tools/pdf2html.cgi
|
# $ PYTHONPATH=CGIDIR pdfminer/tools/pdf2html.cgi
|
||||||
#
|
#
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
@ -27,16 +27,16 @@ from pdfminer.cmap import CMapDB
|
||||||
|
|
||||||
# quote HTML metacharacters
|
# quote HTML metacharacters
|
||||||
def q(x):
|
def q(x):
|
||||||
return x.replace('&','&').replace('>','>').replace('<','<').replace('"','"')
|
return x.replace('&','&').replace('>','>').replace('<','<').replace('"','"')
|
||||||
|
|
||||||
# encode parameters as a URL
|
# encode parameters as a URL
|
||||||
Q = re.compile(r'[^a-zA-Z0-9_.-=]')
|
Q = re.compile(r'[^a-zA-Z0-9_.-=]')
|
||||||
def url(base, **kw):
|
def url(base, **kw):
|
||||||
r = []
|
r = []
|
||||||
for (k,v) in kw.iteritems():
|
for (k,v) in kw.iteritems():
|
||||||
v = Q.sub(lambda m: '%%%02X' % ord(m.group(0)), encoder(q(v), 'replace')[0])
|
v = Q.sub(lambda m: '%%%02X' % ord(m.group(0)), encoder(q(v), 'replace')[0])
|
||||||
r.append('%s=%s' % (k, v))
|
r.append('%s=%s' % (k, v))
|
||||||
return base+'&'.join(r)
|
return base+'&'.join(r)
|
||||||
|
|
||||||
|
|
||||||
## convert
|
## convert
|
||||||
|
@ -44,156 +44,156 @@ def url(base, **kw):
|
||||||
class FileSizeExceeded(ValueError): pass
|
class FileSizeExceeded(ValueError): pass
|
||||||
def convert(outfp, infp, path, codec='utf-8', maxpages=10,
|
def convert(outfp, infp, path, codec='utf-8', maxpages=10,
|
||||||
maxfilesize=5000000, pagenos=None, html=True):
|
maxfilesize=5000000, pagenos=None, html=True):
|
||||||
# save the input file.
|
# save the input file.
|
||||||
src = file(path, 'wb')
|
src = file(path, 'wb')
|
||||||
nbytes = 0
|
nbytes = 0
|
||||||
while 1:
|
while 1:
|
||||||
data = infp.read(4096)
|
data = infp.read(4096)
|
||||||
nbytes += len(data)
|
nbytes += len(data)
|
||||||
if maxfilesize and maxfilesize < nbytes:
|
if maxfilesize and maxfilesize < nbytes:
|
||||||
raise FileSizeExceeded(maxfilesize)
|
raise FileSizeExceeded(maxfilesize)
|
||||||
if not data: break
|
if not data: break
|
||||||
src.write(data)
|
src.write(data)
|
||||||
src.close()
|
src.close()
|
||||||
infp.close()
|
infp.close()
|
||||||
# perform conversion and
|
# perform conversion and
|
||||||
# send the results over the network.
|
# send the results over the network.
|
||||||
CMapDB.initialize()
|
CMapDB.initialize()
|
||||||
rsrc = PDFResourceManager()
|
rsrc = PDFResourceManager()
|
||||||
laparams = LAParams()
|
laparams = LAParams()
|
||||||
if html:
|
if html:
|
||||||
device = HTMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
|
device = HTMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
|
||||||
else:
|
else:
|
||||||
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
|
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
|
||||||
fp = file(path, 'rb')
|
fp = file(path, 'rb')
|
||||||
process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages)
|
process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages)
|
||||||
fp.close()
|
fp.close()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
## PDF2HTMLApp
|
## PDF2HTMLApp
|
||||||
##
|
##
|
||||||
class PDF2HTMLApp(object):
|
class PDF2HTMLApp(object):
|
||||||
|
|
||||||
APPURL = '/convert'
|
APPURL = '/convert'
|
||||||
TMPDIR = './var/'
|
TMPDIR = './var/'
|
||||||
LOGPATH = './var/log'
|
LOGPATH = './var/log'
|
||||||
MAXFILESIZE = 5000000
|
MAXFILESIZE = 5000000
|
||||||
MAXPAGES = 10
|
MAXPAGES = 10
|
||||||
|
|
||||||
def __init__(self, outfp, logpath=LOGPATH, loglevel=logging.DEBUG, codec='utf-8'):
|
|
||||||
self.outfp = outfp
|
|
||||||
self.codec = codec
|
|
||||||
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
|
|
||||||
level=loglevel, filename=logpath, filemode='a')
|
|
||||||
self.remote_addr = os.environ.get('REMOTE_ADDR')
|
|
||||||
self.path_info = os.environ.get('PATH_INFO')
|
|
||||||
self.method = os.environ.get('REQUEST_METHOD', 'GET')
|
|
||||||
self.server = os.environ.get('SERVER_SOFTWARE', '')
|
|
||||||
self.content_type = 'text/html; charset=%s' % codec
|
|
||||||
self.cur_time = time.time()
|
|
||||||
self.form = cgi.FieldStorage()
|
|
||||||
return
|
|
||||||
|
|
||||||
def put(self, *args):
|
def __init__(self, outfp, logpath=LOGPATH, loglevel=logging.DEBUG, codec='utf-8'):
|
||||||
for x in args:
|
self.outfp = outfp
|
||||||
if isinstance(x, str):
|
self.codec = codec
|
||||||
self.outfp.write(x)
|
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
|
||||||
elif isinstance(x, unicode):
|
level=loglevel, filename=logpath, filemode='a')
|
||||||
self.outfp.write(x.encode(self.codec, 'xmlcharrefreplace'))
|
self.remote_addr = os.environ.get('REMOTE_ADDR')
|
||||||
return
|
self.path_info = os.environ.get('PATH_INFO')
|
||||||
|
self.method = os.environ.get('REQUEST_METHOD', 'GET')
|
||||||
|
self.server = os.environ.get('SERVER_SOFTWARE', '')
|
||||||
|
self.content_type = 'text/html; charset=%s' % codec
|
||||||
|
self.cur_time = time.time()
|
||||||
|
self.form = cgi.FieldStorage()
|
||||||
|
return
|
||||||
|
|
||||||
def http_200(self):
|
def put(self, *args):
|
||||||
if self.server.startswith('cgi-httpd'):
|
for x in args:
|
||||||
# required for cgi-httpd
|
if isinstance(x, str):
|
||||||
self.outfp.write('HTTP/1.0 200 OK\r\n')
|
self.outfp.write(x)
|
||||||
self.outfp.write('Content-type: %s\r\n' % self.content_type)
|
elif isinstance(x, unicode):
|
||||||
self.outfp.write('Connection: close\r\n\r\n')
|
self.outfp.write(x.encode(self.codec, 'xmlcharrefreplace'))
|
||||||
return
|
return
|
||||||
|
|
||||||
def http_404(self):
|
|
||||||
if self.server.startswith('cgi-httpd'):
|
|
||||||
# required for cgi-httpd
|
|
||||||
self.outfp.write('HTTP/1.0 404 Not Found\r\n')
|
|
||||||
self.outfp.write('Content-type: text/html\r\n')
|
|
||||||
self.outfp.write('Connection: close\r\n\r\n')
|
|
||||||
self.outfp.write('<html><body>page does not exist</body></body>\n')
|
|
||||||
return
|
|
||||||
|
|
||||||
def http_301(self, url):
|
|
||||||
if self.server.startswith('cgi-httpd'):
|
|
||||||
# required for cgi-httpd
|
|
||||||
self.outfp.write('HTTP/1.0 301 Moved\r\n')
|
|
||||||
self.outfp.write('Location: %s\r\n\r\n' % url)
|
|
||||||
return
|
|
||||||
|
|
||||||
def coverpage(self):
|
def http_200(self):
|
||||||
self.put(
|
if self.server.startswith('cgi-httpd'):
|
||||||
'<html><head><title>pdf2html demo</title></head><body>\n',
|
# required for cgi-httpd
|
||||||
'<h1>pdf2html demo</h1><hr>\n',
|
self.outfp.write('HTTP/1.0 200 OK\r\n')
|
||||||
'<form method="POST" action="%s" enctype="multipart/form-data">\n' % q(self.APPURL),
|
self.outfp.write('Content-type: %s\r\n' % self.content_type)
|
||||||
'<p>Upload PDF File: <input name="f" type="file" value="">\n',
|
self.outfp.write('Connection: close\r\n\r\n')
|
||||||
' Page numbers (comma-separated): <input name="p" type="text" size="10" value="">\n',
|
return
|
||||||
'<p>(Text extraction is limited to maximum %d pages.\n' % self.MAXPAGES,
|
|
||||||
'Maximum file size for input is %d bytes.)\n' % self.MAXFILESIZE,
|
|
||||||
'<p><input type="submit" name="c" value="Convert to HTML">\n',
|
|
||||||
'<input type="submit" name="c" value="Convert to TEXT">\n',
|
|
||||||
'<input type="reset" value="Reset">\n',
|
|
||||||
'</form><hr>\n',
|
|
||||||
'<p>Powered by <a href="http://www.unixuser.org/~euske/python/pdfminer/">PDFMiner</a>\n',
|
|
||||||
'</body></html>\n',
|
|
||||||
)
|
|
||||||
return
|
|
||||||
|
|
||||||
def run(self, argv):
|
def http_404(self):
|
||||||
if self.path_info == '/':
|
if self.server.startswith('cgi-httpd'):
|
||||||
self.http_200()
|
# required for cgi-httpd
|
||||||
self.coverpage()
|
self.outfp.write('HTTP/1.0 404 Not Found\r\n')
|
||||||
return
|
self.outfp.write('Content-type: text/html\r\n')
|
||||||
if self.path_info != self.APPURL:
|
self.outfp.write('Connection: close\r\n\r\n')
|
||||||
self.http_404()
|
self.outfp.write('<html><body>page does not exist</body></body>\n')
|
||||||
return
|
return
|
||||||
if not os.path.isdir(self.TMPDIR):
|
|
||||||
self.bummer('error')
|
def http_301(self, url):
|
||||||
return
|
if self.server.startswith('cgi-httpd'):
|
||||||
if 'f' not in self.form:
|
# required for cgi-httpd
|
||||||
self.http_301('/')
|
self.outfp.write('HTTP/1.0 301 Moved\r\n')
|
||||||
return
|
self.outfp.write('Location: %s\r\n\r\n' % url)
|
||||||
if 'c' not in self.form:
|
return
|
||||||
self.http_301('/')
|
|
||||||
return
|
def coverpage(self):
|
||||||
item = self.form['f']
|
self.put(
|
||||||
if not (item.file and item.filename):
|
'<html><head><title>pdf2html demo</title></head><body>\n',
|
||||||
self.http_301('/')
|
'<h1>pdf2html demo</h1><hr>\n',
|
||||||
return
|
'<form method="POST" action="%s" enctype="multipart/form-data">\n' % q(self.APPURL),
|
||||||
cmd = self.form.getvalue('c')
|
'<p>Upload PDF File: <input name="f" type="file" value="">\n',
|
||||||
html = (cmd == 'Convert to HTML')
|
' Page numbers (comma-separated): <input name="p" type="text" size="10" value="">\n',
|
||||||
pagenos = []
|
'<p>(Text extraction is limited to maximum %d pages.\n' % self.MAXPAGES,
|
||||||
if 'p' in self.form:
|
'Maximum file size for input is %d bytes.)\n' % self.MAXFILESIZE,
|
||||||
for m in re.finditer(r'\d+', self.form.getvalue('p')):
|
'<p><input type="submit" name="c" value="Convert to HTML">\n',
|
||||||
|
'<input type="submit" name="c" value="Convert to TEXT">\n',
|
||||||
|
'<input type="reset" value="Reset">\n',
|
||||||
|
'</form><hr>\n',
|
||||||
|
'<p>Powered by <a href="http://www.unixuser.org/~euske/python/pdfminer/">PDFMiner</a>\n',
|
||||||
|
'</body></html>\n',
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
def run(self, argv):
|
||||||
|
if self.path_info == '/':
|
||||||
|
self.http_200()
|
||||||
|
self.coverpage()
|
||||||
|
return
|
||||||
|
if self.path_info != self.APPURL:
|
||||||
|
self.http_404()
|
||||||
|
return
|
||||||
|
if not os.path.isdir(self.TMPDIR):
|
||||||
|
self.bummer('error')
|
||||||
|
return
|
||||||
|
if 'f' not in self.form:
|
||||||
|
self.http_301('/')
|
||||||
|
return
|
||||||
|
if 'c' not in self.form:
|
||||||
|
self.http_301('/')
|
||||||
|
return
|
||||||
|
item = self.form['f']
|
||||||
|
if not (item.file and item.filename):
|
||||||
|
self.http_301('/')
|
||||||
|
return
|
||||||
|
cmd = self.form.getvalue('c')
|
||||||
|
html = (cmd == 'Convert to HTML')
|
||||||
|
pagenos = []
|
||||||
|
if 'p' in self.form:
|
||||||
|
for m in re.finditer(r'\d+', self.form.getvalue('p')):
|
||||||
|
try:
|
||||||
|
pagenos.append(int(m.group(0)))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
logging.info('process: host=%s, name=%r, pagenos=%r' % (self.remote_addr, item.filename, pagenos))
|
||||||
|
h = abs(hash((random.random(), self.remote_addr, item.filename)))
|
||||||
|
tmppath = os.path.join(self.TMPDIR, '%08x%08x.pdf' % (self.cur_time, h))
|
||||||
try:
|
try:
|
||||||
pagenos.append(int(m.group(0)))
|
try:
|
||||||
except ValueError:
|
if not html:
|
||||||
pass
|
self.content_type = 'text/plain; charset=%s' % self.codec
|
||||||
logging.info('process: host=%s, name=%r, pagenos=%r' % (self.remote_addr, item.filename, pagenos))
|
self.http_200()
|
||||||
h = abs(hash((random.random(), self.remote_addr, item.filename)))
|
convert(sys.stdout, item.file, tmppath, pagenos=pagenos, codec=self.codec,
|
||||||
tmppath = os.path.join(self.TMPDIR, '%08x%08x.pdf' % (self.cur_time, h))
|
maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE, html=html)
|
||||||
try:
|
except Exception, e:
|
||||||
try:
|
self.put('<p>Sorry, an error has occured: %s' % q(repr(e)))
|
||||||
if not html:
|
logging.error('error: %r: path=%r: %s' % (e, tmppath, traceback.format_exc()))
|
||||||
self.content_type = 'text/plain; charset=%s' % self.codec
|
finally:
|
||||||
self.http_200()
|
try:
|
||||||
convert(sys.stdout, item.file, tmppath, pagenos=pagenos, codec=self.codec,
|
os.remove(tmppath)
|
||||||
maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE, html=html)
|
except:
|
||||||
except Exception, e:
|
pass
|
||||||
self.put('<p>Sorry, an error has occured: %s' % q(repr(e)))
|
return
|
||||||
logging.error('error: %r: path=%r: %s' % (e, tmppath, traceback.format_exc()))
|
|
||||||
finally:
|
|
||||||
try:
|
|
||||||
os.remove(tmppath)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
# main
|
# main
|
||||||
|
|
158
tools/pdf2txt.py
158
tools/pdf2txt.py
|
@ -9,85 +9,85 @@ from pdfminer.layout import LAParams
|
||||||
|
|
||||||
# main
|
# main
|
||||||
def main(argv):
|
def main(argv):
|
||||||
import getopt
|
import getopt
|
||||||
def usage():
|
def usage():
|
||||||
print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
|
print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
|
||||||
'[-D direction] [-M char_margin] [-L line_margin] [-W word_margin] '
|
'[-D direction] [-M char_margin] [-L line_margin] [-W word_margin] '
|
||||||
'[-t text|html|sgml|tag] [-o output] file ...' % argv[0])
|
'[-t text|html|sgml|tag] [-o output] file ...' % argv[0])
|
||||||
return 100
|
return 100
|
||||||
try:
|
try:
|
||||||
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:D:M:L:W:t:o:C:D:m:')
|
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:D:M:L:W:t:o:C:D:m:')
|
||||||
except getopt.GetoptError:
|
except getopt.GetoptError:
|
||||||
return usage()
|
return usage()
|
||||||
if not args: return usage()
|
if not args: return usage()
|
||||||
# debug option
|
# debug option
|
||||||
debug = 0
|
debug = 0
|
||||||
# path option
|
# path option
|
||||||
cmapdir = find_cmap_path()
|
cmapdir = find_cmap_path()
|
||||||
# input option
|
# input option
|
||||||
password = ''
|
password = ''
|
||||||
pagenos = set()
|
pagenos = set()
|
||||||
maxpages = 0
|
maxpages = 0
|
||||||
# output option
|
# output option
|
||||||
outfile = None
|
outfile = None
|
||||||
outtype = None
|
outtype = None
|
||||||
codec = 'utf-8'
|
codec = 'utf-8'
|
||||||
pageno = 1
|
pageno = 1
|
||||||
scale = 1
|
scale = 1
|
||||||
showpageno = True
|
showpageno = True
|
||||||
laparams = LAParams()
|
laparams = LAParams()
|
||||||
for (k, v) in opts:
|
for (k, v) in opts:
|
||||||
if k == '-d': debug += 1
|
if k == '-d': debug += 1
|
||||||
elif k == '-C': cmapdir = v
|
elif k == '-C': cmapdir = v
|
||||||
elif k == '-P': password = v
|
elif k == '-P': password = v
|
||||||
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
|
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
|
||||||
elif k == '-m': maxpages = int(v)
|
elif k == '-m': maxpages = int(v)
|
||||||
elif k == '-t': outtype = v
|
elif k == '-t': outtype = v
|
||||||
elif k == '-c': codec = v
|
elif k == '-c': codec = v
|
||||||
elif k == '-o': outfile = v
|
elif k == '-o': outfile = v
|
||||||
elif k == '-s': scale = float(v)
|
elif k == '-s': scale = float(v)
|
||||||
elif k == '-D': laparams.direction = v
|
elif k == '-D': laparams.direction = v
|
||||||
elif k == '-M': laparams.char_margin = float(v)
|
elif k == '-M': laparams.char_margin = float(v)
|
||||||
elif k == '-L': laparams.line_margin = float(v)
|
elif k == '-L': laparams.line_margin = float(v)
|
||||||
elif k == '-W': laparams.word_margin = float(v)
|
elif k == '-W': laparams.word_margin = float(v)
|
||||||
#
|
#
|
||||||
CMapDB.debug = debug
|
CMapDB.debug = debug
|
||||||
PDFResourceManager.debug = debug
|
PDFResourceManager.debug = debug
|
||||||
PDFDocument.debug = debug
|
PDFDocument.debug = debug
|
||||||
PDFParser.debug = debug
|
PDFParser.debug = debug
|
||||||
PDFPageInterpreter.debug = debug
|
PDFPageInterpreter.debug = debug
|
||||||
PDFDevice.debug = debug
|
PDFDevice.debug = debug
|
||||||
#
|
#
|
||||||
CMapDB.initialize(cmapdir)
|
CMapDB.initialize(cmapdir)
|
||||||
rsrc = PDFResourceManager()
|
rsrc = PDFResourceManager()
|
||||||
if not outtype:
|
if not outtype:
|
||||||
outtype = 'text'
|
outtype = 'text'
|
||||||
|
if outfile:
|
||||||
|
if outfile.endswith('.htm') or outfile.endswith('.html'):
|
||||||
|
outtype = 'html'
|
||||||
|
elif outfile.endswith('.sgml'):
|
||||||
|
outtype = 'sgml'
|
||||||
|
elif outfile.endswith('.tag'):
|
||||||
|
outtype = 'tag'
|
||||||
if outfile:
|
if outfile:
|
||||||
if outfile.endswith('.htm') or outfile.endswith('.html'):
|
outfp = file(outfile, 'w')
|
||||||
outtype = 'html'
|
else:
|
||||||
elif outfile.endswith('.sgml'):
|
outfp = sys.stdout
|
||||||
outtype = 'sgml'
|
if outtype == 'text':
|
||||||
elif outfile.endswith('.tag'):
|
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
|
||||||
outtype = 'tag'
|
elif outtype == 'sgml':
|
||||||
if outfile:
|
device = SGMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
|
||||||
outfp = file(outfile, 'w')
|
elif outtype == 'html':
|
||||||
else:
|
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams)
|
||||||
outfp = sys.stdout
|
elif outtype == 'tag':
|
||||||
if outtype == 'text':
|
device = TagExtractor(rsrc, outfp, codec=codec)
|
||||||
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
|
else:
|
||||||
elif outtype == 'sgml':
|
return usage()
|
||||||
device = SGMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
|
for fname in args:
|
||||||
elif outtype == 'html':
|
fp = file(fname, 'rb')
|
||||||
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams)
|
process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages, password=password)
|
||||||
elif outtype == 'tag':
|
fp.close()
|
||||||
device = TagExtractor(rsrc, outfp, codec=codec)
|
device.close()
|
||||||
else:
|
return
|
||||||
return usage()
|
|
||||||
for fname in args:
|
|
||||||
fp = file(fname, 'rb')
|
|
||||||
process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages, password=password)
|
|
||||||
fp.close()
|
|
||||||
device.close()
|
|
||||||
return
|
|
||||||
|
|
||||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||||
|
|
|
@ -2,29 +2,29 @@
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
def prof_main(argv):
|
def prof_main(argv):
|
||||||
import getopt
|
import getopt
|
||||||
import hotshot, hotshot.stats
|
import hotshot, hotshot.stats
|
||||||
def usage():
|
def usage():
|
||||||
print 'usage: %s module.function [args ...]' % argv[0]
|
print 'usage: %s module.function [args ...]' % argv[0]
|
||||||
return 100
|
return 100
|
||||||
args = argv[1:]
|
args = argv[1:]
|
||||||
if len(args) < 1: return usage()
|
if len(args) < 1: return usage()
|
||||||
name = args.pop(0)
|
name = args.pop(0)
|
||||||
prof = name+'.prof'
|
prof = name+'.prof'
|
||||||
i = name.rindex('.')
|
i = name.rindex('.')
|
||||||
(modname, funcname) = (name[:i], name[i+1:])
|
(modname, funcname) = (name[:i], name[i+1:])
|
||||||
module = __import__(modname, fromlist=1)
|
module = __import__(modname, fromlist=1)
|
||||||
func = getattr(module, funcname)
|
func = getattr(module, funcname)
|
||||||
if args:
|
if args:
|
||||||
args.insert(0, argv[0])
|
args.insert(0, argv[0])
|
||||||
prof = hotshot.Profile(prof)
|
prof = hotshot.Profile(prof)
|
||||||
prof.runcall(lambda : func(args))
|
prof.runcall(lambda : func(args))
|
||||||
prof.close()
|
prof.close()
|
||||||
else:
|
else:
|
||||||
stats = hotshot.stats.load(prof)
|
stats = hotshot.stats.load(prof)
|
||||||
stats.strip_dirs()
|
stats.strip_dirs()
|
||||||
stats.sort_stats('time', 'calls')
|
stats.sort_stats('time', 'calls')
|
||||||
stats.print_stats(1000)
|
stats.print_stats(1000)
|
||||||
return
|
return
|
||||||
|
|
||||||
if __name__ == '__main__': sys.exit(prof_main(sys.argv))
|
if __name__ == '__main__': sys.exit(prof_main(sys.argv))
|
||||||
|
|
Loading…
Reference in New Issue