to 4-space indentation

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-10-24 04:41:59 +00:00
parent a09b71d89d
commit 7790808560
24 changed files with 4953 additions and 4953 deletions

View File

@ -18,7 +18,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod> <div align=right class=lastmod>
<!-- hhmts start --> <!-- hhmts start -->
Last Modified: Sat Oct 24 12:42:25 JST 2009 Last Modified: Sat Oct 24 13:40:19 JST 2009
<!-- hhmts end --> <!-- hhmts end -->
</div> </div>
@ -352,7 +352,7 @@ no stream header is displayed for the ease of saving it to a file.
<hr noshade> <hr noshade>
<h2>Changes</h2> <h2>Changes</h2>
<ul> <ul>
<li> 2009/10/24: Charspace bug fixed. <li> 2009/10/24: Charspace bug fixed. Adjusted for 4-space indentation.
<li> 2009/10/04: Another matrix operation bug fixed. Thanks to Vitaly Sedelnik. <li> 2009/10/04: Another matrix operation bug fixed. Thanks to Vitaly Sedelnik.
<li> 2009/09/12: Fixed rectangle handling. Able to extract image boundaries. <li> 2009/09/12: Fixed rectangle handling. Able to extract image boundaries.
<li> 2009/08/30: Fixed page rotation handling. <li> 2009/08/30: Fixed page rotation handling.

View File

@ -8,37 +8,37 @@
## Arcfour ## Arcfour
## ##
class Arcfour(object): class Arcfour(object):
def __init__(self, key):
s = range(256)
j = 0
klen = len(key)
for i in xrange(256):
j = (j + s[i] + ord(key[i % klen])) % 256
(s[i], s[j]) = (s[j], s[i])
self.s = s
(self.i, self.j) = (0, 0)
return
def process(self, data): def __init__(self, key):
(i, j) = (self.i, self.j) s = range(256)
s = self.s j = 0
r = '' klen = len(key)
for c in data: for i in xrange(256):
i = (i+1) % 256 j = (j + s[i] + ord(key[i % klen])) % 256
j = (j+s[i]) % 256 (s[i], s[j]) = (s[j], s[i])
(s[i], s[j]) = (s[j], s[i]) self.s = s
k = s[(s[i]+s[j]) % 256] (self.i, self.j) = (0, 0)
r += chr(ord(c) ^ k) return
(self.i, self.j) = (i, j)
return r def process(self, data):
(i, j) = (self.i, self.j)
s = self.s
r = ''
for c in data:
i = (i+1) % 256
j = (j+s[i]) % 256
(s[i], s[j]) = (s[j], s[i])
k = s[(s[i]+s[j]) % 256]
r += chr(ord(c) ^ k)
(self.i, self.j) = (i, j)
return r
# test # test
if __name__ == '__main__': if __name__ == '__main__':
def doit(key, data): def doit(key, data):
cipher = Arcfour(key) cipher = Arcfour(key)
return ''.join( '%02X' % ord(c) for c in cipher.process(data) ) return ''.join( '%02X' % ord(c) for c in cipher.process(data) )
assert doit("Key", "Plaintext") == 'BBF316E8D940AF0AD3' assert doit("Key", "Plaintext") == 'BBF316E8D940AF0AD3'
assert doit("Wiki", "pedia") == '1021BF0420' assert doit("Wiki", "pedia") == '1021BF0420'
assert doit("Secret", "Attack at dawn") == '45A01F645FC35B383552544B9BF5' assert doit("Secret", "Attack at dawn") == '45A01F645FC35B383552544B9BF5'
print 'test succeeded' print 'test succeeded'

View File

@ -6,72 +6,72 @@
# ascii85decode(data) # ascii85decode(data)
def ascii85decode(data): def ascii85decode(data):
import struct import struct
n = b = 0 n = b = 0
out = '' out = ''
for c in data: for c in data:
if '!' <= c and c <= 'u': if '!' <= c and c <= 'u':
n += 1 n += 1
b = b*85+(ord(c)-33) b = b*85+(ord(c)-33)
if n == 5: if n == 5:
out += struct.pack('>L',b) out += struct.pack('>L',b)
n = b = 0 n = b = 0
elif c == 'z': elif c == 'z':
assert n == 0 assert n == 0
out += '\0\0\0\0' out += '\0\0\0\0'
elif c == '~': elif c == '~':
if n: if n:
for _ in range(5-n): for _ in range(5-n):
b = b*85+84 b = b*85+84
out += struct.pack('>L',b)[:n-1] out += struct.pack('>L',b)[:n-1]
break break
return out return out
# asciihexdecode(data) # asciihexdecode(data)
def asciihexdecode(data): def asciihexdecode(data):
""" """
ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1 ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the
ASCIIHexDecode filter produces one byte of binary data. All white-space ASCIIHexDecode filter produces one byte of binary data. All white-space
characters are ignored. A right angle bracket character (>) indicates characters are ignored. A right angle bracket character (>) indicates
EOD. Any other characters will cause an error. If the filter encounters EOD. Any other characters will cause an error. If the filter encounters
the EOD marker after reading an odd number of hexadecimal digits, it the EOD marker after reading an odd number of hexadecimal digits, it
will behave as if a 0 followed the last digit. will behave as if a 0 followed the last digit.
>>> asciihexdecode("61 62 2e6364 65") >>> asciihexdecode("61 62 2e6364 65")
'ab.cde' 'ab.cde'
>>> asciihexdecode("61 62 2e6364 657>") >>> asciihexdecode("61 62 2e6364 657>")
'ab.cdep' 'ab.cdep'
>>> asciihexdecode("7>") >>> asciihexdecode("7>")
'p' 'p'
""" """
import re import re
hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE) hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE)
trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE) trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
decode = (lambda hx: chr(int(hx, 16))) decode = (lambda hx: chr(int(hx, 16)))
out = map(decode, hex_re.findall(data)) out = map(decode, hex_re.findall(data))
m = trail_re.search(data) m = trail_re.search(data)
if m: if m:
out.append(decode("%c0" % m.group(1))) out.append(decode("%c0" % m.group(1)))
return ''.join(out) return ''.join(out)
# test # test
# sample taken from: http://en.wikipedia.org/w/index.php?title=Ascii85 # sample taken from: http://en.wikipedia.org/w/index.php?title=Ascii85
if __name__ == '__main__': if __name__ == '__main__':
orig = r''' orig = r'''
9jqo^BlbD-BleB1DJ+*+F(f,q/0JhKF<GL>Cj@.4Gp$d7F!,L7@<6@)/0JDEF<G%<+EV:2F!, 9jqo^BlbD-BleB1DJ+*+F(f,q/0JhKF<GL>Cj@.4Gp$d7F!,L7@<6@)/0JDEF<G%<+EV:2F!,
O<DJ+*.@<*K0@<6L(Df-\0Ec5e;DffZ(EZee.Bl.9pF"AGXBPCsi+DGm>@3BB/F*&OCAfu2/AKY O<DJ+*.@<*K0@<6L(Df-\0Ec5e;DffZ(EZee.Bl.9pF"AGXBPCsi+DGm>@3BB/F*&OCAfu2/AKY
i(DIb:@FD,*)+C]U=@3BN#EcYf8ATD3s@q?d$AftVqCh[NqF<G:8+EV:.+Cf>-FD5W8ARlolDIa i(DIb:@FD,*)+C]U=@3BN#EcYf8ATD3s@q?d$AftVqCh[NqF<G:8+EV:.+Cf>-FD5W8ARlolDIa
l(DId<j@<?3r@:F%a+D58'ATD4$Bl@l3De:,-DJs`8ARoFb/0JMK@qB4^F!,R<AKZ&-DfTqBG%G l(DId<j@<?3r@:F%a+D58'ATD4$Bl@l3De:,-DJs`8ARoFb/0JMK@qB4^F!,R<AKZ&-DfTqBG%G
>uD.RTpAKYo'+CT/5+Cei#DII?(E,9)oF*2M7/c~> >uD.RTpAKYo'+CT/5+Cei#DII?(E,9)oF*2M7/c~>
''' '''
data = \ data = \
'Man is distinguished, not only by his reason, but by this singular passion from '\ 'Man is distinguished, not only by his reason, but by this singular passion from '\
'other animals, which is a lust of the mind, that by a perseverance of delight in the '\ 'other animals, which is a lust of the mind, that by a perseverance of delight in the '\
'continued and indefatigable generation of knowledge, exceeds the short vehemence of '\ 'continued and indefatigable generation of knowledge, exceeds the short vehemence of '\
'any carnal pleasure.' 'any carnal pleasure.'
assert ascii85decode(orig) == data assert ascii85decode(orig) == data
print 'ascii85decode test succeeded' print 'ascii85decode test succeeded'
import doctest import doctest
doctest.testmod() doctest.testmod()

View File

@ -10,9 +10,9 @@ from pdfminer.psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
PSLiteral, PSKeyword, literal_name, keyword_name, \ PSLiteral, PSKeyword, literal_name, keyword_name, \
PSStackParser PSStackParser
try: try:
import cdb import cdb
except ImportError: except ImportError:
import pdfminer.pycdb as cdb import pdfminer.pycdb as cdb
class CMapError(Exception): pass class CMapError(Exception): pass
@ -21,449 +21,449 @@ class CMapError(Exception): pass
## find_cmap_path ## find_cmap_path
## ##
def find_cmap_path(): def find_cmap_path():
try: try:
return os.environ['CMAP_PATH'] return os.environ['CMAP_PATH']
except KeyError: except KeyError:
pass pass
basedir = os.path.dirname(__file__) basedir = os.path.dirname(__file__)
return os.path.join(basedir, 'CMap') return os.path.join(basedir, 'CMap')
STRIP_NAME = re.compile(r'[0-9]+') STRIP_NAME = re.compile(r'[0-9]+')
def name2unicode(name): def name2unicode(name):
if name in charname2unicode: if name in charname2unicode:
return charname2unicode[name] return charname2unicode[name]
m = STRIP_NAME.search(name) m = STRIP_NAME.search(name)
if not m: raise KeyError(name) if not m: raise KeyError(name)
return int(m.group(0)) return int(m.group(0))
## CMap ## CMap
## ##
class CMap(object): class CMap(object):
debug = 0 debug = 0
def __init__(self):
self.code2cid = {}
self.cid2code = {}
self.attrs = {}
return
def __repr__(self): def __init__(self):
return '<CMap: %s>' % self.attrs.get('CMapName') self.code2cid = {}
self.cid2code = {}
self.attrs = {}
return
def update(self, code2cid=None, cid2code=None): def __repr__(self):
if code2cid: return '<CMap: %s>' % self.attrs.get('CMapName')
self.code2cid.update(code2cid)
if cid2code:
self.cid2code.update(cid2code)
return self
def copycmap(self, cmap):
self.code2cid.update(cmap.getall_code2cid())
self.cid2code.update(cmap.getall_cid2code())
return self
def register_code2cid(self, code, cid): def update(self, code2cid=None, cid2code=None):
if isinstance(code, str) and isinstance(cid, int): if code2cid:
self.code2cid[code] = cid self.code2cid.update(code2cid)
return self if cid2code:
self.cid2code.update(cid2code)
return self
def register_cid2code(self, cid, code): def copycmap(self, cmap):
if isinstance(cid, int): self.code2cid.update(cmap.getall_code2cid())
if isinstance(code, PSLiteral): self.cid2code.update(cmap.getall_cid2code())
self.cid2code[cid] = pack('>H', name2unicode(code.name)) return self
elif isinstance(code, str):
self.cid2code[cid] = code
return self
def decode(self, bytes): def register_code2cid(self, code, cid):
if self.debug: if isinstance(code, str) and isinstance(cid, int):
print >>stderr, 'decode: %r, %r' % (self, bytes) self.code2cid[code] = cid
x = '' return self
for c in bytes:
if x: def register_cid2code(self, cid, code):
if x+c in self.code2cid: if isinstance(cid, int):
yield self.code2cid[x+c] if isinstance(code, PSLiteral):
self.cid2code[cid] = pack('>H', name2unicode(code.name))
elif isinstance(code, str):
self.cid2code[cid] = code
return self
def decode(self, bytes):
if self.debug:
print >>stderr, 'decode: %r, %r' % (self, bytes)
x = '' x = ''
elif c in self.code2cid: for c in bytes:
yield self.code2cid[c] if x:
else: if x+c in self.code2cid:
x = c yield self.code2cid[x+c]
return x = ''
elif c in self.code2cid:
def is_vertical(self): yield self.code2cid[c]
return self.attrs.get('WMode', 0) else:
x = c
return
def tocid(self, code): def is_vertical(self):
return self.code2cid.get(code) return self.attrs.get('WMode', 0)
def tocode(self, cid):
return self.cid2code.get(cid) def tocid(self, code):
return self.code2cid.get(code)
def tocode(self, cid):
return self.cid2code.get(cid)
def getall_attrs(self):
return self.attrs.iteritems()
def getall_code2cid(self):
return self.code2cid.iteritems()
def getall_cid2code(self):
return self.cid2code.iteritems()
def getall_attrs(self):
return self.attrs.iteritems()
def getall_code2cid(self):
return self.code2cid.iteritems()
def getall_cid2code(self):
return self.cid2code.iteritems()
## CDBCMap ## CDBCMap
## ##
class CDBCMap(CMap): class CDBCMap(CMap):
def __init__(self, cdbname):
CMap.__init__(self)
self.cdbname = cdbname
self.db = cdb.init(cdbname)
return
def __repr__(self): def __init__(self, cdbname):
return '<CDBCMap: %s (%r)>' % (self.db['/CMapName'], self.cdbname) CMap.__init__(self)
self.cdbname = cdbname
self.db = cdb.init(cdbname)
return
def tocid(self, code): def __repr__(self):
k = 'c'+code return '<CDBCMap: %s (%r)>' % (self.db['/CMapName'], self.cdbname)
if not self.db.has_key(k):
return None
return unpack('>L', self.db[k])
def tocode(self, cid):
k = 'i'+pack('>L', cid)
if not self.db.has_key(k):
return None
return self.db[k]
def is_vertical(self):
return (self.db.has_key('/WMode') and
self.db['/WMode'] == '1')
def getall(self, c): def tocid(self, code):
while 1: k = 'c'+code
x = self.db.each() if not self.db.has_key(k):
if not x: break return None
(k,v) = x return unpack('>L', self.db[k])
if k.startswith(c): def tocode(self, cid):
yield (k[1:], unpack('>L', v)[0]) k = 'i'+pack('>L', cid)
return if not self.db.has_key(k):
return None
return self.db[k]
def getall_attrs(self): def is_vertical(self):
while 1: return (self.db.has_key('/WMode') and
x = self.db.each() self.db['/WMode'] == '1')
if not x: break
(k,v) = x
if k.startswith('/'):
yield (k[1:], eval(v)[0])
return
def getall_cid2code(self):
return self.getall('i')
def getall_code2cid(self):
return self.getall('c')
def decode(self, bytes): def getall(self, c):
if self.debug: while 1:
print >>stderr, 'decode: %r, %r' % (self, bytes) x = self.db.each()
x = '' if not x: break
for c in bytes: (k,v) = x
if x: if k.startswith(c):
if x+c in self.code2cid: yield (k[1:], unpack('>L', v)[0])
yield self.code2cid[x+c] return
elif self.db.has_key('c'+x+c):
(dest,) = unpack('>L', self.db['c'+x+c]) def getall_attrs(self):
self.code2cid[x+c] = dest while 1:
yield dest x = self.db.each()
if not x: break
(k,v) = x
if k.startswith('/'):
yield (k[1:], eval(v)[0])
return
def getall_cid2code(self):
return self.getall('i')
def getall_code2cid(self):
return self.getall('c')
def decode(self, bytes):
if self.debug:
print >>stderr, 'decode: %r, %r' % (self, bytes)
x = '' x = ''
elif c in self.code2cid: for c in bytes:
yield self.code2cid[c] if x:
elif self.db.has_key('c'+c): if x+c in self.code2cid:
(dest,) = unpack('>L', self.db['c'+c]) yield self.code2cid[x+c]
self.code2cid[c] = dest elif self.db.has_key('c'+x+c):
yield dest (dest,) = unpack('>L', self.db['c'+x+c])
else: self.code2cid[x+c] = dest
x = c yield dest
return x = ''
elif c in self.code2cid:
yield self.code2cid[c]
elif self.db.has_key('c'+c):
(dest,) = unpack('>L', self.db['c'+c])
self.code2cid[c] = dest
yield dest
else:
x = c
return
## CMapDB ## CMapDB
## ##
class CMapDB(object): class CMapDB(object):
class CMapNotFound(CMapError): pass class CMapNotFound(CMapError): pass
CMAP_ALIAS = {
}
debug = 0
dirname = None
cdbdirname = None
cmapdb = {}
@classmethod CMAP_ALIAS = {
def initialize(klass, dirname=None, cdbdirname=None): }
if not dirname:
dirname = find_cmap_path()
klass.dirname = dirname
klass.cdbdirname = cdbdirname or dirname
return
@classmethod debug = 0
def get_cmap(klass, cmapname, strict=True): dirname = None
cmapname = klass.CMAP_ALIAS.get(cmapname, cmapname) cdbdirname = None
if cmapname in klass.cmapdb: cmapdb = {}
cmap = klass.cmapdb[cmapname]
else: @classmethod
fname = os.path.join(klass.dirname, cmapname) def initialize(klass, dirname=None, cdbdirname=None):
cdbname = os.path.join(klass.cdbdirname, cmapname+'.cmap.cdb') if not dirname:
if os.path.exists(cdbname): dirname = find_cmap_path()
if 1 <= klass.debug: klass.dirname = dirname
print >>stderr, 'Opening: CDBCMap %r...' % cdbname klass.cdbdirname = cdbdirname or dirname
cmap = CDBCMap(cdbname) return
elif os.path.exists(fname):
if 1 <= klass.debug: @classmethod
print >>stderr, 'Reading: CMap %r...' % fname def get_cmap(klass, cmapname, strict=True):
cmap = CMap() cmapname = klass.CMAP_ALIAS.get(cmapname, cmapname)
fp = file(fname, 'rb') if cmapname in klass.cmapdb:
CMapParser(cmap, fp).run() cmap = klass.cmapdb[cmapname]
fp.close() else:
elif not strict: fname = os.path.join(klass.dirname, cmapname)
cmap = CMap() # just create empty cmap cdbname = os.path.join(klass.cdbdirname, cmapname+'.cmap.cdb')
else: if os.path.exists(cdbname):
raise CMapDB.CMapNotFound(cmapname) if 1 <= klass.debug:
klass.cmapdb[cmapname] = cmap print >>stderr, 'Opening: CDBCMap %r...' % cdbname
return cmap cmap = CDBCMap(cdbname)
elif os.path.exists(fname):
if 1 <= klass.debug:
print >>stderr, 'Reading: CMap %r...' % fname
cmap = CMap()
fp = file(fname, 'rb')
CMapParser(cmap, fp).run()
fp.close()
elif not strict:
cmap = CMap() # just create empty cmap
else:
raise CMapDB.CMapNotFound(cmapname)
klass.cmapdb[cmapname] = cmap
return cmap
## CMapParser ## CMapParser
## ##
class CMapParser(PSStackParser): class CMapParser(PSStackParser):
def __init__(self, cmap, fp): def __init__(self, cmap, fp):
PSStackParser.__init__(self, fp) PSStackParser.__init__(self, fp)
self.cmap = cmap self.cmap = cmap
self.in_cmap = False self.in_cmap = False
return return
def run(self): def run(self):
try: try:
self.nextobject() self.nextobject()
except PSEOF: except PSEOF:
pass pass
return return
def do_keyword(self, pos, token): def do_keyword(self, pos, token):
name = token.name name = token.name
if name == 'begincmap': if name == 'begincmap':
self.in_cmap = True self.in_cmap = True
self.popall() self.popall()
return return
elif name == 'endcmap': elif name == 'endcmap':
self.in_cmap = False self.in_cmap = False
return return
if not self.in_cmap: return if not self.in_cmap: return
# #
if name == 'def': if name == 'def':
try: try:
((_,k),(_,v)) = self.pop(2) ((_,k),(_,v)) = self.pop(2)
self.cmap.attrs[literal_name(k)] = v self.cmap.attrs[literal_name(k)] = v
except PSSyntaxError: except PSSyntaxError:
pass pass
return return
if name == 'usecmap':
try:
((_,cmapname),) = self.pop(1)
self.cmap.copycmap(CMapDB.get_cmap(literal_name(cmapname)))
except PSSyntaxError:
pass
return
if name == 'begincodespacerange':
self.popall()
return
if name == 'endcodespacerange':
self.popall()
return
if name == 'begincidrange':
self.popall()
return
if name == 'endcidrange':
objs = [ obj for (_,obj) in self.popall() ]
for (s,e,cid) in choplist(3, objs):
if (not isinstance(s, str) or not isinstance(e, str) or
not isinstance(cid, int) or len(s) != len(e)): continue
sprefix = s[:-4]
eprefix = e[:-4]
if sprefix != eprefix: continue
svar = s[-4:]
evar = e[-4:]
s1 = nunpack(svar)
e1 = nunpack(evar)
vlen = len(svar)
#assert s1 <= e1
for i in xrange(e1-s1+1):
x = sprefix+pack('>L',s1+i)[-vlen:]
self.cmap.register_code2cid(x, cid+i)
return
if name == 'begincidchar':
self.popall()
return
if name == 'endcidchar':
objs = [ obj for (_,obj) in self.popall() ]
for (cid,code) in choplist(2, objs):
if isinstance(code, str) and isinstance(cid, str):
self.cmap.register_code2cid(code, nunpack(cid))
return
if name == 'beginbfrange':
self.popall()
return
if name == 'endbfrange':
objs = [ obj for (_,obj) in self.popall() ]
for (s,e,code) in choplist(3, objs):
if (not isinstance(s, str) or not isinstance(e, str) or
len(s) != len(e)): continue
s1 = nunpack(s)
e1 = nunpack(e)
#assert s1 <= e1
if isinstance(code, list):
for i in xrange(e1-s1+1):
self.cmap.register_cid2code(s1+i, code[i])
else:
var = code[-4:]
base = nunpack(var)
prefix = code[:-4]
vlen = len(var)
for i in xrange(e1-s1+1):
x = prefix+pack('>L',base+i)[-vlen:]
self.cmap.register_cid2code(s1+i, x)
return
if name == 'beginbfchar':
self.popall()
return
if name == 'endbfchar':
objs = [ obj for (_,obj) in self.popall() ]
for (cid,code) in choplist(2, objs):
if isinstance(cid, str) and isinstance(code, str):
self.cmap.register_cid2code(nunpack(cid), code)
return
if name == 'beginnotdefrange':
self.popall()
return
if name == 'endnotdefrange':
self.popall()
return
self.push((pos, token)) if name == 'usecmap':
return try:
((_,cmapname),) = self.pop(1)
self.cmap.copycmap(CMapDB.get_cmap(literal_name(cmapname)))
except PSSyntaxError:
pass
return
if name == 'begincodespacerange':
self.popall()
return
if name == 'endcodespacerange':
self.popall()
return
if name == 'begincidrange':
self.popall()
return
if name == 'endcidrange':
objs = [ obj for (_,obj) in self.popall() ]
for (s,e,cid) in choplist(3, objs):
if (not isinstance(s, str) or not isinstance(e, str) or
not isinstance(cid, int) or len(s) != len(e)): continue
sprefix = s[:-4]
eprefix = e[:-4]
if sprefix != eprefix: continue
svar = s[-4:]
evar = e[-4:]
s1 = nunpack(svar)
e1 = nunpack(evar)
vlen = len(svar)
#assert s1 <= e1
for i in xrange(e1-s1+1):
x = sprefix+pack('>L',s1+i)[-vlen:]
self.cmap.register_code2cid(x, cid+i)
return
if name == 'begincidchar':
self.popall()
return
if name == 'endcidchar':
objs = [ obj for (_,obj) in self.popall() ]
for (cid,code) in choplist(2, objs):
if isinstance(code, str) and isinstance(cid, str):
self.cmap.register_code2cid(code, nunpack(cid))
return
if name == 'beginbfrange':
self.popall()
return
if name == 'endbfrange':
objs = [ obj for (_,obj) in self.popall() ]
for (s,e,code) in choplist(3, objs):
if (not isinstance(s, str) or not isinstance(e, str) or
len(s) != len(e)): continue
s1 = nunpack(s)
e1 = nunpack(e)
#assert s1 <= e1
if isinstance(code, list):
for i in xrange(e1-s1+1):
self.cmap.register_cid2code(s1+i, code[i])
else:
var = code[-4:]
base = nunpack(var)
prefix = code[:-4]
vlen = len(var)
for i in xrange(e1-s1+1):
x = prefix+pack('>L',base+i)[-vlen:]
self.cmap.register_cid2code(s1+i, x)
return
if name == 'beginbfchar':
self.popall()
return
if name == 'endbfchar':
objs = [ obj for (_,obj) in self.popall() ]
for (cid,code) in choplist(2, objs):
if isinstance(cid, str) and isinstance(code, str):
self.cmap.register_cid2code(nunpack(cid), code)
return
if name == 'beginnotdefrange':
self.popall()
return
if name == 'endnotdefrange':
self.popall()
return
self.push((pos, token))
return
## FontMetricsDB ## FontMetricsDB
## ##
class FontMetricsDB(object): class FontMetricsDB(object):
@classmethod @classmethod
def get_metrics(klass, fontname): def get_metrics(klass, fontname):
return FONT_METRICS[fontname] return FONT_METRICS[fontname]
## EncodingDB ## EncodingDB
## ##
class EncodingDB(object): class EncodingDB(object):
std2unicode = {} std2unicode = {}
mac2unicode = {} mac2unicode = {}
win2unicode = {} win2unicode = {}
pdf2unicode = {} pdf2unicode = {}
for (name,std,mac,win,pdf) in ENCODING: for (name,std,mac,win,pdf) in ENCODING:
c = unichr(name2unicode(name)) c = unichr(name2unicode(name))
if std: std2unicode[std] = c if std: std2unicode[std] = c
if mac: mac2unicode[mac] = c if mac: mac2unicode[mac] = c
if win: win2unicode[win] = c if win: win2unicode[win] = c
if pdf: pdf2unicode[pdf] = c if pdf: pdf2unicode[pdf] = c
encodings = { encodings = {
'StandardEncoding': std2unicode, 'StandardEncoding': std2unicode,
'MacRomanEncoding': mac2unicode, 'MacRomanEncoding': mac2unicode,
'WinAnsiEncoding': win2unicode, 'WinAnsiEncoding': win2unicode,
'PDFDocEncoding': pdf2unicode, 'PDFDocEncoding': pdf2unicode,
} }
@classmethod @classmethod
def get_encoding(klass, name, diff=None): def get_encoding(klass, name, diff=None):
cid2unicode = klass.encodings.get(name, klass.std2unicode) cid2unicode = klass.encodings.get(name, klass.std2unicode)
if diff: if diff:
cid2unicode = cid2unicode.copy() cid2unicode = cid2unicode.copy()
cid = 0 cid = 0
for x in diff: for x in diff:
if isinstance(x, int): if isinstance(x, int):
cid = x cid = x
elif isinstance(x, PSLiteral): elif isinstance(x, PSLiteral):
try: try:
cid2unicode[cid] = unichr(name2unicode(x.name)) cid2unicode[cid] = unichr(name2unicode(x.name))
except KeyError: except KeyError:
pass pass
cid += 1 cid += 1
return cid2unicode return cid2unicode
## CMap -> CMapCDB conversion ## CMap -> CMapCDB conversion
## ##
def dumpcdb(cmap, cdbfile, verbose=1): def dumpcdb(cmap, cdbfile, verbose=1):
m = cdb.cdbmake(cdbfile, cdbfile+'.tmp') m = cdb.cdbmake(cdbfile, cdbfile+'.tmp')
if verbose: if verbose:
print >>stderr, 'Writing: %r...' % cdbfile print >>stderr, 'Writing: %r...' % cdbfile
for (k,v) in cmap.getall_attrs(): for (k,v) in cmap.getall_attrs():
m.add('/'+k, repr(v)) m.add('/'+k, repr(v))
for (code,cid) in cmap.getall_code2cid(): for (code,cid) in cmap.getall_code2cid():
m.add('c'+code, pack('>L',cid)) m.add('c'+code, pack('>L',cid))
for (cid,code) in cmap.getall_cid2code(): for (cid,code) in cmap.getall_cid2code():
m.add('i'+pack('>L',cid), code) m.add('i'+pack('>L',cid), code)
m.finish() m.finish()
return return
def convert_cmap(cmapdir, outputdir, force=False): def convert_cmap(cmapdir, outputdir, force=False):
CMapDB.initialize(cmapdir) CMapDB.initialize(cmapdir)
for fname in os.listdir(cmapdir): for fname in os.listdir(cmapdir):
if '.' in fname: continue if '.' in fname: continue
cmapname = os.path.basename(fname) cmapname = os.path.basename(fname)
cdbname = os.path.join(outputdir, cmapname+'.cmap.cdb') cdbname = os.path.join(outputdir, cmapname+'.cmap.cdb')
if not force and os.path.exists(cdbname): if not force and os.path.exists(cdbname):
print >>stderr, 'Skipping: %r' % cmapname print >>stderr, 'Skipping: %r' % cmapname
continue continue
print >>stderr, 'Reading: %r...' % cmapname print >>stderr, 'Reading: %r...' % cmapname
cmap = CMapDB.get_cmap(cmapname) cmap = CMapDB.get_cmap(cmapname)
dumpcdb(cmap, cdbname) dumpcdb(cmap, cdbname)
return return
def main(argv): def main(argv):
import getopt import getopt
def usage(): def usage():
print 'usage: %s [-D outputdir] [-f] cmap_dir' % argv[0] print 'usage: %s [-D outputdir] [-f] cmap_dir' % argv[0]
return 100 return 100
try: try:
(opts, args) = getopt.getopt(argv[1:], 'C:D:f') (opts, args) = getopt.getopt(argv[1:], 'C:D:f')
except getopt.GetoptError: except getopt.GetoptError:
return usage() return usage()
if args: if args:
cmapdir = args.pop(0) cmapdir = args.pop(0)
else: else:
cmapdir = find_cmap_path() cmapdir = find_cmap_path()
outputdir = cmapdir outputdir = cmapdir
force = False force = False
for (k, v) in opts: for (k, v) in opts:
if k == '-f': force = True if k == '-f': force = True
elif k == '-C': cmapdir = v elif k == '-C': cmapdir = v
elif k == '-D': outputdir = v elif k == '-D': outputdir = v
if not os.path.isdir(cmapdir): if not os.path.isdir(cmapdir):
print >>stderr, 'directory does not exist: %r' % cmapdir print >>stderr, 'directory does not exist: %r' % cmapdir
return 111 return 111
if not os.path.isdir(outputdir): if not os.path.isdir(outputdir):
print >>stderr, 'directory does not exist: %r' % outputdir print >>stderr, 'directory does not exist: %r' % outputdir
return 111 return 111
return convert_cmap(cmapdir, outputdir, force=force) return convert_cmap(cmapdir, outputdir, force=force)
if __name__ == '__main__': sys.exit(main(sys.argv)) if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -10,298 +10,298 @@ from pdfminer.utils import apply_matrix_pt, mult_matrix, enc
## ##
class TagExtractor(PDFDevice): class TagExtractor(PDFDevice):
def __init__(self, rsrc, outfp, codec='utf-8'): def __init__(self, rsrc, outfp, codec='utf-8'):
PDFDevice.__init__(self, rsrc) PDFDevice.__init__(self, rsrc)
self.outfp = outfp self.outfp = outfp
self.codec = codec self.codec = codec
self.pageno = 0 self.pageno = 0
self.tag = None self.tag = None
return return
def render_string(self, textstate, seq):
font = textstate.font
text = ''
for obj in seq:
if not isinstance(obj, str): continue
chars = font.decode(obj)
for cid in chars:
try:
char = font.to_unicode(cid)
text += char
except PDFUnicodeNotDefined:
pass
self.outfp.write(enc(text, self.codec))
return
def begin_page(self, page, ctm): def render_string(self, textstate, seq):
(x0, y0, x1, y1) = page.mediabox font = textstate.font
bbox = '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1) text = ''
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' % for obj in seq:
(self.pageno, bbox, page.rotate)) if not isinstance(obj, str): continue
return chars = font.decode(obj)
for cid in chars:
def end_page(self, page): try:
self.outfp.write('</page>\n') char = font.to_unicode(cid)
self.pageno += 1 text += char
return except PDFUnicodeNotDefined:
pass
def begin_tag(self, tag, props=None): self.outfp.write(enc(text, self.codec))
s = '' return
if props:
s = ''.join( ' %s="%s"' % (enc(k), enc(str(v))) for (k,v) def begin_page(self, page, ctm):
in sorted(props.iteritems()) ) (x0, y0, x1, y1) = page.mediabox
self.outfp.write('<%s%s>' % (enc(tag.name), s)) bbox = '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
self.tag = tag self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' %
return (self.pageno, bbox, page.rotate))
return
def end_tag(self):
assert self.tag def end_page(self, page):
self.outfp.write('</%s>' % enc(self.tag.name)) self.outfp.write('</page>\n')
self.tag = None self.pageno += 1
return return
def do_tag(self, tag, props=None): def begin_tag(self, tag, props=None):
self.begin_tag(tag, props) s = ''
self.tag = None if props:
return s = ''.join( ' %s="%s"' % (enc(k), enc(str(v))) for (k,v)
in sorted(props.iteritems()) )
self.outfp.write('<%s%s>' % (enc(tag.name), s))
self.tag = tag
return
def end_tag(self):
assert self.tag
self.outfp.write('</%s>' % enc(self.tag.name))
self.tag = None
return
def do_tag(self, tag, props=None):
self.begin_tag(tag, props)
self.tag = None
return
## PDFPageAggregator ## PDFPageAggregator
## ##
class PDFPageAggregator(PDFTextDevice): class PDFPageAggregator(PDFTextDevice):
def __init__(self, rsrc, pageno=1, laparams=None): def __init__(self, rsrc, pageno=1, laparams=None):
PDFTextDevice.__init__(self, rsrc) PDFTextDevice.__init__(self, rsrc)
self.laparams = laparams self.laparams = laparams
self.pageno = pageno self.pageno = pageno
self.stack = [] self.stack = []
return return
def begin_page(self, page, ctm): def begin_page(self, page, ctm):
(x0,y0,x1,y1) = page.mediabox (x0,y0,x1,y1) = page.mediabox
(x0,y0) = apply_matrix_pt(ctm, (x0,y0)) (x0,y0) = apply_matrix_pt(ctm, (x0,y0))
(x1,y1) = apply_matrix_pt(ctm, (x1,y1)) (x1,y1) = apply_matrix_pt(ctm, (x1,y1))
mediabox = (0, 0, abs(x0-x1), abs(y0-y1)) mediabox = (0, 0, abs(x0-x1), abs(y0-y1))
self.cur_item = LTPage(self.pageno, mediabox) self.cur_item = LTPage(self.pageno, mediabox)
return return
def end_page(self, _):
assert not self.stack
assert isinstance(self.cur_item, LTPage)
self.cur_item.fixate()
if self.laparams:
self.cur_item.analyze_layout(self.laparams)
self.pageno += 1
return self.cur_item
def begin_figure(self, name, bbox, matrix): def end_page(self, _):
self.stack.append(self.cur_item) assert not self.stack
self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm)) assert isinstance(self.cur_item, LTPage)
return self.cur_item.fixate()
if self.laparams:
def end_figure(self, _): self.cur_item.analyze_layout(self.laparams)
fig = self.cur_item self.pageno += 1
self.cur_item.fixate() return self.cur_item
self.cur_item = self.stack.pop()
self.cur_item.add(fig)
return
def paint_path(self, gstate, stroke, fill, evenodd, path): def begin_figure(self, name, bbox, matrix):
shape = ''.join(x[0] for x in path) self.stack.append(self.cur_item)
if shape == 'ml': # horizontal/vertical line self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
(_,x0,y0) = path[0] return
(_,x1,y1) = path[1]
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0)) def end_figure(self, _):
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1)) fig = self.cur_item
if y0 == y1: self.cur_item.fixate()
# horizontal ruler self.cur_item = self.stack.pop()
self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1))) self.cur_item.add(fig)
elif x0 == x1: return
# vertical ruler
self.cur_item.add(LTLine(gstate.linewidth, 'V', (x0,y0,x1,y1))) def paint_path(self, gstate, stroke, fill, evenodd, path):
elif shape == 'mlllh': shape = ''.join(x[0] for x in path)
# rectangle if shape == 'ml': # horizontal/vertical line
(_,x0,y0) = path[0] (_,x0,y0) = path[0]
(_,x1,y1) = path[1] (_,x1,y1) = path[1]
(_,x2,y2) = path[2] (x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
(_,x3,y3) = path[3] (x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0)) if y0 == y1:
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1)) # horizontal ruler
(x2,y2) = apply_matrix_pt(self.ctm, (x2,y2)) self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1)))
(x3,y3) = apply_matrix_pt(self.ctm, (x3,y3)) elif x0 == x1:
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or # vertical ruler
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)): self.cur_item.add(LTLine(gstate.linewidth, 'V', (x0,y0,x1,y1)))
self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2))) elif shape == 'mlllh':
return # rectangle
(_,x0,y0) = path[0]
def render_chars(self, matrix, font, fontsize, charspace, scaling, chars): (_,x1,y1) = path[1]
if not chars: return (0, 0) (_,x2,y2) = path[2]
item = LTTextItem(matrix, font, fontsize, charspace, scaling, chars) (_,x3,y3) = path[3]
self.cur_item.add(item) (x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
return item.adv (x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
(x2,y2) = apply_matrix_pt(self.ctm, (x2,y2))
(x3,y3) = apply_matrix_pt(self.ctm, (x3,y3))
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
return
def render_chars(self, matrix, font, fontsize, charspace, scaling, chars):
if not chars: return (0, 0)
item = LTTextItem(matrix, font, fontsize, charspace, scaling, chars)
self.cur_item.add(item)
return item.adv
## PDFConverter ## PDFConverter
## ##
class PDFConverter(PDFPageAggregator): class PDFConverter(PDFPageAggregator):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None):
PDFPageAggregator.__init__(self, rsrc, pageno=pageno, laparams=laparams)
self.outfp = outfp
self.codec = codec
return
def write(self, text): def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None):
self.outfp.write(enc(text, self.codec)) PDFPageAggregator.__init__(self, rsrc, pageno=pageno, laparams=laparams)
return self.outfp = outfp
self.codec = codec
return
def write(self, text):
self.outfp.write(enc(text, self.codec))
return
## SGMLConverter ## SGMLConverter
## ##
class SGMLConverter(PDFConverter): class SGMLConverter(PDFConverter):
def end_page(self, page): def end_page(self, page):
def render(item): def render(item):
if isinstance(item, LTPage): if isinstance(item, LTPage):
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' % self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
(item.id, item.get_bbox(), item.rotate)) (item.id, item.get_bbox(), item.rotate))
for child in item: for child in item:
render(child) render(child)
self.outfp.write('</page>\n') self.outfp.write('</page>\n')
elif isinstance(item, LTLine): elif isinstance(item, LTLine):
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox())) self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox()))
elif isinstance(item, LTRect): elif isinstance(item, LTRect):
self.outfp.write('<rect linewidth="%d" bbox="%s" />' % (item.linewidth, item.get_bbox())) self.outfp.write('<rect linewidth="%d" bbox="%s" />' % (item.linewidth, item.get_bbox()))
elif isinstance(item, LTFigure): elif isinstance(item, LTFigure):
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox())) self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
for child in item: for child in item:
render(child) render(child)
self.outfp.write('</figure>\n') self.outfp.write('</figure>\n')
elif isinstance(item, LTTextLine): elif isinstance(item, LTTextLine):
self.outfp.write('<textline bbox="%s">\n' % (item.get_bbox())) self.outfp.write('<textline bbox="%s">\n' % (item.get_bbox()))
for child in item: for child in item:
render(child) render(child)
self.outfp.write('</textline>\n') self.outfp.write('</textline>\n')
elif isinstance(item, LTTextBox): elif isinstance(item, LTTextBox):
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox())) self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
for child in item: for child in item:
render(child) render(child)
self.outfp.write('</textbox>\n') self.outfp.write('</textbox>\n')
elif isinstance(item, LTTextItem): elif isinstance(item, LTTextItem):
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' % self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
(enc(item.font.fontname), item.is_vertical(), (enc(item.font.fontname), item.is_vertical(),
item.get_bbox(), item.fontsize)) item.get_bbox(), item.fontsize))
self.write(item.text) self.write(item.text)
self.outfp.write('</text>\n') self.outfp.write('</text>\n')
elif isinstance(item, LTText): elif isinstance(item, LTText):
self.outfp.write('<text>%s</text>\n' % item.text) self.outfp.write('<text>%s</text>\n' % item.text)
else: else:
assert 0, item assert 0, item
return return
page = PDFConverter.end_page(self, page) page = PDFConverter.end_page(self, page)
render(page) render(page)
return return
## HTMLConverter ## HTMLConverter
## ##
class HTMLConverter(PDFConverter): class HTMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
scale=1, showpageno=True, pagepad=50): scale=1, showpageno=True, pagepad=50):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams) PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.showpageno = showpageno self.showpageno = showpageno
self.pagepad = pagepad self.pagepad = pagepad
self.scale = scale self.scale = scale
self.outfp.write('<html><head>\n') self.outfp.write('<html><head>\n')
self.outfp.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' % self.outfp.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' %
self.codec) self.codec)
self.outfp.write('</head><body>\n') self.outfp.write('</head><body>\n')
self.yoffset = self.pagepad self.yoffset = self.pagepad
return return
def write_rect(self, color, width, x, y, w, h): def write_rect(self, color, width, x, y, w, h):
self.outfp.write('<span style="position:absolute; border: %s %dpx solid; ' self.outfp.write('<span style="position:absolute; border: %s %dpx solid; '
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' % 'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
(color, width, x*self.scale, y*self.scale, w*self.scale, h*self.scale)) (color, width, x*self.scale, y*self.scale, w*self.scale, h*self.scale))
return return
def end_page(self, page): def end_page(self, page):
def render(item): def render(item):
if isinstance(item, LTPage): if isinstance(item, LTPage):
self.yoffset += item.y1 self.yoffset += item.y1
self.write_rect('gray', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('gray', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
if self.showpageno: if self.showpageno:
self.outfp.write('<div style="position:absolute; top:%dpx;">' % self.outfp.write('<div style="position:absolute; top:%dpx;">' %
((self.yoffset-item.y1)*self.scale)) ((self.yoffset-item.y1)*self.scale))
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id)) self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id))
for child in item: for child in item:
render(child) render(child)
elif isinstance(item, LTTextItem): elif isinstance(item, LTTextItem):
if item.vertical: if item.vertical:
wmode = 'tb-rl' wmode = 'tb-rl'
else: else:
wmode = 'lr-tb' wmode = 'lr-tb'
self.outfp.write('<span style="position:absolute; writing-mode:%s;' self.outfp.write('<span style="position:absolute; writing-mode:%s;'
' left:%dpx; top:%dpx; font-size:%dpx;">' % ' left:%dpx; top:%dpx; font-size:%dpx;">' %
(wmode, item.x0*self.scale, (self.yoffset-item.y1)*self.scale, (wmode, item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
item.fontsize*self.scale)) item.fontsize*self.scale))
self.write(item.text) self.write(item.text)
self.outfp.write('</span>\n') self.outfp.write('</span>\n')
if self.debug: if self.debug:
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTLine) or isinstance(item, LTRect): elif isinstance(item, LTLine) or isinstance(item, LTRect):
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTTextLine): elif isinstance(item, LTTextLine):
for child in item: for child in item:
render(child) render(child)
elif isinstance(item, LTTextBox): elif isinstance(item, LTTextBox):
self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
for child in item: for child in item:
render(child) render(child)
elif isinstance(item, LTFigure): elif isinstance(item, LTFigure):
self.write_rect('green', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('green', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
for child in item: for child in item:
render(child) render(child)
return return
page = PDFConverter.end_page(self, page) page = PDFConverter.end_page(self, page)
render(page) render(page)
self.yoffset += self.pagepad self.yoffset += self.pagepad
return return
def close(self): def close(self):
self.outfp.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' % self.outfp.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' %
', '.join('<a href="#%s">%s</a>' % (i,i) for i in xrange(1,self.pageno))) ', '.join('<a href="#%s">%s</a>' % (i,i) for i in xrange(1,self.pageno)))
self.outfp.write('</body></html>\n') self.outfp.write('</body></html>\n')
return return
## TextConverter ## TextConverter
## ##
class TextConverter(PDFConverter): class TextConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
showpageno=False): showpageno=False):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams) PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.showpageno = showpageno self.showpageno = showpageno
return return
def write(self, text): def write(self, text):
self.outfp.write(text.encode(self.codec, 'ignore')) self.outfp.write(text.encode(self.codec, 'ignore'))
return return
def end_page(self, page): def end_page(self, page):
def render(item): def render(item):
if isinstance(item, LTText): if isinstance(item, LTText):
self.write(item.text) self.write(item.text)
elif isinstance(item, LayoutContainer): elif isinstance(item, LayoutContainer):
for child in item: for child in item:
render(child) render(child)
if isinstance(item, LTTextBox): if isinstance(item, LTTextBox):
self.write('\n') self.write('\n')
page = PDFConverter.end_page(self, page) page = PDFConverter.end_page(self, page)
if self.showpageno: if self.showpageno:
self.write('Page %d\n' % page.id) self.write('Page %d\n' % page.id)
render(page) render(page)
self.write('\f') self.write('\f')
return return

View File

@ -8,9 +8,9 @@
### BEGIN Verbatim copy of the license part ### BEGIN Verbatim copy of the license part
# #
# Adobe Core 35 AFM Files with 229 Glyph Entries - ReadMe # Adobe Core 35 AFM Files with 229 Glyph Entries - ReadMe
# #
# This file and the 35 PostScript(R) AFM files it accompanies may be # This file and the 35 PostScript(R) AFM files it accompanies may be
# used, copied, and distributed for any purpose and without charge, # used, copied, and distributed for any purpose and without charge,
# with or without modification, provided that all copyright notices # with or without modification, provided that all copyright notices

View File

@ -7,23 +7,23 @@ INF = sys.maxint
## LAParams ## LAParams
## ##
class LAParams(object): class LAParams(object):
def __init__(self,
direction=None,
line_overlap=0.5,
char_margin=1.0,
line_margin=0.5,
word_margin=0.1):
self.direction = direction
self.line_overlap = line_overlap
self.char_margin = char_margin
self.line_margin = line_margin
self.word_margin = word_margin
return
def __repr__(self): def __init__(self,
return ('<LAParams: direction=%r, char_margin=%.1f, line_margin=%.1f, word_margin=%.1f>' % direction=None,
(self.direction, self.char_margin, self.line_margin, self.word_margin)) line_overlap=0.5,
char_margin=1.0,
line_margin=0.5,
word_margin=0.1):
self.direction = direction
self.line_overlap = line_overlap
self.char_margin = char_margin
self.line_margin = line_margin
self.word_margin = word_margin
return
def __repr__(self):
return ('<LAParams: direction=%r, char_margin=%.1f, line_margin=%.1f, word_margin=%.1f>' %
(self.direction, self.char_margin, self.line_margin, self.word_margin))
## Plane ## Plane
@ -35,354 +35,354 @@ class LAParams(object):
## ##
class Plane(object): class Plane(object):
def __init__(self, objs): def __init__(self, objs):
self.xobjs = [] self.xobjs = []
self.yobjs = [] self.yobjs = []
for obj in objs: for obj in objs:
self.place(obj) self.place(obj)
self.xobjs.sort() self.xobjs.sort()
self.yobjs.sort() self.yobjs.sort()
return return
# place(obj): place an object in a certain area. # place(obj): place an object in a certain area.
def place(self, obj): def place(self, obj):
assert isinstance(obj, LayoutItem) assert isinstance(obj, LayoutItem)
self.xobjs.append((obj.x0, obj)) self.xobjs.append((obj.x0, obj))
self.xobjs.append((obj.x1, obj)) self.xobjs.append((obj.x1, obj))
self.yobjs.append((obj.y0, obj)) self.yobjs.append((obj.y0, obj))
self.yobjs.append((obj.y1, obj)) self.yobjs.append((obj.y1, obj))
return return
# find(): finds objects that are in a certain area. # find(): finds objects that are in a certain area.
def find(self, (x0,y0,x1,y1)): def find(self, (x0,y0,x1,y1)):
(i0,_) = bsearch(self.xobjs, x0) (i0,_) = bsearch(self.xobjs, x0)
(_,i1) = bsearch(self.xobjs, x1) (_,i1) = bsearch(self.xobjs, x1)
xobjs = set( obj for (_,obj) in self.xobjs[i0:i1] ) xobjs = set( obj for (_,obj) in self.xobjs[i0:i1] )
(i0,_) = bsearch(self.yobjs, y0) (i0,_) = bsearch(self.yobjs, y0)
(_,i1) = bsearch(self.yobjs, y1) (_,i1) = bsearch(self.yobjs, y1)
yobjs = set( obj for (_,obj) in self.yobjs[i0:i1] ) yobjs = set( obj for (_,obj) in self.yobjs[i0:i1] )
objs = xobjs.intersection(yobjs) objs = xobjs.intersection(yobjs)
return objs return objs
## ClusterSet ## ClusterSet
## ##
class ClusterSet(object): class ClusterSet(object):
def __init__(self, klass): def __init__(self, klass):
self.clusters = {} self.clusters = {}
self.klass = klass self.klass = klass
self.i = 0 self.i = 0
return return
# add(objs): groups text objects if necessary. # add(objs): groups text objects if necessary.
def add(self, objs): def add(self, objs):
group = self.klass(self.i, objs) group = self.klass(self.i, objs)
self.i += 1 self.i += 1
for obj in objs: for obj in objs:
if obj in self.clusters: if obj in self.clusters:
group.merge(self.clusters[obj]) group.merge(self.clusters[obj])
for obj in group: for obj in group:
self.clusters[obj] = group self.clusters[obj] = group
return return
# finish(): returns all the LTTextBoxes in a page. # finish(): returns all the LTTextBoxes in a page.
def finish(self): def finish(self):
r = set(self.clusters.itervalues()) r = set(self.clusters.itervalues())
for group in r: for group in r:
group.fixate() group.fixate()
return list(r) return list(r)
@classmethod @classmethod
def build(klass, objs, hratio, vratio, objtype, func=None): def build(klass, objs, hratio, vratio, objtype, func=None):
plane = Plane(objs) plane = Plane(objs)
cset = ClusterSet(objtype) cset = ClusterSet(objtype)
for obj in objs: for obj in objs:
margin = obj.get_margin() margin = obj.get_margin()
hmargin = hratio * margin hmargin = hratio * margin
vmargin = vratio * margin vmargin = vratio * margin
neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin)) neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin))
assert obj in neighbors, obj assert obj in neighbors, obj
if func: if func:
neighbors = [ x for x in neighbors if x is obj or func(obj, x) ] neighbors = [ x for x in neighbors if x is obj or func(obj, x) ]
cset.add(neighbors) cset.add(neighbors)
return cset.finish() return cset.finish()
## LayoutItem ## LayoutItem
## ##
class LayoutItem(object): class LayoutItem(object):
def __init__(self, bbox): def __init__(self, bbox):
self.set_bbox(bbox) self.set_bbox(bbox)
return return
def set_bbox(self, (x0,y0,x1,y1)): def set_bbox(self, (x0,y0,x1,y1)):
if x1 < x0: (x0,x1) = (x1,x0) if x1 < x0: (x0,x1) = (x1,x0)
if y1 < y0: (y0,y1) = (y1,y0) if y1 < y0: (y0,y1) = (y1,y0)
self.x0 = x0 self.x0 = x0
self.y0 = y0 self.y0 = y0
self.x1 = x1 self.x1 = x1
self.y1 = y1 self.y1 = y1
self.width = x1-x0 self.width = x1-x0
self.height = y1-y0 self.height = y1-y0
return return
def __repr__(self): def __repr__(self):
return ('<item bbox=%s>' % (self.get_bbox())) return ('<item bbox=%s>' % (self.get_bbox()))
def hoverlap(self, obj):
assert isinstance(obj, LayoutItem)
if self.x1 <= obj.x0 or obj.x1 <= self.x0:
return 0
else:
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
def voverlap(self, obj): def hoverlap(self, obj):
assert isinstance(obj, LayoutItem) assert isinstance(obj, LayoutItem)
if self.y1 <= obj.y0 or obj.y1 <= self.y0: if self.x1 <= obj.x0 or obj.x1 <= self.x0:
return 0 return 0
else: else:
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0)) return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
def get_bbox(self): def voverlap(self, obj):
return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1) assert isinstance(obj, LayoutItem)
if self.y1 <= obj.y0 or obj.y1 <= self.y0:
def get_margin(self): return 0
return 0 else:
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
def get_weight(self): def get_bbox(self):
return 0 return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1)
def get_direction(self): def get_margin(self):
return None return 0
def get_weight(self):
return 0
def get_direction(self):
return None
## LayoutContainer ## LayoutContainer
## ##
class LayoutContainer(LayoutItem): class LayoutContainer(LayoutItem):
def __init__(self, id, bbox, objs=None):
LayoutItem.__init__(self, bbox)
self.id = id
if objs:
self.objs = set(objs)
else:
self.objs = set()
self.weight = None
return
def __repr__(self): def __init__(self, id, bbox, objs=None):
return ('<group %s>' % (self.get_bbox())) LayoutItem.__init__(self, bbox)
self.id = id
if objs:
self.objs = set(objs)
else:
self.objs = set()
self.weight = None
return
def __iter__(self): def __repr__(self):
return iter(self.objs) return ('<group %s>' % (self.get_bbox()))
def __len__(self): def __iter__(self):
return len(self.objs) return iter(self.objs)
def add(self, obj):
self.objs.add(obj)
return
def merge(self, group): def __len__(self):
self.objs.update(iter(group)) return len(self.objs)
return
# fixate(): determines its boundery and writing direction. def add(self, obj):
def fixate(self): self.objs.add(obj)
if not self.width and self.objs: return
(bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
for obj in self.objs:
bx0 = min(bx0, obj.x0)
by0 = min(by0, obj.y0)
bx1 = max(bx1, obj.x1)
by1 = max(by1, obj.y1)
self.set_bbox((bx0, by0, bx1, by1))
self.weight = sum( obj.get_weight() for obj in self.objs )
return
def get_weight(self): def merge(self, group):
return self.weight self.objs.update(iter(group))
return
def get_direction(self):
return None # fixate(): determines its boundery and writing direction.
def fixate(self):
if not self.width and self.objs:
(bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
for obj in self.objs:
bx0 = min(bx0, obj.x0)
by0 = min(by0, obj.y0)
bx1 = max(bx1, obj.x1)
by1 = max(by1, obj.y1)
self.set_bbox((bx0, by0, bx1, by1))
self.weight = sum( obj.get_weight() for obj in self.objs )
return
def get_weight(self):
return self.weight
def get_direction(self):
return None
## LTLine ## LTLine
## ##
class LTLine(LayoutItem): class LTLine(LayoutItem):
def __init__(self, linewidth, direction, bbox): def __init__(self, linewidth, direction, bbox):
LayoutItem.__init__(self, bbox) LayoutItem.__init__(self, bbox)
self.linewidth = linewidth self.linewidth = linewidth
self.direction = direction self.direction = direction
return return
## LTRect ## LTRect
## ##
class LTRect(LayoutItem): class LTRect(LayoutItem):
def __init__(self, linewidth, bbox): def __init__(self, linewidth, bbox):
LayoutItem.__init__(self, bbox) LayoutItem.__init__(self, bbox)
self.linewidth = linewidth self.linewidth = linewidth
return return
## LTText ## LTText
## ##
class LTText(object): class LTText(object):
def __init__(self, text): def __init__(self, text):
self.text = text self.text = text
return return
def __repr__(self): def __repr__(self):
return '<text %r>' % self.text return '<text %r>' % self.text
def get_weight(self): def get_weight(self):
return len(self.text) return len(self.text)
def is_upright(self): def is_upright(self):
return True return True
## LTAnon ## LTAnon
## ##
class LTAnon(LTText): class LTAnon(LTText):
def get_weight(self): def get_weight(self):
return 0 return 0
## LTTextItem ## LTTextItem
## ##
class LTTextItem(LayoutItem, LTText): class LTTextItem(LayoutItem, LTText):
debug = 1 debug = 1
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
assert chars
self.matrix = matrix
self.font = font
self.vertical = font.is_vertical()
self.text = ''.join( char for (char,_) in chars )
adv = sum( font.char_width(cid) for (_,cid) in chars )
adv = (adv * fontsize + (len(chars)-1)*charspace) * scaling
#size = (font.get_ascent() - font.get_descent()) * fontsize
size = font.get_size() * fontsize
(_,_,_,_,tx,ty) = self.matrix
if not self.vertical:
# horizontal text
self.adv = (adv, 0)
(dx,dy) = apply_matrix_norm(self.matrix, (adv,size))
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
ty += descent
bbox = (tx, ty, tx+dx, ty+dy)
else:
# vertical text
self.adv = (0, adv)
(_,cid) = chars[0]
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001))
(dx,dy) = apply_matrix_norm(self.matrix, (size,adv))
tx -= dx/2
ty += disp
bbox = (tx, ty+dy, tx+dx, ty)
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
LayoutItem.__init__(self, bbox)
return
def __repr__(self): def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
if self.debug: assert chars
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' % self.matrix = matrix
('[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % self.matrix, self.font = font
self.font, self.fontsize, self.get_bbox(), self.vertical = font.is_vertical()
'(%.1f, %.1f)' % self.adv, self.text = ''.join( char for (char,_) in chars )
self.text)) adv = sum( font.char_width(cid) for (_,cid) in chars )
else: adv = (adv * fontsize + (len(chars)-1)*charspace) * scaling
return '<text %r>' % self.text #size = (font.get_ascent() - font.get_descent()) * fontsize
size = font.get_size() * fontsize
(_,_,_,_,tx,ty) = self.matrix
if not self.vertical:
# horizontal text
self.adv = (adv, 0)
(dx,dy) = apply_matrix_norm(self.matrix, (adv,size))
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
ty += descent
bbox = (tx, ty, tx+dx, ty+dy)
else:
# vertical text
self.adv = (0, adv)
(_,cid) = chars[0]
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001))
(dx,dy) = apply_matrix_norm(self.matrix, (size,adv))
tx -= dx/2
ty += disp
bbox = (tx, ty+dy, tx+dx, ty)
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
LayoutItem.__init__(self, bbox)
return
def get_margin(self): def __repr__(self):
return abs(self.fontsize) if self.debug:
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
('[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % self.matrix,
self.font, self.fontsize, self.get_bbox(),
'(%.1f, %.1f)' % self.adv,
self.text))
else:
return '<text %r>' % self.text
def is_vertical(self): def get_margin(self):
return self.vertical return abs(self.fontsize)
def is_upright(self): def is_vertical(self):
(a,b,c,d,e,f) = self.matrix return self.vertical
return 0 < a*d and b*c <= 0
def is_upright(self):
(a,b,c,d,e,f) = self.matrix
return 0 < a*d and b*c <= 0
## LTFigure ## LTFigure
## ##
class LTFigure(LayoutContainer): class LTFigure(LayoutContainer):
def __init__(self, id, bbox, matrix):
(x,y,w,h) = bbox
x0 = y0 = INF
x1 = y1 = -INF
for (p,q) in ((x,y),(x+w,y),(x,y+h),(x+w,y+h)):
(p,q) = apply_matrix_pt(matrix, (p,q))
x0 = min(x0, p)
x1 = max(x1, p)
y0 = min(y0, q)
y1 = max(y1, q)
bbox = (x0,y0,x1,y1)
self.matrix = matrix
LayoutContainer.__init__(self, id, bbox)
return
def __repr__(self): def __init__(self, id, bbox, matrix):
return ('<figure id=%r bbox=%s matrix=%r>' % (self.id, self.get_bbox(), self.matrix)) (x,y,w,h) = bbox
x0 = y0 = INF
x1 = y1 = -INF
for (p,q) in ((x,y),(x+w,y),(x,y+h),(x+w,y+h)):
(p,q) = apply_matrix_pt(matrix, (p,q))
x0 = min(x0, p)
x1 = max(x1, p)
y0 = min(y0, q)
y1 = max(y1, q)
bbox = (x0,y0,x1,y1)
self.matrix = matrix
LayoutContainer.__init__(self, id, bbox)
return
def __repr__(self):
return ('<figure id=%r bbox=%s matrix=%r>' % (self.id, self.get_bbox(), self.matrix))
## LTTextLine ## LTTextLine
## ##
class LTTextLine(LayoutContainer): class LTTextLine(LayoutContainer):
def __init__(self, id, objs, direction, word_margin): def __init__(self, id, objs, direction, word_margin):
LayoutContainer.__init__(self, id, (0,0,0,0), objs) LayoutContainer.__init__(self, id, (0,0,0,0), objs)
self.direction = direction self.direction = direction
self.word_margin = word_margin self.word_margin = word_margin
return return
def __repr__(self): def __repr__(self):
return ('<line %s(%s)>' % (self.get_bbox(), self.direction)) return ('<line %s(%s)>' % (self.get_bbox(), self.direction))
def get_margin(self): def get_margin(self):
return min(self.width, self.height) return min(self.width, self.height)
def get_direction(self): def get_direction(self):
return self.direction return self.direction
def get_text(self): def get_text(self):
return ''.join( obj.text for obj in self.objs if isinstance(obj, LTText) ) return ''.join( obj.text for obj in self.objs if isinstance(obj, LTText) )
def fixate(self): def fixate(self):
LayoutContainer.fixate(self) LayoutContainer.fixate(self)
objs = [] objs = []
if self.direction == 'V': if self.direction == 'V':
y0 = -INF y0 = -INF
for obj in sorted(self.objs, key=lambda obj: -obj.y1): for obj in sorted(self.objs, key=lambda obj: -obj.y1):
if isinstance(obj, LTTextItem) and self.word_margin: if isinstance(obj, LTTextItem) and self.word_margin:
margin = self.word_margin * obj.get_margin() margin = self.word_margin * obj.get_margin()
if obj.y1+margin < y0: if obj.y1+margin < y0:
objs.append(LTAnon(' ')) objs.append(LTAnon(' '))
objs.append(obj) objs.append(obj)
y0 = obj.y0 y0 = obj.y0
else: else:
x1 = INF x1 = INF
for obj in sorted(self.objs, key=lambda obj: obj.x0): for obj in sorted(self.objs, key=lambda obj: obj.x0):
if isinstance(obj, LTTextItem) and self.word_margin: if isinstance(obj, LTTextItem) and self.word_margin:
margin = self.word_margin * obj.get_margin() margin = self.word_margin * obj.get_margin()
if x1 < obj.x0-margin: if x1 < obj.x0-margin:
objs.append(LTAnon(' ')) objs.append(LTAnon(' '))
objs.append(obj) objs.append(obj)
x1 = obj.x1 x1 = obj.x1
objs.append(LTAnon('\n')) objs.append(LTAnon('\n'))
self.objs = objs self.objs = objs
return return
## LTTextBox ## LTTextBox
@ -392,109 +392,109 @@ class LTTextLine(LayoutContainer):
## ##
class LTTextBox(LayoutContainer): class LTTextBox(LayoutContainer):
def __init__(self, id, objs, direction): def __init__(self, id, objs, direction):
LayoutContainer.__init__(self, id, (0,0,0,0), objs) LayoutContainer.__init__(self, id, (0,0,0,0), objs)
self.direction = direction self.direction = direction
return return
def __repr__(self): def __repr__(self):
return ('<textbox %s(%s) %r...>' % (self.get_bbox(), self.direction, self.get_text()[:20])) return ('<textbox %s(%s) %r...>' % (self.get_bbox(), self.direction, self.get_text()[:20]))
def get_text(self): def get_text(self):
return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) ) return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) )
def fixate(self):
LayoutContainer.fixate(self)
if self.direction == 'V':
self.objs = sorted(self.objs, key=lambda obj: -obj.x1)
else:
self.objs = sorted(self.objs, key=lambda obj: -obj.y1)
return
def get_direction(self): def fixate(self):
return self.direction LayoutContainer.fixate(self)
if self.direction == 'V':
self.objs = sorted(self.objs, key=lambda obj: -obj.x1)
else:
self.objs = sorted(self.objs, key=lambda obj: -obj.y1)
return
def get_direction(self):
return self.direction
def tsort(objs, f): def tsort(objs, f):
gi = dict( (obj,[]) for obj in objs ) gi = dict( (obj,[]) for obj in objs )
go = dict( (obj,[]) for obj in objs ) go = dict( (obj,[]) for obj in objs )
for obj1 in objs: for obj1 in objs:
for obj2 in objs: for obj2 in objs:
if obj1 is obj2: continue if obj1 is obj2: continue
if f(obj1, obj2): # obj1 -> obj2 if f(obj1, obj2): # obj1 -> obj2
go[obj1].append(obj2) go[obj1].append(obj2)
gi[obj2].append(obj1) gi[obj2].append(obj1)
r = objs[:] r = objs[:]
s = [] s = []
while r: while r:
for obj in r: for obj in r:
if not go[obj] or gi[obj]: continue if not go[obj] or gi[obj]: continue
for c in go[obj]: for c in go[obj]:
gi[c].remove(obj) gi[c].remove(obj)
del gi[obj] del gi[obj]
del go[obj] del go[obj]
r.remove(obj) r.remove(obj)
s.append(obj) s.append(obj)
break break
else: else:
obj = r.pop() obj = r.pop()
del gi[obj] del gi[obj]
del go[obj] del go[obj]
s.append(obj) s.append(obj)
return s return s
## LTPage ## LTPage
## ##
class LTPage(LayoutContainer): class LTPage(LayoutContainer):
def __init__(self, id, bbox, rotate=0):
LayoutContainer.__init__(self, id, bbox)
self.rotate = rotate
return
def __repr__(self):
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.get_bbox(), self.rotate))
def analyze_layout(self, laparams): def __init__(self, id, bbox, rotate=0):
textobjs = [] LayoutContainer.__init__(self, id, bbox)
otherobjs = [] self.rotate = rotate
for obj in self.objs: return
if isinstance(obj, LTText) and obj.is_upright():
textobjs.append(obj) def __repr__(self):
else: return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.get_bbox(), self.rotate))
otherobjs.append(obj)
if laparams.direction == 'V': def analyze_layout(self, laparams):
def vline(obj1, obj2): textobjs = []
return obj1.width * laparams.line_overlap < obj1.hoverlap(obj2) otherobjs = []
def vorder(obj1, obj2): for obj in self.objs:
if obj1.voverlap(obj2): if isinstance(obj, LTText) and obj.is_upright():
return obj2.x1 < obj1.x0 textobjs.append(obj)
elif obj1.hoverlap(obj2): else:
return obj2.y1 < obj1.y0 otherobjs.append(obj)
if laparams.direction == 'V':
def vline(obj1, obj2):
return obj1.width * laparams.line_overlap < obj1.hoverlap(obj2)
def vorder(obj1, obj2):
if obj1.voverlap(obj2):
return obj2.x1 < obj1.x0
elif obj1.hoverlap(obj2):
return obj2.y1 < obj1.y0
else:
return obj2.x1 < obj1.x0 and obj2.y1 < obj1.y0
lines = ClusterSet.build(textobjs, 0, laparams.char_margin,
(lambda id,objs: LTTextLine(id, objs, 'V', laparams.word_margin)),
vline)
boxes = ClusterSet.build(lines, laparams.line_margin, 0,
(lambda id,objs: LTTextBox(id, objs, 'V')))
boxes = tsort(boxes, vorder)
else: else:
return obj2.x1 < obj1.x0 and obj2.y1 < obj1.y0 def hline(obj1, obj2):
lines = ClusterSet.build(textobjs, 0, laparams.char_margin, return obj1.height * laparams.line_overlap < obj1.voverlap(obj2)
(lambda id,objs: LTTextLine(id, objs, 'V', laparams.word_margin)), def horder(obj1, obj2):
vline) if obj1.hoverlap(obj2):
boxes = ClusterSet.build(lines, laparams.line_margin, 0, return obj2.y1 < obj1.y0
(lambda id,objs: LTTextBox(id, objs, 'V'))) elif obj1.voverlap(obj2):
boxes = tsort(boxes, vorder) return obj1.x1 < obj2.x0
else: else:
def hline(obj1, obj2): return obj1.x1 < obj2.x0 and obj2.y1 < obj1.y0
return obj1.height * laparams.line_overlap < obj1.voverlap(obj2) lines = ClusterSet.build(textobjs, laparams.char_margin, 0,
def horder(obj1, obj2): (lambda id,objs: LTTextLine(id, objs, 'H', laparams.word_margin)),
if obj1.hoverlap(obj2): hline)
return obj2.y1 < obj1.y0 boxes = ClusterSet.build(lines, 0, laparams.line_margin,
elif obj1.voverlap(obj2): (lambda id,objs: LTTextBox(id, objs, 'H')))
return obj1.x1 < obj2.x0 boxes = tsort(boxes, horder)
else: self.objs = otherobjs + boxes
return obj1.x1 < obj2.x0 and obj2.y1 < obj1.y0 return
lines = ClusterSet.build(textobjs, laparams.char_margin, 0,
(lambda id,objs: LTTextLine(id, objs, 'H', laparams.word_margin)),
hline)
boxes = ClusterSet.build(lines, 0, laparams.line_margin,
(lambda id,objs: LTTextBox(id, objs, 'H')))
boxes = tsort(boxes, horder)
self.objs = otherobjs + boxes
return

View File

@ -7,93 +7,93 @@ stderr = sys.stderr
## ##
class LZWDecoder(object): class LZWDecoder(object):
debug = 0 debug = 0
def __init__(self, fp):
self.fp = fp
self.buff = 0
self.bpos = 8
self.nbits = 9
self.table = None
self.prevbuf = None
return
def readbits(self, bits): def __init__(self, fp):
v = 0 self.fp = fp
while 1: self.buff = 0
# the number of remaining bits we can get from the current buffer. self.bpos = 8
r = 8-self.bpos self.nbits = 9
if bits <= r: self.table = None
# |-----8-bits-----| self.prevbuf = None
# |-bpos-|-bits-| | return
# | |----r----|
v = (v<<bits) | ((self.buff>>(r-bits)) & ((1<<bits)-1)) def readbits(self, bits):
self.bpos += bits v = 0
break while 1:
else: # the number of remaining bits we can get from the current buffer.
# |-----8-bits-----| r = 8-self.bpos
# |-bpos-|---bits----... if bits <= r:
# | |----r----| # |-----8-bits-----|
v = (v<<r) | (self.buff & ((1<<r)-1)) # |-bpos-|-bits-| |
bits -= r # | |----r----|
x = self.fp.read(1) v = (v<<bits) | ((self.buff>>(r-bits)) & ((1<<bits)-1))
if not x: raise EOFError self.bpos += bits
self.buff = ord(x) break
self.bpos = 0 else:
return v # |-----8-bits-----|
# |-bpos-|---bits----...
# | |----r----|
v = (v<<r) | (self.buff & ((1<<r)-1))
bits -= r
x = self.fp.read(1)
if not x: raise EOFError
self.buff = ord(x)
self.bpos = 0
return v
def feed(self, code):
x = ''
if code == 256:
self.table = [ chr(c) for c in xrange(256) ] # 0-255
self.table.append(None) # 256
self.table.append(None) # 257
self.prevbuf = ''
self.nbits = 9
elif code == 257:
pass
elif not self.prevbuf:
x = self.prevbuf = self.table[code]
else:
if code < len(self.table):
x = self.table[code]
self.table.append(self.prevbuf+x[0])
else:
self.table.append(self.prevbuf+self.prevbuf[0])
x = self.table[code]
l = len(self.table)
if l == 511:
self.nbits = 10
elif l == 1023:
self.nbits = 11
elif l == 2047:
self.nbits = 12
self.prevbuf = x
return x
def run(self):
while 1:
try:
code = self.readbits(self.nbits)
except EOFError:
break
x = self.feed(code)
yield x
if self.debug:
print >>stderr, ('nbits=%d, code=%d, output=%r, table=%r' %
(self.nbits, code, x, self.table[258:]))
return
def feed(self, code):
x = ''
if code == 256:
self.table = [ chr(c) for c in xrange(256) ] # 0-255
self.table.append(None) # 256
self.table.append(None) # 257
self.prevbuf = ''
self.nbits = 9
elif code == 257:
pass
elif not self.prevbuf:
x = self.prevbuf = self.table[code]
else:
if code < len(self.table):
x = self.table[code]
self.table.append(self.prevbuf+x[0])
else:
self.table.append(self.prevbuf+self.prevbuf[0])
x = self.table[code]
l = len(self.table)
if l == 511:
self.nbits = 10
elif l == 1023:
self.nbits = 11
elif l == 2047:
self.nbits = 12
self.prevbuf = x
return x
def run(self):
while 1:
try:
code = self.readbits(self.nbits)
except EOFError:
break
x = self.feed(code)
yield x
if self.debug:
print >>stderr, ('nbits=%d, code=%d, output=%r, table=%r' %
(self.nbits, code, x, self.table[258:]))
return
def main(argv): def main(argv):
import StringIO import StringIO
data = '\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01' data = '\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01'
fp = StringIO.StringIO(data) fp = StringIO.StringIO(data)
expected = '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42' expected = '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42'
LZWDecoder.debug = 1 LZWDecoder.debug = 1
output = ''.join(LZWDecoder(fp).run()) output = ''.join(LZWDecoder(fp).run())
print (data, expected, output) print (data, expected, output)
print output == expected print output == expected
return 0 return 0
if __name__ == '__main__': sys.exit(main(sys.argv)) if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -10,14 +10,14 @@ LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK') LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
class PDFColorSpace(object): class PDFColorSpace(object):
def __init__(self, name, ncomponents): def __init__(self, name, ncomponents):
self.name = name self.name = name
self.ncomponents = ncomponents self.ncomponents = ncomponents
return return
def __repr__(self): def __repr__(self):
return '<PDFColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents) return '<PDFColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
PREDEFINED_COLORSPACE = dict( PREDEFINED_COLORSPACE = dict(

View File

@ -9,116 +9,116 @@ from pdfminer.pdffont import PDFUnicodeNotDefined
## ##
class PDFDevice(object): class PDFDevice(object):
debug = 0 debug = 0
def __init__(self, rsrc):
self.rsrc = rsrc
self.ctm = None
return
def __repr__(self):
return '<PDFDevice>'
def close(self): def __init__(self, rsrc):
return self.rsrc = rsrc
self.ctm = None
return
def set_ctm(self, ctm): def __repr__(self):
self.ctm = ctm return '<PDFDevice>'
return
def begin_tag(self, tag, props=None): def close(self):
return return
def end_tag(self):
return
def do_tag(self, tag, props=None):
return
def begin_page(self, page, ctm): def set_ctm(self, ctm):
return self.ctm = ctm
def end_page(self, page): return
return
def begin_figure(self, name, bbox, matrix):
return
def end_figure(self, name):
return
def paint_path(self, graphicstate, stroke, fill, evenodd, path): def begin_tag(self, tag, props=None):
return return
def render_image(self, stream, size): def end_tag(self):
return return
def render_string(self, textstate, seq): def do_tag(self, tag, props=None):
return return
def begin_page(self, page, ctm):
return
def end_page(self, page):
return
def begin_figure(self, name, bbox, matrix):
return
def end_figure(self, name):
return
def paint_path(self, graphicstate, stroke, fill, evenodd, path):
return
def render_image(self, stream, size):
return
def render_string(self, textstate, seq):
return
## PDFTextDevice ## PDFTextDevice
## ##
class PDFTextDevice(PDFDevice): class PDFTextDevice(PDFDevice):
def handle_undefined_char(self, cidcoding, cid): def handle_undefined_char(self, cidcoding, cid):
if self.debug: if self.debug:
print >>sys.stderr, 'undefined: %r, %r' % (cidcoding, cid) print >>sys.stderr, 'undefined: %r, %r' % (cidcoding, cid)
return '?' return '?'
def render_chars(self, matrix, font, fontsize, charspace, scaling, chars): def render_chars(self, matrix, font, fontsize, charspace, scaling, chars):
return (0, 0) return (0, 0)
def render_string(self, textstate, seq): def render_string(self, textstate, seq):
matrix = mult_matrix(textstate.matrix, self.ctm) matrix = mult_matrix(textstate.matrix, self.ctm)
font = textstate.font font = textstate.font
fontsize = textstate.fontsize fontsize = textstate.fontsize
scaling = textstate.scaling * .01 scaling = textstate.scaling * .01
charspace = textstate.charspace * scaling charspace = textstate.charspace * scaling
wordspace = textstate.wordspace * scaling wordspace = textstate.wordspace * scaling
dxscale = .001 * fontsize * scaling dxscale = .001 * fontsize * scaling
chars = []
needspace = False
(x,y) = textstate.linematrix
for obj in seq:
if isinstance(obj, int) or isinstance(obj, float):
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars)
x += dx
y += dy
d = -obj*dxscale
if font.is_vertical():
y += d
else:
x += d
chars = [] chars = []
needspace = False needspace = False
else: (x,y) = textstate.linematrix
for cid in font.decode(obj): for obj in seq:
try: if isinstance(obj, int) or isinstance(obj, float):
char = font.to_unicode(cid) (dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
except PDFUnicodeNotDefined, e: fontsize, charspace, scaling, chars)
(cidcoding, cid) = e.args x += dx
char = self.handle_undefined_char(cidcoding, cid) y += dy
chars.append((char, cid)) d = -obj*dxscale
if cid == 32 and textstate.wordspace and not font.is_multibyte(): if font.is_vertical():
y += d
else:
x += d
chars = []
needspace = False
else:
for cid in font.decode(obj):
try:
char = font.to_unicode(cid)
except PDFUnicodeNotDefined, e:
(cidcoding, cid) = e.args
char = self.handle_undefined_char(cidcoding, cid)
chars.append((char, cid))
if cid == 32 and textstate.wordspace and not font.is_multibyte():
if needspace:
if font.is_vertical():
y += charspace
else:
x += charspace
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars)
needspace = True
x += dx
y += dy
if font.is_vertical():
y += wordspace
else:
x += wordspace
chars = []
if chars:
if needspace: if needspace:
if font.is_vertical(): if font.is_vertical():
y += charspace y += charspace
else: else:
x += charspace x += charspace
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font, (dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars) fontsize, charspace, scaling, chars)
needspace = True
x += dx x += dx
y += dy y += dy
if font.is_vertical(): textstate.linematrix = (x,y)
y += wordspace return
else:
x += wordspace
chars = []
if chars:
if needspace:
if font.is_vertical():
y += charspace
else:
x += charspace
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars)
x += dx
y += dy
textstate.linematrix = (x,y)
return

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -25,218 +25,218 @@ class PDFNotImplementedError(PSException): pass
## PDFObjRef ## PDFObjRef
## ##
class PDFObjRef(PDFObject): class PDFObjRef(PDFObject):
def __init__(self, doc, objid, _):
if objid == 0:
if STRICT:
raise PDFValueError('PDF object id cannot be 0.')
self.doc = doc
self.objid = objid
#self.genno = genno # Never used.
return
def __repr__(self): def __init__(self, doc, objid, _):
return '<PDFObjRef:%d>' % (self.objid) if objid == 0:
if STRICT:
raise PDFValueError('PDF object id cannot be 0.')
self.doc = doc
self.objid = objid
#self.genno = genno # Never used.
return
def resolve(self): def __repr__(self):
return self.doc.getobj(self.objid) return '<PDFObjRef:%d>' % (self.objid)
def resolve(self):
return self.doc.getobj(self.objid)
# resolve # resolve
def resolve1(x): def resolve1(x):
''' '''
Resolve an object. If this is an array or dictionary, Resolve an object. If this is an array or dictionary,
it may still contains some indirect objects inside. it may still contains some indirect objects inside.
''' '''
while isinstance(x, PDFObjRef): while isinstance(x, PDFObjRef):
x = x.resolve() x = x.resolve()
return x return x
def resolve_all(x): def resolve_all(x):
''' '''
Recursively resolve X and all the internals. Recursively resolve X and all the internals.
Make sure there is no indirect reference within the nested object. Make sure there is no indirect reference within the nested object.
This procedure might be slow. This procedure might be slow.
''' '''
while isinstance(x, PDFObjRef): while isinstance(x, PDFObjRef):
x = x.resolve() x = x.resolve()
if isinstance(x, list): if isinstance(x, list):
x = [ resolve_all(v) for v in x ] x = [ resolve_all(v) for v in x ]
elif isinstance(x, dict): elif isinstance(x, dict):
for (k,v) in x.iteritems(): for (k,v) in x.iteritems():
x[k] = resolve_all(v) x[k] = resolve_all(v)
return x return x
def decipher_all(decipher, objid, genno, x): def decipher_all(decipher, objid, genno, x):
''' '''
Recursively decipher X. Recursively decipher X.
''' '''
if isinstance(x, str): if isinstance(x, str):
return decipher(objid, genno, x) return decipher(objid, genno, x)
if isinstance(x, list): if isinstance(x, list):
x = [ decipher_all(decipher, objid, genno, v) for v in x ] x = [ decipher_all(decipher, objid, genno, v) for v in x ]
elif isinstance(x, dict): elif isinstance(x, dict):
for (k,v) in x.iteritems(): for (k,v) in x.iteritems():
x[k] = decipher_all(decipher, objid, genno, v) x[k] = decipher_all(decipher, objid, genno, v)
return x return x
# Type cheking # Type cheking
def int_value(x): def int_value(x):
x = resolve1(x) x = resolve1(x)
if not isinstance(x, int): if not isinstance(x, int):
if STRICT: if STRICT:
raise PDFTypeError('Integer required: %r' % x) raise PDFTypeError('Integer required: %r' % x)
return 0 return 0
return x return x
def float_value(x): def float_value(x):
x = resolve1(x) x = resolve1(x)
if not isinstance(x, float): if not isinstance(x, float):
if STRICT: if STRICT:
raise PDFTypeError('Float required: %r' % x) raise PDFTypeError('Float required: %r' % x)
return 0.0 return 0.0
return x return x
def num_value(x): def num_value(x):
x = resolve1(x) x = resolve1(x)
if not (isinstance(x, int) or isinstance(x, float)): if not (isinstance(x, int) or isinstance(x, float)):
if STRICT: if STRICT:
raise PDFTypeError('Int or Float required: %r' % x) raise PDFTypeError('Int or Float required: %r' % x)
return 0 return 0
return x return x
def str_value(x): def str_value(x):
x = resolve1(x) x = resolve1(x)
if not isinstance(x, str): if not isinstance(x, str):
if STRICT: if STRICT:
raise PDFTypeError('String required: %r' % x) raise PDFTypeError('String required: %r' % x)
return '' return ''
return x return x
def list_value(x): def list_value(x):
x = resolve1(x) x = resolve1(x)
if not (isinstance(x, list) or isinstance(x, tuple)): if not (isinstance(x, list) or isinstance(x, tuple)):
if STRICT: if STRICT:
raise PDFTypeError('List required: %r' % x) raise PDFTypeError('List required: %r' % x)
return [] return []
return x return x
def dict_value(x): def dict_value(x):
x = resolve1(x) x = resolve1(x)
if not isinstance(x, dict): if not isinstance(x, dict):
if STRICT: if STRICT:
raise PDFTypeError('Dict required: %r' % x) raise PDFTypeError('Dict required: %r' % x)
return {} return {}
return x return x
def stream_value(x): def stream_value(x):
x = resolve1(x) x = resolve1(x)
if not isinstance(x, PDFStream): if not isinstance(x, PDFStream):
if STRICT: if STRICT:
raise PDFTypeError('PDFStream required: %r' % x) raise PDFTypeError('PDFStream required: %r' % x)
return PDFStream({}, '') return PDFStream({}, '')
return x return x
## PDFStream type ## PDFStream type
## ##
class PDFStream(PDFObject): class PDFStream(PDFObject):
def __init__(self, dic, rawdata, decipher=None):
self.dic = dic
self.rawdata = rawdata
self.decipher = decipher
self.data = None
self.objid = None
self.genno = None
return
def set_objid(self, objid, genno): def __init__(self, dic, rawdata, decipher=None):
self.objid = objid self.dic = dic
self.genno = genno self.rawdata = rawdata
return self.decipher = decipher
self.data = None
def __repr__(self): self.objid = None
return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic) self.genno = None
return
def decomp(self,data): def set_objid(self, objid, genno):
import zlib self.objid = objid
buf = data self.genno = genno
# some FlateDecode streams have garbage (newlines, etc) appended to the return
# end. remove chars from the end to try and decompress the buffer
while 8 <= len(buf):
try:
# will get errors if the document is encrypted.
dco = zlib.decompressobj()
return dco.decompress(buf)
except zlib.error:
buf = buf[:-1]
raise Exception, "zlib.error while decompressing data"
def decode(self): def __repr__(self):
assert self.data == None and self.rawdata != None return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
data = self.rawdata
if self.decipher:
# Handle encryption
data = self.decipher(self.objid, self.genno, data)
if 'Filter' not in self.dic:
self.data = data
self.rawdata = None
return
filters = self.dic['Filter']
if not isinstance(filters, list):
filters = [ filters ]
for f in filters:
if f in LITERALS_FLATE_DECODE:
# will get errors if the document is encrypted.
data = self.decomp(data)
elif f in LITERALS_LZW_DECODE:
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
data = ''.join(LZWDecoder(StringIO(data)).run())
elif f in LITERALS_ASCII85_DECODE:
import ascii85
data = ascii85.ascii85decode(data)
elif f in LITERALS_ASCIIHEX_DECODE:
import ascii85
data = ascii85.asciihexdecode(data)
elif f == LITERAL_CRYPT:
raise PDFNotImplementedError('/Crypt filter is unsupported')
else:
raise PDFNotImplementedError('Unsupported filter: %r' % f)
# apply predictors
if 'DP' in self.dic:
params = self.dic['DP']
else:
params = self.dic.get('DecodeParms', {})
if 'Predictor' in params:
pred = int_value(params['Predictor'])
if pred:
if pred != 12:
raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
if 'Columns' not in params:
raise PDFValueError('Columns undefined for predictor=12')
columns = int_value(params['Columns'])
buf = ''
ent0 = '\x00' * columns
for i in xrange(0, len(data), columns+1):
pred = data[i]
ent1 = data[i+1:i+1+columns]
if pred == '\x02':
ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )
buf += ent1
ent0 = ent1
data = buf
self.data = data
self.rawdata = None
return
def get_data(self): def decomp(self,data):
if self.data == None: import zlib
self.decode() buf = data
return self.data # some FlateDecode streams have garbage (newlines, etc) appended to the
# end. remove chars from the end to try and decompress the buffer
while 8 <= len(buf):
try:
# will get errors if the document is encrypted.
dco = zlib.decompressobj()
return dco.decompress(buf)
except zlib.error:
buf = buf[:-1]
raise Exception, "zlib.error while decompressing data"
def get_rawdata(self): def decode(self):
return self.rawdata assert self.data == None and self.rawdata != None
data = self.rawdata
if self.decipher:
# Handle encryption
data = self.decipher(self.objid, self.genno, data)
if 'Filter' not in self.dic:
self.data = data
self.rawdata = None
return
filters = self.dic['Filter']
if not isinstance(filters, list):
filters = [ filters ]
for f in filters:
if f in LITERALS_FLATE_DECODE:
# will get errors if the document is encrypted.
data = self.decomp(data)
elif f in LITERALS_LZW_DECODE:
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
data = ''.join(LZWDecoder(StringIO(data)).run())
elif f in LITERALS_ASCII85_DECODE:
import ascii85
data = ascii85.ascii85decode(data)
elif f in LITERALS_ASCIIHEX_DECODE:
import ascii85
data = ascii85.asciihexdecode(data)
elif f == LITERAL_CRYPT:
raise PDFNotImplementedError('/Crypt filter is unsupported')
else:
raise PDFNotImplementedError('Unsupported filter: %r' % f)
# apply predictors
if 'DP' in self.dic:
params = self.dic['DP']
else:
params = self.dic.get('DecodeParms', {})
if 'Predictor' in params:
pred = int_value(params['Predictor'])
if pred:
if pred != 12:
raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
if 'Columns' not in params:
raise PDFValueError('Columns undefined for predictor=12')
columns = int_value(params['Columns'])
buf = ''
ent0 = '\x00' * columns
for i in xrange(0, len(data), columns+1):
pred = data[i]
ent1 = data[i+1:i+1+columns]
if pred == '\x02':
ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )
buf += ent1
ent0 = ent1
data = buf
self.data = data
self.rawdata = None
return
def get_data(self):
if self.data == None:
self.decode()
return self.data
def get_rawdata(self):
return self.rawdata

File diff suppressed because it is too large Load Diff

View File

@ -4,7 +4,7 @@
# #
# by Yusuke Shinyama # by Yusuke Shinyama
# * public domain * # * public domain *
# #
import sys, os import sys, os
from struct import pack, unpack from struct import pack, unpack
@ -13,24 +13,24 @@ from array import array
# calc hash value with a given key # calc hash value with a given key
def cdbhash(s, n=5381L): def cdbhash(s, n=5381L):
return reduce(lambda h,c: ((h*33) ^ ord(c)) & 0xffffffffL, s, n) return reduce(lambda h,c: ((h*33) ^ ord(c)) & 0xffffffffL, s, n)
if pack('=i',1) == pack('>i',1): if pack('=i',1) == pack('>i',1):
# big endian # big endian
def decode(x): def decode(x):
a = array('I', x) a = array('I', x)
a.byteswap() a.byteswap()
return a return a
def encode(a): def encode(a):
a.byteswap() a.byteswap()
return a.tostring() return a.tostring()
else: else:
# little endian # little endian
def decode(x): def decode(x):
a = array('I', x) a = array('I', x)
return a return a
def encode(a): def encode(a):
return a.tostring() return a.tostring()
## CDB ## CDB
@ -38,234 +38,234 @@ else:
# cdbiter # cdbiter
def cdbiter(fp, eod): def cdbiter(fp, eod):
kloc = 2048 kloc = 2048
while kloc < eod: while kloc < eod:
fp.seek(kloc) fp.seek(kloc)
(klen, vlen) = unpack('<II', fp.read(8)) (klen, vlen) = unpack('<II', fp.read(8))
k = fp.read(klen) k = fp.read(klen)
v = fp.read(vlen) v = fp.read(vlen)
kloc += 8+klen+vlen kloc += 8+klen+vlen
yield (k,v) yield (k,v)
fp.close() fp.close()
return return
# CDBReader # CDBReader
class CDBReader(object): class CDBReader(object):
def __init__(self, cdbname, docache=1):
self.name = cdbname
self._fp = file(cdbname, 'rb')
hash0 = decode(self._fp.read(2048))
self._hash0 = [ (hash0[i], hash0[i+1]) for i in xrange(0, 512, 2) ]
self._hash1 = [ None ] * 256
self._eod = hash0[0]
self._docache = docache
self._cache = {}
self._keyiter = None
self._eachiter = None
return
def __repr__(self): def __init__(self, cdbname, docache=1):
return '<CDBReader: %r>' % self.name self.name = cdbname
self._fp = file(cdbname, 'rb')
hash0 = decode(self._fp.read(2048))
self._hash0 = [ (hash0[i], hash0[i+1]) for i in xrange(0, 512, 2) ]
self._hash1 = [ None ] * 256
self._eod = hash0[0]
self._docache = docache
self._cache = {}
self._keyiter = None
self._eachiter = None
return
def __getstate__(self): def __repr__(self):
raise TypeError return '<CDBReader: %r>' % self.name
def __setstate__(self, dict): def __getstate__(self):
raise TypeError raise TypeError
def __getitem__(self, k): def __setstate__(self, dict):
k = str(k) raise TypeError
if k in self._cache: return self._cache[k]
h = cdbhash(k)
h1 = h & 0xff
(pos_bucket, ncells) = self._hash0[h1]
if ncells == 0: raise KeyError(k)
hs = self._hash1[h1]
if hs == None:
self._fp.seek(pos_bucket)
hs = decode(self._fp.read(ncells * 8))
self._hash1[h1] = hs
i = ((h >> 8) % ncells) * 2
n = ncells*2
for _ in xrange(ncells):
p1 = hs[i+1]
if p1 == 0: raise KeyError(k)
if hs[i] == h:
self._fp.seek(p1)
(klen, vlen) = unpack('<II', self._fp.read(8))
k1 = self._fp.read(klen)
if k1 == k:
v1 = self._fp.read(vlen)
if self._docache:
self._cache[k] = v1
return v1
i = (i+2) % n
raise KeyError(k)
def get(self, k, failed=None): def __getitem__(self, k):
try: k = str(k)
return self.__getitem__(k) if k in self._cache: return self._cache[k]
except KeyError: h = cdbhash(k)
return failed h1 = h & 0xff
(pos_bucket, ncells) = self._hash0[h1]
if ncells == 0: raise KeyError(k)
hs = self._hash1[h1]
if hs == None:
self._fp.seek(pos_bucket)
hs = decode(self._fp.read(ncells * 8))
self._hash1[h1] = hs
i = ((h >> 8) % ncells) * 2
n = ncells*2
for _ in xrange(ncells):
p1 = hs[i+1]
if p1 == 0: raise KeyError(k)
if hs[i] == h:
self._fp.seek(p1)
(klen, vlen) = unpack('<II', self._fp.read(8))
k1 = self._fp.read(klen)
if k1 == k:
v1 = self._fp.read(vlen)
if self._docache:
self._cache[k] = v1
return v1
i = (i+2) % n
raise KeyError(k)
def has_key(self, k): def get(self, k, failed=None):
try: try:
self.__getitem__(k) return self.__getitem__(k)
return True except KeyError:
except KeyError: return failed
return False
def __contains__(self, k): def has_key(self, k):
return self.has_key(k) try:
self.__getitem__(k)
return True
except KeyError:
return False
def firstkey(self): def __contains__(self, k):
self._keyiter = None return self.has_key(k)
return self.nextkey()
def nextkey(self):
if not self._keyiter:
self._keyiter = ( k for (k,v) in cdbiter(self._fp, self._eod) )
try:
return self._keyiter.next()
except StopIteration:
return None
def each(self): def firstkey(self):
if not self._eachiter: self._keyiter = None
self._eachiter = cdbiter(self._fp, self._eod) return self.nextkey()
try:
return self._eachiter.next() def nextkey(self):
except StopIteration: if not self._keyiter:
return None self._keyiter = ( k for (k,v) in cdbiter(self._fp, self._eod) )
try:
def iterkeys(self): return self._keyiter.next()
return ( k for (k,v) in cdbiter(self._fp, self._eod) ) except StopIteration:
def itervalues(self): return None
return ( v for (k,v) in cdbiter(self._fp, self._eod) )
def iteritems(self): def each(self):
return cdbiter(self._fp, self._eod) if not self._eachiter:
self._eachiter = cdbiter(self._fp, self._eod)
try:
return self._eachiter.next()
except StopIteration:
return None
def iterkeys(self):
return ( k for (k,v) in cdbiter(self._fp, self._eod) )
def itervalues(self):
return ( v for (k,v) in cdbiter(self._fp, self._eod) )
def iteritems(self):
return cdbiter(self._fp, self._eod)
# CDBMaker # CDBMaker
class CDBMaker(object): class CDBMaker(object):
def __init__(self, cdbname, tmpname): def __init__(self, cdbname, tmpname):
self.fn = cdbname self.fn = cdbname
self.fntmp = tmpname self.fntmp = tmpname
self.numentries = 0 self.numentries = 0
self._fp = file(tmpname, 'wb') self._fp = file(tmpname, 'wb')
self._pos = 2048 # sizeof((h,p))*256 self._pos = 2048 # sizeof((h,p))*256
self._bucket = [ array('I') for _ in xrange(256) ] self._bucket = [ array('I') for _ in xrange(256) ]
return return
def __repr__(self): def __repr__(self):
return '<CDBMaker: %r, %r, %d ents>' % (self.fn, self.fntmp, self.numentries) return '<CDBMaker: %r, %r, %d ents>' % (self.fn, self.fntmp, self.numentries)
def __len__(self): def __len__(self):
return self.numentries return self.numentries
def __getstate__(self): def __getstate__(self):
raise TypeError raise TypeError
def __setstate__(self, dict): def __setstate__(self, dict):
raise TypeError raise TypeError
def add(self, k, v): def add(self, k, v):
(k, v) = (str(k), str(v)) (k, v) = (str(k), str(v))
(klen, vlen) = (len(k), len(v)) (klen, vlen) = (len(k), len(v))
self._fp.seek(self._pos) self._fp.seek(self._pos)
self._fp.write(pack('<II', klen, vlen)) self._fp.write(pack('<II', klen, vlen))
self._fp.write(k) self._fp.write(k)
self._fp.write(v) self._fp.write(v)
h = cdbhash(k) h = cdbhash(k)
b = self._bucket[h % 256] b = self._bucket[h % 256]
b.append(h) b.append(h)
b.append(self._pos) b.append(self._pos)
# sizeof(keylen)+sizeof(datalen)+sizeof(key)+sizeof(data) # sizeof(keylen)+sizeof(datalen)+sizeof(key)+sizeof(data)
self._pos += 8+klen+vlen self._pos += 8+klen+vlen
self.numentries += 1 self.numentries += 1
return self return self
def finish(self):
self._fp.seek(self._pos)
pos_hash = self._pos
# write hashes
for b1 in self._bucket:
if not b1: continue
blen = len(b1)
a = array('I', [0]*blen*2)
for j in xrange(0, blen, 2):
(h,p) = (b1[j],b1[j+1])
i = ((h >> 8) % blen)*2
while a[i+1]: # is cell[i] already occupied?
i = (i+2) % len(a)
a[i] = h
a[i+1] = p
self._fp.write(encode(a))
# write header
self._fp.seek(0)
a = array('I')
for b1 in self._bucket:
a.append(pos_hash)
a.append(len(b1))
pos_hash += len(b1)*8
self._fp.write(encode(a))
# close
self._fp.close()
os.rename(self.fntmp, self.fn)
return
# txt2cdb def finish(self):
def txt2cdb(self, lines): self._fp.seek(self._pos)
import re pos_hash = self._pos
HEAD = re.compile(r'^\+(\d+),(\d+):') # write hashes
for line in lines: for b1 in self._bucket:
m = HEAD.match(line) if not b1: continue
if not m: break blen = len(b1)
(klen, vlen) = (int(m.group(1)), int(m.group(2))) a = array('I', [0]*blen*2)
i = len(m.group(0)) for j in xrange(0, blen, 2):
k = line[i:i+klen] (h,p) = (b1[j],b1[j+1])
i += klen i = ((h >> 8) % blen)*2
if line[i:i+2] != '->': raise ValueError('invalid separator: %r' % line) while a[i+1]: # is cell[i] already occupied?
i += 2 i = (i+2) % len(a)
v = line[i:i+vlen] a[i] = h
self.add(k, v) a[i+1] = p
return self self._fp.write(encode(a))
# write header
self._fp.seek(0)
a = array('I')
for b1 in self._bucket:
a.append(pos_hash)
a.append(len(b1))
pos_hash += len(b1)*8
self._fp.write(encode(a))
# close
self._fp.close()
os.rename(self.fntmp, self.fn)
return
# txt2cdb
def txt2cdb(self, lines):
import re
HEAD = re.compile(r'^\+(\d+),(\d+):')
for line in lines:
m = HEAD.match(line)
if not m: break
(klen, vlen) = (int(m.group(1)), int(m.group(2)))
i = len(m.group(0))
k = line[i:i+klen]
i += klen
if line[i:i+2] != '->': raise ValueError('invalid separator: %r' % line)
i += 2
v = line[i:i+vlen]
self.add(k, v)
return self
# cdbdump # cdbdump
def cdbdump(cdbname): def cdbdump(cdbname):
fp = file(cdbname, 'rb') fp = file(cdbname, 'rb')
(eor,) = unpack('<I', fp.read(4)) (eor,) = unpack('<I', fp.read(4))
return cdbiter(fp, eor) return cdbiter(fp, eor)
# cdbmerge # cdbmerge
def cdbmerge(iters): def cdbmerge(iters):
q = [] q = []
for it in iters: for it in iters:
try: try:
q.append((it.next(),it)) q.append((it.next(),it))
except StopIteration: except StopIteration:
pass pass
k0 = None k0 = None
vs = None vs = None
while q: while q:
q.sort() q.sort()
((k,v),it) = q.pop(0) ((k,v),it) = q.pop(0)
if k0 != k: if k0 != k:
if vs: yield (k0,vs) if vs: yield (k0,vs)
vs = [] vs = []
vs.append(v) vs.append(v)
k0 = k k0 = k
try: try:
q.append((it.next(),it)) q.append((it.next(),it))
except StopIteration: except StopIteration:
continue continue
if vs: yield (k0,vs) if vs: yield (k0,vs)
return return
# aliases # aliases
@ -278,132 +278,132 @@ init = CDBReader
# tcdbiter # tcdbiter
def tcdbiter(fp, eor): def tcdbiter(fp, eor):
locs = {} locs = {}
fp.seek(eor) fp.seek(eor)
while 1: while 1:
x = fp.read(8) x = fp.read(8)
if not x: break if not x: break
(h, pos) = unpack('<II', x) (h, pos) = unpack('<II', x)
if pos: locs[pos] = h if pos: locs[pos] = h
pos = 2048 pos = 2048
fp.seek(pos) fp.seek(pos)
key = () key = ()
parents = [0] parents = [0]
while pos < eor: while pos < eor:
(klen, vlen) = unpack('<II', fp.read(8)) (klen, vlen) = unpack('<II', fp.read(8))
k = fp.read(klen) k = fp.read(klen)
v = fp.read(vlen) v = fp.read(vlen)
h = locs[pos] h = locs[pos]
for (i,p) in enumerate(parents): for (i,p) in enumerate(parents):
if cdbhash(k, p+5381L) == h: if cdbhash(k, p+5381L) == h:
parents = parents[:i+1] parents = parents[:i+1]
key = key[:i] key = key[:i]
break break
key += (k,) key += (k,)
yield (key, v) yield (key, v)
parents.append(pos) parents.append(pos)
pos += 8+klen+vlen pos += 8+klen+vlen
fp.close() fp.close()
return return
# TCDBMaker # TCDBMaker
class TCDBMaker(CDBMaker): class TCDBMaker(CDBMaker):
def __init__(self, cdbname, tmpname): def __init__(self, cdbname, tmpname):
CDBMaker.__init__(self, cdbname, tmpname) CDBMaker.__init__(self, cdbname, tmpname)
self._parent = 0 self._parent = 0
self._stack = [self._parent] self._stack = [self._parent]
return return
def put(self, depth, k, v): def put(self, depth, k, v):
if depth == len(self._stack)+1: if depth == len(self._stack)+1:
self._stack.append(self._parent) self._stack.append(self._parent)
elif depth < len(self._stack): elif depth < len(self._stack):
self._stack = self._stack[:depth] self._stack = self._stack[:depth]
elif depth != len(self._stack): elif depth != len(self._stack):
raise ValueError('invalid depth: %d' % depth) raise ValueError('invalid depth: %d' % depth)
# #
(k, v) = (str(k), str(v)) (k, v) = (str(k), str(v))
(klen, vlen) = (len(k), len(v)) (klen, vlen) = (len(k), len(v))
self._parent = self._pos self._parent = self._pos
# sizeof(keylen)+sizeof(datalen)+sizeof(key)+sizeof(data) # sizeof(keylen)+sizeof(datalen)+sizeof(key)+sizeof(data)
self._fp.seek(self._pos) self._fp.seek(self._pos)
self._fp.write(pack('<II', klen, vlen)) self._fp.write(pack('<II', klen, vlen))
self._fp.write(k) self._fp.write(k)
self._fp.write(v) self._fp.write(v)
self._pos += 4+4+klen+vlen self._pos += 4+4+klen+vlen
h = cdbhash(k, self._stack[-1]+5381L) h = cdbhash(k, self._stack[-1]+5381L)
b = self._bucket[h % 256] b = self._bucket[h % 256]
b.append(h) b.append(h)
b.append(self._parent) b.append(self._parent)
self.numentries += 1 self.numentries += 1
return self return self
def txt2tcdb(self, lines): def txt2tcdb(self, lines):
import re import re
HEAD = re.compile(r'^(\++)(\d+),(\d+):') HEAD = re.compile(r'^(\++)(\d+),(\d+):')
for line in lines: for line in lines:
m = HEAD.match(line) m = HEAD.match(line)
if not m: break if not m: break
(depth, klen, vlen) = (len(m.group(1)), int(m.group(2)), int(m.group(3))) (depth, klen, vlen) = (len(m.group(1)), int(m.group(2)), int(m.group(3)))
i = len(m.group(0)) i = len(m.group(0))
k = line[i:i+klen] k = line[i:i+klen]
i += klen i += klen
if line[i:i+2] != '->': raise ValueError('invalid separator: %r' % line) if line[i:i+2] != '->': raise ValueError('invalid separator: %r' % line)
i += 2 i += 2
v = line[i:i+vlen] v = line[i:i+vlen]
self.put(depth, k, v) self.put(depth, k, v)
return self return self
# TCDBReader # TCDBReader
class TCDBReader(CDBReader): class TCDBReader(CDBReader):
def lookup(self, seq, parent=0L): def lookup(self, seq, parent=0L):
r = [] r = []
for k in seq: for k in seq:
(v, parent) = self.lookup1(k, parent) (v, parent) = self.lookup1(k, parent)
r.append(v) r.append(v)
return r return r
def lookup1(self, k, parent=0L): def lookup1(self, k, parent=0L):
k = str(k) k = str(k)
if self._docache and (parent,k) in self._cache: if self._docache and (parent,k) in self._cache:
return self._cache[(parent,k)] return self._cache[(parent,k)]
h = cdbhash(k, parent+5381L) h = cdbhash(k, parent+5381L)
self._fp.seek((h % 256) << 3) self._fp.seek((h % 256) << 3)
(pos_bucket, ncells) = unpack('<II', self._fp.read(8)) (pos_bucket, ncells) = unpack('<II', self._fp.read(8))
if ncells == 0: raise KeyError(k) if ncells == 0: raise KeyError(k)
start = (h >> 8) % ncells start = (h >> 8) % ncells
for i in xrange(ncells): for i in xrange(ncells):
self._fp.seek(pos_bucket + ((start+i) % ncells << 3)) self._fp.seek(pos_bucket + ((start+i) % ncells << 3))
(h1, p1) = unpack('<II', self._fp.read(8)) (h1, p1) = unpack('<II', self._fp.read(8))
if p1 == 0: raise KeyError(k) if p1 == 0: raise KeyError(k)
if h1 == h: if h1 == h:
self._fp.seek(p1) self._fp.seek(p1)
(klen, vlen) = unpack('<II', self._fp.read(8)) (klen, vlen) = unpack('<II', self._fp.read(8))
k1 = self._fp.read(klen) k1 = self._fp.read(klen)
if k1 == k: if k1 == k:
v1 = self._fp.read(vlen) v1 = self._fp.read(vlen)
if self._docache: if self._docache:
self._cache[(parent,k)] = (v1,p1) self._cache[(parent,k)] = (v1,p1)
return (v1,p1) return (v1,p1)
raise KeyError(k) raise KeyError(k)
def iterkeys(self): def iterkeys(self):
return ( k for (k,v) in tcdbiter(self._fp, self._eod) ) return ( k for (k,v) in tcdbiter(self._fp, self._eod) )
def itervalues(self): def itervalues(self):
return ( v for (k,v) in tcdbiter(self._fp, self._eod) ) return ( v for (k,v) in tcdbiter(self._fp, self._eod) )
def iteritems(self): def iteritems(self):
return tcdbiter(self._fp, self._eod) return tcdbiter(self._fp, self._eod)
# tcdbdump # tcdbdump
def tcdbdump(cdbname): def tcdbdump(cdbname):
fp = file(cdbname, 'rb') fp = file(cdbname, 'rb')
(eor,) = unpack('<I', fp.read(4)) (eor,) = unpack('<I', fp.read(4))
return tcdbiter(fp, eor) return tcdbiter(fp, eor)
# aliases # aliases
@ -414,64 +414,64 @@ tcdbmerge = cdbmerge
# main # main
def main(argv): def main(argv):
import getopt, fileinput import getopt, fileinput
def usage(): def usage():
print 'usage: %s {cmake,cget,cdump,cmerge} [options] cdbname [args ...]' % argv[0] print 'usage: %s {cmake,cget,cdump,cmerge} [options] cdbname [args ...]' % argv[0]
print 'usage: %s {tmake,tget,tdump,tmerge} [options] tcdbname [args ...]' % argv[0] print 'usage: %s {tmake,tget,tdump,tmerge} [options] tcdbname [args ...]' % argv[0]
return 100 return 100
args = argv[1:] args = argv[1:]
if not args: return usage() if not args: return usage()
cmd = args.pop(0) cmd = args.pop(0)
try: try:
(opts, args) = getopt.getopt(args, 'kv2') (opts, args) = getopt.getopt(args, 'kv2')
except getopt.GetoptError: except getopt.GetoptError:
return usage() return usage()
if not args: return usage() if not args: return usage()
dbname = args.pop(0) dbname = args.pop(0)
# cdb # cdb
if cmd == 'cmake': if cmd == 'cmake':
CDBMaker(dbname, dbname+'.tmp').txt2cdb(fileinput.input(args)).finish() CDBMaker(dbname, dbname+'.tmp').txt2cdb(fileinput.input(args)).finish()
elif cmd == 'cget': elif cmd == 'cget':
print repr(CDBReader(dbname).get(args[0])) print repr(CDBReader(dbname).get(args[0]))
elif cmd == 'cdump': elif cmd == 'cdump':
f = (lambda k,v: '+%d,%d:%s->%s' % (len(k), len(v), k, v)) f = (lambda k,v: '+%d,%d:%s->%s' % (len(k), len(v), k, v))
for (k, v) in opts: for (k, v) in opts:
if k == '-k': f = (lambda k,_: k) if k == '-k': f = (lambda k,_: k)
elif k == '-v': f = (lambda _,v: v) elif k == '-v': f = (lambda _,v: v)
elif k == '-2': f = (lambda k,v: k+'\t'+v) elif k == '-2': f = (lambda k,v: k+'\t'+v)
for (k,v) in cdbdump(dbname): for (k,v) in cdbdump(dbname):
print f(k,v) print f(k,v)
print print
elif cmd == 'cmerge': elif cmd == 'cmerge':
dbs = [ cdbdump(fname) for fname in args ] dbs = [ cdbdump(fname) for fname in args ]
m = CDBMaker(dbname, dbname+'.tmp') m = CDBMaker(dbname, dbname+'.tmp')
for (k,vs) in tcdbmerge(dbs): for (k,vs) in tcdbmerge(dbs):
m.add(k, ' '.join(vs)) m.add(k, ' '.join(vs))
m.finish() m.finish()
# tcdb # tcdb
elif cmd == 'tmake': elif cmd == 'tmake':
TCDBMaker(dbname, dbname+'.tmp').txt2tcdb(fileinput.input(args)).finish() TCDBMaker(dbname, dbname+'.tmp').txt2tcdb(fileinput.input(args)).finish()
elif cmd == 'tget': elif cmd == 'tget':
print repr(TCDBReader(dbname).lookup(args)) print repr(TCDBReader(dbname).lookup(args))
elif cmd == 'tdump': elif cmd == 'tdump':
f = (lambda k,v: '%s%d,%d:%s->%s' % ('+'*len(k), len(k[-1]), len(v), k[-1], v)) f = (lambda k,v: '%s%d,%d:%s->%s' % ('+'*len(k), len(k[-1]), len(v), k[-1], v))
for (k, v) in opts: for (k, v) in opts:
if k == '-k': f = (lambda k,_: '/'.join(k)) if k == '-k': f = (lambda k,_: '/'.join(k))
elif k == '-v': f = (lambda _,v: v) elif k == '-v': f = (lambda _,v: v)
elif k == '-2': f = (lambda k,v: '/'.join(k)+'\t'+v) elif k == '-2': f = (lambda k,v: '/'.join(k)+'\t'+v)
for (k,v) in tcdbdump(dbname): for (k,v) in tcdbdump(dbname):
print f(k,v) print f(k,v)
print print
elif cmd == 'tmerge': elif cmd == 'tmerge':
dbs = [ tcdbdump(fname) for fname in args ] dbs = [ tcdbdump(fname) for fname in args ]
m = TCDBMaker(dbname, dbname+'.tmp') m = TCDBMaker(dbname, dbname+'.tmp')
for (k,vs) in tcdbmerge(dbs): for (k,vs) in tcdbmerge(dbs):
m.put(len(k), k[-1], ' '.join(vs)) m.put(len(k), k[-1], ' '.join(vs))
m.finish() m.finish()
else: else:
return usage() return usage()
return return
if __name__ == '__main__': sys.exit(main(sys.argv)) if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -691,88 +691,88 @@ rcon = [
] ]
if len(pack('L',0)) == 4: if len(pack('L',0)) == 4:
# 32bit # 32bit
def GETU32(x): return unpack('>L', x)[0] def GETU32(x): return unpack('>L', x)[0]
def PUTU32(x): return pack('>L', x) def PUTU32(x): return pack('>L', x)
else: else:
# 64bit # 64bit
def GETU32(x): return unpack('>I', x)[0] def GETU32(x): return unpack('>I', x)[0]
def PUTU32(x): return pack('>I', x) def PUTU32(x): return pack('>I', x)
# Expand the cipher key into the encryption key schedule. # Expand the cipher key into the encryption key schedule.
# #
# @return the number of rounds for the given cipher key size. # @return the number of rounds for the given cipher key size.
def rijndaelSetupEncrypt(key, keybits): def rijndaelSetupEncrypt(key, keybits):
i = p = 0 i = p = 0
rk = [0]*RKLENGTH(keybits) rk = [0]*RKLENGTH(keybits)
rk[0] = GETU32(key[0:4]) rk[0] = GETU32(key[0:4])
rk[1] = GETU32(key[4:8]) rk[1] = GETU32(key[4:8])
rk[2] = GETU32(key[8:12]) rk[2] = GETU32(key[8:12])
rk[3] = GETU32(key[12:16]) rk[3] = GETU32(key[12:16])
if keybits == 128: if keybits == 128:
while 1: while 1:
temp = rk[p+3] temp = rk[p+3]
rk[p+4] = (rk[p+0] ^ rk[p+4] = (rk[p+0] ^
(Te4[(temp >> 16) & 0xff] & 0xff000000) ^ (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
(Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^ (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
(Te4[(temp ) & 0xff] & 0x0000ff00) ^ (Te4[(temp ) & 0xff] & 0x0000ff00) ^
(Te4[(temp >> 24) ] & 0x000000ff) ^ (Te4[(temp >> 24) ] & 0x000000ff) ^
rcon[i]) rcon[i])
rk[p+5] = rk[p+1] ^ rk[p+4] rk[p+5] = rk[p+1] ^ rk[p+4]
rk[p+6] = rk[p+2] ^ rk[p+5] rk[p+6] = rk[p+2] ^ rk[p+5]
rk[p+7] = rk[p+3] ^ rk[p+6] rk[p+7] = rk[p+3] ^ rk[p+6]
i += 1 i += 1
if i == 10: return (rk, 10) if i == 10: return (rk, 10)
p += 4 p += 4
rk[4] = GETU32(key[16:20]) rk[4] = GETU32(key[16:20])
rk[5] = GETU32(key[20:24]) rk[5] = GETU32(key[20:24])
if keybits == 192: if keybits == 192:
while 1: while 1:
temp = rk[p+5] temp = rk[p+5]
rk[p+6] = (rk[p+0] ^ rk[p+6] = (rk[p+0] ^
(Te4[(temp >> 16) & 0xff] & 0xff000000) ^ (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
(Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^ (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
(Te4[(temp ) & 0xff] & 0x0000ff00) ^ (Te4[(temp ) & 0xff] & 0x0000ff00) ^
(Te4[(temp >> 24) ] & 0x000000ff) ^ (Te4[(temp >> 24) ] & 0x000000ff) ^
rcon[i]) rcon[i])
rk[p+7] = rk[p+1] ^ rk[p+6] rk[p+7] = rk[p+1] ^ rk[p+6]
rk[p+8] = rk[p+2] ^ rk[p+7] rk[p+8] = rk[p+2] ^ rk[p+7]
rk[p+9] = rk[p+3] ^ rk[p+8] rk[p+9] = rk[p+3] ^ rk[p+8]
i += 1 i += 1
if i == 8: return (rk, 12) if i == 8: return (rk, 12)
rk[p+10] = rk[p+4] ^ rk[p+9] rk[p+10] = rk[p+4] ^ rk[p+9]
rk[p+11] = rk[p+5] ^ rk[p+10] rk[p+11] = rk[p+5] ^ rk[p+10]
p += 6 p += 6
rk[6] = GETU32(key[24:28]) rk[6] = GETU32(key[24:28])
rk[7] = GETU32(key[28:32]) rk[7] = GETU32(key[28:32])
if keybits == 256: if keybits == 256:
while 1: while 1:
temp = rk[p+7] temp = rk[p+7]
rk[p+8] = (rk[p+0] ^ rk[p+8] = (rk[p+0] ^
(Te4[(temp >> 16) & 0xff] & 0xff000000) ^ (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
(Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^ (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
(Te4[(temp ) & 0xff] & 0x0000ff00) ^ (Te4[(temp ) & 0xff] & 0x0000ff00) ^
(Te4[(temp >> 24) ] & 0x000000ff) ^ (Te4[(temp >> 24) ] & 0x000000ff) ^
rcon[i]) rcon[i])
rk[p+9] = rk[p+1] ^ rk[p+8] rk[p+9] = rk[p+1] ^ rk[p+8]
rk[p+10] = rk[p+2] ^ rk[p+9] rk[p+10] = rk[p+2] ^ rk[p+9]
rk[p+11] = rk[p+3] ^ rk[p+10] rk[p+11] = rk[p+3] ^ rk[p+10]
i += 1 i += 1
if i == 7: return (rk, 14) if i == 7: return (rk, 14)
temp = rk[p+11] temp = rk[p+11]
rk[p+12] = (rk[p+4] ^ rk[p+12] = (rk[p+4] ^
(Te4[(temp >> 24) ] & 0xff000000) ^ (Te4[(temp >> 24) ] & 0xff000000) ^
(Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^ (Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^
(Te4[(temp >> 8) & 0xff] & 0x0000ff00) ^ (Te4[(temp >> 8) & 0xff] & 0x0000ff00) ^
(Te4[(temp ) & 0xff] & 0x000000ff)) (Te4[(temp ) & 0xff] & 0x000000ff))
rk[p+13] = rk[p+5] ^ rk[p+12] rk[p+13] = rk[p+5] ^ rk[p+12]
rk[p+14] = rk[p+6] ^ rk[p+13] rk[p+14] = rk[p+6] ^ rk[p+13]
rk[p+15] = rk[p+7] ^ rk[p+14] rk[p+15] = rk[p+7] ^ rk[p+14]
p += 8 p += 8
raise ValueError(keybits) raise ValueError(keybits)
# Expand the cipher key into the decryption key schedule. # Expand the cipher key into the decryption key schedule.
@ -780,291 +780,291 @@ def rijndaelSetupEncrypt(key, keybits):
# @return the number of rounds for the given cipher key size. # @return the number of rounds for the given cipher key size.
def rijndaelSetupDecrypt(key, keybits): def rijndaelSetupDecrypt(key, keybits):
# expand the cipher key: # expand the cipher key:
(rk, nrounds) = rijndaelSetupEncrypt(key, keybits) (rk, nrounds) = rijndaelSetupEncrypt(key, keybits)
# invert the order of the round keys: # invert the order of the round keys:
i = 0 i = 0
j = 4*nrounds j = 4*nrounds
while i < j: while i < j:
temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp
temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp
temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp
temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp
i += 4 i += 4
j -= 4 j -= 4
# apply the inverse MixColumn transform to all round keys but the first and the last: # apply the inverse MixColumn transform to all round keys but the first and the last:
p = 0 p = 0
for i in xrange(1, nrounds): for i in xrange(1, nrounds):
p += 4 p += 4
rk[p+0] = ( rk[p+0] = (
Td0[Te4[(rk[p+0] >> 24) ] & 0xff] ^ Td0[Te4[(rk[p+0] >> 24) ] & 0xff] ^
Td1[Te4[(rk[p+0] >> 16) & 0xff] & 0xff] ^ Td1[Te4[(rk[p+0] >> 16) & 0xff] & 0xff] ^
Td2[Te4[(rk[p+0] >> 8) & 0xff] & 0xff] ^ Td2[Te4[(rk[p+0] >> 8) & 0xff] & 0xff] ^
Td3[Te4[(rk[p+0] ) & 0xff] & 0xff]) Td3[Te4[(rk[p+0] ) & 0xff] & 0xff])
rk[p+1] = ( rk[p+1] = (
Td0[Te4[(rk[p+1] >> 24) ] & 0xff] ^ Td0[Te4[(rk[p+1] >> 24) ] & 0xff] ^
Td1[Te4[(rk[p+1] >> 16) & 0xff] & 0xff] ^ Td1[Te4[(rk[p+1] >> 16) & 0xff] & 0xff] ^
Td2[Te4[(rk[p+1] >> 8) & 0xff] & 0xff] ^ Td2[Te4[(rk[p+1] >> 8) & 0xff] & 0xff] ^
Td3[Te4[(rk[p+1] ) & 0xff] & 0xff]) Td3[Te4[(rk[p+1] ) & 0xff] & 0xff])
rk[p+2] = ( rk[p+2] = (
Td0[Te4[(rk[p+2] >> 24) ] & 0xff] ^ Td0[Te4[(rk[p+2] >> 24) ] & 0xff] ^
Td1[Te4[(rk[p+2] >> 16) & 0xff] & 0xff] ^ Td1[Te4[(rk[p+2] >> 16) & 0xff] & 0xff] ^
Td2[Te4[(rk[p+2] >> 8) & 0xff] & 0xff] ^ Td2[Te4[(rk[p+2] >> 8) & 0xff] & 0xff] ^
Td3[Te4[(rk[p+2] ) & 0xff] & 0xff]) Td3[Te4[(rk[p+2] ) & 0xff] & 0xff])
rk[p+3] = ( rk[p+3] = (
Td0[Te4[(rk[p+3] >> 24) ] & 0xff] ^ Td0[Te4[(rk[p+3] >> 24) ] & 0xff] ^
Td1[Te4[(rk[p+3] >> 16) & 0xff] & 0xff] ^ Td1[Te4[(rk[p+3] >> 16) & 0xff] & 0xff] ^
Td2[Te4[(rk[p+3] >> 8) & 0xff] & 0xff] ^ Td2[Te4[(rk[p+3] >> 8) & 0xff] & 0xff] ^
Td3[Te4[(rk[p+3] ) & 0xff] & 0xff]) Td3[Te4[(rk[p+3] ) & 0xff] & 0xff])
return (rk, nrounds) return (rk, nrounds)
def rijndaelEncrypt(rk, nrounds, plaintext): def rijndaelEncrypt(rk, nrounds, plaintext):
assert len(plaintext) == 16 assert len(plaintext) == 16
# map byte array block to cipher state # map byte array block to cipher state
# and add initial round key: # and add initial round key:
s0 = GETU32(plaintext[0:4]) ^ rk[0] s0 = GETU32(plaintext[0:4]) ^ rk[0]
s1 = GETU32(plaintext[4:8]) ^ rk[1] s1 = GETU32(plaintext[4:8]) ^ rk[1]
s2 = GETU32(plaintext[8:12]) ^ rk[2] s2 = GETU32(plaintext[8:12]) ^ rk[2]
s3 = GETU32(plaintext[12:16]) ^ rk[3] s3 = GETU32(plaintext[12:16]) ^ rk[3]
# nrounds - 1 full rounds: # nrounds - 1 full rounds:
r = nrounds >> 1 r = nrounds >> 1
p = 0 p = 0
while 1: while 1:
t0 = ( t0 = (
Te0[(s0 >> 24) ] ^ Te0[(s0 >> 24) ] ^
Te1[(s1 >> 16) & 0xff] ^ Te1[(s1 >> 16) & 0xff] ^
Te2[(s2 >> 8) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^
Te3[(s3 ) & 0xff] ^ Te3[(s3 ) & 0xff] ^
rk[p+4]) rk[p+4])
t1 = ( t1 = (
Te0[(s1 >> 24) ] ^ Te0[(s1 >> 24) ] ^
Te1[(s2 >> 16) & 0xff] ^ Te1[(s2 >> 16) & 0xff] ^
Te2[(s3 >> 8) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^
Te3[(s0 ) & 0xff] ^ Te3[(s0 ) & 0xff] ^
rk[p+5]) rk[p+5])
t2 = ( t2 = (
Te0[(s2 >> 24) ] ^ Te0[(s2 >> 24) ] ^
Te1[(s3 >> 16) & 0xff] ^ Te1[(s3 >> 16) & 0xff] ^
Te2[(s0 >> 8) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^
Te3[(s1 ) & 0xff] ^ Te3[(s1 ) & 0xff] ^
rk[p+6]) rk[p+6])
t3 = ( t3 = (
Te0[(s3 >> 24) ] ^ Te0[(s3 >> 24) ] ^
Te1[(s0 >> 16) & 0xff] ^ Te1[(s0 >> 16) & 0xff] ^
Te2[(s1 >> 8) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^
Te3[(s2 ) & 0xff] ^ Te3[(s2 ) & 0xff] ^
rk[p+7]) rk[p+7])
p += 8 p += 8
r -= 1 r -= 1
if r == 0: break if r == 0: break
s0 = (
Te0[(t0 >> 24) ] ^
Te1[(t1 >> 16) & 0xff] ^
Te2[(t2 >> 8) & 0xff] ^
Te3[(t3 ) & 0xff] ^
rk[p+0])
s1 = (
Te0[(t1 >> 24) ] ^
Te1[(t2 >> 16) & 0xff] ^
Te2[(t3 >> 8) & 0xff] ^
Te3[(t0 ) & 0xff] ^
rk[p+1])
s2 = (
Te0[(t2 >> 24) ] ^
Te1[(t3 >> 16) & 0xff] ^
Te2[(t0 >> 8) & 0xff] ^
Te3[(t1 ) & 0xff] ^
rk[p+2])
s3 = (
Te0[(t3 >> 24) ] ^
Te1[(t0 >> 16) & 0xff] ^
Te2[(t1 >> 8) & 0xff] ^
Te3[(t2 ) & 0xff] ^
rk[p+3])
ciphertext = ''
# apply last round and
# map cipher state to byte array block:
s0 = ( s0 = (
Te0[(t0 >> 24) ] ^ (Te4[(t0 >> 24) ] & 0xff000000) ^
Te1[(t1 >> 16) & 0xff] ^ (Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
Te2[(t2 >> 8) & 0xff] ^ (Te4[(t2 >> 8) & 0xff] & 0x0000ff00) ^
Te3[(t3 ) & 0xff] ^ (Te4[(t3 ) & 0xff] & 0x000000ff) ^
rk[p+0]) rk[p+0])
ciphertext += PUTU32(s0)
s1 = ( s1 = (
Te0[(t1 >> 24) ] ^ (Te4[(t1 >> 24) ] & 0xff000000) ^
Te1[(t2 >> 16) & 0xff] ^ (Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
Te2[(t3 >> 8) & 0xff] ^ (Te4[(t3 >> 8) & 0xff] & 0x0000ff00) ^
Te3[(t0 ) & 0xff] ^ (Te4[(t0 ) & 0xff] & 0x000000ff) ^
rk[p+1]) rk[p+1])
ciphertext += PUTU32(s1)
s2 = ( s2 = (
Te0[(t2 >> 24) ] ^ (Te4[(t2 >> 24) ] & 0xff000000) ^
Te1[(t3 >> 16) & 0xff] ^ (Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
Te2[(t0 >> 8) & 0xff] ^ (Te4[(t0 >> 8) & 0xff] & 0x0000ff00) ^
Te3[(t1 ) & 0xff] ^ (Te4[(t1 ) & 0xff] & 0x000000ff) ^
rk[p+2]) rk[p+2])
ciphertext += PUTU32(s2)
s3 = ( s3 = (
Te0[(t3 >> 24) ] ^ (Te4[(t3 >> 24) ] & 0xff000000) ^
Te1[(t0 >> 16) & 0xff] ^ (Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
Te2[(t1 >> 8) & 0xff] ^ (Te4[(t1 >> 8) & 0xff] & 0x0000ff00) ^
Te3[(t2 ) & 0xff] ^ (Te4[(t2 ) & 0xff] & 0x000000ff) ^
rk[p+3]) rk[p+3])
ciphertext += PUTU32(s3)
ciphertext = '' assert len(ciphertext) == 16
return ciphertext
# apply last round and
# map cipher state to byte array block:
s0 = (
(Te4[(t0 >> 24) ] & 0xff000000) ^
(Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
(Te4[(t2 >> 8) & 0xff] & 0x0000ff00) ^
(Te4[(t3 ) & 0xff] & 0x000000ff) ^
rk[p+0])
ciphertext += PUTU32(s0)
s1 = (
(Te4[(t1 >> 24) ] & 0xff000000) ^
(Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
(Te4[(t3 >> 8) & 0xff] & 0x0000ff00) ^
(Te4[(t0 ) & 0xff] & 0x000000ff) ^
rk[p+1])
ciphertext += PUTU32(s1)
s2 = (
(Te4[(t2 >> 24) ] & 0xff000000) ^
(Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
(Te4[(t0 >> 8) & 0xff] & 0x0000ff00) ^
(Te4[(t1 ) & 0xff] & 0x000000ff) ^
rk[p+2])
ciphertext += PUTU32(s2)
s3 = (
(Te4[(t3 >> 24) ] & 0xff000000) ^
(Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
(Te4[(t1 >> 8) & 0xff] & 0x0000ff00) ^
(Te4[(t2 ) & 0xff] & 0x000000ff) ^
rk[p+3])
ciphertext += PUTU32(s3)
assert len(ciphertext) == 16
return ciphertext
def rijndaelDecrypt(rk, nrounds, ciphertext): def rijndaelDecrypt(rk, nrounds, ciphertext):
assert len(ciphertext) == 16 assert len(ciphertext) == 16
# map byte array block to cipher state # map byte array block to cipher state
# and add initial round key: # and add initial round key:
s0 = GETU32(ciphertext[0:4]) ^ rk[0] s0 = GETU32(ciphertext[0:4]) ^ rk[0]
s1 = GETU32(ciphertext[4:8]) ^ rk[1] s1 = GETU32(ciphertext[4:8]) ^ rk[1]
s2 = GETU32(ciphertext[8:12]) ^ rk[2] s2 = GETU32(ciphertext[8:12]) ^ rk[2]
s3 = GETU32(ciphertext[12:16]) ^ rk[3] s3 = GETU32(ciphertext[12:16]) ^ rk[3]
# nrounds - 1 full rounds: # nrounds - 1 full rounds:
r = nrounds >> 1 r = nrounds >> 1
p = 0 p = 0
while 1: while 1:
t0 = ( t0 = (
Td0[(s0 >> 24) ] ^ Td0[(s0 >> 24) ] ^
Td1[(s3 >> 16) & 0xff] ^ Td1[(s3 >> 16) & 0xff] ^
Td2[(s2 >> 8) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^
Td3[(s1 ) & 0xff] ^ Td3[(s1 ) & 0xff] ^
rk[p+4]) rk[p+4])
t1 = ( t1 = (
Td0[(s1 >> 24) ] ^ Td0[(s1 >> 24) ] ^
Td1[(s0 >> 16) & 0xff] ^ Td1[(s0 >> 16) & 0xff] ^
Td2[(s3 >> 8) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^
Td3[(s2 ) & 0xff] ^ Td3[(s2 ) & 0xff] ^
rk[p+5]) rk[p+5])
t2 = ( t2 = (
Td0[(s2 >> 24) ] ^ Td0[(s2 >> 24) ] ^
Td1[(s1 >> 16) & 0xff] ^ Td1[(s1 >> 16) & 0xff] ^
Td2[(s0 >> 8) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^
Td3[(s3 ) & 0xff] ^ Td3[(s3 ) & 0xff] ^
rk[p+6]) rk[p+6])
t3 = ( t3 = (
Td0[(s3 >> 24) ] ^ Td0[(s3 >> 24) ] ^
Td1[(s2 >> 16) & 0xff] ^ Td1[(s2 >> 16) & 0xff] ^
Td2[(s1 >> 8) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^
Td3[(s0 ) & 0xff] ^ Td3[(s0 ) & 0xff] ^
rk[p+7]) rk[p+7])
p += 8 p += 8
r -= 1 r -= 1
if r == 0: break if r == 0: break
s0 = (
Td0[(t0 >> 24) ] ^
Td1[(t3 >> 16) & 0xff] ^
Td2[(t2 >> 8) & 0xff] ^
Td3[(t1 ) & 0xff] ^
rk[p+0])
s1 = (
Td0[(t1 >> 24) ] ^
Td1[(t0 >> 16) & 0xff] ^
Td2[(t3 >> 8) & 0xff] ^
Td3[(t2 ) & 0xff] ^
rk[p+1])
s2 = (
Td0[(t2 >> 24) ] ^
Td1[(t1 >> 16) & 0xff] ^
Td2[(t0 >> 8) & 0xff] ^
Td3[(t3 ) & 0xff] ^
rk[p+2])
s3 = (
Td0[(t3 >> 24) ] ^
Td1[(t2 >> 16) & 0xff] ^
Td2[(t1 >> 8) & 0xff] ^
Td3[(t0 ) & 0xff] ^
rk[p+3])
plaintext = ''
# apply last round and
# map cipher state to byte array block:
s0 = ( s0 = (
Td0[(t0 >> 24) ] ^ (Td4[(t0 >> 24) ] & 0xff000000) ^
Td1[(t3 >> 16) & 0xff] ^ (Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
Td2[(t2 >> 8) & 0xff] ^ (Td4[(t2 >> 8) & 0xff] & 0x0000ff00) ^
Td3[(t1 ) & 0xff] ^ (Td4[(t1 ) & 0xff] & 0x000000ff) ^
rk[p+0]) rk[p+0])
plaintext += PUTU32(s0)
s1 = ( s1 = (
Td0[(t1 >> 24) ] ^ (Td4[(t1 >> 24) ] & 0xff000000) ^
Td1[(t0 >> 16) & 0xff] ^ (Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
Td2[(t3 >> 8) & 0xff] ^ (Td4[(t3 >> 8) & 0xff] & 0x0000ff00) ^
Td3[(t2 ) & 0xff] ^ (Td4[(t2 ) & 0xff] & 0x000000ff) ^
rk[p+1]) rk[p+1])
plaintext += PUTU32(s1)
s2 = ( s2 = (
Td0[(t2 >> 24) ] ^ (Td4[(t2 >> 24) ] & 0xff000000) ^
Td1[(t1 >> 16) & 0xff] ^ (Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
Td2[(t0 >> 8) & 0xff] ^ (Td4[(t0 >> 8) & 0xff] & 0x0000ff00) ^
Td3[(t3 ) & 0xff] ^ (Td4[(t3 ) & 0xff] & 0x000000ff) ^
rk[p+2]) rk[p+2])
plaintext += PUTU32(s2)
s3 = ( s3 = (
Td0[(t3 >> 24) ] ^ (Td4[(t3 >> 24) ] & 0xff000000) ^
Td1[(t2 >> 16) & 0xff] ^ (Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
Td2[(t1 >> 8) & 0xff] ^ (Td4[(t1 >> 8) & 0xff] & 0x0000ff00) ^
Td3[(t0 ) & 0xff] ^ (Td4[(t0 ) & 0xff] & 0x000000ff) ^
rk[p+3]) rk[p+3])
plaintext += PUTU32(s3)
plaintext = '' assert len(plaintext) == 16
return plaintext
# apply last round and
# map cipher state to byte array block:
s0 = (
(Td4[(t0 >> 24) ] & 0xff000000) ^
(Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
(Td4[(t2 >> 8) & 0xff] & 0x0000ff00) ^
(Td4[(t1 ) & 0xff] & 0x000000ff) ^
rk[p+0])
plaintext += PUTU32(s0)
s1 = (
(Td4[(t1 >> 24) ] & 0xff000000) ^
(Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
(Td4[(t3 >> 8) & 0xff] & 0x0000ff00) ^
(Td4[(t2 ) & 0xff] & 0x000000ff) ^
rk[p+1])
plaintext += PUTU32(s1)
s2 = (
(Td4[(t2 >> 24) ] & 0xff000000) ^
(Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
(Td4[(t0 >> 8) & 0xff] & 0x0000ff00) ^
(Td4[(t3 ) & 0xff] & 0x000000ff) ^
rk[p+2])
plaintext += PUTU32(s2)
s3 = (
(Td4[(t3 >> 24) ] & 0xff000000) ^
(Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
(Td4[(t1 >> 8) & 0xff] & 0x0000ff00) ^
(Td4[(t0 ) & 0xff] & 0x000000ff) ^
rk[p+3])
plaintext += PUTU32(s3)
assert len(plaintext) == 16
return plaintext
# decrypt(key, fin, fout, keybits=256) # decrypt(key, fin, fout, keybits=256)
class RijndaelDecryptor(object): class RijndaelDecryptor(object):
def __init__(self, key, keybits=256):
assert len(key) == KEYLENGTH(keybits)
(self.rk, self.nrounds) = rijndaelSetupDecrypt(key, keybits)
assert len(self.rk) == RKLENGTH(keybits)
assert self.nrounds == NROUNDS(keybits)
return
def decrypt(self, ciphertext): def __init__(self, key, keybits=256):
assert len(ciphertext) == 16 assert len(key) == KEYLENGTH(keybits)
return rijndaelDecrypt(self.rk, self.nrounds, ciphertext) (self.rk, self.nrounds) = rijndaelSetupDecrypt(key, keybits)
assert len(self.rk) == RKLENGTH(keybits)
assert self.nrounds == NROUNDS(keybits)
return
def decrypt(self, ciphertext):
assert len(ciphertext) == 16
return rijndaelDecrypt(self.rk, self.nrounds, ciphertext)
# encrypt(key, fin, fout, keybits=256) # encrypt(key, fin, fout, keybits=256)
class RijndaelEncryptor(object): class RijndaelEncryptor(object):
def __init__(self, key, keybits=256): def __init__(self, key, keybits=256):
assert len(key) == KEYLENGTH(keybits) assert len(key) == KEYLENGTH(keybits)
(self.rk, self.nrounds) = rijndaelSetupEncrypt(key, keybits) (self.rk, self.nrounds) = rijndaelSetupEncrypt(key, keybits)
assert len(self.rk) == RKLENGTH(keybits) assert len(self.rk) == RKLENGTH(keybits)
assert self.nrounds == NROUNDS(keybits) assert self.nrounds == NROUNDS(keybits)
return return
def encrypt(self, plaintext): def encrypt(self, plaintext):
assert len(plaintext) == 16 assert len(plaintext) == 16
return rijndaelEncrypt(self.rk, self.nrounds, plaintext) return rijndaelEncrypt(self.rk, self.nrounds, plaintext)
def main(argv): def main(argv):
# test # test
key = '00010203050607080A0B0C0D0F101112'.decode('hex') key = '00010203050607080A0B0C0D0F101112'.decode('hex')
plaintext = '506812A45F08C889B97F5980038B8359'.decode('hex') plaintext = '506812A45F08C889B97F5980038B8359'.decode('hex')
ciphertext = 'D8F532538289EF7D06B506A4FD5BE9C9'.decode('hex') ciphertext = 'D8F532538289EF7D06B506A4FD5BE9C9'.decode('hex')
e = RijndaelEncryptor(key, 128) e = RijndaelEncryptor(key, 128)
text = e.encrypt(plaintext) text = e.encrypt(plaintext)
assert text == ciphertext assert text == ciphertext
d = RijndaelDecryptor(key, 128) d = RijndaelDecryptor(key, 128)
text = d.decrypt(ciphertext) text = d.decrypt(ciphertext)
assert text == plaintext assert text == plaintext
return 0 return 0
if __name__ == '__main__': sys.exit(main(sys.argv)) if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -7,21 +7,21 @@ from struct import unpack
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0) MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)): def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
'''Multiplies two matrices.''' '''Multiplies two matrices.'''
return (a0*a1+c0*b1, b0*a1+d0*b1, return (a0*a1+c0*b1, b0*a1+d0*b1,
a0*c1+c0*d1, b0*c1+d0*d1, a0*c1+c0*d1, b0*c1+d0*d1,
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0) a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
def translate_matrix((a,b,c,d,e,f), (x,y)): def translate_matrix((a,b,c,d,e,f), (x,y)):
return (a,b,c,d,x*a+y*c+e,x*b+y*d+f) return (a,b,c,d,x*a+y*c+e,x*b+y*d+f)
def apply_matrix_pt((a,b,c,d,e,f), (x,y)): def apply_matrix_pt((a,b,c,d,e,f), (x,y)):
'''Applies a matrix to a point.''' '''Applies a matrix to a point.'''
return (a*x+c*y+e, b*x+d*y+f) return (a*x+c*y+e, b*x+d*y+f)
def apply_matrix_norm((a,b,c,d,e,f), (p,q)): def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
'''Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))''' '''Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))'''
return (a*p+c*q, b*p+d*q) return (a*p+c*q, b*p+d*q)
## Utility functions ## Utility functions
@ -29,62 +29,62 @@ def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
# pick # pick
def pick(seq, func, maxobj=None): def pick(seq, func, maxobj=None):
'''Picks the object that has the highest value of func(obj).''' '''Picks the object that has the highest value of func(obj).'''
maxscore = None maxscore = None
for obj in seq: for obj in seq:
score = func(obj) score = func(obj)
if maxscore == None or maxscore < score: if maxscore == None or maxscore < score:
(maxscore,maxobj) = (score,obj) (maxscore,maxobj) = (score,obj)
return maxobj return maxobj
# bsearch # bsearch
def bsearch(objs, v0): def bsearch(objs, v0):
'''Tries to find the closest value to v0.''' '''Tries to find the closest value to v0.'''
i0 = 0 i0 = 0
i1 = len(objs) i1 = len(objs)
while i0 < i1: while i0 < i1:
i = (i0+i1)/2 i = (i0+i1)/2
(v, obj) = objs[i] (v, obj) = objs[i]
if v0 == v: if v0 == v:
(i0,i1) = (i,i+1) (i0,i1) = (i,i+1)
while 0 < i0 and objs[i0-1][0] == v0: while 0 < i0 and objs[i0-1][0] == v0:
i0 -= 1 i0 -= 1
while i1 < len(objs)-1 and objs[i1][0] == v0: while i1 < len(objs)-1 and objs[i1][0] == v0:
i1 += 1 i1 += 1
break break
elif v0 < v: elif v0 < v:
i1 = i i1 = i
else: else:
i0 = i+1 i0 = i+1
return (i0,i1) return (i0,i1)
# choplist # choplist
def choplist(n, seq): def choplist(n, seq):
'''Groups every n elements of the list.''' '''Groups every n elements of the list.'''
r = [] r = []
for x in seq: for x in seq:
r.append(x) r.append(x)
if len(r) == n: if len(r) == n:
yield tuple(r) yield tuple(r)
r = [] r = []
return return
# nunpack # nunpack
def nunpack(s, default=0): def nunpack(s, default=0):
'''Unpacks up to 4 bytes big endian.''' '''Unpacks up to 4 bytes big endian.'''
l = len(s) l = len(s)
if not l: if not l:
return default return default
elif l == 1: elif l == 1:
return ord(s) return ord(s)
elif l == 2: elif l == 2:
return unpack('>H', s)[0] return unpack('>H', s)[0]
elif l == 3: elif l == 3:
return unpack('>L', '\x00'+s)[0] return unpack('>L', '\x00'+s)[0]
elif l == 4: elif l == 4:
return unpack('>L', s)[0] return unpack('>L', s)[0]
else: else:
return TypeError('invalid length: %d' % l) return TypeError('invalid length: %d' % l)
# decode_text # decode_text
PDFDocEncoding = ''.join( unichr(x) for x in ( PDFDocEncoding = ''.join( unichr(x) for x in (
@ -122,14 +122,14 @@ PDFDocEncoding = ''.join( unichr(x) for x in (
0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
)) ))
def decode_text(s): def decode_text(s):
'''Decodes a PDFDocEncoding string to Unicode.''' '''Decodes a PDFDocEncoding string to Unicode.'''
if s.startswith('\xfe\xff'): if s.startswith('\xfe\xff'):
return unicode(s[2:], 'utf-16be', 'ignore') return unicode(s[2:], 'utf-16be', 'ignore')
else: else:
return ''.join( PDFDocEncoding[ord(c)] for c in s ) return ''.join( PDFDocEncoding[ord(c)] for c in s )
# enc # enc
def enc(x, codec='ascii'): def enc(x, codec='ascii'):
'''Encodes a string for SGML/XML/HTML''' '''Encodes a string for SGML/XML/HTML'''
x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;') x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
return x.encode(codec, 'xmlcharrefreplace') return x.encode(codec, 'xmlcharrefreplace')

View File

@ -3,10 +3,10 @@ from distutils.core import setup
from pdfminer import __version__ from pdfminer import __version__
setup( setup(
name='pdfminer', name='pdfminer',
version=__version__, version=__version__,
description='PDF parser and analyzer', description='PDF parser and analyzer',
long_description='''PDFMiner is a suite of programs that help long_description='''PDFMiner is a suite of programs that help
extracting and analyzing text data of PDF documents. extracting and analyzing text data of PDF documents.
Unlike other PDF-related tools, it allows to obtain Unlike other PDF-related tools, it allows to obtain
the exact location of texts in a page, as well as the exact location of texts in a page, as well as
@ -14,23 +14,23 @@ other extra information such as font information or ruled lines.
It includes a PDF converter that can transform PDF files It includes a PDF converter that can transform PDF files
into other text formats (such as HTML). It has an extensible into other text formats (such as HTML). It has an extensible
PDF parser that can be used for other purposes instead of text analysis.''', PDF parser that can be used for other purposes instead of text analysis.''',
license='MIT/X', license='MIT/X',
author='Yusuke Shinyama', author='Yusuke Shinyama',
author_email='yusuke at cs dot nyu dot edu', author_email='yusuke at cs dot nyu dot edu',
url='http://www.unixuser.org/~euske/python/pdfminer/index.html', url='http://www.unixuser.org/~euske/python/pdfminer/index.html',
packages=[ packages=[
'pdfminer' 'pdfminer'
], ],
scripts=[ scripts=[
'tools/pdf2txt.py', 'tools/pdf2txt.py',
'tools/dumppdf.py' 'tools/dumppdf.py'
], ],
keywords=['pdf parser', 'pdf converter', 'text mining'], keywords=['pdf parser', 'pdf converter', 'text mining'],
classifiers=[ classifiers=[
'Development Status :: 4 - Beta', 'Development Status :: 4 - Beta',
'Environment :: Console', 'Environment :: Console',
'Intended Audience :: Developers', 'Intended Audience :: Developers',
'Intended Audience :: Science/Research', 'Intended Audience :: Science/Research',
'License :: OSI Approved :: MIT License', 'License :: OSI Approved :: MIT License',
], ],
) )

View File

@ -5,38 +5,38 @@ stdout = sys.stdout
stderr = sys.stderr stderr = sys.stderr
def main(argv): def main(argv):
fonts = {} fonts = {}
for line in fileinput.input(): for line in fileinput.input():
f = line.strip().split(' ') f = line.strip().split(' ')
if not f: continue if not f: continue
k = f[0] k = f[0]
if k == 'FontName': if k == 'FontName':
fontname = f[1] fontname = f[1]
props = {'FontName': fontname, 'Flags': 0} props = {'FontName': fontname, 'Flags': 0}
chars = {} chars = {}
fonts[fontname] = (props, chars) fonts[fontname] = (props, chars)
elif k == 'C': elif k == 'C':
cid = int(f[1]) cid = int(f[1])
if 0 <= cid and cid <= 255: if 0 <= cid and cid <= 255:
width = int(f[4]) width = int(f[4])
chars[cid] = width chars[cid] = width
elif k in ('CapHeight', 'XHeight', 'ItalicAngle', elif k in ('CapHeight', 'XHeight', 'ItalicAngle',
'Ascender', 'Descender'): 'Ascender', 'Descender'):
k = {'Ascender':'Ascent', 'Descender':'Descent'}.get(k,k) k = {'Ascender':'Ascent', 'Descender':'Descent'}.get(k,k)
props[k] = float(f[1]) props[k] = float(f[1])
elif k in ('FontName', 'FamilyName', 'Weight'): elif k in ('FontName', 'FamilyName', 'Weight'):
k = {'FamilyName':'FontFamily', 'Weight':'FontWeight'}.get(k,k) k = {'FamilyName':'FontFamily', 'Weight':'FontWeight'}.get(k,k)
props[k] = f[1] props[k] = f[1]
elif k == 'IsFixedPitch': elif k == 'IsFixedPitch':
if f[1].lower() == 'true': if f[1].lower() == 'true':
props['Flags'] = 64 props['Flags'] = 64
elif k == 'FontBBox': elif k == 'FontBBox':
props[k] = tuple(map(float, f[1:5])) props[k] = tuple(map(float, f[1:5]))
print '# -*- python -*-' print '# -*- python -*-'
print 'FONT_METRICS = {' print 'FONT_METRICS = {'
for (fontname,(props,chars)) in fonts.iteritems(): for (fontname,(props,chars)) in fonts.iteritems():
print ' %r: %r,' % (fontname, (props,chars)) print ' %r: %r,' % (fontname, (props,chars))
print '}' print '}'
return 0 return 0
if __name__ == '__main__': sys.exit(main(sys.argv)) if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -13,173 +13,173 @@ from pdfminer.pdftypes import PDFStream, PDFObjRef, PSKeyword, PSLiteral, resolv
ESC_PAT = re.compile(r'[\000-\037&<>()\042\047\134\177-\377]') ESC_PAT = re.compile(r'[\000-\037&<>()\042\047\134\177-\377]')
def esc(s): def esc(s):
return ESC_PAT.sub(lambda m:'&#%d;' % ord(m.group(0)), s) return ESC_PAT.sub(lambda m:'&#%d;' % ord(m.group(0)), s)
# dumpxml # dumpxml
def dumpxml(out, obj, codec=None): def dumpxml(out, obj, codec=None):
if isinstance(obj, dict): if isinstance(obj, dict):
out.write('<dict size="%d">\n' % len(obj)) out.write('<dict size="%d">\n' % len(obj))
for (k,v) in obj.iteritems(): for (k,v) in obj.iteritems():
out.write('<key>%s</key>\n' % k) out.write('<key>%s</key>\n' % k)
out.write('<value>') out.write('<value>')
dumpxml(out, v) dumpxml(out, v)
out.write('</value>\n') out.write('</value>\n')
out.write('</dict>') out.write('</dict>')
return return
if isinstance(obj, list): if isinstance(obj, list):
out.write('<list size="%d">\n' % len(obj)) out.write('<list size="%d">\n' % len(obj))
for v in obj: for v in obj:
dumpxml(out, v) dumpxml(out, v)
out.write('\n') out.write('\n')
out.write('</list>') out.write('</list>')
return return
if isinstance(obj, str): if isinstance(obj, str):
out.write('<string size="%d">%s</string>' % (len(obj), esc(obj))) out.write('<string size="%d">%s</string>' % (len(obj), esc(obj)))
return return
if isinstance(obj, PDFStream): if isinstance(obj, PDFStream):
out.write('<stream>\n<props>\n') out.write('<stream>\n<props>\n')
dumpxml(out, obj.dic) dumpxml(out, obj.dic)
out.write('\n</props>\n') out.write('\n</props>\n')
if codec == 'text': if codec == 'text':
data = obj.get_data() data = obj.get_data()
out.write('<data size="%d">%s</data>\n' % (len(data), esc(data))) out.write('<data size="%d">%s</data>\n' % (len(data), esc(data)))
out.write('</stream>') out.write('</stream>')
return return
if isinstance(obj, PDFObjRef): if isinstance(obj, PDFObjRef):
out.write('<ref id="%d"/>' % obj.objid) out.write('<ref id="%d"/>' % obj.objid)
return return
if isinstance(obj, PSKeyword): if isinstance(obj, PSKeyword):
out.write('<keyword>%s</keyword>' % obj.name) out.write('<keyword>%s</keyword>' % obj.name)
return return
if isinstance(obj, PSLiteral): if isinstance(obj, PSLiteral):
out.write('<literal>%s</literal>' % obj.name) out.write('<literal>%s</literal>' % obj.name)
return return
if isinstance(obj, int) or isinstance(obj, float): if isinstance(obj, int) or isinstance(obj, float):
out.write('<number>%s</number>' % obj) out.write('<number>%s</number>' % obj)
return return
raise TypeError(obj) raise TypeError(obj)
# dumptrailers # dumptrailers
def dumptrailers(out, doc): def dumptrailers(out, doc):
for xref in doc.xrefs: for xref in doc.xrefs:
out.write('<trailer>\n') out.write('<trailer>\n')
dumpxml(out, xref.trailer) dumpxml(out, xref.trailer)
out.write('\n</trailer>\n\n') out.write('\n</trailer>\n\n')
return return
# dumpallobjs # dumpallobjs
def dumpallobjs(out, doc, codec=None): def dumpallobjs(out, doc, codec=None):
out.write('<pdf>') out.write('<pdf>')
for xref in doc.xrefs: for xref in doc.xrefs:
for objid in xref.objids(): for objid in xref.objids():
try: try:
obj = doc.getobj(objid) obj = doc.getobj(objid)
if obj == None: continue if obj == None: continue
out.write('<object id="%d">\n' % objid) out.write('<object id="%d">\n' % objid)
dumpxml(out, obj, codec=codec) dumpxml(out, obj, codec=codec)
out.write('\n</object>\n\n') out.write('\n</object>\n\n')
except: except:
raise raise
dumptrailers(out, doc) dumptrailers(out, doc)
out.write('</pdf>') out.write('</pdf>')
return return
# dumpoutline # dumpoutline
def dumpoutline(outfp, fname, objids, pagenos, password='', def dumpoutline(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None): dumpall=False, codec=None):
doc = PDFDocument() doc = PDFDocument()
fp = file(fname, 'rb') fp = file(fname, 'rb')
parser = PDFParser(doc, fp) parser = PDFParser(doc, fp)
doc.initialize(password) doc.initialize(password)
pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) ) pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
for (level,title,dest,a,se) in doc.get_outlines(): for (level,title,dest,a,se) in doc.get_outlines():
pageno = None pageno = None
if dest: if dest:
dest = resolve1( doc.lookup_name('Dests', dest) ) dest = resolve1( doc.lookup_name('Dests', dest) )
if isinstance(dest, dict): if isinstance(dest, dict):
dest = dest['D'] dest = dest['D']
pageno = pages[dest[0].objid] pageno = pages[dest[0].objid]
outfp.write(repr((level,title,dest,pageno))+'\n') outfp.write(repr((level,title,dest,pageno))+'\n')
parser.close() parser.close()
fp.close() fp.close()
return return
# dumppdf # dumppdf
def dumppdf(outfp, fname, objids, pagenos, password='', def dumppdf(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None): dumpall=False, codec=None):
doc = PDFDocument() doc = PDFDocument()
fp = file(fname, 'rb') fp = file(fname, 'rb')
parser = PDFParser(doc, fp) parser = PDFParser(doc, fp)
doc.initialize(password) doc.initialize(password)
if objids: if objids:
for objid in objids: for objid in objids:
obj = doc.getobj(objid) obj = doc.getobj(objid)
if isinstance(obj, PDFStream) and codec == 'raw': if isinstance(obj, PDFStream) and codec == 'raw':
outfp.write(obj.get_rawdata()) outfp.write(obj.get_rawdata())
elif isinstance(obj, PDFStream) and codec == 'binary': elif isinstance(obj, PDFStream) and codec == 'binary':
outfp.write(obj.get_data()) outfp.write(obj.get_data())
else: else:
dumpxml(outfp, obj, codec=codec) dumpxml(outfp, obj, codec=codec)
if pagenos: if pagenos:
for (pageno,page) in enumerate(doc.get_pages()): for (pageno,page) in enumerate(doc.get_pages()):
if pageno in pagenos: if pageno in pagenos:
dumpxml(outfp, page.attrs) dumpxml(outfp, page.attrs)
if dumpall: if dumpall:
dumpallobjs(outfp, doc, codec=codec) dumpallobjs(outfp, doc, codec=codec)
if (not objids) and (not pagenos) and (not dumpall): if (not objids) and (not pagenos) and (not dumpall):
dumptrailers(outfp, doc) dumptrailers(outfp, doc)
fp.close() fp.close()
if codec not in ('raw','binary'): if codec not in ('raw','binary'):
outfp.write('\n') outfp.write('\n')
return return
# main # main
def main(argv): def main(argv):
import getopt import getopt
def usage(): def usage():
print 'usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-i objid] file ...' % argv[0] print 'usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-i objid] file ...' % argv[0]
return 100 return 100
try: try:
(opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTi:') (opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTi:')
except getopt.GetoptError: except getopt.GetoptError:
return usage() return usage()
if not args: return usage() if not args: return usage()
debug = 0 debug = 0
objids = [] objids = []
pagenos = set() pagenos = set()
codec = None codec = None
password = '' password = ''
dumpall = False dumpall = False
proc = dumppdf proc = dumppdf
outfp = sys.stdout outfp = sys.stdout
for (k, v) in opts: for (k, v) in opts:
if k == '-d': debug += 1 if k == '-d': debug += 1
elif k == '-i': objids.extend( int(x) for x in v.split(',') ) elif k == '-i': objids.extend( int(x) for x in v.split(',') )
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-P': password = v elif k == '-P': password = v
elif k == '-a': dumpall = True elif k == '-a': dumpall = True
elif k == '-r': codec = 'raw' elif k == '-r': codec = 'raw'
elif k == '-b': codec = 'binary' elif k == '-b': codec = 'binary'
elif k == '-t': codec = 'text' elif k == '-t': codec = 'text'
elif k == '-T': proc = dumpoutline elif k == '-T': proc = dumpoutline
elif k == '-o': outfp = file(v, 'wb') elif k == '-o': outfp = file(v, 'wb')
# #
PDFDocument.debug = debug PDFDocument.debug = debug
PDFParser.debug = debug PDFParser.debug = debug
# #
for fname in args: for fname in args:
proc(outfp, fname, objids, pagenos, password=password, proc(outfp, fname, objids, pagenos, password=password,
dumpall=dumpall, codec=codec) dumpall=dumpall, codec=codec)
return return
if __name__ == '__main__': sys.exit(main(sys.argv)) if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -12,7 +12,7 @@
# $ mkdir CGIDIR # $ mkdir CGIDIR
# $ mkdir CGIDIR/var # $ mkdir CGIDIR/var
# $ cp -a pdfminer/pdflib CGIDIR # $ cp -a pdfminer/pdflib CGIDIR
# $ PYTHONPATH=CGIDIR pdfminer/tools/pdf2html.cgi # $ PYTHONPATH=CGIDIR pdfminer/tools/pdf2html.cgi
# #
import sys import sys
@ -27,16 +27,16 @@ from pdfminer.cmap import CMapDB
# quote HTML metacharacters # quote HTML metacharacters
def q(x): def q(x):
return x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;') return x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
# encode parameters as a URL # encode parameters as a URL
Q = re.compile(r'[^a-zA-Z0-9_.-=]') Q = re.compile(r'[^a-zA-Z0-9_.-=]')
def url(base, **kw): def url(base, **kw):
r = [] r = []
for (k,v) in kw.iteritems(): for (k,v) in kw.iteritems():
v = Q.sub(lambda m: '%%%02X' % ord(m.group(0)), encoder(q(v), 'replace')[0]) v = Q.sub(lambda m: '%%%02X' % ord(m.group(0)), encoder(q(v), 'replace')[0])
r.append('%s=%s' % (k, v)) r.append('%s=%s' % (k, v))
return base+'&'.join(r) return base+'&'.join(r)
## convert ## convert
@ -44,156 +44,156 @@ def url(base, **kw):
class FileSizeExceeded(ValueError): pass class FileSizeExceeded(ValueError): pass
def convert(outfp, infp, path, codec='utf-8', maxpages=10, def convert(outfp, infp, path, codec='utf-8', maxpages=10,
maxfilesize=5000000, pagenos=None, html=True): maxfilesize=5000000, pagenos=None, html=True):
# save the input file. # save the input file.
src = file(path, 'wb') src = file(path, 'wb')
nbytes = 0 nbytes = 0
while 1: while 1:
data = infp.read(4096) data = infp.read(4096)
nbytes += len(data) nbytes += len(data)
if maxfilesize and maxfilesize < nbytes: if maxfilesize and maxfilesize < nbytes:
raise FileSizeExceeded(maxfilesize) raise FileSizeExceeded(maxfilesize)
if not data: break if not data: break
src.write(data) src.write(data)
src.close() src.close()
infp.close() infp.close()
# perform conversion and # perform conversion and
# send the results over the network. # send the results over the network.
CMapDB.initialize() CMapDB.initialize()
rsrc = PDFResourceManager() rsrc = PDFResourceManager()
laparams = LAParams() laparams = LAParams()
if html: if html:
device = HTMLConverter(rsrc, outfp, codec=codec, laparams=laparams) device = HTMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
else: else:
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams) device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
fp = file(path, 'rb') fp = file(path, 'rb')
process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages) process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages)
fp.close() fp.close()
return return
## PDF2HTMLApp ## PDF2HTMLApp
## ##
class PDF2HTMLApp(object): class PDF2HTMLApp(object):
APPURL = '/convert' APPURL = '/convert'
TMPDIR = './var/' TMPDIR = './var/'
LOGPATH = './var/log' LOGPATH = './var/log'
MAXFILESIZE = 5000000 MAXFILESIZE = 5000000
MAXPAGES = 10 MAXPAGES = 10
def __init__(self, outfp, logpath=LOGPATH, loglevel=logging.DEBUG, codec='utf-8'):
self.outfp = outfp
self.codec = codec
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
level=loglevel, filename=logpath, filemode='a')
self.remote_addr = os.environ.get('REMOTE_ADDR')
self.path_info = os.environ.get('PATH_INFO')
self.method = os.environ.get('REQUEST_METHOD', 'GET')
self.server = os.environ.get('SERVER_SOFTWARE', '')
self.content_type = 'text/html; charset=%s' % codec
self.cur_time = time.time()
self.form = cgi.FieldStorage()
return
def put(self, *args): def __init__(self, outfp, logpath=LOGPATH, loglevel=logging.DEBUG, codec='utf-8'):
for x in args: self.outfp = outfp
if isinstance(x, str): self.codec = codec
self.outfp.write(x) logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
elif isinstance(x, unicode): level=loglevel, filename=logpath, filemode='a')
self.outfp.write(x.encode(self.codec, 'xmlcharrefreplace')) self.remote_addr = os.environ.get('REMOTE_ADDR')
return self.path_info = os.environ.get('PATH_INFO')
self.method = os.environ.get('REQUEST_METHOD', 'GET')
self.server = os.environ.get('SERVER_SOFTWARE', '')
self.content_type = 'text/html; charset=%s' % codec
self.cur_time = time.time()
self.form = cgi.FieldStorage()
return
def http_200(self): def put(self, *args):
if self.server.startswith('cgi-httpd'): for x in args:
# required for cgi-httpd if isinstance(x, str):
self.outfp.write('HTTP/1.0 200 OK\r\n') self.outfp.write(x)
self.outfp.write('Content-type: %s\r\n' % self.content_type) elif isinstance(x, unicode):
self.outfp.write('Connection: close\r\n\r\n') self.outfp.write(x.encode(self.codec, 'xmlcharrefreplace'))
return return
def http_404(self):
if self.server.startswith('cgi-httpd'):
# required for cgi-httpd
self.outfp.write('HTTP/1.0 404 Not Found\r\n')
self.outfp.write('Content-type: text/html\r\n')
self.outfp.write('Connection: close\r\n\r\n')
self.outfp.write('<html><body>page does not exist</body></body>\n')
return
def http_301(self, url):
if self.server.startswith('cgi-httpd'):
# required for cgi-httpd
self.outfp.write('HTTP/1.0 301 Moved\r\n')
self.outfp.write('Location: %s\r\n\r\n' % url)
return
def coverpage(self): def http_200(self):
self.put( if self.server.startswith('cgi-httpd'):
'<html><head><title>pdf2html demo</title></head><body>\n', # required for cgi-httpd
'<h1>pdf2html demo</h1><hr>\n', self.outfp.write('HTTP/1.0 200 OK\r\n')
'<form method="POST" action="%s" enctype="multipart/form-data">\n' % q(self.APPURL), self.outfp.write('Content-type: %s\r\n' % self.content_type)
'<p>Upload PDF File: <input name="f" type="file" value="">\n', self.outfp.write('Connection: close\r\n\r\n')
'&nbsp; Page numbers (comma-separated): <input name="p" type="text" size="10" value="">\n', return
'<p>(Text extraction is limited to maximum %d pages.\n' % self.MAXPAGES,
'Maximum file size for input is %d bytes.)\n' % self.MAXFILESIZE,
'<p><input type="submit" name="c" value="Convert to HTML">\n',
'<input type="submit" name="c" value="Convert to TEXT">\n',
'<input type="reset" value="Reset">\n',
'</form><hr>\n',
'<p>Powered by <a href="http://www.unixuser.org/~euske/python/pdfminer/">PDFMiner</a>\n',
'</body></html>\n',
)
return
def run(self, argv): def http_404(self):
if self.path_info == '/': if self.server.startswith('cgi-httpd'):
self.http_200() # required for cgi-httpd
self.coverpage() self.outfp.write('HTTP/1.0 404 Not Found\r\n')
return self.outfp.write('Content-type: text/html\r\n')
if self.path_info != self.APPURL: self.outfp.write('Connection: close\r\n\r\n')
self.http_404() self.outfp.write('<html><body>page does not exist</body></body>\n')
return return
if not os.path.isdir(self.TMPDIR):
self.bummer('error') def http_301(self, url):
return if self.server.startswith('cgi-httpd'):
if 'f' not in self.form: # required for cgi-httpd
self.http_301('/') self.outfp.write('HTTP/1.0 301 Moved\r\n')
return self.outfp.write('Location: %s\r\n\r\n' % url)
if 'c' not in self.form: return
self.http_301('/')
return def coverpage(self):
item = self.form['f'] self.put(
if not (item.file and item.filename): '<html><head><title>pdf2html demo</title></head><body>\n',
self.http_301('/') '<h1>pdf2html demo</h1><hr>\n',
return '<form method="POST" action="%s" enctype="multipart/form-data">\n' % q(self.APPURL),
cmd = self.form.getvalue('c') '<p>Upload PDF File: <input name="f" type="file" value="">\n',
html = (cmd == 'Convert to HTML') '&nbsp; Page numbers (comma-separated): <input name="p" type="text" size="10" value="">\n',
pagenos = [] '<p>(Text extraction is limited to maximum %d pages.\n' % self.MAXPAGES,
if 'p' in self.form: 'Maximum file size for input is %d bytes.)\n' % self.MAXFILESIZE,
for m in re.finditer(r'\d+', self.form.getvalue('p')): '<p><input type="submit" name="c" value="Convert to HTML">\n',
'<input type="submit" name="c" value="Convert to TEXT">\n',
'<input type="reset" value="Reset">\n',
'</form><hr>\n',
'<p>Powered by <a href="http://www.unixuser.org/~euske/python/pdfminer/">PDFMiner</a>\n',
'</body></html>\n',
)
return
def run(self, argv):
if self.path_info == '/':
self.http_200()
self.coverpage()
return
if self.path_info != self.APPURL:
self.http_404()
return
if not os.path.isdir(self.TMPDIR):
self.bummer('error')
return
if 'f' not in self.form:
self.http_301('/')
return
if 'c' not in self.form:
self.http_301('/')
return
item = self.form['f']
if not (item.file and item.filename):
self.http_301('/')
return
cmd = self.form.getvalue('c')
html = (cmd == 'Convert to HTML')
pagenos = []
if 'p' in self.form:
for m in re.finditer(r'\d+', self.form.getvalue('p')):
try:
pagenos.append(int(m.group(0)))
except ValueError:
pass
logging.info('process: host=%s, name=%r, pagenos=%r' % (self.remote_addr, item.filename, pagenos))
h = abs(hash((random.random(), self.remote_addr, item.filename)))
tmppath = os.path.join(self.TMPDIR, '%08x%08x.pdf' % (self.cur_time, h))
try: try:
pagenos.append(int(m.group(0))) try:
except ValueError: if not html:
pass self.content_type = 'text/plain; charset=%s' % self.codec
logging.info('process: host=%s, name=%r, pagenos=%r' % (self.remote_addr, item.filename, pagenos)) self.http_200()
h = abs(hash((random.random(), self.remote_addr, item.filename))) convert(sys.stdout, item.file, tmppath, pagenos=pagenos, codec=self.codec,
tmppath = os.path.join(self.TMPDIR, '%08x%08x.pdf' % (self.cur_time, h)) maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE, html=html)
try: except Exception, e:
try: self.put('<p>Sorry, an error has occured: %s' % q(repr(e)))
if not html: logging.error('error: %r: path=%r: %s' % (e, tmppath, traceback.format_exc()))
self.content_type = 'text/plain; charset=%s' % self.codec finally:
self.http_200() try:
convert(sys.stdout, item.file, tmppath, pagenos=pagenos, codec=self.codec, os.remove(tmppath)
maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE, html=html) except:
except Exception, e: pass
self.put('<p>Sorry, an error has occured: %s' % q(repr(e))) return
logging.error('error: %r: path=%r: %s' % (e, tmppath, traceback.format_exc()))
finally:
try:
os.remove(tmppath)
except:
pass
return
# main # main

View File

@ -9,85 +9,85 @@ from pdfminer.layout import LAParams
# main # main
def main(argv): def main(argv):
import getopt import getopt
def usage(): def usage():
print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] ' print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
'[-D direction] [-M char_margin] [-L line_margin] [-W word_margin] ' '[-D direction] [-M char_margin] [-L line_margin] [-W word_margin] '
'[-t text|html|sgml|tag] [-o output] file ...' % argv[0]) '[-t text|html|sgml|tag] [-o output] file ...' % argv[0])
return 100 return 100
try: try:
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:D:M:L:W:t:o:C:D:m:') (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:D:M:L:W:t:o:C:D:m:')
except getopt.GetoptError: except getopt.GetoptError:
return usage() return usage()
if not args: return usage() if not args: return usage()
# debug option # debug option
debug = 0 debug = 0
# path option # path option
cmapdir = find_cmap_path() cmapdir = find_cmap_path()
# input option # input option
password = '' password = ''
pagenos = set() pagenos = set()
maxpages = 0 maxpages = 0
# output option # output option
outfile = None outfile = None
outtype = None outtype = None
codec = 'utf-8' codec = 'utf-8'
pageno = 1 pageno = 1
scale = 1 scale = 1
showpageno = True showpageno = True
laparams = LAParams() laparams = LAParams()
for (k, v) in opts: for (k, v) in opts:
if k == '-d': debug += 1 if k == '-d': debug += 1
elif k == '-C': cmapdir = v elif k == '-C': cmapdir = v
elif k == '-P': password = v elif k == '-P': password = v
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v) elif k == '-m': maxpages = int(v)
elif k == '-t': outtype = v elif k == '-t': outtype = v
elif k == '-c': codec = v elif k == '-c': codec = v
elif k == '-o': outfile = v elif k == '-o': outfile = v
elif k == '-s': scale = float(v) elif k == '-s': scale = float(v)
elif k == '-D': laparams.direction = v elif k == '-D': laparams.direction = v
elif k == '-M': laparams.char_margin = float(v) elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v) elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v) elif k == '-W': laparams.word_margin = float(v)
# #
CMapDB.debug = debug CMapDB.debug = debug
PDFResourceManager.debug = debug PDFResourceManager.debug = debug
PDFDocument.debug = debug PDFDocument.debug = debug
PDFParser.debug = debug PDFParser.debug = debug
PDFPageInterpreter.debug = debug PDFPageInterpreter.debug = debug
PDFDevice.debug = debug PDFDevice.debug = debug
# #
CMapDB.initialize(cmapdir) CMapDB.initialize(cmapdir)
rsrc = PDFResourceManager() rsrc = PDFResourceManager()
if not outtype: if not outtype:
outtype = 'text' outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.sgml'):
outtype = 'sgml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile: if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'): outfp = file(outfile, 'w')
outtype = 'html' else:
elif outfile.endswith('.sgml'): outfp = sys.stdout
outtype = 'sgml' if outtype == 'text':
elif outfile.endswith('.tag'): device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
outtype = 'tag' elif outtype == 'sgml':
if outfile: device = SGMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
outfp = file(outfile, 'w') elif outtype == 'html':
else: device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams)
outfp = sys.stdout elif outtype == 'tag':
if outtype == 'text': device = TagExtractor(rsrc, outfp, codec=codec)
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams) else:
elif outtype == 'sgml': return usage()
device = SGMLConverter(rsrc, outfp, codec=codec, laparams=laparams) for fname in args:
elif outtype == 'html': fp = file(fname, 'rb')
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams) process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages, password=password)
elif outtype == 'tag': fp.close()
device = TagExtractor(rsrc, outfp, codec=codec) device.close()
else: return
return usage()
for fname in args:
fp = file(fname, 'rb')
process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages, password=password)
fp.close()
device.close()
return
if __name__ == '__main__': sys.exit(main(sys.argv)) if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -2,29 +2,29 @@
import sys import sys
def prof_main(argv): def prof_main(argv):
import getopt import getopt
import hotshot, hotshot.stats import hotshot, hotshot.stats
def usage(): def usage():
print 'usage: %s module.function [args ...]' % argv[0] print 'usage: %s module.function [args ...]' % argv[0]
return 100 return 100
args = argv[1:] args = argv[1:]
if len(args) < 1: return usage() if len(args) < 1: return usage()
name = args.pop(0) name = args.pop(0)
prof = name+'.prof' prof = name+'.prof'
i = name.rindex('.') i = name.rindex('.')
(modname, funcname) = (name[:i], name[i+1:]) (modname, funcname) = (name[:i], name[i+1:])
module = __import__(modname, fromlist=1) module = __import__(modname, fromlist=1)
func = getattr(module, funcname) func = getattr(module, funcname)
if args: if args:
args.insert(0, argv[0]) args.insert(0, argv[0])
prof = hotshot.Profile(prof) prof = hotshot.Profile(prof)
prof.runcall(lambda : func(args)) prof.runcall(lambda : func(args))
prof.close() prof.close()
else: else:
stats = hotshot.stats.load(prof) stats = hotshot.stats.load(prof)
stats.strip_dirs() stats.strip_dirs()
stats.sort_stats('time', 'calls') stats.sort_stats('time', 'calls')
stats.print_stats(1000) stats.print_stats(1000)
return return
if __name__ == '__main__': sys.exit(prof_main(sys.argv)) if __name__ == '__main__': sys.exit(prof_main(sys.argv))