PEP8: Whitespace changes to match pep8
parent
c1da8b835c
commit
2caa5edc25
|
@ -1,4 +1,5 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
__version__ = '20131022'
|
__version__ = '20131022'
|
||||||
|
|
||||||
if __name__ == '__main__': print __version__
|
if __name__ == '__main__':
|
||||||
|
print __version__
|
||||||
|
|
|
@ -6,6 +6,7 @@ This code is in the public domain.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
## Arcfour
|
## Arcfour
|
||||||
##
|
##
|
||||||
class Arcfour(object):
|
class Arcfour(object):
|
||||||
|
|
|
@ -9,6 +9,7 @@ This code is in the public domain.
|
||||||
import re
|
import re
|
||||||
import struct
|
import struct
|
||||||
|
|
||||||
|
|
||||||
# ascii85decode(data)
|
# ascii85decode(data)
|
||||||
def ascii85decode(data):
|
def ascii85decode(data):
|
||||||
"""
|
"""
|
||||||
|
@ -35,7 +36,7 @@ def ascii85decode(data):
|
||||||
n += 1
|
n += 1
|
||||||
b = b*85+(ord(c)-33)
|
b = b*85+(ord(c)-33)
|
||||||
if n == 5:
|
if n == 5:
|
||||||
out += struct.pack('>L',b)
|
out += struct.pack('>L', b)
|
||||||
n = b = 0
|
n = b = 0
|
||||||
elif c == 'z':
|
elif c == 'z':
|
||||||
assert n == 0
|
assert n == 0
|
||||||
|
@ -44,13 +45,15 @@ def ascii85decode(data):
|
||||||
if n:
|
if n:
|
||||||
for _ in range(5-n):
|
for _ in range(5-n):
|
||||||
b = b*85+84
|
b = b*85+84
|
||||||
out += struct.pack('>L',b)[:n-1]
|
out += struct.pack('>L', b)[:n-1]
|
||||||
break
|
break
|
||||||
return out
|
return out
|
||||||
|
|
||||||
# asciihexdecode(data)
|
# asciihexdecode(data)
|
||||||
hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE)
|
hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE)
|
||||||
trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
|
trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
def asciihexdecode(data):
|
def asciihexdecode(data):
|
||||||
"""
|
"""
|
||||||
ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
|
ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
|
||||||
|
|
|
@ -25,10 +25,11 @@ class BitParser(object):
|
||||||
@classmethod
|
@classmethod
|
||||||
def add(klass, root, v, bits):
|
def add(klass, root, v, bits):
|
||||||
p = root
|
p = root
|
||||||
|
b = None
|
||||||
for i in xrange(len(bits)):
|
for i in xrange(len(bits)):
|
||||||
if 0 < i:
|
if 0 < i:
|
||||||
if p[b] is None:
|
if p[b] is None:
|
||||||
p[b] = [None,None]
|
p[b] = [None, None]
|
||||||
p = p[b]
|
p = p[b]
|
||||||
if bits[i] == '1':
|
if bits[i] == '1':
|
||||||
b = 1
|
b = 1
|
||||||
|
@ -40,7 +41,7 @@ class BitParser(object):
|
||||||
def feedbytes(self, data):
|
def feedbytes(self, data):
|
||||||
for c in data:
|
for c in data:
|
||||||
b = ord(c)
|
b = ord(c)
|
||||||
for m in (128,64,32,16,8,4,2,1):
|
for m in (128, 64, 32, 16, 8, 4, 2, 1):
|
||||||
self._parse_bit(b & m)
|
self._parse_bit(b & m)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -61,7 +62,7 @@ class BitParser(object):
|
||||||
##
|
##
|
||||||
class CCITTG4Parser(BitParser):
|
class CCITTG4Parser(BitParser):
|
||||||
|
|
||||||
MODE = [None,None]
|
MODE = [None, None]
|
||||||
BitParser.add(MODE, 0, '1')
|
BitParser.add(MODE, 0, '1')
|
||||||
BitParser.add(MODE, +1, '011')
|
BitParser.add(MODE, +1, '011')
|
||||||
BitParser.add(MODE, -1, '010')
|
BitParser.add(MODE, -1, '010')
|
||||||
|
@ -81,7 +82,7 @@ class CCITTG4Parser(BitParser):
|
||||||
BitParser.add(MODE, 'x7', '0000001110')
|
BitParser.add(MODE, 'x7', '0000001110')
|
||||||
BitParser.add(MODE, 'e', '000000000001000000000001')
|
BitParser.add(MODE, 'e', '000000000001000000000001')
|
||||||
|
|
||||||
WHITE = [None,None]
|
WHITE = [None, None]
|
||||||
BitParser.add(WHITE, 0 , '00110101')
|
BitParser.add(WHITE, 0 , '00110101')
|
||||||
BitParser.add(WHITE, 1 , '000111')
|
BitParser.add(WHITE, 1 , '000111')
|
||||||
BitParser.add(WHITE, 2 , '0111')
|
BitParser.add(WHITE, 2 , '0111')
|
||||||
|
@ -187,7 +188,7 @@ class CCITTG4Parser(BitParser):
|
||||||
BitParser.add(WHITE, 2496, '000000011110')
|
BitParser.add(WHITE, 2496, '000000011110')
|
||||||
BitParser.add(WHITE, 2560, '000000011111')
|
BitParser.add(WHITE, 2560, '000000011111')
|
||||||
|
|
||||||
BLACK = [None,None]
|
BLACK = [None, None]
|
||||||
BitParser.add(BLACK, 0 , '0000110111')
|
BitParser.add(BLACK, 0 , '0000110111')
|
||||||
BitParser.add(BLACK, 1 , '010')
|
BitParser.add(BLACK, 1 , '010')
|
||||||
BitParser.add(BLACK, 2 , '11')
|
BitParser.add(BLACK, 2 , '11')
|
||||||
|
@ -293,25 +294,30 @@ class CCITTG4Parser(BitParser):
|
||||||
BitParser.add(BLACK, 2496, '000000011110')
|
BitParser.add(BLACK, 2496, '000000011110')
|
||||||
BitParser.add(BLACK, 2560, '000000011111')
|
BitParser.add(BLACK, 2560, '000000011111')
|
||||||
|
|
||||||
UNCOMPRESSED = [None,None]
|
UNCOMPRESSED = [None, None]
|
||||||
BitParser.add(UNCOMPRESSED, '1' , '1')
|
BitParser.add(UNCOMPRESSED, '1', '1')
|
||||||
BitParser.add(UNCOMPRESSED, '01' , '01')
|
BitParser.add(UNCOMPRESSED, '01', '01')
|
||||||
BitParser.add(UNCOMPRESSED, '001' , '001')
|
BitParser.add(UNCOMPRESSED, '001', '001')
|
||||||
BitParser.add(UNCOMPRESSED, '0001' , '0001')
|
BitParser.add(UNCOMPRESSED, '0001', '0001')
|
||||||
BitParser.add(UNCOMPRESSED, '00001' , '00001')
|
BitParser.add(UNCOMPRESSED, '00001', '00001')
|
||||||
BitParser.add(UNCOMPRESSED, '00000' , '000001')
|
BitParser.add(UNCOMPRESSED, '00000', '000001')
|
||||||
BitParser.add(UNCOMPRESSED, 'T00' , '00000011')
|
BitParser.add(UNCOMPRESSED, 'T00', '00000011')
|
||||||
BitParser.add(UNCOMPRESSED, 'T10' , '00000010')
|
BitParser.add(UNCOMPRESSED, 'T10', '00000010')
|
||||||
BitParser.add(UNCOMPRESSED, 'T000' , '000000011')
|
BitParser.add(UNCOMPRESSED, 'T000', '000000011')
|
||||||
BitParser.add(UNCOMPRESSED, 'T100' , '000000010')
|
BitParser.add(UNCOMPRESSED, 'T100', '000000010')
|
||||||
BitParser.add(UNCOMPRESSED, 'T0000' , '0000000011')
|
BitParser.add(UNCOMPRESSED, 'T0000', '0000000011')
|
||||||
BitParser.add(UNCOMPRESSED, 'T1000' , '0000000010')
|
BitParser.add(UNCOMPRESSED, 'T1000', '0000000010')
|
||||||
BitParser.add(UNCOMPRESSED, 'T00000' , '00000000011')
|
BitParser.add(UNCOMPRESSED, 'T00000', '00000000011')
|
||||||
BitParser.add(UNCOMPRESSED, 'T10000' , '00000000010')
|
BitParser.add(UNCOMPRESSED, 'T10000', '00000000010')
|
||||||
|
|
||||||
class EOFB(Exception): pass
|
class EOFB(Exception):
|
||||||
class InvalidData(Exception): pass
|
pass
|
||||||
class ByteSkip(Exception): pass
|
|
||||||
|
class InvalidData(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class ByteSkip(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
def __init__(self, width, bytealign=False):
|
def __init__(self, width, bytealign=False):
|
||||||
BitParser.__init__(self)
|
BitParser.__init__(self)
|
||||||
|
@ -324,7 +330,7 @@ class CCITTG4Parser(BitParser):
|
||||||
for c in data:
|
for c in data:
|
||||||
b = ord(c)
|
b = ord(c)
|
||||||
try:
|
try:
|
||||||
for m in (128,64,32,16,8,4,2,1):
|
for m in (128, 64, 32, 16, 8, 4, 2, 1):
|
||||||
self._parse_bit(b & m)
|
self._parse_bit(b & m)
|
||||||
except self.ByteSkip:
|
except self.ByteSkip:
|
||||||
self._accept = self._parse_mode
|
self._accept = self._parse_mode
|
||||||
|
@ -358,7 +364,8 @@ class CCITTG4Parser(BitParser):
|
||||||
raise self.InvalidData(mode)
|
raise self.InvalidData(mode)
|
||||||
|
|
||||||
def _parse_horiz1(self, n):
|
def _parse_horiz1(self, n):
|
||||||
if n is None: raise self.InvalidData
|
if n is None:
|
||||||
|
raise self.InvalidData
|
||||||
self._n1 += n
|
self._n1 += n
|
||||||
if n < 64:
|
if n < 64:
|
||||||
self._n2 = 0
|
self._n2 = 0
|
||||||
|
@ -370,7 +377,8 @@ class CCITTG4Parser(BitParser):
|
||||||
return self.BLACK
|
return self.BLACK
|
||||||
|
|
||||||
def _parse_horiz2(self, n):
|
def _parse_horiz2(self, n):
|
||||||
if n is None: raise self.InvalidData
|
if n is None:
|
||||||
|
raise self.InvalidData
|
||||||
self._n2 += n
|
self._n2 += n
|
||||||
if n < 64:
|
if n < 64:
|
||||||
self._color = 1-self._color
|
self._color = 1-self._color
|
||||||
|
@ -384,7 +392,8 @@ class CCITTG4Parser(BitParser):
|
||||||
return self.BLACK
|
return self.BLACK
|
||||||
|
|
||||||
def _parse_uncompressed(self, bits):
|
def _parse_uncompressed(self, bits):
|
||||||
if not bits: raise self.InvalidData
|
if not bits:
|
||||||
|
raise self.InvalidData
|
||||||
if bits.startswith('T'):
|
if bits.startswith('T'):
|
||||||
self._accept = self._parse_mode
|
self._accept = self._parse_mode
|
||||||
self._color = int(bits[1])
|
self._color = int(bits[1])
|
||||||
|
@ -395,17 +404,17 @@ class CCITTG4Parser(BitParser):
|
||||||
return self.UNCOMPRESSED
|
return self.UNCOMPRESSED
|
||||||
|
|
||||||
def _get_bits(self):
|
def _get_bits(self):
|
||||||
return ''.join( str(b) for b in self._curline[:self._curpos] )
|
return ''.join(str(b) for b in self._curline[:self._curpos])
|
||||||
|
|
||||||
def _get_refline(self, i):
|
def _get_refline(self, i):
|
||||||
if i < 0:
|
if i < 0:
|
||||||
return '[]'+''.join( str(b) for b in self._refline )
|
return '[]'+''.join(str(b) for b in self._refline)
|
||||||
elif len(self._refline) <= i:
|
elif len(self._refline) <= i:
|
||||||
return ''.join( str(b) for b in self._refline )+'[]'
|
return ''.join(str(b) for b in self._refline)+'[]'
|
||||||
else:
|
else:
|
||||||
return (''.join( str(b) for b in self._refline[:i] )+
|
return (''.join(str(b) for b in self._refline[:i]) +
|
||||||
'['+str(self._refline[i])+']'+
|
'['+str(self._refline[i])+']' +
|
||||||
''.join( str(b) for b in self._refline[i+1:] ))
|
''.join(str(b) for b in self._refline[i+1:]))
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self._y = 0
|
self._y = 0
|
||||||
|
@ -416,7 +425,7 @@ class CCITTG4Parser(BitParser):
|
||||||
return
|
return
|
||||||
|
|
||||||
def output_line(self, y, bits):
|
def output_line(self, y, bits):
|
||||||
print y, ''.join( str(b) for b in bits )
|
print y, ''.join(str(b) for b in bits)
|
||||||
return
|
return
|
||||||
|
|
||||||
def _reset_line(self):
|
def _reset_line(self):
|
||||||
|
@ -441,12 +450,13 @@ class CCITTG4Parser(BitParser):
|
||||||
x1 = self._curpos+1
|
x1 = self._curpos+1
|
||||||
while 1:
|
while 1:
|
||||||
if x1 == 0:
|
if x1 == 0:
|
||||||
if (self._color == 1 and
|
if (self._color == 1 and self._refline[x1] != self._color):
|
||||||
self._refline[x1] != self._color): break
|
break
|
||||||
elif x1 == len(self._refline):
|
elif x1 == len(self._refline):
|
||||||
break
|
break
|
||||||
elif (self._refline[x1-1] == self._color and
|
elif (self._refline[x1-1] == self._color and
|
||||||
self._refline[x1] != self._color): break
|
self._refline[x1] != self._color):
|
||||||
|
break
|
||||||
x1 += 1
|
x1 += 1
|
||||||
x1 += dx
|
x1 += dx
|
||||||
x0 = max(0, self._curpos)
|
x0 = max(0, self._curpos)
|
||||||
|
@ -467,21 +477,23 @@ class CCITTG4Parser(BitParser):
|
||||||
x1 = self._curpos+1
|
x1 = self._curpos+1
|
||||||
while 1:
|
while 1:
|
||||||
if x1 == 0:
|
if x1 == 0:
|
||||||
if (self._color == 1 and
|
if (self._color == 1 and self._refline[x1] != self._color):
|
||||||
self._refline[x1] != self._color): break
|
break
|
||||||
elif x1 == len(self._refline):
|
elif x1 == len(self._refline):
|
||||||
break
|
break
|
||||||
elif (self._refline[x1-1] == self._color and
|
elif (self._refline[x1-1] == self._color and
|
||||||
self._refline[x1] != self._color): break
|
self._refline[x1] != self._color):
|
||||||
|
break
|
||||||
x1 += 1
|
x1 += 1
|
||||||
while 1:
|
while 1:
|
||||||
if x1 == 0:
|
if x1 == 0:
|
||||||
if (self._color == 0 and
|
if (self._color == 0 and self._refline[x1] == self._color):
|
||||||
self._refline[x1] == self._color): break
|
break
|
||||||
elif x1 == len(self._refline):
|
elif x1 == len(self._refline):
|
||||||
break
|
break
|
||||||
elif (self._refline[x1-1] != self._color and
|
elif (self._refline[x1-1] != self._color and
|
||||||
self._refline[x1] == self._color): break
|
self._refline[x1] == self._color):
|
||||||
|
break
|
||||||
x1 += 1
|
x1 += 1
|
||||||
for x in xrange(self._curpos, x1):
|
for x in xrange(self._curpos, x1):
|
||||||
self._curline[x] = self._color
|
self._curline[x] = self._color
|
||||||
|
@ -494,11 +506,13 @@ class CCITTG4Parser(BitParser):
|
||||||
self._curpos = 0
|
self._curpos = 0
|
||||||
x = self._curpos
|
x = self._curpos
|
||||||
for _ in xrange(n1):
|
for _ in xrange(n1):
|
||||||
if len(self._curline) <= x: break
|
if len(self._curline) <= x:
|
||||||
|
break
|
||||||
self._curline[x] = self._color
|
self._curline[x] = self._color
|
||||||
x += 1
|
x += 1
|
||||||
for _ in xrange(n2):
|
for _ in xrange(n2):
|
||||||
if len(self._curline) <= x: break
|
if len(self._curline) <= x:
|
||||||
|
break
|
||||||
self._curline[x] = 1-self._color
|
self._curline[x] = 1-self._color
|
||||||
x += 1
|
x += 1
|
||||||
self._curpos = x
|
self._curpos = x
|
||||||
|
@ -512,15 +526,16 @@ class CCITTG4Parser(BitParser):
|
||||||
self._flush_line()
|
self._flush_line()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
|
||||||
## Test cases
|
## Test cases
|
||||||
##
|
##
|
||||||
import unittest
|
|
||||||
class TestCCITTG4Parser(unittest.TestCase):
|
class TestCCITTG4Parser(unittest.TestCase):
|
||||||
|
|
||||||
def get_parser(self, bits):
|
def get_parser(self, bits):
|
||||||
parser = CCITTG4Parser(len(bits))
|
parser = CCITTG4Parser(len(bits))
|
||||||
parser._curline = [ int(c) for c in bits ]
|
parser._curline = [int(c) for c in bits]
|
||||||
parser._reset_line()
|
parser._reset_line()
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
@ -655,7 +670,7 @@ class TestCCITTG4Parser(unittest.TestCase):
|
||||||
parser._do_vertical(-1)
|
parser._do_vertical(-1)
|
||||||
parser._do_vertical(-1)
|
parser._do_vertical(-1)
|
||||||
parser._do_vertical(1)
|
parser._do_vertical(1)
|
||||||
parser._do_horizontal(1,1)
|
parser._do_horizontal(1, 1)
|
||||||
self.assertEqual(parser._get_bits(), '011101')
|
self.assertEqual(parser._get_bits(), '011101')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -685,10 +700,10 @@ class CCITTFaxDecoder(CCITTG4Parser):
|
||||||
def output_line(self, y, bits):
|
def output_line(self, y, bits):
|
||||||
bytes = array.array('B', [0]*((len(bits)+7)/8))
|
bytes = array.array('B', [0]*((len(bits)+7)/8))
|
||||||
if self.reversed:
|
if self.reversed:
|
||||||
bits = [ 1-b for b in bits ]
|
bits = [1-b for b in bits]
|
||||||
for (i,b) in enumerate(bits):
|
for (i, b) in enumerate(bits):
|
||||||
if b:
|
if b:
|
||||||
bytes[i/8] += (128,64,32,16,8,4,2,1)[i%8]
|
bytes[i/8] += (128, 64, 32, 16, 8, 4, 2, 1)[i % 8]
|
||||||
self._buf += bytes.tostring()
|
self._buf += bytes.tostring()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -711,28 +726,32 @@ def main(argv):
|
||||||
import pygame
|
import pygame
|
||||||
if not argv[1:]:
|
if not argv[1:]:
|
||||||
return unittest.main()
|
return unittest.main()
|
||||||
|
|
||||||
class Parser(CCITTG4Parser):
|
class Parser(CCITTG4Parser):
|
||||||
def __init__(self, width, bytealign=False):
|
def __init__(self, width, bytealign=False):
|
||||||
CCITTG4Parser.__init__(self, width, bytealign=bytealign)
|
CCITTG4Parser.__init__(self, width, bytealign=bytealign)
|
||||||
self.img = pygame.Surface((self.width,1000))
|
self.img = pygame.Surface((self.width, 1000))
|
||||||
return
|
return
|
||||||
|
|
||||||
def output_line(self, y, bits):
|
def output_line(self, y, bits):
|
||||||
for (x,b) in enumerate(bits):
|
for (x, b) in enumerate(bits):
|
||||||
if b:
|
if b:
|
||||||
self.img.set_at((x,y), (255,255,255))
|
self.img.set_at((x, y), (255, 255, 255))
|
||||||
else:
|
else:
|
||||||
self.img.set_at((x,y), (0,0,0))
|
self.img.set_at((x, y), (0, 0, 0))
|
||||||
return
|
return
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
pygame.image.save(self.img, 'out.bmp')
|
pygame.image.save(self.img, 'out.bmp')
|
||||||
return
|
return
|
||||||
for path in argv[1:]:
|
for path in argv[1:]:
|
||||||
fp = file(path,'rb')
|
fp = file(path, 'rb')
|
||||||
(_,_,k,w,h,_) = path.split('.')
|
(_, _, k, w, h, _) = path.split('.')
|
||||||
parser = Parser(int(w))
|
parser = Parser(int(w))
|
||||||
parser.feedbytes(fp.read())
|
parser.feedbytes(fp.read())
|
||||||
parser.close()
|
parser.close()
|
||||||
fp.close()
|
fp.close()
|
||||||
return
|
return
|
||||||
|
|
||||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
if __name__ == '__main__':
|
||||||
|
sys.exit(main(sys.argv))
|
||||||
|
|
|
@ -26,7 +26,8 @@ from encodingdb import name2unicode
|
||||||
from utils import choplist, nunpack
|
from utils import choplist, nunpack
|
||||||
|
|
||||||
|
|
||||||
class CMapError(Exception): pass
|
class CMapError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
## CMap
|
## CMap
|
||||||
|
@ -44,8 +45,9 @@ class CMap(object):
|
||||||
|
|
||||||
def use_cmap(self, cmap):
|
def use_cmap(self, cmap):
|
||||||
assert isinstance(cmap, CMap)
|
assert isinstance(cmap, CMap)
|
||||||
|
|
||||||
def copy(dst, src):
|
def copy(dst, src):
|
||||||
for (k,v) in src.iteritems():
|
for (k, v) in src.iteritems():
|
||||||
if isinstance(v, dict):
|
if isinstance(v, dict):
|
||||||
d = {}
|
d = {}
|
||||||
dst[k] = d
|
dst[k] = d
|
||||||
|
@ -74,10 +76,10 @@ class CMap(object):
|
||||||
if code2cid is None:
|
if code2cid is None:
|
||||||
code2cid = self.code2cid
|
code2cid = self.code2cid
|
||||||
code = ()
|
code = ()
|
||||||
for (k,v) in sorted(code2cid.iteritems()):
|
for (k, v) in sorted(code2cid.iteritems()):
|
||||||
c = code+(k,)
|
c = code+(k,)
|
||||||
if isinstance(v, int):
|
if isinstance(v, int):
|
||||||
out.write('code %r = cid %d\n' % (c,v))
|
out.write('code %r = cid %d\n' % (c, v))
|
||||||
else:
|
else:
|
||||||
self.dump(out=out, code2cid=v, code=c)
|
self.dump(out=out, code2cid=v, code=c)
|
||||||
return
|
return
|
||||||
|
@ -102,7 +104,6 @@ class IdentityCMap(object):
|
||||||
return ()
|
return ()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## UnicodeMap
|
## UnicodeMap
|
||||||
##
|
##
|
||||||
class UnicodeMap(object):
|
class UnicodeMap(object):
|
||||||
|
@ -119,8 +120,8 @@ class UnicodeMap(object):
|
||||||
return self.cid2unichr[cid]
|
return self.cid2unichr[cid]
|
||||||
|
|
||||||
def dump(self, out=sys.stdout):
|
def dump(self, out=sys.stdout):
|
||||||
for (k,v) in sorted(self.cid2unichr.iteritems()):
|
for (k, v) in sorted(self.cid2unichr.iteritems()):
|
||||||
out.write('cid %d = unicode %r\n' % (k,v))
|
out.write('cid %d = unicode %r\n' % (k, v))
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@ -153,7 +154,7 @@ class FileCMap(CMap):
|
||||||
else:
|
else:
|
||||||
t = {}
|
t = {}
|
||||||
d[c] = t
|
d[c] = t
|
||||||
d =t
|
d = t
|
||||||
c = ord(code[-1])
|
c = ord(code[-1])
|
||||||
d[c] = cid
|
d[c] = cid
|
||||||
return
|
return
|
||||||
|
@ -232,17 +233,16 @@ class CMapDB(object):
|
||||||
_cmap_cache = {}
|
_cmap_cache = {}
|
||||||
_umap_cache = {}
|
_umap_cache = {}
|
||||||
|
|
||||||
class CMapNotFound(CMapError): pass
|
class CMapNotFound(CMapError):
|
||||||
|
pass
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _load_data(klass, name):
|
def _load_data(klass, name):
|
||||||
filename = '%s.pickle.gz' % name
|
filename = '%s.pickle.gz' % name
|
||||||
if klass.debug:
|
if klass.debug:
|
||||||
print >>sys.stderr, 'loading:', name
|
print >>sys.stderr, 'loading:', name
|
||||||
cmap_paths = (
|
cmap_paths = (os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'),
|
||||||
os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'),
|
os.path.join(os.path.dirname(__file__), 'cmap'),)
|
||||||
os.path.join(os.path.dirname(__file__), 'cmap'),
|
|
||||||
)
|
|
||||||
for directory in cmap_paths:
|
for directory in cmap_paths:
|
||||||
path = os.path.join(directory, filename)
|
path = os.path.join(directory, filename)
|
||||||
if os.path.exists(path):
|
if os.path.exists(path):
|
||||||
|
@ -306,11 +306,12 @@ class CMapParser(PSStackParser):
|
||||||
elif name == 'endcmap':
|
elif name == 'endcmap':
|
||||||
self._in_cmap = False
|
self._in_cmap = False
|
||||||
return
|
return
|
||||||
if not self._in_cmap: return
|
if not self._in_cmap:
|
||||||
|
return
|
||||||
#
|
#
|
||||||
if name == 'def':
|
if name == 'def':
|
||||||
try:
|
try:
|
||||||
((_,k),(_,v)) = self.pop(2)
|
((_, k), (_, v)) = self.pop(2)
|
||||||
self.cmap.set_attr(literal_name(k), v)
|
self.cmap.set_attr(literal_name(k), v)
|
||||||
except PSSyntaxError:
|
except PSSyntaxError:
|
||||||
pass
|
pass
|
||||||
|
@ -318,7 +319,7 @@ class CMapParser(PSStackParser):
|
||||||
|
|
||||||
if name == 'usecmap':
|
if name == 'usecmap':
|
||||||
try:
|
try:
|
||||||
((_,cmapname),) = self.pop(1)
|
((_, cmapname),) = self.pop(1)
|
||||||
self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
|
self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
|
||||||
except PSSyntaxError:
|
except PSSyntaxError:
|
||||||
pass
|
pass
|
||||||
|
@ -337,13 +338,15 @@ class CMapParser(PSStackParser):
|
||||||
self.popall()
|
self.popall()
|
||||||
return
|
return
|
||||||
if name == 'endcidrange':
|
if name == 'endcidrange':
|
||||||
objs = [ obj for (_,obj) in self.popall() ]
|
objs = [obj for (__, obj) in self.popall()]
|
||||||
for (s,e,cid) in choplist(3, objs):
|
for (s, e, cid) in choplist(3, objs):
|
||||||
if (not isinstance(s, str) or not isinstance(e, str) or
|
if (not isinstance(s, str) or not isinstance(e, str) or
|
||||||
not isinstance(cid, int) or len(s) != len(e)): continue
|
not isinstance(cid, int) or len(s) != len(e)):
|
||||||
|
continue
|
||||||
sprefix = s[:-4]
|
sprefix = s[:-4]
|
||||||
eprefix = e[:-4]
|
eprefix = e[:-4]
|
||||||
if sprefix != eprefix: continue
|
if sprefix != eprefix:
|
||||||
|
continue
|
||||||
svar = s[-4:]
|
svar = s[-4:]
|
||||||
evar = e[-4:]
|
evar = e[-4:]
|
||||||
s1 = nunpack(svar)
|
s1 = nunpack(svar)
|
||||||
|
@ -351,7 +354,7 @@ class CMapParser(PSStackParser):
|
||||||
vlen = len(svar)
|
vlen = len(svar)
|
||||||
#assert s1 <= e1
|
#assert s1 <= e1
|
||||||
for i in xrange(e1-s1+1):
|
for i in xrange(e1-s1+1):
|
||||||
x = sprefix+struct.pack('>L',s1+i)[-vlen:]
|
x = sprefix+struct.pack('>L', s1+i)[-vlen:]
|
||||||
self.cmap.add_code2cid(x, cid+i)
|
self.cmap.add_code2cid(x, cid+i)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -359,8 +362,8 @@ class CMapParser(PSStackParser):
|
||||||
self.popall()
|
self.popall()
|
||||||
return
|
return
|
||||||
if name == 'endcidchar':
|
if name == 'endcidchar':
|
||||||
objs = [ obj for (_,obj) in self.popall() ]
|
objs = [obj for (__, obj) in self.popall()]
|
||||||
for (cid,code) in choplist(2, objs):
|
for (cid, code) in choplist(2, objs):
|
||||||
if isinstance(code, str) and isinstance(cid, str):
|
if isinstance(code, str) and isinstance(cid, str):
|
||||||
self.cmap.add_code2cid(code, nunpack(cid))
|
self.cmap.add_code2cid(code, nunpack(cid))
|
||||||
return
|
return
|
||||||
|
@ -369,10 +372,11 @@ class CMapParser(PSStackParser):
|
||||||
self.popall()
|
self.popall()
|
||||||
return
|
return
|
||||||
if name == 'endbfrange':
|
if name == 'endbfrange':
|
||||||
objs = [ obj for (_,obj) in self.popall() ]
|
objs = [obj for (__, obj) in self.popall()]
|
||||||
for (s,e,code) in choplist(3, objs):
|
for (s, e, code) in choplist(3, objs):
|
||||||
if (not isinstance(s, str) or not isinstance(e, str) or
|
if (not isinstance(s, str) or not isinstance(e, str) or
|
||||||
len(s) != len(e)): continue
|
len(s) != len(e)):
|
||||||
|
continue
|
||||||
s1 = nunpack(s)
|
s1 = nunpack(s)
|
||||||
e1 = nunpack(e)
|
e1 = nunpack(e)
|
||||||
#assert s1 <= e1
|
#assert s1 <= e1
|
||||||
|
@ -385,7 +389,7 @@ class CMapParser(PSStackParser):
|
||||||
prefix = code[:-4]
|
prefix = code[:-4]
|
||||||
vlen = len(var)
|
vlen = len(var)
|
||||||
for i in xrange(e1-s1+1):
|
for i in xrange(e1-s1+1):
|
||||||
x = prefix+struct.pack('>L',base+i)[-vlen:]
|
x = prefix+struct.pack('>L', base+i)[-vlen:]
|
||||||
self.cmap.add_cid2unichr(s1+i, x)
|
self.cmap.add_cid2unichr(s1+i, x)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -393,8 +397,8 @@ class CMapParser(PSStackParser):
|
||||||
self.popall()
|
self.popall()
|
||||||
return
|
return
|
||||||
if name == 'endbfchar':
|
if name == 'endbfchar':
|
||||||
objs = [ obj for (_,obj) in self.popall() ]
|
objs = [obj for (__, obj) in self.popall()]
|
||||||
for (cid,code) in choplist(2, objs):
|
for (cid, code) in choplist(2, objs):
|
||||||
if isinstance(cid, str) and isinstance(code, str):
|
if isinstance(cid, str) and isinstance(code, str):
|
||||||
self.cmap.add_cid2unichr(nunpack(cid), code)
|
self.cmap.add_cid2unichr(nunpack(cid), code)
|
||||||
return
|
return
|
||||||
|
@ -409,6 +413,7 @@ class CMapParser(PSStackParser):
|
||||||
self.push((pos, token))
|
self.push((pos, token))
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
# test
|
# test
|
||||||
def main(argv):
|
def main(argv):
|
||||||
args = argv[1:]
|
args = argv[1:]
|
||||||
|
@ -421,4 +426,5 @@ def main(argv):
|
||||||
cmap.dump()
|
cmap.dump()
|
||||||
return
|
return
|
||||||
|
|
||||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
if __name__ == '__main__':
|
||||||
|
sys.exit(main(sys.argv))
|
||||||
|
|
|
@ -21,9 +21,9 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_page(self, page, ctm):
|
def begin_page(self, page, ctm):
|
||||||
(x0,y0,x1,y1) = page.mediabox
|
(x0, y0, x1, y1) = page.mediabox
|
||||||
(x0,y0) = apply_matrix_pt(ctm, (x0,y0))
|
(x0, y0) = apply_matrix_pt(ctm, (x0, y0))
|
||||||
(x1,y1) = apply_matrix_pt(ctm, (x1,y1))
|
(x1, y1) = apply_matrix_pt(ctm, (x1, y1))
|
||||||
mediabox = (0, 0, abs(x0-x1), abs(y0-y1))
|
mediabox = (0, 0, abs(x0-x1), abs(y0-y1))
|
||||||
self.cur_item = LTPage(self.pageno, mediabox)
|
self.cur_item = LTPage(self.pageno, mediabox)
|
||||||
return
|
return
|
||||||
|
@ -61,26 +61,26 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
||||||
shape = ''.join(x[0] for x in path)
|
shape = ''.join(x[0] for x in path)
|
||||||
if shape == 'ml':
|
if shape == 'ml':
|
||||||
# horizontal/vertical line
|
# horizontal/vertical line
|
||||||
(_,x0,y0) = path[0]
|
(_, x0, y0) = path[0]
|
||||||
(_,x1,y1) = path[1]
|
(_, x1, y1) = path[1]
|
||||||
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
|
(x0, y0) = apply_matrix_pt(self.ctm, (x0, y0))
|
||||||
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
|
(x1, y1) = apply_matrix_pt(self.ctm, (x1, y1))
|
||||||
if x0 == x1 or y0 == y1:
|
if x0 == x1 or y0 == y1:
|
||||||
self.cur_item.add(LTLine(gstate.linewidth, (x0,y0), (x1,y1)))
|
self.cur_item.add(LTLine(gstate.linewidth, (x0, y0), (x1, y1)))
|
||||||
return
|
return
|
||||||
if shape == 'mlllh':
|
if shape == 'mlllh':
|
||||||
# rectangle
|
# rectangle
|
||||||
(_,x0,y0) = path[0]
|
(_, x0, y0) = path[0]
|
||||||
(_,x1,y1) = path[1]
|
(_, x1, y1) = path[1]
|
||||||
(_,x2,y2) = path[2]
|
(_, x2, y2) = path[2]
|
||||||
(_,x3,y3) = path[3]
|
(_, x3, y3) = path[3]
|
||||||
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
|
(x0, y0) = apply_matrix_pt(self.ctm, (x0, y0))
|
||||||
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
|
(x1, y1) = apply_matrix_pt(self.ctm, (x1, y1))
|
||||||
(x2,y2) = apply_matrix_pt(self.ctm, (x2,y2))
|
(x2, y2) = apply_matrix_pt(self.ctm, (x2, y2))
|
||||||
(x3,y3) = apply_matrix_pt(self.ctm, (x3,y3))
|
(x3, y3) = apply_matrix_pt(self.ctm, (x3, y3))
|
||||||
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
|
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
|
||||||
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
|
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
|
||||||
self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
|
self.cur_item.add(LTRect(gstate.linewidth, (x0, y0, x2, y2)))
|
||||||
return
|
return
|
||||||
# other shapes
|
# other shapes
|
||||||
pts = []
|
pts = []
|
||||||
|
@ -176,7 +176,8 @@ class TextConverter(PDFConverter):
|
||||||
# is text. This stops all the image and drawing ouput from being
|
# is text. This stops all the image and drawing ouput from being
|
||||||
# recorded and taking up RAM.
|
# recorded and taking up RAM.
|
||||||
def render_image(self, name, stream):
|
def render_image(self, name, stream):
|
||||||
if self.imagewriter is None: return
|
if self.imagewriter is None:
|
||||||
|
return
|
||||||
PDFConverter.render_image(self, name, stream)
|
PDFConverter.render_image(self, name, stream)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -196,18 +197,18 @@ class HTMLConverter(PDFConverter):
|
||||||
'textgroup': 'red',
|
'textgroup': 'red',
|
||||||
'curve': 'black',
|
'curve': 'black',
|
||||||
'page': 'gray',
|
'page': 'gray',
|
||||||
}
|
}
|
||||||
|
|
||||||
TEXT_COLORS = {
|
TEXT_COLORS = {
|
||||||
'textbox': 'blue',
|
'textbox': 'blue',
|
||||||
'char': 'black',
|
'char': 'black',
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
|
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
|
||||||
scale=1, fontscale=1.0, layoutmode='normal', showpageno=True,
|
scale=1, fontscale=1.0, layoutmode='normal', showpageno=True,
|
||||||
pagemargin=50, imagewriter=None,
|
pagemargin=50, imagewriter=None,
|
||||||
rect_colors={'curve':'black', 'page':'gray'},
|
rect_colors={'curve': 'black', 'page': 'gray'},
|
||||||
text_colors={'char':'black'}):
|
text_colors={'char': 'black'}):
|
||||||
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
self.fontscale = fontscale
|
self.fontscale = fontscale
|
||||||
|
@ -238,7 +239,7 @@ class HTMLConverter(PDFConverter):
|
||||||
|
|
||||||
def write_footer(self):
|
def write_footer(self):
|
||||||
self.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' %
|
self.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' %
|
||||||
', '.join('<a href="#%s">%s</a>' % (i,i) for i in xrange(1,self.pageno)))
|
', '.join('<a href="#%s">%s</a>' % (i, i) for i in xrange(1, self.pageno)))
|
||||||
self.write('</body></html>\n')
|
self.write('</body></html>\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -318,6 +319,7 @@ class HTMLConverter(PDFConverter):
|
||||||
for child in item:
|
for child in item:
|
||||||
show_group(child)
|
show_group(child)
|
||||||
return
|
return
|
||||||
|
|
||||||
def render(item):
|
def render(item):
|
||||||
if isinstance(item, LTPage):
|
if isinstance(item, LTPage):
|
||||||
self._yoffset += item.y1
|
self._yoffset += item.y1
|
||||||
|
@ -415,6 +417,7 @@ class XMLConverter(PDFConverter):
|
||||||
show_group(child)
|
show_group(child)
|
||||||
self.outfp.write('</textgroup>\n')
|
self.outfp.write('</textgroup>\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
def render(item):
|
def render(item):
|
||||||
if isinstance(item, LTPage):
|
if isinstance(item, LTPage):
|
||||||
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
|
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
|
||||||
|
|
|
@ -6,15 +6,18 @@ from glyphlist import glyphname2unicode
|
||||||
from latin_enc import ENCODING
|
from latin_enc import ENCODING
|
||||||
|
|
||||||
|
|
||||||
|
STRIP_NAME = re.compile(r'[0-9]+')
|
||||||
|
|
||||||
|
|
||||||
## name2unicode
|
## name2unicode
|
||||||
##
|
##
|
||||||
STRIP_NAME = re.compile(r'[0-9]+')
|
|
||||||
def name2unicode(name):
|
def name2unicode(name):
|
||||||
"""Converts Adobe glyph names to Unicode numbers."""
|
"""Converts Adobe glyph names to Unicode numbers."""
|
||||||
if name in glyphname2unicode:
|
if name in glyphname2unicode:
|
||||||
return glyphname2unicode[name]
|
return glyphname2unicode[name]
|
||||||
m = STRIP_NAME.search(name)
|
m = STRIP_NAME.search(name)
|
||||||
if not m: raise KeyError(name)
|
if not m:
|
||||||
|
raise KeyError(name)
|
||||||
return unichr(int(m.group(0)))
|
return unichr(int(m.group(0)))
|
||||||
|
|
||||||
|
|
||||||
|
@ -26,19 +29,23 @@ class EncodingDB(object):
|
||||||
mac2unicode = {}
|
mac2unicode = {}
|
||||||
win2unicode = {}
|
win2unicode = {}
|
||||||
pdf2unicode = {}
|
pdf2unicode = {}
|
||||||
for (name,std,mac,win,pdf) in ENCODING:
|
for (name, std, mac, win, pdf) in ENCODING:
|
||||||
c = name2unicode(name)
|
c = name2unicode(name)
|
||||||
if std: std2unicode[std] = c
|
if std:
|
||||||
if mac: mac2unicode[mac] = c
|
std2unicode[std] = c
|
||||||
if win: win2unicode[win] = c
|
if mac:
|
||||||
if pdf: pdf2unicode[pdf] = c
|
mac2unicode[mac] = c
|
||||||
|
if win:
|
||||||
|
win2unicode[win] = c
|
||||||
|
if pdf:
|
||||||
|
pdf2unicode[pdf] = c
|
||||||
|
|
||||||
encodings = {
|
encodings = {
|
||||||
'StandardEncoding': std2unicode,
|
'StandardEncoding': std2unicode,
|
||||||
'MacRomanEncoding': mac2unicode,
|
'MacRomanEncoding': mac2unicode,
|
||||||
'WinAnsiEncoding': win2unicode,
|
'WinAnsiEncoding': win2unicode,
|
||||||
'PDFDocEncoding': pdf2unicode,
|
'PDFDocEncoding': pdf2unicode,
|
||||||
}
|
}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_encoding(klass, name, diff=None):
|
def get_encoding(klass, name, diff=None):
|
||||||
|
|
|
@ -7,9 +7,11 @@ import os, os.path
|
||||||
from pdftypes import LITERALS_DCT_DECODE
|
from pdftypes import LITERALS_DCT_DECODE
|
||||||
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK
|
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK
|
||||||
|
|
||||||
|
|
||||||
def align32(x):
|
def align32(x):
|
||||||
return ((x+3)/4)*4
|
return ((x+3)/4)*4
|
||||||
|
|
||||||
|
|
||||||
## BMPWriter
|
## BMPWriter
|
||||||
##
|
##
|
||||||
class BMPWriter(object):
|
class BMPWriter(object):
|
||||||
|
@ -38,12 +40,12 @@ class BMPWriter(object):
|
||||||
self.fp.write(info)
|
self.fp.write(info)
|
||||||
if ncols == 2:
|
if ncols == 2:
|
||||||
# B&W color table
|
# B&W color table
|
||||||
for i in (0,255):
|
for i in (0, 255):
|
||||||
self.fp.write(struct.pack('BBBx', i,i,i))
|
self.fp.write(struct.pack('BBBx', i, i, i))
|
||||||
elif ncols == 256:
|
elif ncols == 256:
|
||||||
# grayscale color table
|
# grayscale color table
|
||||||
for i in xrange(256):
|
for i in xrange(256):
|
||||||
self.fp.write(struct.pack('BBBx', i,i,i))
|
self.fp.write(struct.pack('BBBx', i, i, i))
|
||||||
self.pos0 = self.fp.tell()
|
self.pos0 = self.fp.tell()
|
||||||
self.pos1 = self.pos0 + self.datasize
|
self.pos1 = self.pos0 + self.datasize
|
||||||
return
|
return
|
||||||
|
|
|
@ -82,7 +82,7 @@ class LTComponent(LTItem):
|
||||||
return ('<%s %s>' %
|
return ('<%s %s>' %
|
||||||
(self.__class__.__name__, bbox2str(self.bbox)))
|
(self.__class__.__name__, bbox2str(self.bbox)))
|
||||||
|
|
||||||
def set_bbox(self, (x0,y0,x1,y1)):
|
def set_bbox(self, (x0, y0, x1, y1)):
|
||||||
self.x0 = x0
|
self.x0 = x0
|
||||||
self.y0 = y0
|
self.y0 = y0
|
||||||
self.x1 = x1
|
self.x1 = x1
|
||||||
|
@ -143,7 +143,7 @@ class LTCurve(LTComponent):
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_pts(self):
|
def get_pts(self):
|
||||||
return ','.join( '%.3f,%.3f' % p for p in self.pts )
|
return ','.join('%.3f,%.3f' % p for p in self.pts)
|
||||||
|
|
||||||
|
|
||||||
## LTLine
|
## LTLine
|
||||||
|
@ -159,8 +159,8 @@ class LTLine(LTCurve):
|
||||||
##
|
##
|
||||||
class LTRect(LTCurve):
|
class LTRect(LTCurve):
|
||||||
|
|
||||||
def __init__(self, linewidth, (x0,y0,x1,y1)):
|
def __init__(self, linewidth, (x0, y0, x1, y1)):
|
||||||
LTCurve.__init__(self, linewidth, [(x0,y0), (x1,y0), (x1,y1), (x0,y1)])
|
LTCurve.__init__(self, linewidth, [(x0, y0), (x1, y0), (x1, y1), (x0, y1)])
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@ -213,7 +213,7 @@ class LTChar(LTComponent, LTText):
|
||||||
if font.is_vertical():
|
if font.is_vertical():
|
||||||
# vertical
|
# vertical
|
||||||
width = font.get_width() * fontsize
|
width = font.get_width() * fontsize
|
||||||
(vx,vy) = textdisp
|
(vx, vy) = textdisp
|
||||||
if vx is None:
|
if vx is None:
|
||||||
vx = width/2
|
vx = width/2
|
||||||
else:
|
else:
|
||||||
|
@ -230,15 +230,15 @@ class LTChar(LTComponent, LTText):
|
||||||
ty = descent + rise
|
ty = descent + rise
|
||||||
bll = (0, ty)
|
bll = (0, ty)
|
||||||
bur = (self.adv, ty+height)
|
bur = (self.adv, ty+height)
|
||||||
(a,b,c,d,e,f) = self.matrix
|
(a, b, c, d, e, f) = self.matrix
|
||||||
self.upright = (0 < a*d*scaling and b*c <= 0)
|
self.upright = (0 < a*d*scaling and b*c <= 0)
|
||||||
(x0,y0) = apply_matrix_pt(self.matrix, bll)
|
(x0, y0) = apply_matrix_pt(self.matrix, bll)
|
||||||
(x1,y1) = apply_matrix_pt(self.matrix, bur)
|
(x1, y1) = apply_matrix_pt(self.matrix, bur)
|
||||||
if x1 < x0:
|
if x1 < x0:
|
||||||
(x0,x1) = (x1,x0)
|
(x0, x1) = (x1, x0)
|
||||||
if y1 < y0:
|
if y1 < y0:
|
||||||
(y0,y1) = (y1,y0)
|
(y0, y1) = (y1, y0)
|
||||||
LTComponent.__init__(self, (x0,y0,x1,y1))
|
LTComponent.__init__(self, (x0, y0, x1, y1))
|
||||||
if font.is_vertical():
|
if font.is_vertical():
|
||||||
self.size = self.width
|
self.size = self.width
|
||||||
else:
|
else:
|
||||||
|
@ -294,7 +294,7 @@ class LTContainer(LTComponent):
|
||||||
class LTExpandableContainer(LTContainer):
|
class LTExpandableContainer(LTContainer):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
LTContainer.__init__(self, (+INF,+INF,-INF,-INF))
|
LTContainer.__init__(self, (+INF, +INF, -INF, -INF))
|
||||||
return
|
return
|
||||||
|
|
||||||
def add(self, obj):
|
def add(self, obj):
|
||||||
|
@ -314,7 +314,7 @@ class LTTextContainer(LTExpandableContainer, LTText):
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_text(self):
|
def get_text(self):
|
||||||
return ''.join( obj.get_text() for obj in self if isinstance(obj, LTText) )
|
return ''.join(obj.get_text() for obj in self if isinstance(obj, LTText))
|
||||||
|
|
||||||
|
|
||||||
## LTTextLine
|
## LTTextLine
|
||||||
|
@ -339,6 +339,7 @@ class LTTextLine(LTTextContainer):
|
||||||
def find_neighbors(self, plane, ratio):
|
def find_neighbors(self, plane, ratio):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
class LTTextLineHorizontal(LTTextLine):
|
class LTTextLineHorizontal(LTTextLine):
|
||||||
|
|
||||||
def __init__(self, word_margin):
|
def __init__(self, word_margin):
|
||||||
|
@ -358,11 +359,12 @@ class LTTextLineHorizontal(LTTextLine):
|
||||||
def find_neighbors(self, plane, ratio):
|
def find_neighbors(self, plane, ratio):
|
||||||
d = ratio*self.height
|
d = ratio*self.height
|
||||||
objs = plane.find((self.x0, self.y0-d, self.x1, self.y1+d))
|
objs = plane.find((self.x0, self.y0-d, self.x1, self.y1+d))
|
||||||
return [ obj for obj in objs
|
return [obj for obj in objs
|
||||||
if (isinstance(obj, LTTextLineHorizontal) and
|
if (isinstance(obj, LTTextLineHorizontal) and
|
||||||
abs(obj.height-self.height) < d and
|
abs(obj.height-self.height) < d and
|
||||||
(abs(obj.x0-self.x0) < d or
|
(abs(obj.x0-self.x0) < d or
|
||||||
abs(obj.x1-self.x1) < d)) ]
|
abs(obj.x1-self.x1) < d))]
|
||||||
|
|
||||||
|
|
||||||
class LTTextLineVertical(LTTextLine):
|
class LTTextLineVertical(LTTextLine):
|
||||||
|
|
||||||
|
@ -383,11 +385,11 @@ class LTTextLineVertical(LTTextLine):
|
||||||
def find_neighbors(self, plane, ratio):
|
def find_neighbors(self, plane, ratio):
|
||||||
d = ratio*self.width
|
d = ratio*self.width
|
||||||
objs = plane.find((self.x0-d, self.y0, self.x1+d, self.y1))
|
objs = plane.find((self.x0-d, self.y0, self.x1+d, self.y1))
|
||||||
return [ obj for obj in objs
|
return [obj for obj in objs
|
||||||
if (isinstance(obj, LTTextLineVertical) and
|
if (isinstance(obj, LTTextLineVertical) and
|
||||||
abs(obj.width-self.width) < d and
|
abs(obj.width-self.width) < d and
|
||||||
(abs(obj.y0-self.y0) < d or
|
(abs(obj.y0-self.y0) < d or
|
||||||
abs(obj.y1-self.y1) < d)) ]
|
abs(obj.y1-self.y1) < d))]
|
||||||
|
|
||||||
|
|
||||||
## LTTextBox
|
## LTTextBox
|
||||||
|
@ -407,6 +409,7 @@ class LTTextBox(LTTextContainer):
|
||||||
(self.__class__.__name__,
|
(self.__class__.__name__,
|
||||||
self.index, bbox2str(self.bbox), self.get_text()))
|
self.index, bbox2str(self.bbox), self.get_text()))
|
||||||
|
|
||||||
|
|
||||||
class LTTextBoxHorizontal(LTTextBox):
|
class LTTextBoxHorizontal(LTTextBox):
|
||||||
|
|
||||||
def analyze(self, laparams):
|
def analyze(self, laparams):
|
||||||
|
@ -417,6 +420,7 @@ class LTTextBoxHorizontal(LTTextBox):
|
||||||
def get_writing_mode(self):
|
def get_writing_mode(self):
|
||||||
return 'lr-tb'
|
return 'lr-tb'
|
||||||
|
|
||||||
|
|
||||||
class LTTextBoxVertical(LTTextBox):
|
class LTTextBoxVertical(LTTextBox):
|
||||||
|
|
||||||
def analyze(self, laparams):
|
def analyze(self, laparams):
|
||||||
|
@ -437,6 +441,7 @@ class LTTextGroup(LTTextContainer):
|
||||||
self.extend(objs)
|
self.extend(objs)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
class LTTextGroupLRTB(LTTextGroup):
|
class LTTextGroupLRTB(LTTextGroup):
|
||||||
|
|
||||||
def analyze(self, laparams):
|
def analyze(self, laparams):
|
||||||
|
@ -447,6 +452,7 @@ class LTTextGroupLRTB(LTTextGroup):
|
||||||
(1+laparams.boxes_flow)*(obj.y0+obj.y1))
|
(1+laparams.boxes_flow)*(obj.y0+obj.y1))
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
class LTTextGroupTBRL(LTTextGroup):
|
class LTTextGroupTBRL(LTTextGroup):
|
||||||
|
|
||||||
def analyze(self, laparams):
|
def analyze(self, laparams):
|
||||||
|
@ -454,7 +460,7 @@ class LTTextGroupTBRL(LTTextGroup):
|
||||||
# reorder the objects from top-right to bottom-left.
|
# reorder the objects from top-right to bottom-left.
|
||||||
self._objs = csort(self._objs, key=lambda obj:
|
self._objs = csort(self._objs, key=lambda obj:
|
||||||
-(1+laparams.boxes_flow)*(obj.x0+obj.x1)
|
-(1+laparams.boxes_flow)*(obj.x0+obj.x1)
|
||||||
-(1-laparams.boxes_flow)*(obj.y1))
|
- (1-laparams.boxes_flow)*(obj.y1))
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@ -506,8 +512,8 @@ class LTLayoutContainer(LTContainer):
|
||||||
# |<-->|
|
# |<-->|
|
||||||
# (line_overlap)
|
# (line_overlap)
|
||||||
k |= 2
|
k |= 2
|
||||||
if ( (k & 1 and isinstance(line, LTTextLineHorizontal)) or
|
if ((k & 1 and isinstance(line, LTTextLineHorizontal)) or
|
||||||
(k & 2 and isinstance(line, LTTextLineVertical)) ):
|
(k & 2 and isinstance(line, LTTextLineVertical))):
|
||||||
line.add(obj1)
|
line.add(obj1)
|
||||||
elif line is not None:
|
elif line is not None:
|
||||||
yield line
|
yield line
|
||||||
|
@ -555,7 +561,8 @@ class LTLayoutContainer(LTContainer):
|
||||||
done = set()
|
done = set()
|
||||||
for line in lines:
|
for line in lines:
|
||||||
box = boxes[line]
|
box = boxes[line]
|
||||||
if box in done: continue
|
if box in done:
|
||||||
|
continue
|
||||||
done.add(box)
|
done.add(box)
|
||||||
if not box.is_empty():
|
if not box.is_empty():
|
||||||
yield box
|
yield box
|
||||||
|
@ -563,32 +570,34 @@ class LTLayoutContainer(LTContainer):
|
||||||
|
|
||||||
def group_textboxes(self, laparams, boxes):
|
def group_textboxes(self, laparams, boxes):
|
||||||
assert boxes
|
assert boxes
|
||||||
|
|
||||||
def dist(obj1, obj2):
|
def dist(obj1, obj2):
|
||||||
"""A distance function between two TextBoxes.
|
"""A distance function between two TextBoxes.
|
||||||
|
|
||||||
Consider the bounding rectangle for obj1 and obj2.
|
Consider the bounding rectangle for obj1 and obj2.
|
||||||
Return its area less the areas of obj1 and obj2,
|
Return its area less the areas of obj1 and obj2,
|
||||||
shown as 'www' below. This value may be negative.
|
shown as 'www' below. This value may be negative.
|
||||||
+------+..........+ (x1,y1)
|
+------+..........+ (x1, y1)
|
||||||
| obj1 |wwwwwwwwww:
|
| obj1 |wwwwwwwwww:
|
||||||
+------+www+------+
|
+------+www+------+
|
||||||
:wwwwwwwwww| obj2 |
|
:wwwwwwwwww| obj2 |
|
||||||
(x0,y0) +..........+------+
|
(x0, y0) +..........+------+
|
||||||
"""
|
"""
|
||||||
x0 = min(obj1.x0,obj2.x0)
|
x0 = min(obj1.x0, obj2.x0)
|
||||||
y0 = min(obj1.y0,obj2.y0)
|
y0 = min(obj1.y0, obj2.y0)
|
||||||
x1 = max(obj1.x1,obj2.x1)
|
x1 = max(obj1.x1, obj2.x1)
|
||||||
y1 = max(obj1.y1,obj2.y1)
|
y1 = max(obj1.y1, obj2.y1)
|
||||||
return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height)
|
return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height)
|
||||||
|
|
||||||
def isany(obj1, obj2):
|
def isany(obj1, obj2):
|
||||||
"""Check if there's any other object between obj1 and obj2.
|
"""Check if there's any other object between obj1 and obj2.
|
||||||
"""
|
"""
|
||||||
x0 = min(obj1.x0,obj2.x0)
|
x0 = min(obj1.x0, obj2.x0)
|
||||||
y0 = min(obj1.y0,obj2.y0)
|
y0 = min(obj1.y0, obj2.y0)
|
||||||
x1 = max(obj1.x1,obj2.x1)
|
x1 = max(obj1.x1, obj2.x1)
|
||||||
y1 = max(obj1.y1,obj2.y1)
|
y1 = max(obj1.y1, obj2.y1)
|
||||||
objs = set(plane.find((x0,y0,x1,y1)))
|
objs = set(plane.find((x0, y0, x1, y1)))
|
||||||
return objs.difference((obj1,obj2))
|
return objs.difference((obj1, obj2))
|
||||||
# XXX this still takes O(n^2) :(
|
# XXX this still takes O(n^2) :(
|
||||||
dists = []
|
dists = []
|
||||||
for i in xrange(len(boxes)):
|
for i in xrange(len(boxes)):
|
||||||
|
@ -600,23 +609,23 @@ class LTLayoutContainer(LTContainer):
|
||||||
plane = Plane(self.bbox)
|
plane = Plane(self.bbox)
|
||||||
plane.extend(boxes)
|
plane.extend(boxes)
|
||||||
while dists:
|
while dists:
|
||||||
(c,d,obj1,obj2) = dists.pop(0)
|
(c, d, obj1, obj2) = dists.pop(0)
|
||||||
if c == 0 and isany(obj1, obj2):
|
if c == 0 and isany(obj1, obj2):
|
||||||
dists.append((1,d,obj1,obj2))
|
dists.append((1, d, obj1, obj2))
|
||||||
continue
|
continue
|
||||||
if (isinstance(obj1, LTTextBoxVertical) or
|
if (isinstance(obj1, LTTextBoxVertical) or
|
||||||
isinstance(obj1, LTTextGroupTBRL) or
|
isinstance(obj1, LTTextGroupTBRL) or
|
||||||
isinstance(obj2, LTTextBoxVertical) or
|
isinstance(obj2, LTTextBoxVertical) or
|
||||||
isinstance(obj2, LTTextGroupTBRL)):
|
isinstance(obj2, LTTextGroupTBRL)):
|
||||||
group = LTTextGroupTBRL([obj1,obj2])
|
group = LTTextGroupTBRL([obj1, obj2])
|
||||||
else:
|
else:
|
||||||
group = LTTextGroupLRTB([obj1,obj2])
|
group = LTTextGroupLRTB([obj1, obj2])
|
||||||
plane.remove(obj1)
|
plane.remove(obj1)
|
||||||
plane.remove(obj2)
|
plane.remove(obj2)
|
||||||
# this line is optimized -- don't change without profiling
|
# this line is optimized -- don't change without profiling
|
||||||
dists = [ n for n in dists if n[2] in plane._objs and n[3] in plane._objs ]
|
dists = [n for n in dists if n[2] in plane._objs and n[3] in plane._objs]
|
||||||
for other in plane:
|
for other in plane:
|
||||||
dists.append((0, dist(group,other), group, other))
|
dists.append((0, dist(group, other), group, other))
|
||||||
dists.sort()
|
dists.sort()
|
||||||
plane.add(group)
|
plane.add(group)
|
||||||
assert len(plane) == 1
|
assert len(plane) == 1
|
||||||
|
@ -628,21 +637,22 @@ class LTLayoutContainer(LTContainer):
|
||||||
(textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self._objs)
|
(textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self._objs)
|
||||||
for obj in otherobjs:
|
for obj in otherobjs:
|
||||||
obj.analyze(laparams)
|
obj.analyze(laparams)
|
||||||
if not textobjs: return
|
if not textobjs:
|
||||||
|
return
|
||||||
textlines = list(self.get_textlines(laparams, textobjs))
|
textlines = list(self.get_textlines(laparams, textobjs))
|
||||||
assert len(textobjs) <= sum( len(line._objs) for line in textlines )
|
assert len(textobjs) <= sum(len(line._objs) for line in textlines)
|
||||||
(empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)
|
(empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)
|
||||||
for obj in empties:
|
for obj in empties:
|
||||||
obj.analyze(laparams)
|
obj.analyze(laparams)
|
||||||
textboxes = list(self.get_textboxes(laparams, textlines))
|
textboxes = list(self.get_textboxes(laparams, textlines))
|
||||||
assert len(textlines) == sum( len(box._objs) for box in textboxes )
|
assert len(textlines) == sum(len(box._objs) for box in textboxes)
|
||||||
if textboxes:
|
if textboxes:
|
||||||
self.groups = self.group_textboxes(laparams, textboxes)
|
self.groups = self.group_textboxes(laparams, textboxes)
|
||||||
assigner = IndexAssigner()
|
assigner = IndexAssigner()
|
||||||
for group in self.groups:
|
for group in self.groups:
|
||||||
group.analyze(laparams)
|
group.analyze(laparams)
|
||||||
assigner.run(group)
|
assigner.run(group)
|
||||||
textboxes.sort(key=lambda box:box.index)
|
textboxes.sort(key=lambda box: box.index)
|
||||||
self._objs = textboxes + otherobjs + empties
|
self._objs = textboxes + otherobjs + empties
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -654,9 +664,9 @@ class LTFigure(LTLayoutContainer):
|
||||||
def __init__(self, name, bbox, matrix):
|
def __init__(self, name, bbox, matrix):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.matrix = matrix
|
self.matrix = matrix
|
||||||
(x,y,w,h) = bbox
|
(x, y, w, h) = bbox
|
||||||
bbox = get_bound( apply_matrix_pt(matrix, (p,q))
|
bbox = get_bound(apply_matrix_pt(matrix, (p, q))
|
||||||
for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) )
|
for (p, q) in ((x, y), (x+w, y), (x, y+h), (x+w, y+h)))
|
||||||
LTLayoutContainer.__init__(self, bbox)
|
LTLayoutContainer.__init__(self, bbox)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -666,7 +676,8 @@ class LTFigure(LTLayoutContainer):
|
||||||
bbox2str(self.bbox), matrix2str(self.matrix)))
|
bbox2str(self.bbox), matrix2str(self.matrix)))
|
||||||
|
|
||||||
def analyze(self, laparams):
|
def analyze(self, laparams):
|
||||||
if not laparams.all_texts: return
|
if not laparams.all_texts:
|
||||||
|
return
|
||||||
LTLayoutContainer.analyze(self, laparams)
|
LTLayoutContainer.analyze(self, laparams)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
|
@ -34,17 +34,18 @@ class LZWDecoder(object):
|
||||||
# |-----8-bits-----|
|
# |-----8-bits-----|
|
||||||
# |-bpos-|-bits-| |
|
# |-bpos-|-bits-| |
|
||||||
# | |----r----|
|
# | |----r----|
|
||||||
v = (v<<bits) | ((self.buff>>(r-bits)) & ((1<<bits)-1))
|
v = (v << bits) | ((self.buff >> (r-bits)) & ((1 << bits)-1))
|
||||||
self.bpos += bits
|
self.bpos += bits
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
# |-----8-bits-----|
|
# |-----8-bits-----|
|
||||||
# |-bpos-|---bits----...
|
# |-bpos-|---bits----...
|
||||||
# | |----r----|
|
# | |----r----|
|
||||||
v = (v<<r) | (self.buff & ((1<<r)-1))
|
v = (v << r) | (self.buff & ((1 << r)-1))
|
||||||
bits -= r
|
bits -= r
|
||||||
x = self.fp.read(1)
|
x = self.fp.read(1)
|
||||||
if not x: raise EOFError
|
if not x:
|
||||||
|
raise EOFError
|
||||||
self.buff = ord(x)
|
self.buff = ord(x)
|
||||||
self.bpos = 0
|
self.bpos = 0
|
||||||
return v
|
return v
|
||||||
|
@ -52,9 +53,9 @@ class LZWDecoder(object):
|
||||||
def feed(self, code):
|
def feed(self, code):
|
||||||
x = ''
|
x = ''
|
||||||
if code == 256:
|
if code == 256:
|
||||||
self.table = [ chr(c) for c in xrange(256) ] # 0-255
|
self.table = [chr(c) for c in xrange(256)] # 0-255
|
||||||
self.table.append(None) # 256
|
self.table.append(None) # 256
|
||||||
self.table.append(None) # 257
|
self.table.append(None) # 257
|
||||||
self.prevbuf = ''
|
self.prevbuf = ''
|
||||||
self.nbits = 9
|
self.nbits = 9
|
||||||
elif code == 257:
|
elif code == 257:
|
||||||
|
@ -97,6 +98,7 @@ class LZWDecoder(object):
|
||||||
(self.nbits, code, x, self.table[258:]))
|
(self.nbits, code, x, self.table[258:]))
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
# lzwdecode
|
# lzwdecode
|
||||||
def lzwdecode(data):
|
def lzwdecode(data):
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -8,6 +8,7 @@ LITERAL_DEVICE_GRAY = LIT('DeviceGray')
|
||||||
LITERAL_DEVICE_RGB = LIT('DeviceRGB')
|
LITERAL_DEVICE_RGB = LIT('DeviceRGB')
|
||||||
LITERAL_DEVICE_CMYK = LIT('DeviceCMYK')
|
LITERAL_DEVICE_CMYK = LIT('DeviceCMYK')
|
||||||
|
|
||||||
|
|
||||||
class PDFColorSpace(object):
|
class PDFColorSpace(object):
|
||||||
|
|
||||||
def __init__(self, name, ncomponents):
|
def __init__(self, name, ncomponents):
|
||||||
|
@ -20,14 +21,14 @@ class PDFColorSpace(object):
|
||||||
|
|
||||||
|
|
||||||
PREDEFINED_COLORSPACE = dict(
|
PREDEFINED_COLORSPACE = dict(
|
||||||
(name, PDFColorSpace(name,n)) for (name,n) in {
|
(name, PDFColorSpace(name, n)) for (name, n) in {
|
||||||
'CalRGB': 3,
|
'CalRGB': 3,
|
||||||
'CalGray': 1,
|
'CalGray': 1,
|
||||||
'Lab': 3,
|
'Lab': 3,
|
||||||
'DeviceRGB': 3,
|
'DeviceRGB': 3,
|
||||||
'DeviceCMYK': 4,
|
'DeviceCMYK': 4,
|
||||||
'DeviceGray': 1,
|
'DeviceGray': 1,
|
||||||
'Separation': 1,
|
'Separation': 1,
|
||||||
'Indexed': 1,
|
'Indexed': 1,
|
||||||
'Pattern': 1,
|
'Pattern': 1,
|
||||||
}.iteritems())
|
}.iteritems())
|
||||||
|
|
|
@ -28,24 +28,31 @@ class PDFDevice(object):
|
||||||
|
|
||||||
def begin_tag(self, tag, props=None):
|
def begin_tag(self, tag, props=None):
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_tag(self):
|
def end_tag(self):
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_tag(self, tag, props=None):
|
def do_tag(self, tag, props=None):
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_page(self, page, ctm):
|
def begin_page(self, page, ctm):
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_page(self, page):
|
def end_page(self, page):
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_figure(self, name, bbox, matrix):
|
def begin_figure(self, name, bbox, matrix):
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_figure(self, name):
|
def end_figure(self, name):
|
||||||
return
|
return
|
||||||
|
|
||||||
def paint_path(self, graphicstate, stroke, fill, evenodd, path):
|
def paint_path(self, graphicstate, stroke, fill, evenodd, path):
|
||||||
return
|
return
|
||||||
|
|
||||||
def render_image(self, name, stream):
|
def render_image(self, name, stream):
|
||||||
return
|
return
|
||||||
|
|
||||||
def render_string(self, textstate, seq):
|
def render_string(self, textstate, seq):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -75,7 +82,7 @@ class PDFTextDevice(PDFDevice):
|
||||||
scaling, charspace, wordspace, rise, dxscale)
|
scaling, charspace, wordspace, rise, dxscale)
|
||||||
return
|
return
|
||||||
|
|
||||||
def render_string_horizontal(self, seq, matrix, (x,y),
|
def render_string_horizontal(self, seq, matrix, (x, y),
|
||||||
font, fontsize, scaling, charspace, wordspace, rise, dxscale):
|
font, fontsize, scaling, charspace, wordspace, rise, dxscale):
|
||||||
needcharspace = False
|
needcharspace = False
|
||||||
for obj in seq:
|
for obj in seq:
|
||||||
|
@ -86,14 +93,14 @@ class PDFTextDevice(PDFDevice):
|
||||||
for cid in font.decode(obj):
|
for cid in font.decode(obj):
|
||||||
if needcharspace:
|
if needcharspace:
|
||||||
x += charspace
|
x += charspace
|
||||||
x += self.render_char(translate_matrix(matrix, (x,y)),
|
x += self.render_char(translate_matrix(matrix, (x, y)),
|
||||||
font, fontsize, scaling, rise, cid)
|
font, fontsize, scaling, rise, cid)
|
||||||
if cid == 32 and wordspace:
|
if cid == 32 and wordspace:
|
||||||
x += wordspace
|
x += wordspace
|
||||||
needcharspace = True
|
needcharspace = True
|
||||||
return (x, y)
|
return (x, y)
|
||||||
|
|
||||||
def render_string_vertical(self, seq, matrix, (x,y),
|
def render_string_vertical(self, seq, matrix, (x, y),
|
||||||
font, fontsize, scaling, charspace, wordspace, rise, dxscale):
|
font, fontsize, scaling, charspace, wordspace, rise, dxscale):
|
||||||
needcharspace = False
|
needcharspace = False
|
||||||
for obj in seq:
|
for obj in seq:
|
||||||
|
@ -104,7 +111,7 @@ class PDFTextDevice(PDFDevice):
|
||||||
for cid in font.decode(obj):
|
for cid in font.decode(obj):
|
||||||
if needcharspace:
|
if needcharspace:
|
||||||
y += charspace
|
y += charspace
|
||||||
y += self.render_char(translate_matrix(matrix, (x,y)),
|
y += self.render_char(translate_matrix(matrix, (x, y)),
|
||||||
font, fontsize, scaling, rise, cid)
|
font, fontsize, scaling, rise, cid)
|
||||||
if cid == 32 and wordspace:
|
if cid == 32 and wordspace:
|
||||||
y += wordspace
|
y += wordspace
|
||||||
|
@ -132,7 +139,8 @@ class TagExtractor(PDFDevice):
|
||||||
font = textstate.font
|
font = textstate.font
|
||||||
text = ''
|
text = ''
|
||||||
for obj in seq:
|
for obj in seq:
|
||||||
if not isinstance(obj, str): continue
|
if not isinstance(obj, str):
|
||||||
|
continue
|
||||||
chars = font.decode(obj)
|
chars = font.decode(obj)
|
||||||
for cid in chars:
|
for cid in chars:
|
||||||
try:
|
try:
|
||||||
|
@ -156,8 +164,8 @@ class TagExtractor(PDFDevice):
|
||||||
def begin_tag(self, tag, props=None):
|
def begin_tag(self, tag, props=None):
|
||||||
s = ''
|
s = ''
|
||||||
if isinstance(props, dict):
|
if isinstance(props, dict):
|
||||||
s = ''.join( ' %s="%s"' % (enc(k), enc(str(v))) for (k,v)
|
s = ''.join(' %s="%s"' % (enc(k), enc(str(v))) for (k, v)
|
||||||
in sorted(props.iteritems()) )
|
in sorted(props.iteritems()))
|
||||||
self.outfp.write('<%s%s>' % (enc(tag.name), s))
|
self.outfp.write('<%s%s>' % (enc(tag.name), s))
|
||||||
self._stack.append(tag)
|
self._stack.append(tag)
|
||||||
return
|
return
|
||||||
|
|
|
@ -23,11 +23,24 @@ from utils import decode_text
|
||||||
|
|
||||||
## Exceptions
|
## Exceptions
|
||||||
##
|
##
|
||||||
class PDFNoValidXRef(PDFSyntaxError): pass
|
class PDFNoValidXRef(PDFSyntaxError):
|
||||||
class PDFNoOutlines(PDFException): pass
|
pass
|
||||||
class PDFDestinationNotFound(PDFException): pass
|
|
||||||
class PDFEncryptionError(PDFException): pass
|
|
||||||
class PDFPasswordIncorrect(PDFEncryptionError): pass
|
class PDFNoOutlines(PDFException):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class PDFDestinationNotFound(PDFException):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class PDFEncryptionError(PDFException):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class PDFPasswordIncorrect(PDFEncryptionError):
|
||||||
|
pass
|
||||||
|
|
||||||
# some predefined literals and keywords.
|
# some predefined literals and keywords.
|
||||||
LITERAL_OBJSTM = LIT('ObjStm')
|
LITERAL_OBJSTM = LIT('ObjStm')
|
||||||
|
@ -68,7 +81,8 @@ class PDFXRef(PDFBaseXRef):
|
||||||
while 1:
|
while 1:
|
||||||
try:
|
try:
|
||||||
(pos, line) = parser.nextline()
|
(pos, line) = parser.nextline()
|
||||||
if not line.strip(): continue
|
if not line.strip():
|
||||||
|
continue
|
||||||
except PSEOF:
|
except PSEOF:
|
||||||
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
|
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
|
||||||
if not line:
|
if not line:
|
||||||
|
@ -92,7 +106,8 @@ class PDFXRef(PDFBaseXRef):
|
||||||
if len(f) != 3:
|
if len(f) != 3:
|
||||||
raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line))
|
raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line))
|
||||||
(pos, genno, use) = f
|
(pos, genno, use) = f
|
||||||
if use != 'n': continue
|
if use != 'n':
|
||||||
|
continue
|
||||||
self.offsets[objid] = (None, long(pos), int(genno))
|
self.offsets[objid] = (None, long(pos), int(genno))
|
||||||
if 1 <= debug:
|
if 1 <= debug:
|
||||||
print >>sys.stderr, 'xref objects:', self.offsets
|
print >>sys.stderr, 'xref objects:', self.offsets
|
||||||
|
@ -100,16 +115,17 @@ class PDFXRef(PDFBaseXRef):
|
||||||
return
|
return
|
||||||
|
|
||||||
KEYWORD_TRAILER = KWD('trailer')
|
KEYWORD_TRAILER = KWD('trailer')
|
||||||
|
|
||||||
def load_trailer(self, parser):
|
def load_trailer(self, parser):
|
||||||
try:
|
try:
|
||||||
(_,kwd) = parser.nexttoken()
|
(_, kwd) = parser.nexttoken()
|
||||||
assert kwd is self.KEYWORD_TRAILER
|
assert kwd is self.KEYWORD_TRAILER
|
||||||
(_,dic) = parser.nextobject()
|
(_, dic) = parser.nextobject()
|
||||||
except PSEOF:
|
except PSEOF:
|
||||||
x = parser.pop(1)
|
x = parser.pop(1)
|
||||||
if not x:
|
if not x:
|
||||||
raise PDFNoValidXRef('Unexpected EOF - file corrupted')
|
raise PDFNoValidXRef('Unexpected EOF - file corrupted')
|
||||||
(_,dic) = x[0]
|
(_, dic) = x[0]
|
||||||
self.trailer.update(dict_value(dic))
|
self.trailer.update(dict_value(dic))
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -134,6 +150,7 @@ class PDFXRefFallback(PDFXRef):
|
||||||
return '<PDFXRefFallback: offsets=%r>' % (self.offsets.keys())
|
return '<PDFXRefFallback: offsets=%r>' % (self.offsets.keys())
|
||||||
|
|
||||||
PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
|
PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
|
||||||
|
|
||||||
def load(self, parser, debug=0):
|
def load(self, parser, debug=0):
|
||||||
parser.seek(0)
|
parser.seek(0)
|
||||||
while 1:
|
while 1:
|
||||||
|
@ -148,14 +165,15 @@ class PDFXRefFallback(PDFXRef):
|
||||||
print >>sys.stderr, 'trailer: %r' % self.get_trailer()
|
print >>sys.stderr, 'trailer: %r' % self.get_trailer()
|
||||||
break
|
break
|
||||||
m = self.PDFOBJ_CUE.match(line)
|
m = self.PDFOBJ_CUE.match(line)
|
||||||
if not m: continue
|
if not m:
|
||||||
|
continue
|
||||||
(objid, genno) = m.groups()
|
(objid, genno) = m.groups()
|
||||||
objid = int(objid)
|
objid = int(objid)
|
||||||
genno = int(genno)
|
genno = int(genno)
|
||||||
self.offsets[objid] = (None, pos, genno)
|
self.offsets[objid] = (None, pos, genno)
|
||||||
# expand ObjStm.
|
# expand ObjStm.
|
||||||
parser.seek(pos)
|
parser.seek(pos)
|
||||||
(_,obj) = parser.nextobject()
|
(_, obj) = parser.nextobject()
|
||||||
if isinstance(obj, PDFStream) and obj.get('Type') is LITERAL_OBJSTM:
|
if isinstance(obj, PDFStream) and obj.get('Type') is LITERAL_OBJSTM:
|
||||||
stream = stream_value(obj)
|
stream = stream_value(obj)
|
||||||
try:
|
try:
|
||||||
|
@ -168,7 +186,7 @@ class PDFXRefFallback(PDFXRef):
|
||||||
objs = []
|
objs = []
|
||||||
try:
|
try:
|
||||||
while 1:
|
while 1:
|
||||||
(_,obj) = parser1.nextobject()
|
(_, obj) = parser1.nextobject()
|
||||||
objs.append(obj)
|
objs.append(obj)
|
||||||
except PSEOF:
|
except PSEOF:
|
||||||
pass
|
pass
|
||||||
|
@ -193,14 +211,14 @@ class PDFXRefStream(PDFBaseXRef):
|
||||||
return '<PDFXRefStream: ranges=%r>' % (self.ranges)
|
return '<PDFXRefStream: ranges=%r>' % (self.ranges)
|
||||||
|
|
||||||
def load(self, parser, debug=0):
|
def load(self, parser, debug=0):
|
||||||
(_,objid) = parser.nexttoken() # ignored
|
(_, objid) = parser.nexttoken() # ignored
|
||||||
(_,genno) = parser.nexttoken() # ignored
|
(_, genno) = parser.nexttoken() # ignored
|
||||||
(_,kwd) = parser.nexttoken()
|
(_, kwd) = parser.nexttoken()
|
||||||
(_,stream) = parser.nextobject()
|
(_, stream) = parser.nextobject()
|
||||||
if not isinstance(stream, PDFStream) or stream['Type'] is not LITERAL_XREF:
|
if not isinstance(stream, PDFStream) or stream['Type'] is not LITERAL_XREF:
|
||||||
raise PDFNoValidXRef('Invalid PDF stream spec.')
|
raise PDFNoValidXRef('Invalid PDF stream spec.')
|
||||||
size = stream['Size']
|
size = stream['Size']
|
||||||
index_array = stream.get('Index', (1,size))
|
index_array = stream.get('Index', (1, size))
|
||||||
if len(index_array) % 2 != 0:
|
if len(index_array) % 2 != 0:
|
||||||
raise PDFSyntaxError('Invalid index number')
|
raise PDFSyntaxError('Invalid index number')
|
||||||
self.ranges.extend(choplist(2, index_array))
|
self.ranges.extend(choplist(2, index_array))
|
||||||
|
@ -210,22 +228,22 @@ class PDFXRefStream(PDFBaseXRef):
|
||||||
self.trailer = stream.attrs
|
self.trailer = stream.attrs
|
||||||
if 1 <= debug:
|
if 1 <= debug:
|
||||||
print >>sys.stderr, ('xref stream: objid=%s, fields=%d,%d,%d' %
|
print >>sys.stderr, ('xref stream: objid=%s, fields=%d,%d,%d' %
|
||||||
(', '.join(map(repr, self.ranges)),
|
(', '.join(map(repr, self.ranges)),
|
||||||
self.fl1, self.fl2, self.fl3))
|
self.fl1, self.fl2, self.fl3))
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_trailer(self):
|
def get_trailer(self):
|
||||||
return self.trailer
|
return self.trailer
|
||||||
|
|
||||||
def get_objids(self):
|
def get_objids(self):
|
||||||
for (start,nobjs) in self.ranges:
|
for (start, nobjs) in self.ranges:
|
||||||
for i in xrange(nobjs):
|
for i in xrange(nobjs):
|
||||||
yield start+i
|
yield start+i
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_pos(self, objid):
|
def get_pos(self, objid):
|
||||||
index = 0
|
index = 0
|
||||||
for (start,nobjs) in self.ranges:
|
for (start, nobjs) in self.ranges:
|
||||||
if start <= objid and objid < start+nobjs:
|
if start <= objid and objid < start+nobjs:
|
||||||
index += objid - start
|
index += objid - start
|
||||||
else:
|
else:
|
||||||
|
@ -292,7 +310,8 @@ class PDFDocument(object):
|
||||||
self.xrefs.append(xref)
|
self.xrefs.append(xref)
|
||||||
for xref in self.xrefs:
|
for xref in self.xrefs:
|
||||||
trailer = xref.get_trailer()
|
trailer = xref.get_trailer()
|
||||||
if not trailer: continue
|
if not trailer:
|
||||||
|
continue
|
||||||
# If there's an encryption info, remember it.
|
# If there's an encryption info, remember it.
|
||||||
if 'Encrypt' in trailer:
|
if 'Encrypt' in trailer:
|
||||||
#assert not self.encryption
|
#assert not self.encryption
|
||||||
|
@ -316,6 +335,7 @@ class PDFDocument(object):
|
||||||
# This step is mandatory even if there's no password associated
|
# This step is mandatory even if there's no password associated
|
||||||
# with the document.
|
# with the document.
|
||||||
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
|
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
|
||||||
|
|
||||||
def initialize(self, password=''):
|
def initialize(self, password=''):
|
||||||
if not self.encryption:
|
if not self.encryption:
|
||||||
self.is_printable = self.is_modifiable = self.is_extractable = True
|
self.is_printable = self.is_modifiable = self.is_extractable = True
|
||||||
|
@ -326,9 +346,9 @@ class PDFDocument(object):
|
||||||
V = int_value(param.get('V', 0))
|
V = int_value(param.get('V', 0))
|
||||||
if not (V == 1 or V == 2):
|
if not (V == 1 or V == 2):
|
||||||
raise PDFEncryptionError('Unknown algorithm: param=%r' % param)
|
raise PDFEncryptionError('Unknown algorithm: param=%r' % param)
|
||||||
length = int_value(param.get('Length', 40)) # Key length (bits)
|
length = int_value(param.get('Length', 40)) # Key length (bits)
|
||||||
O = str_value(param['O'])
|
O = str_value(param['O'])
|
||||||
R = int_value(param['R']) # Revision
|
R = int_value(param['R']) # Revision
|
||||||
if 5 <= R:
|
if 5 <= R:
|
||||||
raise PDFEncryptionError('Unknown revision: %r' % R)
|
raise PDFEncryptionError('Unknown revision: %r' % R)
|
||||||
U = str_value(param['U'])
|
U = str_value(param['U'])
|
||||||
|
@ -337,11 +357,11 @@ class PDFDocument(object):
|
||||||
self.is_modifiable = bool(P & 8)
|
self.is_modifiable = bool(P & 8)
|
||||||
self.is_extractable = bool(P & 16)
|
self.is_extractable = bool(P & 16)
|
||||||
# Algorithm 3.2
|
# Algorithm 3.2
|
||||||
password = (password+self.PASSWORD_PADDING)[:32] # 1
|
password = (password+self.PASSWORD_PADDING)[:32] # 1
|
||||||
hash = md5.md5(password) # 2
|
hash = md5.md5(password) # 2
|
||||||
hash.update(O) # 3
|
hash.update(O) # 3
|
||||||
hash.update(struct.pack('<l', P)) # 4
|
hash.update(struct.pack('<l', P)) # 4
|
||||||
hash.update(docid[0]) # 5
|
hash.update(docid[0]) # 5
|
||||||
if 4 <= R:
|
if 4 <= R:
|
||||||
# 6
|
# 6
|
||||||
raise PDFNotImplementedError('Revision 4 encryption is currently unsupported')
|
raise PDFNotImplementedError('Revision 4 encryption is currently unsupported')
|
||||||
|
@ -355,13 +375,13 @@ class PDFDocument(object):
|
||||||
u1 = Arcfour(key).process(self.PASSWORD_PADDING)
|
u1 = Arcfour(key).process(self.PASSWORD_PADDING)
|
||||||
elif R == 3:
|
elif R == 3:
|
||||||
# Algorithm 3.5
|
# Algorithm 3.5
|
||||||
hash = md5.md5(self.PASSWORD_PADDING) # 2
|
hash = md5.md5(self.PASSWORD_PADDING) # 2
|
||||||
hash.update(docid[0]) # 3
|
hash.update(docid[0]) # 3
|
||||||
x = Arcfour(key).process(hash.digest()[:16]) # 4
|
x = Arcfour(key).process(hash.digest()[:16]) # 4
|
||||||
for i in xrange(1,19+1):
|
for i in xrange(1, 19+1):
|
||||||
k = ''.join( chr(ord(c) ^ i) for c in key )
|
k = ''.join(chr(ord(c) ^ i) for c in key)
|
||||||
x = Arcfour(k).process(x)
|
x = Arcfour(k).process(x)
|
||||||
u1 = x+x # 32bytes total
|
u1 = x+x # 32bytes total
|
||||||
if R == 2:
|
if R == 2:
|
||||||
is_authenticated = (u1 == U)
|
is_authenticated = (u1 == U)
|
||||||
else:
|
else:
|
||||||
|
@ -373,18 +393,18 @@ class PDFDocument(object):
|
||||||
return
|
return
|
||||||
|
|
||||||
def decrypt_rc4(self, objid, genno, data):
|
def decrypt_rc4(self, objid, genno, data):
|
||||||
key = self.decrypt_key + struct.pack('<L',objid)[:3]+struct.pack('<L',genno)[:2]
|
key = self.decrypt_key + struct.pack('<L', objid)[:3]+struct.pack('<L', genno)[:2]
|
||||||
hash = md5.md5(key)
|
hash = md5.md5(key)
|
||||||
key = hash.digest()[:min(len(key),16)]
|
key = hash.digest()[:min(len(key), 16)]
|
||||||
return Arcfour(key).process(data)
|
return Arcfour(key).process(data)
|
||||||
|
|
||||||
def _getobj_objstm(self, stream, index, objid):
|
def _getobj_objstm(self, stream, index, objid):
|
||||||
if stream.objid in self._parsed_objs:
|
if stream.objid in self._parsed_objs:
|
||||||
(objs,n) = self._parsed_objs[stream.objid]
|
(objs, n) = self._parsed_objs[stream.objid]
|
||||||
else:
|
else:
|
||||||
(objs,n) = self._get_objects(stream)
|
(objs, n) = self._get_objects(stream)
|
||||||
if self.caching:
|
if self.caching:
|
||||||
self._parsed_objs[stream.objid] = (objs,n)
|
self._parsed_objs[stream.objid] = (objs, n)
|
||||||
i = n*2+index
|
i = n*2+index
|
||||||
try:
|
try:
|
||||||
obj = objs[i]
|
obj = objs[i]
|
||||||
|
@ -407,23 +427,24 @@ class PDFDocument(object):
|
||||||
objs = []
|
objs = []
|
||||||
try:
|
try:
|
||||||
while 1:
|
while 1:
|
||||||
(_,obj) = parser.nextobject()
|
(_, obj) = parser.nextobject()
|
||||||
objs.append(obj)
|
objs.append(obj)
|
||||||
except PSEOF:
|
except PSEOF:
|
||||||
pass
|
pass
|
||||||
return (objs, n)
|
return (objs, n)
|
||||||
|
|
||||||
KEYWORD_OBJ = KWD('obj')
|
KEYWORD_OBJ = KWD('obj')
|
||||||
|
|
||||||
def _getobj_parse(self, pos, objid):
|
def _getobj_parse(self, pos, objid):
|
||||||
self._parser.seek(pos)
|
self._parser.seek(pos)
|
||||||
(_,objid1) = self._parser.nexttoken() # objid
|
(_, objid1) = self._parser.nexttoken() # objid
|
||||||
if objid1 != objid:
|
if objid1 != objid:
|
||||||
raise PDFSyntaxError('objid mismatch: %r=%r' % (objid1, objid))
|
raise PDFSyntaxError('objid mismatch: %r=%r' % (objid1, objid))
|
||||||
(_,genno) = self._parser.nexttoken() # genno
|
(_, genno) = self._parser.nexttoken() # genno
|
||||||
(_,kwd) = self._parser.nexttoken()
|
(_, kwd) = self._parser.nexttoken()
|
||||||
if kwd is not self.KEYWORD_OBJ:
|
if kwd is not self.KEYWORD_OBJ:
|
||||||
raise PDFSyntaxError('Invalid object spec: offset=%r' % pos)
|
raise PDFSyntaxError('Invalid object spec: offset=%r' % pos)
|
||||||
(_,obj) = self._parser.nextobject()
|
(_, obj) = self._parser.nextobject()
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
# can raise PDFObjectNotFound
|
# can raise PDFObjectNotFound
|
||||||
|
@ -465,6 +486,7 @@ class PDFDocument(object):
|
||||||
def get_outlines(self):
|
def get_outlines(self):
|
||||||
if 'Outlines' not in self.catalog:
|
if 'Outlines' not in self.catalog:
|
||||||
raise PDFNoOutlines
|
raise PDFNoOutlines
|
||||||
|
|
||||||
def search(entry, level):
|
def search(entry, level):
|
||||||
entry = dict_value(entry)
|
entry = dict_value(entry)
|
||||||
if 'Title' in entry:
|
if 'Title' in entry:
|
||||||
|
@ -487,13 +509,15 @@ class PDFDocument(object):
|
||||||
try:
|
try:
|
||||||
names = dict_value(self.catalog['Names'])
|
names = dict_value(self.catalog['Names'])
|
||||||
except (PDFTypeError, KeyError):
|
except (PDFTypeError, KeyError):
|
||||||
raise KeyError((cat,key))
|
raise KeyError((cat, key))
|
||||||
# may raise KeyError
|
# may raise KeyError
|
||||||
d0 = dict_value(names[cat])
|
d0 = dict_value(names[cat])
|
||||||
|
|
||||||
def lookup(d):
|
def lookup(d):
|
||||||
if 'Limits' in d:
|
if 'Limits' in d:
|
||||||
(k1,k2) = list_value(d['Limits'])
|
(k1, k2) = list_value(d['Limits'])
|
||||||
if key < k1 or k2 < key: return None
|
if key < k1 or k2 < key:
|
||||||
|
return None
|
||||||
if 'Names' in d:
|
if 'Names' in d:
|
||||||
objs = list_value(d['Names'])
|
objs = list_value(d['Names'])
|
||||||
names = dict(choplist(2, objs))
|
names = dict(choplist(2, objs))
|
||||||
|
@ -501,8 +525,9 @@ class PDFDocument(object):
|
||||||
if 'Kids' in d:
|
if 'Kids' in d:
|
||||||
for c in list_value(d['Kids']):
|
for c in list_value(d['Kids']):
|
||||||
v = lookup(dict_value(c))
|
v = lookup(dict_value(c))
|
||||||
if v: return v
|
if v:
|
||||||
raise KeyError((cat,key))
|
return v
|
||||||
|
raise KeyError((cat, key))
|
||||||
return lookup(d0)
|
return lookup(d0)
|
||||||
|
|
||||||
def get_dest(self, name):
|
def get_dest(self, name):
|
||||||
|
@ -528,7 +553,8 @@ class PDFDocument(object):
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if 2 <= self.debug:
|
if 2 <= self.debug:
|
||||||
print >>sys.stderr, 'find_xref: %r' % line
|
print >>sys.stderr, 'find_xref: %r' % line
|
||||||
if line == 'startxref': break
|
if line == 'startxref':
|
||||||
|
break
|
||||||
if line:
|
if line:
|
||||||
prev = line
|
prev = line
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -25,13 +25,13 @@ def get_widths(seq):
|
||||||
if isinstance(v, list):
|
if isinstance(v, list):
|
||||||
if r:
|
if r:
|
||||||
char1 = r[-1]
|
char1 = r[-1]
|
||||||
for (i,w) in enumerate(v):
|
for (i, w) in enumerate(v):
|
||||||
widths[char1+i] = w
|
widths[char1+i] = w
|
||||||
r = []
|
r = []
|
||||||
elif isinstance(v, int):
|
elif isinstance(v, int):
|
||||||
r.append(v)
|
r.append(v)
|
||||||
if len(r) == 3:
|
if len(r) == 3:
|
||||||
(char1,char2,w) = r
|
(char1, char2, w) = r
|
||||||
for i in xrange(char1, char2+1):
|
for i in xrange(char1, char2+1):
|
||||||
widths[i] = w
|
widths[i] = w
|
||||||
r = []
|
r = []
|
||||||
|
@ -40,6 +40,7 @@ def get_widths(seq):
|
||||||
#assert get_widths([1,2,3]) == {1:3, 2:3}
|
#assert get_widths([1,2,3]) == {1:3, 2:3}
|
||||||
#assert get_widths([1,[2,3],6,[7,8]]) == {1:2,2:3, 6:7,7:8}
|
#assert get_widths([1,[2,3],6,[7,8]]) == {1:2,2:3, 6:7,7:8}
|
||||||
|
|
||||||
|
|
||||||
def get_widths2(seq):
|
def get_widths2(seq):
|
||||||
widths = {}
|
widths = {}
|
||||||
r = []
|
r = []
|
||||||
|
@ -47,20 +48,20 @@ def get_widths2(seq):
|
||||||
if isinstance(v, list):
|
if isinstance(v, list):
|
||||||
if r:
|
if r:
|
||||||
char1 = r[-1]
|
char1 = r[-1]
|
||||||
for (i,(w,vx,vy)) in enumerate(choplist(3,v)):
|
for (i, (w, vx, vy)) in enumerate(choplist(3, v)):
|
||||||
widths[char1+i] = (w,(vx,vy))
|
widths[char1+i] = (w, (vx, vy))
|
||||||
r = []
|
r = []
|
||||||
elif isinstance(v, int):
|
elif isinstance(v, int):
|
||||||
r.append(v)
|
r.append(v)
|
||||||
if len(r) == 5:
|
if len(r) == 5:
|
||||||
(char1,char2,w,vx,vy) = r
|
(char1, char2, w, vx, vy) = r
|
||||||
for i in xrange(char1, char2+1):
|
for i in xrange(char1, char2+1):
|
||||||
widths[i] = (w,(vx,vy))
|
widths[i] = (w, (vx, vy))
|
||||||
r = []
|
r = []
|
||||||
return widths
|
return widths
|
||||||
#assert get_widths2([1]) == {}
|
#assert get_widths2([1]) == {}
|
||||||
#assert get_widths2([1,2,3,4,5]) == {1:(3,(4,5)), 2:(3,(4,5))}
|
#assert get_widths2([1,2,3,4,5]) == {1:(3, (4,5)), 2:(3, (4,5))}
|
||||||
#assert get_widths2([1,[2,3,4,5],6,[7,8,9]]) == {1:(2,(3,4)), 6:(7,(8,9))}
|
#assert get_widths2([1,[2,3,4,5],6,[7,8,9]]) == {1:(2, (3,4)), 6:(7, (8,9))}
|
||||||
|
|
||||||
|
|
||||||
## FontMetricsDB
|
## FontMetricsDB
|
||||||
|
@ -94,7 +95,7 @@ class Type1FontHeaderParser(PSStackParser):
|
||||||
def get_encoding(self):
|
def get_encoding(self):
|
||||||
while 1:
|
while 1:
|
||||||
try:
|
try:
|
||||||
(cid,name) = self.nextobject()
|
(cid, name) = self.nextobject()
|
||||||
except PSEOF:
|
except PSEOF:
|
||||||
break
|
break
|
||||||
try:
|
try:
|
||||||
|
@ -105,25 +106,28 @@ class Type1FontHeaderParser(PSStackParser):
|
||||||
|
|
||||||
def do_keyword(self, pos, token):
|
def do_keyword(self, pos, token):
|
||||||
if token is self.KEYWORD_PUT:
|
if token is self.KEYWORD_PUT:
|
||||||
((_,key),(_,value)) = self.pop(2)
|
((_, key), (_, value)) = self.pop(2)
|
||||||
if (isinstance(key, int) and
|
if (isinstance(key, int) and
|
||||||
isinstance(value, PSLiteral)):
|
isinstance(value, PSLiteral)):
|
||||||
self.add_results((key, literal_name(value)))
|
self.add_results((key, literal_name(value)))
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-')
|
||||||
|
|
||||||
|
|
||||||
## CFFFont
|
## CFFFont
|
||||||
## (Format specified in Adobe Technical Note: #5176
|
## (Format specified in Adobe Technical Note: #5176
|
||||||
## "The Compact Font Format Specification")
|
## "The Compact Font Format Specification")
|
||||||
##
|
##
|
||||||
NIBBLES = ('0','1','2','3','4','5','6','7','8','9','.','e','e-',None,'-')
|
|
||||||
def getdict(data):
|
def getdict(data):
|
||||||
d = {}
|
d = {}
|
||||||
fp = StringIO(data)
|
fp = StringIO(data)
|
||||||
stack = []
|
stack = []
|
||||||
while 1:
|
while 1:
|
||||||
c = fp.read(1)
|
c = fp.read(1)
|
||||||
if not c: break
|
if not c:
|
||||||
|
break
|
||||||
b0 = ord(c)
|
b0 = ord(c)
|
||||||
if b0 <= 21:
|
if b0 <= 21:
|
||||||
d[b0] = stack
|
d[b0] = stack
|
||||||
|
@ -145,19 +149,21 @@ def getdict(data):
|
||||||
else:
|
else:
|
||||||
b1 = ord(fp.read(1))
|
b1 = ord(fp.read(1))
|
||||||
if 247 <= b0 and b0 <= 250:
|
if 247 <= b0 and b0 <= 250:
|
||||||
value = ((b0-247)<<8)+b1+108
|
value = ((b0-247) << 8)+b1+108
|
||||||
elif 251 <= b0 and b0 <= 254:
|
elif 251 <= b0 and b0 <= 254:
|
||||||
value = -((b0-251)<<8)-b1-108
|
value = -((b0-251) << 8)-b1-108
|
||||||
else:
|
else:
|
||||||
b2 = ord(fp.read(1))
|
b2 = ord(fp.read(1))
|
||||||
if 128 <= b1: b1 -= 256
|
if 128 <= b1:
|
||||||
|
b1 -= 256
|
||||||
if b0 == 28:
|
if b0 == 28:
|
||||||
value = b1<<8 | b2
|
value = b1 << 8 | b2
|
||||||
else:
|
else:
|
||||||
value = b1<<24 | b2<<16 | struct.unpack('>H', fp.read(2))[0]
|
value = b1 << 24 | b2 << 16 | struct.unpack('>H', fp.read(2))[0]
|
||||||
stack.append(value)
|
stack.append(value)
|
||||||
return d
|
return d
|
||||||
|
|
||||||
|
|
||||||
class CFFFont(object):
|
class CFFFont(object):
|
||||||
|
|
||||||
STANDARD_STRINGS = (
|
STANDARD_STRINGS = (
|
||||||
|
@ -239,7 +245,7 @@ class CFFFont(object):
|
||||||
'Yacutesmall', 'Thornsmall', 'Ydieresissmall', '001.000',
|
'Yacutesmall', 'Thornsmall', 'Ydieresissmall', '001.000',
|
||||||
'001.001', '001.002', '001.003', 'Black', 'Bold', 'Book',
|
'001.001', '001.002', '001.003', 'Black', 'Bold', 'Book',
|
||||||
'Light', 'Medium', 'Regular', 'Roman', 'Semibold',
|
'Light', 'Medium', 'Regular', 'Roman', 'Semibold',
|
||||||
)
|
)
|
||||||
|
|
||||||
class INDEX(object):
|
class INDEX(object):
|
||||||
|
|
||||||
|
@ -264,13 +270,13 @@ class CFFFont(object):
|
||||||
return self.fp.read(self.offsets[i+1]-self.offsets[i])
|
return self.fp.read(self.offsets[i+1]-self.offsets[i])
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
return iter( self[i] for i in xrange(len(self)) )
|
return iter(self[i] for i in xrange(len(self)))
|
||||||
|
|
||||||
def __init__(self, name, fp):
|
def __init__(self, name, fp):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.fp = fp
|
self.fp = fp
|
||||||
# Header
|
# Header
|
||||||
(_major,_minor,hdrsize,offsize) = struct.unpack('BBBB', self.fp.read(4))
|
(_major, _minor, hdrsize, offsize) = struct.unpack('BBBB', self.fp.read(4))
|
||||||
self.fp.read(hdrsize-4)
|
self.fp.read(hdrsize-4)
|
||||||
# Name INDEX
|
# Name INDEX
|
||||||
self.name_index = self.INDEX(self.fp)
|
self.name_index = self.INDEX(self.fp)
|
||||||
|
@ -297,7 +303,7 @@ class CFFFont(object):
|
||||||
if format == '\x00':
|
if format == '\x00':
|
||||||
# Format 0
|
# Format 0
|
||||||
(n,) = struct.unpack('B', self.fp.read(1))
|
(n,) = struct.unpack('B', self.fp.read(1))
|
||||||
for (code,gid) in enumerate(struct.unpack('B'*n, self.fp.read(n))):
|
for (code, gid) in enumerate(struct.unpack('B'*n, self.fp.read(n))):
|
||||||
self.code2gid[code] = gid
|
self.code2gid[code] = gid
|
||||||
self.gid2code[gid] = code
|
self.gid2code[gid] = code
|
||||||
elif format == '\x01':
|
elif format == '\x01':
|
||||||
|
@ -305,8 +311,8 @@ class CFFFont(object):
|
||||||
(n,) = struct.unpack('B', self.fp.read(1))
|
(n,) = struct.unpack('B', self.fp.read(1))
|
||||||
code = 0
|
code = 0
|
||||||
for i in xrange(n):
|
for i in xrange(n):
|
||||||
(first,nleft) = struct.unpack('BB', self.fp.read(2))
|
(first, nleft) = struct.unpack('BB', self.fp.read(2))
|
||||||
for gid in xrange(first,first+nleft+1):
|
for gid in xrange(first, first+nleft+1):
|
||||||
self.code2gid[code] = gid
|
self.code2gid[code] = gid
|
||||||
self.gid2code[gid] = code
|
self.gid2code[gid] = code
|
||||||
code += 1
|
code += 1
|
||||||
|
@ -320,7 +326,7 @@ class CFFFont(object):
|
||||||
if format == '\x00':
|
if format == '\x00':
|
||||||
# Format 0
|
# Format 0
|
||||||
n = self.nglyphs-1
|
n = self.nglyphs-1
|
||||||
for (gid,sid) in enumerate(struct.unpack('>'+'H'*n, self.fp.read(2*n))):
|
for (gid, sid) in enumerate(struct.unpack('>'+'H'*n, self.fp.read(2*n))):
|
||||||
gid += 1
|
gid += 1
|
||||||
name = self.getstr(sid)
|
name = self.getstr(sid)
|
||||||
self.name2gid[name] = gid
|
self.name2gid[name] = gid
|
||||||
|
@ -330,8 +336,8 @@ class CFFFont(object):
|
||||||
(n,) = struct.unpack('B', self.fp.read(1))
|
(n,) = struct.unpack('B', self.fp.read(1))
|
||||||
sid = 0
|
sid = 0
|
||||||
for i in xrange(n):
|
for i in xrange(n):
|
||||||
(first,nleft) = struct.unpack('BB', self.fp.read(2))
|
(first, nleft) = struct.unpack('BB', self.fp.read(2))
|
||||||
for gid in xrange(first,first+nleft+1):
|
for gid in xrange(first, first+nleft+1):
|
||||||
name = self.getstr(sid)
|
name = self.getstr(sid)
|
||||||
self.name2gid[name] = gid
|
self.name2gid[name] = gid
|
||||||
self.gid2name[gid] = name
|
self.gid2name[gid] = name
|
||||||
|
@ -356,7 +362,8 @@ class CFFFont(object):
|
||||||
##
|
##
|
||||||
class TrueTypeFont(object):
|
class TrueTypeFont(object):
|
||||||
|
|
||||||
class CMapNotFound(Exception): pass
|
class CMapNotFound(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
def __init__(self, name, fp):
|
def __init__(self, name, fp):
|
||||||
self.name = name
|
self.name = name
|
||||||
|
@ -389,15 +396,16 @@ class TrueTypeFont(object):
|
||||||
elif fmttype == 2:
|
elif fmttype == 2:
|
||||||
subheaderkeys = struct.unpack('>256H', fp.read(512))
|
subheaderkeys = struct.unpack('>256H', fp.read(512))
|
||||||
firstbytes = [0]*8192
|
firstbytes = [0]*8192
|
||||||
for (i,k) in enumerate(subheaderkeys):
|
for (i, k) in enumerate(subheaderkeys):
|
||||||
firstbytes[k/8] = i
|
firstbytes[k/8] = i
|
||||||
nhdrs = max(subheaderkeys)/8 + 1
|
nhdrs = max(subheaderkeys)/8 + 1
|
||||||
hdrs = []
|
hdrs = []
|
||||||
for i in xrange(nhdrs):
|
for i in xrange(nhdrs):
|
||||||
(firstcode,entcount,delta,offset) = struct.unpack('>HHhH', fp.read(8))
|
(firstcode, entcount, delta, offset) = struct.unpack('>HHhH', fp.read(8))
|
||||||
hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset))
|
hdrs.append((i, firstcode, entcount, delta, fp.tell()-2+offset))
|
||||||
for (i,firstcode,entcount,delta,pos) in hdrs:
|
for (i, firstcode, entcount, delta, pos) in hdrs:
|
||||||
if not entcount: continue
|
if not entcount:
|
||||||
|
continue
|
||||||
first = firstcode + (firstbytes[i] << 8)
|
first = firstcode + (firstbytes[i] << 8)
|
||||||
fp.seek(pos)
|
fp.seek(pos)
|
||||||
for c in xrange(entcount):
|
for c in xrange(entcount):
|
||||||
|
@ -414,7 +422,7 @@ class TrueTypeFont(object):
|
||||||
idds = struct.unpack('>%dh' % segcount, fp.read(2*segcount))
|
idds = struct.unpack('>%dh' % segcount, fp.read(2*segcount))
|
||||||
pos = fp.tell()
|
pos = fp.tell()
|
||||||
idrs = struct.unpack('>%dH' % segcount, fp.read(2*segcount))
|
idrs = struct.unpack('>%dH' % segcount, fp.read(2*segcount))
|
||||||
for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs):
|
for (ec, sc, idd, idr) in zip(ecs, scs, idds, idrs):
|
||||||
if idr:
|
if idr:
|
||||||
fp.seek(pos+idr)
|
fp.seek(pos+idr)
|
||||||
for c in xrange(sc, ec+1):
|
for c in xrange(sc, ec+1):
|
||||||
|
@ -426,16 +434,19 @@ class TrueTypeFont(object):
|
||||||
assert 0
|
assert 0
|
||||||
# create unicode map
|
# create unicode map
|
||||||
unicode_map = FileUnicodeMap()
|
unicode_map = FileUnicodeMap()
|
||||||
for (char,gid) in char2gid.iteritems():
|
for (char, gid) in char2gid.iteritems():
|
||||||
unicode_map.add_cid2unichr(gid, char)
|
unicode_map.add_cid2unichr(gid, char)
|
||||||
return unicode_map
|
return unicode_map
|
||||||
|
|
||||||
|
|
||||||
## Fonts
|
## Fonts
|
||||||
##
|
##
|
||||||
|
class PDFFontError(PDFException):
|
||||||
|
pass
|
||||||
|
|
||||||
class PDFFontError(PDFException): pass
|
|
||||||
class PDFUnicodeNotDefined(PDFFontError): pass
|
class PDFUnicodeNotDefined(PDFFontError):
|
||||||
|
pass
|
||||||
|
|
||||||
LITERAL_STANDARD_ENCODING = LIT('StandardEncoding')
|
LITERAL_STANDARD_ENCODING = LIT('StandardEncoding')
|
||||||
LITERAL_TYPE1C = LIT('Type1C')
|
LITERAL_TYPE1C = LIT('Type1C')
|
||||||
|
@ -456,7 +467,7 @@ class PDFFont(object):
|
||||||
self.italic_angle = num_value(descriptor.get('ItalicAngle', 0))
|
self.italic_angle = num_value(descriptor.get('ItalicAngle', 0))
|
||||||
self.default_width = default_width or num_value(descriptor.get('MissingWidth', 0))
|
self.default_width = default_width or num_value(descriptor.get('MissingWidth', 0))
|
||||||
self.leading = num_value(descriptor.get('Leading', 0))
|
self.leading = num_value(descriptor.get('Leading', 0))
|
||||||
self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0)))
|
self.bbox = list_value(descriptor.get('FontBBox', (0, 0, 0, 0)))
|
||||||
self.hscale = self.vscale = .001
|
self.hscale = self.vscale = .001
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -474,6 +485,7 @@ class PDFFont(object):
|
||||||
|
|
||||||
def get_ascent(self):
|
def get_ascent(self):
|
||||||
return self.ascent * self.vscale
|
return self.ascent * self.vscale
|
||||||
|
|
||||||
def get_descent(self):
|
def get_descent(self):
|
||||||
return self.descent * self.vscale
|
return self.descent * self.vscale
|
||||||
|
|
||||||
|
@ -482,6 +494,7 @@ class PDFFont(object):
|
||||||
if w == 0:
|
if w == 0:
|
||||||
w = -self.default_width
|
w = -self.default_width
|
||||||
return w * self.hscale
|
return w * self.hscale
|
||||||
|
|
||||||
def get_height(self):
|
def get_height(self):
|
||||||
h = self.bbox[3]-self.bbox[1]
|
h = self.bbox[3]-self.bbox[1]
|
||||||
if h == 0:
|
if h == 0:
|
||||||
|
@ -501,7 +514,7 @@ class PDFFont(object):
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
def string_width(self, s):
|
def string_width(self, s):
|
||||||
return sum( self.char_width(cid) for cid in self.decode(s) )
|
return sum(self.char_width(cid) for cid in self.decode(s))
|
||||||
|
|
||||||
|
|
||||||
# PDFSimpleFont
|
# PDFSimpleFont
|
||||||
|
@ -540,6 +553,7 @@ class PDFSimpleFont(PDFFont):
|
||||||
except KeyError:
|
except KeyError:
|
||||||
raise PDFUnicodeNotDefined(None, cid)
|
raise PDFUnicodeNotDefined(None, cid)
|
||||||
|
|
||||||
|
|
||||||
# PDFType1Font
|
# PDFType1Font
|
||||||
class PDFType1Font(PDFSimpleFont):
|
class PDFType1Font(PDFSimpleFont):
|
||||||
|
|
||||||
|
@ -557,7 +571,7 @@ class PDFType1Font(PDFSimpleFont):
|
||||||
firstchar = int_value(spec.get('FirstChar', 0))
|
firstchar = int_value(spec.get('FirstChar', 0))
|
||||||
lastchar = int_value(spec.get('LastChar', 255))
|
lastchar = int_value(spec.get('LastChar', 255))
|
||||||
widths = list_value(spec.get('Widths', [0]*256))
|
widths = list_value(spec.get('Widths', [0]*256))
|
||||||
widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) )
|
widths = dict((i+firstchar, w) for (i, w) in enumerate(widths))
|
||||||
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
||||||
if 'Encoding' not in spec and 'FontFile' in descriptor:
|
if 'Encoding' not in spec and 'FontFile' in descriptor:
|
||||||
# try to recover the missing encoding info from the font file.
|
# try to recover the missing encoding info from the font file.
|
||||||
|
@ -571,12 +585,14 @@ class PDFType1Font(PDFSimpleFont):
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<PDFType1Font: basefont=%r>' % self.basefont
|
return '<PDFType1Font: basefont=%r>' % self.basefont
|
||||||
|
|
||||||
|
|
||||||
# PDFTrueTypeFont
|
# PDFTrueTypeFont
|
||||||
class PDFTrueTypeFont(PDFType1Font):
|
class PDFTrueTypeFont(PDFType1Font):
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
|
return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
|
||||||
|
|
||||||
|
|
||||||
# PDFType3Font
|
# PDFType3Font
|
||||||
class PDFType3Font(PDFSimpleFont):
|
class PDFType3Font(PDFSimpleFont):
|
||||||
|
|
||||||
|
@ -584,16 +600,16 @@ class PDFType3Font(PDFSimpleFont):
|
||||||
firstchar = int_value(spec.get('FirstChar', 0))
|
firstchar = int_value(spec.get('FirstChar', 0))
|
||||||
lastchar = int_value(spec.get('LastChar', 0))
|
lastchar = int_value(spec.get('LastChar', 0))
|
||||||
widths = list_value(spec.get('Widths', [0]*256))
|
widths = list_value(spec.get('Widths', [0]*256))
|
||||||
widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths))
|
widths = dict((i+firstchar, w) for (i, w) in enumerate(widths))
|
||||||
if 'FontDescriptor' in spec:
|
if 'FontDescriptor' in spec:
|
||||||
descriptor = dict_value(spec['FontDescriptor'])
|
descriptor = dict_value(spec['FontDescriptor'])
|
||||||
else:
|
else:
|
||||||
descriptor = {'Ascent':0, 'Descent':0,
|
descriptor = {'Ascent': 0, 'Descent': 0,
|
||||||
'FontBBox':spec['FontBBox']}
|
'FontBBox': spec['FontBBox']}
|
||||||
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
||||||
self.matrix = tuple(list_value(spec.get('FontMatrix')))
|
self.matrix = tuple(list_value(spec.get('FontMatrix')))
|
||||||
(_,self.descent,_,self.ascent) = self.bbox
|
(_, self.descent, _, self.ascent) = self.bbox
|
||||||
(self.hscale,self.vscale) = apply_matrix_norm(self.matrix, (1,1))
|
(self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
|
@ -657,10 +673,10 @@ class PDFCIDFont(PDFFont):
|
||||||
if self.vertical:
|
if self.vertical:
|
||||||
# writing mode: vertical
|
# writing mode: vertical
|
||||||
widths = get_widths2(list_value(spec.get('W2', [])))
|
widths = get_widths2(list_value(spec.get('W2', [])))
|
||||||
self.disps = dict( (cid,(vx,vy)) for (cid,(_,(vx,vy))) in widths.iteritems() )
|
self.disps = dict((cid, (vx, vy)) for (cid, (_, (vx, vy))) in widths.iteritems())
|
||||||
(vy,w) = spec.get('DW2', [880, -1000])
|
(vy, w) = spec.get('DW2', [880, -1000])
|
||||||
self.default_disp = (None,vy)
|
self.default_disp = (None, vy)
|
||||||
widths = dict( (cid,w) for (cid,(w,_)) in widths.iteritems() )
|
widths = dict((cid, w) for (cid, (w, _)) in widths.iteritems())
|
||||||
default_width = w
|
default_width = w
|
||||||
else:
|
else:
|
||||||
# writing mode: horizontal
|
# writing mode: horizontal
|
||||||
|
@ -689,7 +705,8 @@ class PDFCIDFont(PDFFont):
|
||||||
|
|
||||||
def to_unichr(self, cid):
|
def to_unichr(self, cid):
|
||||||
try:
|
try:
|
||||||
if not self.unicode_map: raise KeyError(cid)
|
if not self.unicode_map:
|
||||||
|
raise KeyError(cid)
|
||||||
return self.unicode_map.get_unichr(cid)
|
return self.unicode_map.get_unichr(cid)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
||||||
|
@ -705,4 +722,5 @@ def main(argv):
|
||||||
fp.close()
|
fp.close()
|
||||||
return
|
return
|
||||||
|
|
||||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
if __name__ == '__main__':
|
||||||
|
sys.exit(main(sys.argv))
|
||||||
|
|
|
@ -30,8 +30,12 @@ from utils import mult_matrix, MATRIX_IDENTITY
|
||||||
|
|
||||||
## Exceptions
|
## Exceptions
|
||||||
##
|
##
|
||||||
class PDFResourceError(PDFException): pass
|
class PDFResourceError(PDFException):
|
||||||
class PDFInterpreterError(PDFException): pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class PDFInterpreterError(PDFException):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
## Constants
|
## Constants
|
||||||
|
@ -120,6 +124,7 @@ class PDFGraphicState(object):
|
||||||
(self.linewidth, self.linecap, self.linejoin,
|
(self.linewidth, self.linecap, self.linejoin,
|
||||||
self.miterlimit, self.dash, self.intent, self.flatness))
|
self.miterlimit, self.dash, self.intent, self.flatness))
|
||||||
|
|
||||||
|
|
||||||
## Resource Manager
|
## Resource Manager
|
||||||
##
|
##
|
||||||
class PDFResourceManager(object):
|
class PDFResourceManager(object):
|
||||||
|
@ -152,7 +157,8 @@ class PDFResourceManager(object):
|
||||||
try:
|
try:
|
||||||
return CMapDB.get_cmap(cmapname)
|
return CMapDB.get_cmap(cmapname)
|
||||||
except CMapDB.CMapNotFound:
|
except CMapDB.CMapNotFound:
|
||||||
if strict: raise
|
if strict:
|
||||||
|
raise
|
||||||
return CMap()
|
return CMap()
|
||||||
|
|
||||||
def get_font(self, objid, spec):
|
def get_font(self, objid, spec):
|
||||||
|
@ -195,7 +201,7 @@ class PDFResourceManager(object):
|
||||||
else:
|
else:
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFFontError('Invalid Font spec: %r' % spec)
|
raise PDFFontError('Invalid Font spec: %r' % spec)
|
||||||
font = PDFType1Font(self, spec) # this is so wrong!
|
font = PDFType1Font(self, spec) # this is so wrong!
|
||||||
if objid and self.caching:
|
if objid and self.caching:
|
||||||
self._cached_fonts[objid] = font
|
self._cached_fonts[objid] = font
|
||||||
return font
|
return font
|
||||||
|
@ -227,12 +233,14 @@ class PDFContentParser(PSStackParser):
|
||||||
return
|
return
|
||||||
|
|
||||||
def fillbuf(self):
|
def fillbuf(self):
|
||||||
if self.charpos < len(self.buf): return
|
if self.charpos < len(self.buf):
|
||||||
|
return
|
||||||
while 1:
|
while 1:
|
||||||
self.fillfp()
|
self.fillfp()
|
||||||
self.bufpos = self.fp.tell()
|
self.bufpos = self.fp.tell()
|
||||||
self.buf = self.fp.read(self.BUFSIZ)
|
self.buf = self.fp.read(self.BUFSIZ)
|
||||||
if self.buf: break
|
if self.buf:
|
||||||
|
break
|
||||||
self.fp = None
|
self.fp = None
|
||||||
self.charpos = 0
|
self.charpos = 0
|
||||||
return
|
return
|
||||||
|
@ -263,7 +271,7 @@ class PDFContentParser(PSStackParser):
|
||||||
except ValueError:
|
except ValueError:
|
||||||
data += self.buf[self.charpos:]
|
data += self.buf[self.charpos:]
|
||||||
self.charpos = len(self.buf)
|
self.charpos = len(self.buf)
|
||||||
data = data[:-(len(target)+1)] # strip the last part
|
data = data[:-(len(target)+1)] # strip the last part
|
||||||
data = re.sub(r'(\x0d\x0a|[\x0d\x0a])$', '', data)
|
data = re.sub(r'(\x0d\x0a|[\x0d\x0a])$', '', data)
|
||||||
return (pos, data)
|
return (pos, data)
|
||||||
|
|
||||||
|
@ -274,6 +282,7 @@ class PDFContentParser(PSStackParser):
|
||||||
KEYWORD_BI = KWD('BI')
|
KEYWORD_BI = KWD('BI')
|
||||||
KEYWORD_ID = KWD('ID')
|
KEYWORD_ID = KWD('ID')
|
||||||
KEYWORD_EI = KWD('EI')
|
KEYWORD_EI = KWD('EI')
|
||||||
|
|
||||||
def do_keyword(self, pos, token):
|
def do_keyword(self, pos, token):
|
||||||
if token is self.KEYWORD_BI:
|
if token is self.KEYWORD_BI:
|
||||||
# inline image within a content stream
|
# inline image within a content stream
|
||||||
|
@ -283,13 +292,14 @@ class PDFContentParser(PSStackParser):
|
||||||
(_, objs) = self.end_type('inline')
|
(_, objs) = self.end_type('inline')
|
||||||
if len(objs) % 2 != 0:
|
if len(objs) % 2 != 0:
|
||||||
raise PSTypeError('Invalid dictionary construct: %r' % objs)
|
raise PSTypeError('Invalid dictionary construct: %r' % objs)
|
||||||
d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
|
d = dict((literal_name(k), v) for (k, v) in choplist(2, objs))
|
||||||
(pos, data) = self.get_inline_data(pos+len('ID '))
|
(pos, data) = self.get_inline_data(pos+len('ID '))
|
||||||
obj = PDFStream(d, data)
|
obj = PDFStream(d, data)
|
||||||
self.push((pos, obj))
|
self.push((pos, obj))
|
||||||
self.push((pos, self.KEYWORD_EI))
|
self.push((pos, self.KEYWORD_EI))
|
||||||
except PSTypeError:
|
except PSTypeError:
|
||||||
if STRICT: raise
|
if STRICT:
|
||||||
|
raise
|
||||||
else:
|
else:
|
||||||
self.push((pos, token))
|
self.push((pos, token))
|
||||||
return
|
return
|
||||||
|
@ -316,7 +326,9 @@ class PDFPageInterpreter(object):
|
||||||
self.fontmap = {}
|
self.fontmap = {}
|
||||||
self.xobjmap = {}
|
self.xobjmap = {}
|
||||||
self.csmap = PREDEFINED_COLORSPACE.copy()
|
self.csmap = PREDEFINED_COLORSPACE.copy()
|
||||||
if not resources: return
|
if not resources:
|
||||||
|
return
|
||||||
|
|
||||||
def get_colorspace(spec):
|
def get_colorspace(spec):
|
||||||
if isinstance(spec, list):
|
if isinstance(spec, list):
|
||||||
name = literal_name(spec[0])
|
name = literal_name(spec[0])
|
||||||
|
@ -328,23 +340,23 @@ class PDFPageInterpreter(object):
|
||||||
return PDFColorSpace(name, len(list_value(spec[1])))
|
return PDFColorSpace(name, len(list_value(spec[1])))
|
||||||
else:
|
else:
|
||||||
return PREDEFINED_COLORSPACE.get(name)
|
return PREDEFINED_COLORSPACE.get(name)
|
||||||
for (k,v) in dict_value(resources).iteritems():
|
for (k, v) in dict_value(resources).iteritems():
|
||||||
if 2 <= self.debug:
|
if 2 <= self.debug:
|
||||||
print >>sys.stderr, 'Resource: %r: %r' % (k,v)
|
print >>sys.stderr, 'Resource: %r: %r' % (k, v)
|
||||||
if k == 'Font':
|
if k == 'Font':
|
||||||
for (fontid,spec) in dict_value(v).iteritems():
|
for (fontid, spec) in dict_value(v).iteritems():
|
||||||
objid = None
|
objid = None
|
||||||
if isinstance(spec, PDFObjRef):
|
if isinstance(spec, PDFObjRef):
|
||||||
objid = spec.objid
|
objid = spec.objid
|
||||||
spec = dict_value(spec)
|
spec = dict_value(spec)
|
||||||
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
|
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
|
||||||
elif k == 'ColorSpace':
|
elif k == 'ColorSpace':
|
||||||
for (csid,spec) in dict_value(v).iteritems():
|
for (csid, spec) in dict_value(v).iteritems():
|
||||||
self.csmap[csid] = get_colorspace(resolve1(spec))
|
self.csmap[csid] = get_colorspace(resolve1(spec))
|
||||||
elif k == 'ProcSet':
|
elif k == 'ProcSet':
|
||||||
self.rsrcmgr.get_procset(list_value(v))
|
self.rsrcmgr.get_procset(list_value(v))
|
||||||
elif k == 'XObject':
|
elif k == 'XObject':
|
||||||
for (xobjid,xobjstrm) in dict_value(v).iteritems():
|
for (xobjid, xobjstrm) in dict_value(v).iteritems():
|
||||||
self.xobjmap[xobjid] = xobjstrm
|
self.xobjmap[xobjid] = xobjstrm
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -371,7 +383,8 @@ class PDFPageInterpreter(object):
|
||||||
return
|
return
|
||||||
|
|
||||||
def pop(self, n):
|
def pop(self, n):
|
||||||
if n == 0: return []
|
if n == 0:
|
||||||
|
return []
|
||||||
x = self.argstack[-n:]
|
x = self.argstack[-n:]
|
||||||
self.argstack = self.argstack[:-n]
|
self.argstack = self.argstack[:-n]
|
||||||
return x
|
return x
|
||||||
|
@ -388,6 +401,7 @@ class PDFPageInterpreter(object):
|
||||||
def do_q(self):
|
def do_q(self):
|
||||||
self.gstack.append(self.get_current_state())
|
self.gstack.append(self.get_current_state())
|
||||||
return
|
return
|
||||||
|
|
||||||
# grestore
|
# grestore
|
||||||
def do_Q(self):
|
def do_Q(self):
|
||||||
if self.gstack:
|
if self.gstack:
|
||||||
|
@ -396,7 +410,7 @@ class PDFPageInterpreter(object):
|
||||||
|
|
||||||
# concat-matrix
|
# concat-matrix
|
||||||
def do_cm(self, a1, b1, c1, d1, e1, f1):
|
def do_cm(self, a1, b1, c1, d1, e1, f1):
|
||||||
self.ctm = mult_matrix((a1,b1,c1,d1,e1,f1), self.ctm)
|
self.ctm = mult_matrix((a1, b1, c1, d1, e1, f1), self.ctm)
|
||||||
self.device.set_ctm(self.ctm)
|
self.device.set_ctm(self.ctm)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -404,30 +418,37 @@ class PDFPageInterpreter(object):
|
||||||
def do_w(self, linewidth):
|
def do_w(self, linewidth):
|
||||||
self.graphicstate.linewidth = linewidth
|
self.graphicstate.linewidth = linewidth
|
||||||
return
|
return
|
||||||
|
|
||||||
# setlinecap
|
# setlinecap
|
||||||
def do_J(self, linecap):
|
def do_J(self, linecap):
|
||||||
self.graphicstate.linecap = linecap
|
self.graphicstate.linecap = linecap
|
||||||
return
|
return
|
||||||
|
|
||||||
# setlinejoin
|
# setlinejoin
|
||||||
def do_j(self, linejoin):
|
def do_j(self, linejoin):
|
||||||
self.graphicstate.linejoin = linejoin
|
self.graphicstate.linejoin = linejoin
|
||||||
return
|
return
|
||||||
|
|
||||||
# setmiterlimit
|
# setmiterlimit
|
||||||
def do_M(self, miterlimit):
|
def do_M(self, miterlimit):
|
||||||
self.graphicstate.miterlimit = miterlimit
|
self.graphicstate.miterlimit = miterlimit
|
||||||
return
|
return
|
||||||
|
|
||||||
# setdash
|
# setdash
|
||||||
def do_d(self, dash, phase):
|
def do_d(self, dash, phase):
|
||||||
self.graphicstate.dash = (dash, phase)
|
self.graphicstate.dash = (dash, phase)
|
||||||
return
|
return
|
||||||
|
|
||||||
# setintent
|
# setintent
|
||||||
def do_ri(self, intent):
|
def do_ri(self, intent):
|
||||||
self.graphicstate.intent = intent
|
self.graphicstate.intent = intent
|
||||||
return
|
return
|
||||||
|
|
||||||
# setflatness
|
# setflatness
|
||||||
def do_i(self, flatness):
|
def do_i(self, flatness):
|
||||||
self.graphicstate.flatness = flatness
|
self.graphicstate.flatness = flatness
|
||||||
return
|
return
|
||||||
|
|
||||||
# load-gstate
|
# load-gstate
|
||||||
def do_gs(self, name):
|
def do_gs(self, name):
|
||||||
#XXX
|
#XXX
|
||||||
|
@ -435,34 +456,40 @@ class PDFPageInterpreter(object):
|
||||||
|
|
||||||
# moveto
|
# moveto
|
||||||
def do_m(self, x, y):
|
def do_m(self, x, y):
|
||||||
self.curpath.append(('m',x,y))
|
self.curpath.append(('m', x, y))
|
||||||
return
|
return
|
||||||
|
|
||||||
# lineto
|
# lineto
|
||||||
def do_l(self, x, y):
|
def do_l(self, x, y):
|
||||||
self.curpath.append(('l',x,y))
|
self.curpath.append(('l', x, y))
|
||||||
return
|
return
|
||||||
|
|
||||||
# curveto
|
# curveto
|
||||||
def do_c(self, x1, y1, x2, y2, x3, y3):
|
def do_c(self, x1, y1, x2, y2, x3, y3):
|
||||||
self.curpath.append(('c',x1,y1,x2,y2,x3,y3))
|
self.curpath.append(('c', x1, y1, x2, y2, x3, y3))
|
||||||
return
|
return
|
||||||
|
|
||||||
# urveto
|
# urveto
|
||||||
def do_v(self, x2, y2, x3, y3):
|
def do_v(self, x2, y2, x3, y3):
|
||||||
self.curpath.append(('v',x2,y2,x3,y3))
|
self.curpath.append(('v', x2, y2, x3, y3))
|
||||||
return
|
return
|
||||||
|
|
||||||
# rveto
|
# rveto
|
||||||
def do_y(self, x1, y1, x3, y3):
|
def do_y(self, x1, y1, x3, y3):
|
||||||
self.curpath.append(('y',x1,y1,x3,y3))
|
self.curpath.append(('y', x1, y1, x3, y3))
|
||||||
return
|
return
|
||||||
|
|
||||||
# closepath
|
# closepath
|
||||||
def do_h(self):
|
def do_h(self):
|
||||||
self.curpath.append(('h',))
|
self.curpath.append(('h',))
|
||||||
return
|
return
|
||||||
|
|
||||||
# rectangle
|
# rectangle
|
||||||
def do_re(self, x, y, w, h):
|
def do_re(self, x, y, w, h):
|
||||||
self.curpath.append(('m',x,y))
|
self.curpath.append(('m', x, y))
|
||||||
self.curpath.append(('l',x+w,y))
|
self.curpath.append(('l', x+w, y))
|
||||||
self.curpath.append(('l',x+w,y+h))
|
self.curpath.append(('l', x+w, y+h))
|
||||||
self.curpath.append(('l',x,y+h))
|
self.curpath.append(('l', x, y+h))
|
||||||
self.curpath.append(('h',))
|
self.curpath.append(('h',))
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -471,11 +498,13 @@ class PDFPageInterpreter(object):
|
||||||
self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
|
self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
|
||||||
self.curpath = []
|
self.curpath = []
|
||||||
return
|
return
|
||||||
|
|
||||||
# close-and-stroke
|
# close-and-stroke
|
||||||
def do_s(self):
|
def do_s(self):
|
||||||
self.do_h()
|
self.do_h()
|
||||||
self.do_S()
|
self.do_S()
|
||||||
return
|
return
|
||||||
|
|
||||||
# fill
|
# fill
|
||||||
def do_f(self):
|
def do_f(self):
|
||||||
self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
|
self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
|
||||||
|
@ -483,68 +512,85 @@ class PDFPageInterpreter(object):
|
||||||
return
|
return
|
||||||
# fill (obsolete)
|
# fill (obsolete)
|
||||||
do_F = do_f
|
do_F = do_f
|
||||||
|
|
||||||
# fill-even-odd
|
# fill-even-odd
|
||||||
def do_f_a(self):
|
def do_f_a(self):
|
||||||
self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
|
self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
|
||||||
self.curpath = []
|
self.curpath = []
|
||||||
return
|
return
|
||||||
|
|
||||||
# fill-and-stroke
|
# fill-and-stroke
|
||||||
def do_B(self):
|
def do_B(self):
|
||||||
self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
|
self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
|
||||||
self.curpath = []
|
self.curpath = []
|
||||||
return
|
return
|
||||||
|
|
||||||
# fill-and-stroke-even-odd
|
# fill-and-stroke-even-odd
|
||||||
def do_B_a(self):
|
def do_B_a(self):
|
||||||
self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
|
self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
|
||||||
self.curpath = []
|
self.curpath = []
|
||||||
return
|
return
|
||||||
|
|
||||||
# close-fill-and-stroke
|
# close-fill-and-stroke
|
||||||
def do_b(self):
|
def do_b(self):
|
||||||
self.do_h()
|
self.do_h()
|
||||||
self.do_B()
|
self.do_B()
|
||||||
return
|
return
|
||||||
|
|
||||||
# close-fill-and-stroke-even-odd
|
# close-fill-and-stroke-even-odd
|
||||||
def do_b_a(self):
|
def do_b_a(self):
|
||||||
self.do_h()
|
self.do_h()
|
||||||
self.do_B_a()
|
self.do_B_a()
|
||||||
return
|
return
|
||||||
|
|
||||||
# close-only
|
# close-only
|
||||||
def do_n(self):
|
def do_n(self):
|
||||||
self.curpath = []
|
self.curpath = []
|
||||||
return
|
return
|
||||||
|
|
||||||
# clip
|
# clip
|
||||||
def do_W(self): return
|
def do_W(self):
|
||||||
|
return
|
||||||
|
|
||||||
# clip-even-odd
|
# clip-even-odd
|
||||||
def do_W_a(self): return
|
def do_W_a(self):
|
||||||
|
return
|
||||||
|
|
||||||
# setcolorspace-stroking
|
# setcolorspace-stroking
|
||||||
def do_CS(self, name):
|
def do_CS(self, name):
|
||||||
self.scs = self.csmap[literal_name(name)]
|
self.scs = self.csmap[literal_name(name)]
|
||||||
return
|
return
|
||||||
|
|
||||||
# setcolorspace-non-strokine
|
# setcolorspace-non-strokine
|
||||||
def do_cs(self, name):
|
def do_cs(self, name):
|
||||||
self.ncs = self.csmap[literal_name(name)]
|
self.ncs = self.csmap[literal_name(name)]
|
||||||
return
|
return
|
||||||
|
|
||||||
# setgray-stroking
|
# setgray-stroking
|
||||||
def do_G(self, gray):
|
def do_G(self, gray):
|
||||||
#self.do_CS(LITERAL_DEVICE_GRAY)
|
#self.do_CS(LITERAL_DEVICE_GRAY)
|
||||||
return
|
return
|
||||||
|
|
||||||
# setgray-non-stroking
|
# setgray-non-stroking
|
||||||
def do_g(self, gray):
|
def do_g(self, gray):
|
||||||
#self.do_cs(LITERAL_DEVICE_GRAY)
|
#self.do_cs(LITERAL_DEVICE_GRAY)
|
||||||
return
|
return
|
||||||
|
|
||||||
# setrgb-stroking
|
# setrgb-stroking
|
||||||
def do_RG(self, r, g, b):
|
def do_RG(self, r, g, b):
|
||||||
#self.do_CS(LITERAL_DEVICE_RGB)
|
#self.do_CS(LITERAL_DEVICE_RGB)
|
||||||
return
|
return
|
||||||
|
|
||||||
# setrgb-non-stroking
|
# setrgb-non-stroking
|
||||||
def do_rg(self, r, g, b):
|
def do_rg(self, r, g, b):
|
||||||
#self.do_cs(LITERAL_DEVICE_RGB)
|
#self.do_cs(LITERAL_DEVICE_RGB)
|
||||||
return
|
return
|
||||||
|
|
||||||
# setcmyk-stroking
|
# setcmyk-stroking
|
||||||
def do_K(self, c, m, y, k):
|
def do_K(self, c, m, y, k):
|
||||||
#self.do_CS(LITERAL_DEVICE_CMYK)
|
#self.do_CS(LITERAL_DEVICE_CMYK)
|
||||||
return
|
return
|
||||||
|
|
||||||
# setcmyk-non-stroking
|
# setcmyk-non-stroking
|
||||||
def do_k(self, c, m, y, k):
|
def do_k(self, c, m, y, k):
|
||||||
#self.do_cs(LITERAL_DEVICE_CMYK)
|
#self.do_cs(LITERAL_DEVICE_CMYK)
|
||||||
|
@ -560,6 +606,7 @@ class PDFPageInterpreter(object):
|
||||||
n = 1
|
n = 1
|
||||||
self.pop(n)
|
self.pop(n)
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_scn(self):
|
def do_scn(self):
|
||||||
if self.ncs:
|
if self.ncs:
|
||||||
n = self.ncs.ncomponents
|
n = self.ncs.ncomponents
|
||||||
|
@ -569,42 +616,53 @@ class PDFPageInterpreter(object):
|
||||||
n = 1
|
n = 1
|
||||||
self.pop(n)
|
self.pop(n)
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_SC(self):
|
def do_SC(self):
|
||||||
self.do_SCN()
|
self.do_SCN()
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_sc(self):
|
def do_sc(self):
|
||||||
self.do_scn()
|
self.do_scn()
|
||||||
return
|
return
|
||||||
|
|
||||||
# sharing-name
|
# sharing-name
|
||||||
def do_sh(self, name): return
|
def do_sh(self, name):
|
||||||
|
return
|
||||||
|
|
||||||
# begin-text
|
# begin-text
|
||||||
def do_BT(self):
|
def do_BT(self):
|
||||||
self.textstate.reset()
|
self.textstate.reset()
|
||||||
return
|
return
|
||||||
|
|
||||||
# end-text
|
# end-text
|
||||||
def do_ET(self):
|
def do_ET(self):
|
||||||
return
|
return
|
||||||
|
|
||||||
# begin-compat
|
# begin-compat
|
||||||
def do_BX(self): return
|
def do_BX(self):
|
||||||
|
return
|
||||||
|
|
||||||
# end-compat
|
# end-compat
|
||||||
def do_EX(self): return
|
def do_EX(self):
|
||||||
|
return
|
||||||
|
|
||||||
# marked content operators
|
# marked content operators
|
||||||
def do_MP(self, tag):
|
def do_MP(self, tag):
|
||||||
self.device.do_tag(tag)
|
self.device.do_tag(tag)
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_DP(self, tag, props):
|
def do_DP(self, tag, props):
|
||||||
self.device.do_tag(tag, props)
|
self.device.do_tag(tag, props)
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_BMC(self, tag):
|
def do_BMC(self, tag):
|
||||||
self.device.begin_tag(tag)
|
self.device.begin_tag(tag)
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_BDC(self, tag, props):
|
def do_BDC(self, tag, props):
|
||||||
self.device.begin_tag(tag, props)
|
self.device.begin_tag(tag, props)
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_EMC(self):
|
def do_EMC(self):
|
||||||
self.device.end_tag()
|
self.device.end_tag()
|
||||||
return
|
return
|
||||||
|
@ -613,18 +671,22 @@ class PDFPageInterpreter(object):
|
||||||
def do_Tc(self, space):
|
def do_Tc(self, space):
|
||||||
self.textstate.charspace = space
|
self.textstate.charspace = space
|
||||||
return
|
return
|
||||||
|
|
||||||
# setwordspace
|
# setwordspace
|
||||||
def do_Tw(self, space):
|
def do_Tw(self, space):
|
||||||
self.textstate.wordspace = space
|
self.textstate.wordspace = space
|
||||||
return
|
return
|
||||||
|
|
||||||
# textscale
|
# textscale
|
||||||
def do_Tz(self, scale):
|
def do_Tz(self, scale):
|
||||||
self.textstate.scaling = scale
|
self.textstate.scaling = scale
|
||||||
return
|
return
|
||||||
|
|
||||||
# setleading
|
# setleading
|
||||||
def do_TL(self, leading):
|
def do_TL(self, leading):
|
||||||
self.textstate.leading = -leading
|
self.textstate.leading = -leading
|
||||||
return
|
return
|
||||||
|
|
||||||
# selectfont
|
# selectfont
|
||||||
def do_Tf(self, fontid, fontsize):
|
def do_Tf(self, fontid, fontsize):
|
||||||
try:
|
try:
|
||||||
|
@ -635,10 +697,12 @@ class PDFPageInterpreter(object):
|
||||||
self.textstate.font = self.rsrcmgr.get_font(None, {})
|
self.textstate.font = self.rsrcmgr.get_font(None, {})
|
||||||
self.textstate.fontsize = fontsize
|
self.textstate.fontsize = fontsize
|
||||||
return
|
return
|
||||||
|
|
||||||
# setrendering
|
# setrendering
|
||||||
def do_Tr(self, render):
|
def do_Tr(self, render):
|
||||||
self.textstate.render = render
|
self.textstate.render = render
|
||||||
return
|
return
|
||||||
|
|
||||||
# settextrise
|
# settextrise
|
||||||
def do_Ts(self, rise):
|
def do_Ts(self, rise):
|
||||||
self.textstate.rise = rise
|
self.textstate.rise = rise
|
||||||
|
@ -646,49 +710,55 @@ class PDFPageInterpreter(object):
|
||||||
|
|
||||||
# text-move
|
# text-move
|
||||||
def do_Td(self, tx, ty):
|
def do_Td(self, tx, ty):
|
||||||
(a,b,c,d,e,f) = self.textstate.matrix
|
(a, b, c, d, e, f) = self.textstate.matrix
|
||||||
self.textstate.matrix = (a,b,c,d,tx*a+ty*c+e,tx*b+ty*d+f)
|
self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f)
|
||||||
self.textstate.linematrix = (0, 0)
|
self.textstate.linematrix = (0, 0)
|
||||||
#print >>sys.stderr, 'Td(%r,%r): %r' % (tx,ty,self.textstate)
|
#print >>sys.stderr, 'Td(%r,%r): %r' % (tx, ty, self.textstate)
|
||||||
return
|
return
|
||||||
|
|
||||||
# text-move
|
# text-move
|
||||||
def do_TD(self, tx, ty):
|
def do_TD(self, tx, ty):
|
||||||
(a,b,c,d,e,f) = self.textstate.matrix
|
(a, b, c, d, e, f) = self.textstate.matrix
|
||||||
self.textstate.matrix = (a,b,c,d,tx*a+ty*c+e,tx*b+ty*d+f)
|
self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f)
|
||||||
self.textstate.leading = ty
|
self.textstate.leading = ty
|
||||||
self.textstate.linematrix = (0, 0)
|
self.textstate.linematrix = (0, 0)
|
||||||
#print >>sys.stderr, 'TD(%r,%r): %r' % (tx,ty,self.textstate)
|
#print >>sys.stderr, 'TD(%r,%r): %r' % (tx, ty, self.textstate)
|
||||||
return
|
return
|
||||||
|
|
||||||
# textmatrix
|
# textmatrix
|
||||||
def do_Tm(self, a,b,c,d,e,f):
|
def do_Tm(self, a, b, c, d, e, f):
|
||||||
self.textstate.matrix = (a,b,c,d,e,f)
|
self.textstate.matrix = (a, b, c, d, e, f)
|
||||||
self.textstate.linematrix = (0, 0)
|
self.textstate.linematrix = (0, 0)
|
||||||
return
|
return
|
||||||
|
|
||||||
# nextline
|
# nextline
|
||||||
def do_T_a(self):
|
def do_T_a(self):
|
||||||
(a,b,c,d,e,f) = self.textstate.matrix
|
(a, b, c, d, e, f) = self.textstate.matrix
|
||||||
self.textstate.matrix = (a,b,c,d,self.textstate.leading*c+e,self.textstate.leading*d+f)
|
self.textstate.matrix = (a, b, c, d, self.textstate.leading*c+e, self.textstate.leading*d+f)
|
||||||
self.textstate.linematrix = (0, 0)
|
self.textstate.linematrix = (0, 0)
|
||||||
return
|
return
|
||||||
|
|
||||||
# show-pos
|
# show-pos
|
||||||
def do_TJ(self, seq):
|
def do_TJ(self, seq):
|
||||||
#print >>sys.stderr, 'TJ(%r): %r' % (seq,self.textstate)
|
#print >>sys.stderr, 'TJ(%r): %r' % (seq, self.textstate)
|
||||||
if self.textstate.font is None:
|
if self.textstate.font is None:
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFInterpreterError('No font specified!')
|
raise PDFInterpreterError('No font specified!')
|
||||||
return
|
return
|
||||||
self.device.render_string(self.textstate, seq)
|
self.device.render_string(self.textstate, seq)
|
||||||
return
|
return
|
||||||
|
|
||||||
# show
|
# show
|
||||||
def do_Tj(self, s):
|
def do_Tj(self, s):
|
||||||
self.do_TJ([s])
|
self.do_TJ([s])
|
||||||
return
|
return
|
||||||
|
|
||||||
# quote
|
# quote
|
||||||
def do__q(self, s):
|
def do__q(self, s):
|
||||||
self.do_T_a()
|
self.do_T_a()
|
||||||
self.do_TJ([s])
|
self.do_TJ([s])
|
||||||
return
|
return
|
||||||
|
|
||||||
# doublequote
|
# doublequote
|
||||||
def do__w(self, aw, ac, s):
|
def do__w(self, aw, ac, s):
|
||||||
self.do_Tw(aw)
|
self.do_Tw(aw)
|
||||||
|
@ -697,14 +767,16 @@ class PDFPageInterpreter(object):
|
||||||
return
|
return
|
||||||
|
|
||||||
# inline image
|
# inline image
|
||||||
def do_BI(self): # never called
|
def do_BI(self): # never called
|
||||||
return
|
return
|
||||||
def do_ID(self): # never called
|
|
||||||
|
def do_ID(self): # never called
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_EI(self, obj):
|
def do_EI(self, obj):
|
||||||
if 'W' in obj and 'H' in obj:
|
if 'W' in obj and 'H' in obj:
|
||||||
iobjid = str(id(obj))
|
iobjid = str(id(obj))
|
||||||
self.device.begin_figure(iobjid, (0,0,1,1), MATRIX_IDENTITY)
|
self.device.begin_figure(iobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
|
||||||
self.device.render_image(iobjid, obj)
|
self.device.render_image(iobjid, obj)
|
||||||
self.device.end_figure(iobjid)
|
self.device.end_figure(iobjid)
|
||||||
return
|
return
|
||||||
|
@ -733,7 +805,7 @@ class PDFPageInterpreter(object):
|
||||||
interpreter.render_contents(resources, [xobj], ctm=mult_matrix(matrix, self.ctm))
|
interpreter.render_contents(resources, [xobj], ctm=mult_matrix(matrix, self.ctm))
|
||||||
self.device.end_figure(xobjid)
|
self.device.end_figure(xobjid)
|
||||||
elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj:
|
elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj:
|
||||||
self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY)
|
self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
|
||||||
self.device.render_image(xobjid, xobj)
|
self.device.render_image(xobjid, xobj)
|
||||||
self.device.end_figure(xobjid)
|
self.device.end_figure(xobjid)
|
||||||
else:
|
else:
|
||||||
|
@ -744,15 +816,15 @@ class PDFPageInterpreter(object):
|
||||||
def process_page(self, page):
|
def process_page(self, page):
|
||||||
if 1 <= self.debug:
|
if 1 <= self.debug:
|
||||||
print >>sys.stderr, 'Processing page: %r' % page
|
print >>sys.stderr, 'Processing page: %r' % page
|
||||||
(x0,y0,x1,y1) = page.mediabox
|
(x0, y0, x1, y1) = page.mediabox
|
||||||
if page.rotate == 90:
|
if page.rotate == 90:
|
||||||
ctm = (0,-1,1,0, -y0,x1)
|
ctm = (0, -1, 1, 0, -y0, x1)
|
||||||
elif page.rotate == 180:
|
elif page.rotate == 180:
|
||||||
ctm = (-1,0,0,-1, x1,y1)
|
ctm = (-1, 0, 0, -1, x1, y1)
|
||||||
elif page.rotate == 270:
|
elif page.rotate == 270:
|
||||||
ctm = (0,1,-1,0, y1,-x0)
|
ctm = (0, 1, -1, 0, y1, -x0)
|
||||||
else:
|
else:
|
||||||
ctm = (1,0,0,1, -x0,-y0)
|
ctm = (1, 0, 0, 1, -x0, -y0)
|
||||||
self.device.begin_page(page, ctm)
|
self.device.begin_page(page, ctm)
|
||||||
self.render_contents(page.resources, page.contents, ctm=ctm)
|
self.render_contents(page.resources, page.contents, ctm=ctm)
|
||||||
self.device.end_page(page)
|
self.device.end_page(page)
|
||||||
|
@ -764,7 +836,7 @@ class PDFPageInterpreter(object):
|
||||||
def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY):
|
def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY):
|
||||||
if 1 <= self.debug:
|
if 1 <= self.debug:
|
||||||
print >>sys.stderr, ('render_contents: resources=%r, streams=%r, ctm=%r' %
|
print >>sys.stderr, ('render_contents: resources=%r, streams=%r, ctm=%r' %
|
||||||
(resources, streams, ctm))
|
(resources, streams, ctm))
|
||||||
self.init_resources(resources)
|
self.init_resources(resources)
|
||||||
self.init_state(ctm)
|
self.init_state(ctm)
|
||||||
self.execute(list_value(streams))
|
self.execute(list_value(streams))
|
||||||
|
@ -778,12 +850,12 @@ class PDFPageInterpreter(object):
|
||||||
return
|
return
|
||||||
while 1:
|
while 1:
|
||||||
try:
|
try:
|
||||||
(_,obj) = parser.nextobject()
|
(_, obj) = parser.nextobject()
|
||||||
except PSEOF:
|
except PSEOF:
|
||||||
break
|
break
|
||||||
if isinstance(obj, PSKeyword):
|
if isinstance(obj, PSKeyword):
|
||||||
name = keyword_name(obj)
|
name = keyword_name(obj)
|
||||||
method = 'do_%s' % name.replace('*','_a').replace('"','_w').replace("'",'_q')
|
method = 'do_%s' % name.replace('*', '_a').replace('"', '_w').replace("'", '_q')
|
||||||
if hasattr(self, method):
|
if hasattr(self, method):
|
||||||
func = getattr(self, method)
|
func = getattr(self, method)
|
||||||
nargs = func.func_code.co_argcount-1
|
nargs = func.func_code.co_argcount-1
|
||||||
|
|
|
@ -63,7 +63,7 @@ class PDFPage(object):
|
||||||
else:
|
else:
|
||||||
contents = []
|
contents = []
|
||||||
if not isinstance(contents, list):
|
if not isinstance(contents, list):
|
||||||
contents = [ contents ]
|
contents = [contents]
|
||||||
self.contents = contents
|
self.contents = contents
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -71,6 +71,7 @@ class PDFPage(object):
|
||||||
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
|
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
|
||||||
|
|
||||||
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
|
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_pages(klass, document, debug=0):
|
def create_pages(klass, document, debug=0):
|
||||||
def search(obj, parent):
|
def search(obj, parent):
|
||||||
|
@ -80,7 +81,7 @@ class PDFPage(object):
|
||||||
else:
|
else:
|
||||||
objid = obj.objid
|
objid = obj.objid
|
||||||
tree = dict_value(obj).copy()
|
tree = dict_value(obj).copy()
|
||||||
for (k,v) in parent.iteritems():
|
for (k, v) in parent.iteritems():
|
||||||
if k in klass.INHERITABLE_ATTRS and k not in tree:
|
if k in klass.INHERITABLE_ATTRS and k not in tree:
|
||||||
tree[k] = v
|
tree[k] = v
|
||||||
if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
|
if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
|
||||||
|
@ -95,7 +96,7 @@ class PDFPage(object):
|
||||||
yield (objid, tree)
|
yield (objid, tree)
|
||||||
pages = False
|
pages = False
|
||||||
if 'Pages' in document.catalog:
|
if 'Pages' in document.catalog:
|
||||||
for (objid,tree) in search(document.catalog['Pages'], document.catalog):
|
for (objid, tree) in search(document.catalog['Pages'], document.catalog):
|
||||||
yield klass(document, objid, tree)
|
yield klass(document, objid, tree)
|
||||||
pages = True
|
pages = True
|
||||||
if not pages:
|
if not pages:
|
||||||
|
@ -110,7 +111,8 @@ class PDFPage(object):
|
||||||
pass
|
pass
|
||||||
return
|
return
|
||||||
|
|
||||||
class PDFTextExtractionNotAllowed(PDFEncryptionError): pass
|
class PDFTextExtractionNotAllowed(PDFEncryptionError):
|
||||||
|
pass
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_pages(klass, fp,
|
def get_pages(klass, fp,
|
||||||
|
@ -127,8 +129,10 @@ class PDFPage(object):
|
||||||
if check_extractable and not doc.is_extractable:
|
if check_extractable and not doc.is_extractable:
|
||||||
raise klass.PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
|
raise klass.PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
|
||||||
# Process each page contained in the document.
|
# Process each page contained in the document.
|
||||||
for (pageno,page) in enumerate(klass.create_pages(doc)):
|
for (pageno, page) in enumerate(klass.create_pages(doc)):
|
||||||
if pagenos and (pageno not in pagenos): continue
|
if pagenos and (pageno not in pagenos):
|
||||||
|
continue
|
||||||
yield page
|
yield page
|
||||||
if maxpages and maxpages <= pageno+1: break
|
if maxpages and maxpages <= pageno+1:
|
||||||
|
break
|
||||||
return
|
return
|
||||||
|
|
|
@ -15,7 +15,8 @@ from pdftypes import str_value, list_value, dict_value, stream_value
|
||||||
|
|
||||||
## Exceptions
|
## Exceptions
|
||||||
##
|
##
|
||||||
class PDFSyntaxError(PDFException): pass
|
class PDFSyntaxError(PDFException):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
## PDFParser
|
## PDFParser
|
||||||
|
@ -55,6 +56,7 @@ class PDFParser(PSStackParser):
|
||||||
KEYWORD_STREAM = KWD('stream')
|
KEYWORD_STREAM = KWD('stream')
|
||||||
KEYWORD_XREF = KWD('xref')
|
KEYWORD_XREF = KWD('xref')
|
||||||
KEYWORD_STARTXREF = KWD('startxref')
|
KEYWORD_STARTXREF = KWD('startxref')
|
||||||
|
|
||||||
def do_keyword(self, pos, token):
|
def do_keyword(self, pos, token):
|
||||||
"""Handles PDF-related keywords."""
|
"""Handles PDF-related keywords."""
|
||||||
|
|
||||||
|
@ -71,7 +73,7 @@ class PDFParser(PSStackParser):
|
||||||
elif token is self.KEYWORD_R:
|
elif token is self.KEYWORD_R:
|
||||||
# reference to indirect object
|
# reference to indirect object
|
||||||
try:
|
try:
|
||||||
((_,objid), (_,genno)) = self.pop(2)
|
((_, objid), (_, genno)) = self.pop(2)
|
||||||
(objid, genno) = (int(objid), int(genno))
|
(objid, genno) = (int(objid), int(genno))
|
||||||
obj = PDFObjRef(self.doc, objid, genno)
|
obj = PDFObjRef(self.doc, objid, genno)
|
||||||
self.push((pos, obj))
|
self.push((pos, obj))
|
||||||
|
@ -80,7 +82,7 @@ class PDFParser(PSStackParser):
|
||||||
|
|
||||||
elif token is self.KEYWORD_STREAM:
|
elif token is self.KEYWORD_STREAM:
|
||||||
# stream object
|
# stream object
|
||||||
((_,dic),) = self.pop(1)
|
((_, dic),) = self.pop(1)
|
||||||
dic = dict_value(dic)
|
dic = dict_value(dic)
|
||||||
objlen = 0
|
objlen = 0
|
||||||
if not self.fallback:
|
if not self.fallback:
|
||||||
|
@ -118,7 +120,7 @@ class PDFParser(PSStackParser):
|
||||||
# XXX limit objlen not to exceed object boundary
|
# XXX limit objlen not to exceed object boundary
|
||||||
if 2 <= self.debug:
|
if 2 <= self.debug:
|
||||||
print >>sys.stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
|
print >>sys.stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
|
||||||
(pos, objlen, dic, data[:10])
|
(pos, objlen, dic, data[:10])
|
||||||
obj = PDFStream(dic, data, self.doc.decipher)
|
obj = PDFStream(dic, data, self.doc.decipher)
|
||||||
self.push((pos, obj))
|
self.push((pos, obj))
|
||||||
|
|
||||||
|
@ -153,7 +155,7 @@ class PDFStreamParser(PDFParser):
|
||||||
if token is self.KEYWORD_R:
|
if token is self.KEYWORD_R:
|
||||||
# reference to indirect object
|
# reference to indirect object
|
||||||
try:
|
try:
|
||||||
((_,objid), (_,genno)) = self.pop(2)
|
((_, objid), (_, genno)) = self.pop(2)
|
||||||
(objid, genno) = (int(objid), int(genno))
|
(objid, genno) = (int(objid), int(genno))
|
||||||
obj = PDFObjRef(self.doc, objid, genno)
|
obj = PDFObjRef(self.doc, objid, genno)
|
||||||
self.push((pos, obj))
|
self.push((pos, obj))
|
||||||
|
|
|
@ -23,13 +23,28 @@ LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT'))
|
||||||
|
|
||||||
## PDF Objects
|
## PDF Objects
|
||||||
##
|
##
|
||||||
class PDFObject(PSObject): pass
|
class PDFObject(PSObject):
|
||||||
|
pass
|
||||||
|
|
||||||
class PDFException(PSException): pass
|
|
||||||
class PDFTypeError(PDFException): pass
|
class PDFException(PSException):
|
||||||
class PDFValueError(PDFException): pass
|
pass
|
||||||
class PDFObjectNotFound(PDFException): pass
|
|
||||||
class PDFNotImplementedError(PDFException): pass
|
|
||||||
|
class PDFTypeError(PDFException):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class PDFValueError(PDFException):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class PDFObjectNotFound(PDFException):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class PDFNotImplementedError(PDFException):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
## PDFObjRef
|
## PDFObjRef
|
||||||
|
@ -66,6 +81,7 @@ def resolve1(x, default=None):
|
||||||
x = x.resolve(default=default)
|
x = x.resolve(default=default)
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
def resolve_all(x, default=None):
|
def resolve_all(x, default=None):
|
||||||
"""Recursively resolves the given object and all the internals.
|
"""Recursively resolves the given object and all the internals.
|
||||||
|
|
||||||
|
@ -75,24 +91,26 @@ def resolve_all(x, default=None):
|
||||||
while isinstance(x, PDFObjRef):
|
while isinstance(x, PDFObjRef):
|
||||||
x = x.resolve(default=default)
|
x = x.resolve(default=default)
|
||||||
if isinstance(x, list):
|
if isinstance(x, list):
|
||||||
x = [ resolve_all(v, default=default) for v in x ]
|
x = [resolve_all(v, default=default) for v in x]
|
||||||
elif isinstance(x, dict):
|
elif isinstance(x, dict):
|
||||||
for (k,v) in x.iteritems():
|
for (k, v) in x.iteritems():
|
||||||
x[k] = resolve_all(v, default=default)
|
x[k] = resolve_all(v, default=default)
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
def decipher_all(decipher, objid, genno, x):
|
def decipher_all(decipher, objid, genno, x):
|
||||||
"""Recursively deciphers the given object.
|
"""Recursively deciphers the given object.
|
||||||
"""
|
"""
|
||||||
if isinstance(x, str):
|
if isinstance(x, str):
|
||||||
return decipher(objid, genno, x)
|
return decipher(objid, genno, x)
|
||||||
if isinstance(x, list):
|
if isinstance(x, list):
|
||||||
x = [ decipher_all(decipher, objid, genno, v) for v in x ]
|
x = [decipher_all(decipher, objid, genno, v) for v in x]
|
||||||
elif isinstance(x, dict):
|
elif isinstance(x, dict):
|
||||||
for (k,v) in x.iteritems():
|
for (k, v) in x.iteritems():
|
||||||
x[k] = decipher_all(decipher, objid, genno, v)
|
x[k] = decipher_all(decipher, objid, genno, v)
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
# Type cheking
|
# Type cheking
|
||||||
def int_value(x):
|
def int_value(x):
|
||||||
x = resolve1(x)
|
x = resolve1(x)
|
||||||
|
@ -102,6 +120,7 @@ def int_value(x):
|
||||||
return 0
|
return 0
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
def float_value(x):
|
def float_value(x):
|
||||||
x = resolve1(x)
|
x = resolve1(x)
|
||||||
if not isinstance(x, float):
|
if not isinstance(x, float):
|
||||||
|
@ -110,6 +129,7 @@ def float_value(x):
|
||||||
return 0.0
|
return 0.0
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
def num_value(x):
|
def num_value(x):
|
||||||
x = resolve1(x)
|
x = resolve1(x)
|
||||||
if not (isinstance(x, int) or isinstance(x, float)):
|
if not (isinstance(x, int) or isinstance(x, float)):
|
||||||
|
@ -118,6 +138,7 @@ def num_value(x):
|
||||||
return 0
|
return 0
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
def str_value(x):
|
def str_value(x):
|
||||||
x = resolve1(x)
|
x = resolve1(x)
|
||||||
if not isinstance(x, str):
|
if not isinstance(x, str):
|
||||||
|
@ -126,6 +147,7 @@ def str_value(x):
|
||||||
return ''
|
return ''
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
def list_value(x):
|
def list_value(x):
|
||||||
x = resolve1(x)
|
x = resolve1(x)
|
||||||
if not (isinstance(x, list) or isinstance(x, tuple)):
|
if not (isinstance(x, list) or isinstance(x, tuple)):
|
||||||
|
@ -134,6 +156,7 @@ def list_value(x):
|
||||||
return []
|
return []
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
def dict_value(x):
|
def dict_value(x):
|
||||||
x = resolve1(x)
|
x = resolve1(x)
|
||||||
if not isinstance(x, dict):
|
if not isinstance(x, dict):
|
||||||
|
@ -142,6 +165,7 @@ def dict_value(x):
|
||||||
return {}
|
return {}
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
def stream_value(x):
|
def stream_value(x):
|
||||||
x = resolve1(x)
|
x = resolve1(x)
|
||||||
if not isinstance(x, PDFStream):
|
if not isinstance(x, PDFStream):
|
||||||
|
@ -195,12 +219,14 @@ class PDFStream(PDFObject):
|
||||||
|
|
||||||
def get_filters(self):
|
def get_filters(self):
|
||||||
filters = self.get_any(('F', 'Filter'))
|
filters = self.get_any(('F', 'Filter'))
|
||||||
if not filters: return []
|
if not filters:
|
||||||
if isinstance(filters, list): return filters
|
return []
|
||||||
return [ filters ]
|
if isinstance(filters, list):
|
||||||
|
return filters
|
||||||
|
return [filters]
|
||||||
|
|
||||||
def decode(self):
|
def decode(self):
|
||||||
assert self.data is None and self.rawdata != None
|
assert self.data is None and self.rawdata is not None
|
||||||
data = self.rawdata
|
data = self.rawdata
|
||||||
if self.decipher:
|
if self.decipher:
|
||||||
# Handle encryption
|
# Handle encryption
|
||||||
|
|
|
@ -8,11 +8,24 @@ STRICT = 0
|
||||||
|
|
||||||
## PS Exceptions
|
## PS Exceptions
|
||||||
##
|
##
|
||||||
class PSException(Exception): pass
|
class PSException(Exception):
|
||||||
class PSEOF(PSException): pass
|
pass
|
||||||
class PSSyntaxError(PSException): pass
|
|
||||||
class PSTypeError(PSException): pass
|
|
||||||
class PSValueError(PSException): pass
|
class PSEOF(PSException):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class PSSyntaxError(PSException):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class PSTypeError(PSException):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class PSValueError(PSException):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
## Basic PostScript Types
|
## Basic PostScript Types
|
||||||
|
@ -114,6 +127,7 @@ def literal_name(x):
|
||||||
return str(x)
|
return str(x)
|
||||||
return x.name
|
return x.name
|
||||||
|
|
||||||
|
|
||||||
def keyword_name(x):
|
def keyword_name(x):
|
||||||
if not isinstance(x, PSKeyword):
|
if not isinstance(x, PSKeyword):
|
||||||
if STRICT:
|
if STRICT:
|
||||||
|
@ -136,7 +150,9 @@ END_NUMBER = re.compile(r'[^0-9]')
|
||||||
END_KEYWORD = re.compile(r'[#/%\[\]()<>{}\s]')
|
END_KEYWORD = re.compile(r'[#/%\[\]()<>{}\s]')
|
||||||
END_STRING = re.compile(r'[()\134]')
|
END_STRING = re.compile(r'[()\134]')
|
||||||
OCT_STRING = re.compile(r'[0-7]')
|
OCT_STRING = re.compile(r'[0-7]')
|
||||||
ESC_STRING = { 'b':8, 't':9, 'n':10, 'f':12, 'r':13, '(':40, ')':41, '\\':92 }
|
ESC_STRING = {'b': 8, 't': 9, 'n': 10, 'f': 12, 'r': 13, '(': 40, ')': 41, '\\': 92}
|
||||||
|
|
||||||
|
|
||||||
class PSBaseParser(object):
|
class PSBaseParser(object):
|
||||||
|
|
||||||
"""Most basic PostScript parser that performs only tokenization.
|
"""Most basic PostScript parser that performs only tokenization.
|
||||||
|
@ -190,7 +206,8 @@ class PSBaseParser(object):
|
||||||
return
|
return
|
||||||
|
|
||||||
def fillbuf(self):
|
def fillbuf(self):
|
||||||
if self.charpos < len(self.buf): return
|
if self.charpos < len(self.buf):
|
||||||
|
return
|
||||||
# fetch next chunk.
|
# fetch next chunk.
|
||||||
self.bufpos = self.fp.tell()
|
self.bufpos = self.fp.tell()
|
||||||
self.buf = self.fp.read(self.BUFSIZ)
|
self.buf = self.fp.read(self.BUFSIZ)
|
||||||
|
@ -242,7 +259,8 @@ class PSBaseParser(object):
|
||||||
pos = max(0, pos-self.BUFSIZ)
|
pos = max(0, pos-self.BUFSIZ)
|
||||||
self.fp.seek(pos)
|
self.fp.seek(pos)
|
||||||
s = self.fp.read(prevpos-pos)
|
s = self.fp.read(prevpos-pos)
|
||||||
if not s: break
|
if not s:
|
||||||
|
break
|
||||||
while 1:
|
while 1:
|
||||||
n = max(s.rfind('\r'), s.rfind('\n'))
|
n = max(s.rfind('\r'), s.rfind('\n'))
|
||||||
if n == -1:
|
if n == -1:
|
||||||
|
@ -407,7 +425,7 @@ class PSBaseParser(object):
|
||||||
return j+1
|
return j+1
|
||||||
if c == ')':
|
if c == ')':
|
||||||
self.paren -= 1
|
self.paren -= 1
|
||||||
if self.paren: # WTF, they said balanced parens need no special treatment.
|
if self.paren: # WTF, they said balanced parens need no special treatment.
|
||||||
self._curtoken += c
|
self._curtoken += c
|
||||||
return j+1
|
return j+1
|
||||||
self._add_token(self._curtoken)
|
self._add_token(self._curtoken)
|
||||||
|
@ -520,7 +538,7 @@ class PSStackParser(PSBaseParser):
|
||||||
def end_type(self, type):
|
def end_type(self, type):
|
||||||
if self.curtype != type:
|
if self.curtype != type:
|
||||||
raise PSTypeError('Type mismatch: %r != %r' % (self.curtype, type))
|
raise PSTypeError('Type mismatch: %r != %r' % (self.curtype, type))
|
||||||
objs = [ obj for (_,obj) in self.curstack ]
|
objs = [obj for (_, obj) in self.curstack]
|
||||||
(pos, self.curtype, self.curstack) = self.context.pop()
|
(pos, self.curtype, self.curstack) = self.context.pop()
|
||||||
if 2 <= self.debug:
|
if 2 <= self.debug:
|
||||||
print >>sys.stderr, 'end_type: pos=%r, type=%r, objs=%r' % (pos, type, objs)
|
print >>sys.stderr, 'end_type: pos=%r, type=%r, objs=%r' % (pos, type, objs)
|
||||||
|
@ -553,7 +571,8 @@ class PSStackParser(PSBaseParser):
|
||||||
try:
|
try:
|
||||||
self.push(self.end_type('a'))
|
self.push(self.end_type('a'))
|
||||||
except PSTypeError:
|
except PSTypeError:
|
||||||
if STRICT: raise
|
if STRICT:
|
||||||
|
raise
|
||||||
elif token == KEYWORD_DICT_BEGIN:
|
elif token == KEYWORD_DICT_BEGIN:
|
||||||
# begin dictionary
|
# begin dictionary
|
||||||
self.start_type(pos, 'd')
|
self.start_type(pos, 'd')
|
||||||
|
@ -564,10 +583,11 @@ class PSStackParser(PSBaseParser):
|
||||||
if len(objs) % 2 != 0:
|
if len(objs) % 2 != 0:
|
||||||
raise PSSyntaxError('Invalid dictionary construct: %r' % objs)
|
raise PSSyntaxError('Invalid dictionary construct: %r' % objs)
|
||||||
# construct a Python dictionary.
|
# construct a Python dictionary.
|
||||||
d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) if v is not None )
|
d = dict((literal_name(k), v) for (k, v) in choplist(2, objs) if v is not None)
|
||||||
self.push((pos, d))
|
self.push((pos, d))
|
||||||
except PSTypeError:
|
except PSTypeError:
|
||||||
if STRICT: raise
|
if STRICT:
|
||||||
|
raise
|
||||||
elif token == KEYWORD_PROC_BEGIN:
|
elif token == KEYWORD_PROC_BEGIN:
|
||||||
# begin proc
|
# begin proc
|
||||||
self.start_type(pos, 'p')
|
self.start_type(pos, 'p')
|
||||||
|
@ -576,7 +596,8 @@ class PSStackParser(PSBaseParser):
|
||||||
try:
|
try:
|
||||||
self.push(self.end_type('p'))
|
self.push(self.end_type('p'))
|
||||||
except PSTypeError:
|
except PSTypeError:
|
||||||
if STRICT: raise
|
if STRICT:
|
||||||
|
raise
|
||||||
else:
|
else:
|
||||||
if 2 <= self.debug:
|
if 2 <= self.debug:
|
||||||
print >>sys.stderr, 'do_keyword: pos=%r, token=%r, stack=%r' % \
|
print >>sys.stderr, 'do_keyword: pos=%r, token=%r, stack=%r' % \
|
||||||
|
@ -592,9 +613,11 @@ class PSStackParser(PSBaseParser):
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
|
||||||
## Simplistic Test cases
|
## Simplistic Test cases
|
||||||
##
|
##
|
||||||
import unittest
|
|
||||||
class TestPSBaseParser(unittest.TestCase):
|
class TestPSBaseParser(unittest.TestCase):
|
||||||
|
|
||||||
TESTDATA = r'''%!PS
|
TESTDATA = r'''%!PS
|
||||||
|
@ -630,7 +653,7 @@ func/a/b{(c)do*}def
|
||||||
(242, KWD('def')), (246, KWD('[')), (248, 1), (250, 'z'), (254, KWD('!')),
|
(242, KWD('def')), (246, KWD('[')), (248, 1), (250, 'z'), (254, KWD('!')),
|
||||||
(256, KWD(']')), (258, KWD('<<')), (261, LIT('foo')), (266, 'bar'),
|
(256, KWD(']')), (258, KWD('<<')), (261, LIT('foo')), (266, 'bar'),
|
||||||
(272, KWD('>>'))
|
(272, KWD('>>'))
|
||||||
]
|
]
|
||||||
|
|
||||||
OBJS = [
|
OBJS = [
|
||||||
(23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')),
|
(23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')),
|
||||||
|
@ -641,10 +664,11 @@ func/a/b{(c)do*}def
|
||||||
(191, ''), (194, ' '), (199, '@@ '), (211, '\xab\xcd\x00\x124\x05'),
|
(191, ''), (194, ' '), (199, '@@ '), (211, '\xab\xcd\x00\x124\x05'),
|
||||||
(230, LIT('a')), (232, LIT('b')), (234, ['c']), (246, [1, 'z']),
|
(230, LIT('a')), (232, LIT('b')), (234, ['c']), (246, [1, 'z']),
|
||||||
(258, {'foo': 'bar'}),
|
(258, {'foo': 'bar'}),
|
||||||
]
|
]
|
||||||
|
|
||||||
def get_tokens(self, s):
|
def get_tokens(self, s):
|
||||||
import StringIO
|
import StringIO
|
||||||
|
|
||||||
class MyParser(PSBaseParser):
|
class MyParser(PSBaseParser):
|
||||||
def flush(self):
|
def flush(self):
|
||||||
self.add_results(*self.popall())
|
self.add_results(*self.popall())
|
||||||
|
@ -659,6 +683,7 @@ func/a/b{(c)do*}def
|
||||||
|
|
||||||
def get_objects(self, s):
|
def get_objects(self, s):
|
||||||
import StringIO
|
import StringIO
|
||||||
|
|
||||||
class MyParser(PSStackParser):
|
class MyParser(PSStackParser):
|
||||||
def flush(self):
|
def flush(self):
|
||||||
self.add_results(*self.popall())
|
self.add_results(*self.popall())
|
||||||
|
@ -683,4 +708,5 @@ func/a/b{(c)do*}def
|
||||||
self.assertEqual(objs, self.OBJS)
|
self.assertEqual(objs, self.OBJS)
|
||||||
return
|
return
|
||||||
|
|
||||||
if __name__ == '__main__': unittest.main()
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
||||||
|
|
1326
pdfminer/rijndael.py
1326
pdfminer/rijndael.py
File diff suppressed because it is too large
Load Diff
|
@ -8,6 +8,7 @@
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
|
||||||
def rldecode(data):
|
def rldecode(data):
|
||||||
"""
|
"""
|
||||||
RunLength decoder (Adobe version) implementation based on PDF Reference
|
RunLength decoder (Adobe version) implementation based on PDF Reference
|
||||||
|
@ -26,7 +27,7 @@ def rldecode(data):
|
||||||
'1234567777777abcde'
|
'1234567777777abcde'
|
||||||
"""
|
"""
|
||||||
decoded = []
|
decoded = []
|
||||||
i=0
|
i = 0
|
||||||
while i < len(data):
|
while i < len(data):
|
||||||
#print "data[%d]=:%d:" % (i,ord(data[i]))
|
#print "data[%d]=:%d:" % (i,ord(data[i]))
|
||||||
length = ord(data[i])
|
length = ord(data[i])
|
||||||
|
|
|
@ -32,13 +32,13 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
|
||||||
buf += chr(c)
|
buf += chr(c)
|
||||||
elif pred == '\x02':
|
elif pred == '\x02':
|
||||||
# PNG up
|
# PNG up
|
||||||
for (a,b) in zip(line0,line1):
|
for (a, b) in zip(line0, line1):
|
||||||
c = (ord(a)+ord(b)) & 255
|
c = (ord(a)+ord(b)) & 255
|
||||||
buf += chr(c)
|
buf += chr(c)
|
||||||
elif pred == '\x03':
|
elif pred == '\x03':
|
||||||
# PNG average (UNTESTED)
|
# PNG average (UNTESTED)
|
||||||
c = 0
|
c = 0
|
||||||
for (a,b) in zip(line0,line1):
|
for (a, b) in zip(line0, line1):
|
||||||
c = ((c+ord(a)+ord(b))/2) & 255
|
c = ((c+ord(a)+ord(b))/2) & 255
|
||||||
buf += chr(c)
|
buf += chr(c)
|
||||||
else:
|
else:
|
||||||
|
@ -52,21 +52,25 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
|
||||||
##
|
##
|
||||||
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
|
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
|
||||||
|
|
||||||
def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
|
|
||||||
|
def mult_matrix((a1, b1, c1, d1, e1, f1), (a0, b0, c0, d0, e0, f0)):
|
||||||
"""Returns the multiplication of two matrices."""
|
"""Returns the multiplication of two matrices."""
|
||||||
return (a0*a1+c0*b1, b0*a1+d0*b1,
|
return (a0*a1+c0*b1, b0*a1+d0*b1,
|
||||||
a0*c1+c0*d1, b0*c1+d0*d1,
|
a0*c1+c0*d1, b0*c1+d0*d1,
|
||||||
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
|
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
|
||||||
|
|
||||||
def translate_matrix((a,b,c,d,e,f), (x,y)):
|
|
||||||
"""Translates a matrix by (x,y)."""
|
|
||||||
return (a,b,c,d,x*a+y*c+e,x*b+y*d+f)
|
|
||||||
|
|
||||||
def apply_matrix_pt((a,b,c,d,e,f), (x,y)):
|
def translate_matrix((a, b, c, d, e, f), (x, y)):
|
||||||
|
"""Translates a matrix by (x, y)."""
|
||||||
|
return (a, b, c, d, x*a+y*c+e, x*b+y*d+f)
|
||||||
|
|
||||||
|
|
||||||
|
def apply_matrix_pt((a, b, c, d, e, f), (x, y)):
|
||||||
"""Applies a matrix to a point."""
|
"""Applies a matrix to a point."""
|
||||||
return (a*x+c*y+e, b*x+d*y+f)
|
return (a*x+c*y+e, b*x+d*y+f)
|
||||||
|
|
||||||
def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
|
|
||||||
|
def apply_matrix_norm((a, b, c, d, e, f), (p, q)):
|
||||||
"""Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))"""
|
"""Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))"""
|
||||||
return (a*p+c*q, b*p+d*q)
|
return (a*p+c*q, b*p+d*q)
|
||||||
|
|
||||||
|
@ -79,17 +83,20 @@ def uniq(objs):
|
||||||
"""Eliminates duplicated elements."""
|
"""Eliminates duplicated elements."""
|
||||||
done = set()
|
done = set()
|
||||||
for obj in objs:
|
for obj in objs:
|
||||||
if obj in done: continue
|
if obj in done:
|
||||||
|
continue
|
||||||
done.add(obj)
|
done.add(obj)
|
||||||
yield obj
|
yield obj
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
# csort
|
# csort
|
||||||
def csort(objs, key=lambda x:x):
|
def csort(objs, key=lambda x: x):
|
||||||
"""Order-preserving sorting function."""
|
"""Order-preserving sorting function."""
|
||||||
idxs = dict( (obj,i) for (i,obj) in enumerate(objs) )
|
idxs = dict((obj, i) for (i, obj) in enumerate(objs))
|
||||||
return sorted(objs, key=lambda obj: (key(obj), idxs[obj]))
|
return sorted(objs, key=lambda obj: (key(obj), idxs[obj]))
|
||||||
|
|
||||||
|
|
||||||
# fsplit
|
# fsplit
|
||||||
def fsplit(pred, objs):
|
def fsplit(pred, objs):
|
||||||
"""Split a list into two classes according to the predicate."""
|
"""Split a list into two classes according to the predicate."""
|
||||||
|
@ -100,7 +107,8 @@ def fsplit(pred, objs):
|
||||||
t.append(obj)
|
t.append(obj)
|
||||||
else:
|
else:
|
||||||
f.append(obj)
|
f.append(obj)
|
||||||
return (t,f)
|
return (t, f)
|
||||||
|
|
||||||
|
|
||||||
# drange
|
# drange
|
||||||
def drange(v0, v1, d):
|
def drange(v0, v1, d):
|
||||||
|
@ -108,16 +116,18 @@ def drange(v0, v1, d):
|
||||||
assert v0 < v1
|
assert v0 < v1
|
||||||
return xrange(int(v0)/d, int(v1+d)/d)
|
return xrange(int(v0)/d, int(v1+d)/d)
|
||||||
|
|
||||||
|
|
||||||
# get_bound
|
# get_bound
|
||||||
def get_bound(pts):
|
def get_bound(pts):
|
||||||
"""Compute a minimal rectangle that covers all the points."""
|
"""Compute a minimal rectangle that covers all the points."""
|
||||||
(x0, y0, x1, y1) = (INF, INF, -INF, -INF)
|
(x0, y0, x1, y1) = (INF, INF, -INF, -INF)
|
||||||
for (x,y) in pts:
|
for (x, y) in pts:
|
||||||
x0 = min(x0, x)
|
x0 = min(x0, x)
|
||||||
y0 = min(y0, y)
|
y0 = min(y0, y)
|
||||||
x1 = max(x1, x)
|
x1 = max(x1, x)
|
||||||
y1 = max(y1, y)
|
y1 = max(y1, y)
|
||||||
return (x0,y0,x1,y1)
|
return (x0, y0, x1, y1)
|
||||||
|
|
||||||
|
|
||||||
# pick
|
# pick
|
||||||
def pick(seq, func, maxobj=None):
|
def pick(seq, func, maxobj=None):
|
||||||
|
@ -126,9 +136,10 @@ def pick(seq, func, maxobj=None):
|
||||||
for obj in seq:
|
for obj in seq:
|
||||||
score = func(obj)
|
score = func(obj)
|
||||||
if maxscore is None or maxscore < score:
|
if maxscore is None or maxscore < score:
|
||||||
(maxscore,maxobj) = (score,obj)
|
(maxscore, maxobj) = (score, obj)
|
||||||
return maxobj
|
return maxobj
|
||||||
|
|
||||||
|
|
||||||
# choplist
|
# choplist
|
||||||
def choplist(n, seq):
|
def choplist(n, seq):
|
||||||
"""Groups every n elements of the list."""
|
"""Groups every n elements of the list."""
|
||||||
|
@ -140,6 +151,7 @@ def choplist(n, seq):
|
||||||
r = []
|
r = []
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
# nunpack
|
# nunpack
|
||||||
def nunpack(s, default=0):
|
def nunpack(s, default=0):
|
||||||
"""Unpacks 1 to 4 byte integers (big endian)."""
|
"""Unpacks 1 to 4 byte integers (big endian)."""
|
||||||
|
@ -157,59 +169,65 @@ def nunpack(s, default=0):
|
||||||
else:
|
else:
|
||||||
raise TypeError('invalid length: %d' % l)
|
raise TypeError('invalid length: %d' % l)
|
||||||
|
|
||||||
|
|
||||||
# decode_text
|
# decode_text
|
||||||
PDFDocEncoding = ''.join( unichr(x) for x in (
|
PDFDocEncoding = ''.join(unichr(x) for x in (
|
||||||
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
|
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
|
||||||
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
|
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
|
||||||
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0017, 0x0017,
|
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0017, 0x0017,
|
||||||
0x02d8, 0x02c7, 0x02c6, 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc,
|
0x02d8, 0x02c7, 0x02c6, 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc,
|
||||||
0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
|
0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
|
||||||
0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
|
0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
|
||||||
0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
|
0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
|
||||||
0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
|
0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
|
||||||
0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
|
0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
|
||||||
0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
|
0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
|
||||||
0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
|
0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
|
||||||
0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
|
0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
|
||||||
0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
|
0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
|
||||||
0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
|
0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
|
||||||
0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
|
0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
|
||||||
0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x0000,
|
0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x0000,
|
||||||
0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044,
|
0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044,
|
||||||
0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
|
0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
|
||||||
0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160,
|
0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160,
|
||||||
0x0178, 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000,
|
0x0178, 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000,
|
||||||
0x20ac, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
|
0x20ac, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
|
||||||
0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af,
|
0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af,
|
||||||
0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
|
0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
|
||||||
0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
|
0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
|
||||||
0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
|
0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
|
||||||
0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
|
0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
|
||||||
0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
|
0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
|
||||||
0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
|
0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
|
||||||
0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
|
0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
|
||||||
0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
|
0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
|
||||||
0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
|
0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
|
||||||
0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
|
0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
|
||||||
))
|
))
|
||||||
|
|
||||||
|
|
||||||
def decode_text(s):
|
def decode_text(s):
|
||||||
"""Decodes a PDFDocEncoding string to Unicode."""
|
"""Decodes a PDFDocEncoding string to Unicode."""
|
||||||
if s.startswith('\xfe\xff'):
|
if s.startswith('\xfe\xff'):
|
||||||
return unicode(s[2:], 'utf-16be', 'ignore')
|
return unicode(s[2:], 'utf-16be', 'ignore')
|
||||||
else:
|
else:
|
||||||
return ''.join( PDFDocEncoding[ord(c)] for c in s )
|
return ''.join(PDFDocEncoding[ord(c)] for c in s)
|
||||||
|
|
||||||
|
|
||||||
# enc
|
# enc
|
||||||
def enc(x, codec='ascii'):
|
def enc(x, codec='ascii'):
|
||||||
"""Encodes a string for SGML/XML/HTML"""
|
"""Encodes a string for SGML/XML/HTML"""
|
||||||
x = x.replace('&','&').replace('>','>').replace('<','<').replace('"','"')
|
x = x.replace('&', '&').replace('>', '>').replace('<', '<').replace('"', '"')
|
||||||
return x.encode(codec, 'xmlcharrefreplace')
|
return x.encode(codec, 'xmlcharrefreplace')
|
||||||
|
|
||||||
def bbox2str((x0,y0,x1,y1)):
|
|
||||||
|
def bbox2str((x0, y0, x1, y1)):
|
||||||
return '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
|
return '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
|
||||||
|
|
||||||
def matrix2str((a,b,c,d,e,f)):
|
|
||||||
return '[%.2f,%.2f,%.2f,%.2f, (%.2f,%.2f)]' % (a,b,c,d,e,f)
|
def matrix2str((a, b, c, d, e, f)):
|
||||||
|
return '[%.2f,%.2f,%.2f,%.2f, (%.2f,%.2f)]' % (a, b, c, d, e, f)
|
||||||
|
|
||||||
|
|
||||||
## Plane
|
## Plane
|
||||||
|
@ -240,14 +258,14 @@ class Plane(object):
|
||||||
def __contains__(self, obj):
|
def __contains__(self, obj):
|
||||||
return obj in self._objs
|
return obj in self._objs
|
||||||
|
|
||||||
def _getrange(self, (x0,y0,x1,y1)):
|
def _getrange(self, (x0, y0, x1, y1)):
|
||||||
x0 = max(self.x0, x0)
|
x0 = max(self.x0, x0)
|
||||||
y0 = max(self.y0, y0)
|
y0 = max(self.y0, y0)
|
||||||
x1 = min(self.x1, x1)
|
x1 = min(self.x1, x1)
|
||||||
y1 = min(self.y1, y1)
|
y1 = min(self.y1, y1)
|
||||||
for y in drange(y0, y1, self.gridsize):
|
for y in drange(y0, y1, self.gridsize):
|
||||||
for x in drange(x0, x1, self.gridsize):
|
for x in drange(x0, x1, self.gridsize):
|
||||||
yield (x,y)
|
yield (x, y)
|
||||||
return
|
return
|
||||||
|
|
||||||
# extend(objs)
|
# extend(objs)
|
||||||
|
@ -279,14 +297,17 @@ class Plane(object):
|
||||||
return
|
return
|
||||||
|
|
||||||
# find(): finds objects that are in a certain area.
|
# find(): finds objects that are in a certain area.
|
||||||
def find(self, (x0,y0,x1,y1)):
|
def find(self, (x0, y0, x1, y1)):
|
||||||
done = set()
|
done = set()
|
||||||
for k in self._getrange((x0,y0,x1,y1)):
|
for k in self._getrange((x0, y0, x1, y1)):
|
||||||
if k not in self._grid: continue
|
if k not in self._grid:
|
||||||
|
continue
|
||||||
for obj in self._grid[k]:
|
for obj in self._grid[k]:
|
||||||
if obj in done: continue
|
if obj in done:
|
||||||
|
continue
|
||||||
done.add(obj)
|
done.add(obj)
|
||||||
if (obj.x1 <= x0 or x1 <= obj.x0 or
|
if (obj.x1 <= x0 or x1 <= obj.x0 or
|
||||||
obj.y1 <= y0 or y1 <= obj.y0): continue
|
obj.y1 <= y0 or y1 <= obj.y0):
|
||||||
|
continue
|
||||||
yield obj
|
yield obj
|
||||||
return
|
return
|
||||||
|
|
Loading…
Reference in New Issue