PEP8: Whitespace changes to match pep8
parent
c1da8b835c
commit
2caa5edc25
|
@ -1,4 +1,5 @@
|
|||
#!/usr/bin/env python
|
||||
__version__ = '20131022'
|
||||
|
||||
if __name__ == '__main__': print __version__
|
||||
if __name__ == '__main__':
|
||||
print __version__
|
||||
|
|
|
@ -6,6 +6,7 @@ This code is in the public domain.
|
|||
|
||||
"""
|
||||
|
||||
|
||||
## Arcfour
|
||||
##
|
||||
class Arcfour(object):
|
||||
|
|
|
@ -9,6 +9,7 @@ This code is in the public domain.
|
|||
import re
|
||||
import struct
|
||||
|
||||
|
||||
# ascii85decode(data)
|
||||
def ascii85decode(data):
|
||||
"""
|
||||
|
@ -35,7 +36,7 @@ def ascii85decode(data):
|
|||
n += 1
|
||||
b = b*85+(ord(c)-33)
|
||||
if n == 5:
|
||||
out += struct.pack('>L',b)
|
||||
out += struct.pack('>L', b)
|
||||
n = b = 0
|
||||
elif c == 'z':
|
||||
assert n == 0
|
||||
|
@ -44,13 +45,15 @@ def ascii85decode(data):
|
|||
if n:
|
||||
for _ in range(5-n):
|
||||
b = b*85+84
|
||||
out += struct.pack('>L',b)[:n-1]
|
||||
out += struct.pack('>L', b)[:n-1]
|
||||
break
|
||||
return out
|
||||
|
||||
# asciihexdecode(data)
|
||||
hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE)
|
||||
trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
|
||||
|
||||
|
||||
def asciihexdecode(data):
|
||||
"""
|
||||
ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
|
||||
|
|
|
@ -25,10 +25,11 @@ class BitParser(object):
|
|||
@classmethod
|
||||
def add(klass, root, v, bits):
|
||||
p = root
|
||||
b = None
|
||||
for i in xrange(len(bits)):
|
||||
if 0 < i:
|
||||
if p[b] is None:
|
||||
p[b] = [None,None]
|
||||
p[b] = [None, None]
|
||||
p = p[b]
|
||||
if bits[i] == '1':
|
||||
b = 1
|
||||
|
@ -40,7 +41,7 @@ class BitParser(object):
|
|||
def feedbytes(self, data):
|
||||
for c in data:
|
||||
b = ord(c)
|
||||
for m in (128,64,32,16,8,4,2,1):
|
||||
for m in (128, 64, 32, 16, 8, 4, 2, 1):
|
||||
self._parse_bit(b & m)
|
||||
return
|
||||
|
||||
|
@ -61,7 +62,7 @@ class BitParser(object):
|
|||
##
|
||||
class CCITTG4Parser(BitParser):
|
||||
|
||||
MODE = [None,None]
|
||||
MODE = [None, None]
|
||||
BitParser.add(MODE, 0, '1')
|
||||
BitParser.add(MODE, +1, '011')
|
||||
BitParser.add(MODE, -1, '010')
|
||||
|
@ -81,7 +82,7 @@ class CCITTG4Parser(BitParser):
|
|||
BitParser.add(MODE, 'x7', '0000001110')
|
||||
BitParser.add(MODE, 'e', '000000000001000000000001')
|
||||
|
||||
WHITE = [None,None]
|
||||
WHITE = [None, None]
|
||||
BitParser.add(WHITE, 0 , '00110101')
|
||||
BitParser.add(WHITE, 1 , '000111')
|
||||
BitParser.add(WHITE, 2 , '0111')
|
||||
|
@ -187,7 +188,7 @@ class CCITTG4Parser(BitParser):
|
|||
BitParser.add(WHITE, 2496, '000000011110')
|
||||
BitParser.add(WHITE, 2560, '000000011111')
|
||||
|
||||
BLACK = [None,None]
|
||||
BLACK = [None, None]
|
||||
BitParser.add(BLACK, 0 , '0000110111')
|
||||
BitParser.add(BLACK, 1 , '010')
|
||||
BitParser.add(BLACK, 2 , '11')
|
||||
|
@ -293,25 +294,30 @@ class CCITTG4Parser(BitParser):
|
|||
BitParser.add(BLACK, 2496, '000000011110')
|
||||
BitParser.add(BLACK, 2560, '000000011111')
|
||||
|
||||
UNCOMPRESSED = [None,None]
|
||||
BitParser.add(UNCOMPRESSED, '1' , '1')
|
||||
BitParser.add(UNCOMPRESSED, '01' , '01')
|
||||
BitParser.add(UNCOMPRESSED, '001' , '001')
|
||||
BitParser.add(UNCOMPRESSED, '0001' , '0001')
|
||||
BitParser.add(UNCOMPRESSED, '00001' , '00001')
|
||||
BitParser.add(UNCOMPRESSED, '00000' , '000001')
|
||||
BitParser.add(UNCOMPRESSED, 'T00' , '00000011')
|
||||
BitParser.add(UNCOMPRESSED, 'T10' , '00000010')
|
||||
BitParser.add(UNCOMPRESSED, 'T000' , '000000011')
|
||||
BitParser.add(UNCOMPRESSED, 'T100' , '000000010')
|
||||
BitParser.add(UNCOMPRESSED, 'T0000' , '0000000011')
|
||||
BitParser.add(UNCOMPRESSED, 'T1000' , '0000000010')
|
||||
BitParser.add(UNCOMPRESSED, 'T00000' , '00000000011')
|
||||
BitParser.add(UNCOMPRESSED, 'T10000' , '00000000010')
|
||||
UNCOMPRESSED = [None, None]
|
||||
BitParser.add(UNCOMPRESSED, '1', '1')
|
||||
BitParser.add(UNCOMPRESSED, '01', '01')
|
||||
BitParser.add(UNCOMPRESSED, '001', '001')
|
||||
BitParser.add(UNCOMPRESSED, '0001', '0001')
|
||||
BitParser.add(UNCOMPRESSED, '00001', '00001')
|
||||
BitParser.add(UNCOMPRESSED, '00000', '000001')
|
||||
BitParser.add(UNCOMPRESSED, 'T00', '00000011')
|
||||
BitParser.add(UNCOMPRESSED, 'T10', '00000010')
|
||||
BitParser.add(UNCOMPRESSED, 'T000', '000000011')
|
||||
BitParser.add(UNCOMPRESSED, 'T100', '000000010')
|
||||
BitParser.add(UNCOMPRESSED, 'T0000', '0000000011')
|
||||
BitParser.add(UNCOMPRESSED, 'T1000', '0000000010')
|
||||
BitParser.add(UNCOMPRESSED, 'T00000', '00000000011')
|
||||
BitParser.add(UNCOMPRESSED, 'T10000', '00000000010')
|
||||
|
||||
class EOFB(Exception): pass
|
||||
class InvalidData(Exception): pass
|
||||
class ByteSkip(Exception): pass
|
||||
class EOFB(Exception):
|
||||
pass
|
||||
|
||||
class InvalidData(Exception):
|
||||
pass
|
||||
|
||||
class ByteSkip(Exception):
|
||||
pass
|
||||
|
||||
def __init__(self, width, bytealign=False):
|
||||
BitParser.__init__(self)
|
||||
|
@ -324,7 +330,7 @@ class CCITTG4Parser(BitParser):
|
|||
for c in data:
|
||||
b = ord(c)
|
||||
try:
|
||||
for m in (128,64,32,16,8,4,2,1):
|
||||
for m in (128, 64, 32, 16, 8, 4, 2, 1):
|
||||
self._parse_bit(b & m)
|
||||
except self.ByteSkip:
|
||||
self._accept = self._parse_mode
|
||||
|
@ -358,7 +364,8 @@ class CCITTG4Parser(BitParser):
|
|||
raise self.InvalidData(mode)
|
||||
|
||||
def _parse_horiz1(self, n):
|
||||
if n is None: raise self.InvalidData
|
||||
if n is None:
|
||||
raise self.InvalidData
|
||||
self._n1 += n
|
||||
if n < 64:
|
||||
self._n2 = 0
|
||||
|
@ -370,7 +377,8 @@ class CCITTG4Parser(BitParser):
|
|||
return self.BLACK
|
||||
|
||||
def _parse_horiz2(self, n):
|
||||
if n is None: raise self.InvalidData
|
||||
if n is None:
|
||||
raise self.InvalidData
|
||||
self._n2 += n
|
||||
if n < 64:
|
||||
self._color = 1-self._color
|
||||
|
@ -384,7 +392,8 @@ class CCITTG4Parser(BitParser):
|
|||
return self.BLACK
|
||||
|
||||
def _parse_uncompressed(self, bits):
|
||||
if not bits: raise self.InvalidData
|
||||
if not bits:
|
||||
raise self.InvalidData
|
||||
if bits.startswith('T'):
|
||||
self._accept = self._parse_mode
|
||||
self._color = int(bits[1])
|
||||
|
@ -395,17 +404,17 @@ class CCITTG4Parser(BitParser):
|
|||
return self.UNCOMPRESSED
|
||||
|
||||
def _get_bits(self):
|
||||
return ''.join( str(b) for b in self._curline[:self._curpos] )
|
||||
return ''.join(str(b) for b in self._curline[:self._curpos])
|
||||
|
||||
def _get_refline(self, i):
|
||||
if i < 0:
|
||||
return '[]'+''.join( str(b) for b in self._refline )
|
||||
return '[]'+''.join(str(b) for b in self._refline)
|
||||
elif len(self._refline) <= i:
|
||||
return ''.join( str(b) for b in self._refline )+'[]'
|
||||
return ''.join(str(b) for b in self._refline)+'[]'
|
||||
else:
|
||||
return (''.join( str(b) for b in self._refline[:i] )+
|
||||
'['+str(self._refline[i])+']'+
|
||||
''.join( str(b) for b in self._refline[i+1:] ))
|
||||
return (''.join(str(b) for b in self._refline[:i]) +
|
||||
'['+str(self._refline[i])+']' +
|
||||
''.join(str(b) for b in self._refline[i+1:]))
|
||||
|
||||
def reset(self):
|
||||
self._y = 0
|
||||
|
@ -416,7 +425,7 @@ class CCITTG4Parser(BitParser):
|
|||
return
|
||||
|
||||
def output_line(self, y, bits):
|
||||
print y, ''.join( str(b) for b in bits )
|
||||
print y, ''.join(str(b) for b in bits)
|
||||
return
|
||||
|
||||
def _reset_line(self):
|
||||
|
@ -441,12 +450,13 @@ class CCITTG4Parser(BitParser):
|
|||
x1 = self._curpos+1
|
||||
while 1:
|
||||
if x1 == 0:
|
||||
if (self._color == 1 and
|
||||
self._refline[x1] != self._color): break
|
||||
if (self._color == 1 and self._refline[x1] != self._color):
|
||||
break
|
||||
elif x1 == len(self._refline):
|
||||
break
|
||||
elif (self._refline[x1-1] == self._color and
|
||||
self._refline[x1] != self._color): break
|
||||
self._refline[x1] != self._color):
|
||||
break
|
||||
x1 += 1
|
||||
x1 += dx
|
||||
x0 = max(0, self._curpos)
|
||||
|
@ -467,21 +477,23 @@ class CCITTG4Parser(BitParser):
|
|||
x1 = self._curpos+1
|
||||
while 1:
|
||||
if x1 == 0:
|
||||
if (self._color == 1 and
|
||||
self._refline[x1] != self._color): break
|
||||
if (self._color == 1 and self._refline[x1] != self._color):
|
||||
break
|
||||
elif x1 == len(self._refline):
|
||||
break
|
||||
elif (self._refline[x1-1] == self._color and
|
||||
self._refline[x1] != self._color): break
|
||||
self._refline[x1] != self._color):
|
||||
break
|
||||
x1 += 1
|
||||
while 1:
|
||||
if x1 == 0:
|
||||
if (self._color == 0 and
|
||||
self._refline[x1] == self._color): break
|
||||
if (self._color == 0 and self._refline[x1] == self._color):
|
||||
break
|
||||
elif x1 == len(self._refline):
|
||||
break
|
||||
elif (self._refline[x1-1] != self._color and
|
||||
self._refline[x1] == self._color): break
|
||||
self._refline[x1] == self._color):
|
||||
break
|
||||
x1 += 1
|
||||
for x in xrange(self._curpos, x1):
|
||||
self._curline[x] = self._color
|
||||
|
@ -494,11 +506,13 @@ class CCITTG4Parser(BitParser):
|
|||
self._curpos = 0
|
||||
x = self._curpos
|
||||
for _ in xrange(n1):
|
||||
if len(self._curline) <= x: break
|
||||
if len(self._curline) <= x:
|
||||
break
|
||||
self._curline[x] = self._color
|
||||
x += 1
|
||||
for _ in xrange(n2):
|
||||
if len(self._curline) <= x: break
|
||||
if len(self._curline) <= x:
|
||||
break
|
||||
self._curline[x] = 1-self._color
|
||||
x += 1
|
||||
self._curpos = x
|
||||
|
@ -512,15 +526,16 @@ class CCITTG4Parser(BitParser):
|
|||
self._flush_line()
|
||||
return
|
||||
|
||||
import unittest
|
||||
|
||||
|
||||
## Test cases
|
||||
##
|
||||
import unittest
|
||||
class TestCCITTG4Parser(unittest.TestCase):
|
||||
|
||||
def get_parser(self, bits):
|
||||
parser = CCITTG4Parser(len(bits))
|
||||
parser._curline = [ int(c) for c in bits ]
|
||||
parser._curline = [int(c) for c in bits]
|
||||
parser._reset_line()
|
||||
return parser
|
||||
|
||||
|
@ -655,7 +670,7 @@ class TestCCITTG4Parser(unittest.TestCase):
|
|||
parser._do_vertical(-1)
|
||||
parser._do_vertical(-1)
|
||||
parser._do_vertical(1)
|
||||
parser._do_horizontal(1,1)
|
||||
parser._do_horizontal(1, 1)
|
||||
self.assertEqual(parser._get_bits(), '011101')
|
||||
return
|
||||
|
||||
|
@ -685,10 +700,10 @@ class CCITTFaxDecoder(CCITTG4Parser):
|
|||
def output_line(self, y, bits):
|
||||
bytes = array.array('B', [0]*((len(bits)+7)/8))
|
||||
if self.reversed:
|
||||
bits = [ 1-b for b in bits ]
|
||||
for (i,b) in enumerate(bits):
|
||||
bits = [1-b for b in bits]
|
||||
for (i, b) in enumerate(bits):
|
||||
if b:
|
||||
bytes[i/8] += (128,64,32,16,8,4,2,1)[i%8]
|
||||
bytes[i/8] += (128, 64, 32, 16, 8, 4, 2, 1)[i % 8]
|
||||
self._buf += bytes.tostring()
|
||||
return
|
||||
|
||||
|
@ -711,28 +726,32 @@ def main(argv):
|
|||
import pygame
|
||||
if not argv[1:]:
|
||||
return unittest.main()
|
||||
|
||||
class Parser(CCITTG4Parser):
|
||||
def __init__(self, width, bytealign=False):
|
||||
CCITTG4Parser.__init__(self, width, bytealign=bytealign)
|
||||
self.img = pygame.Surface((self.width,1000))
|
||||
self.img = pygame.Surface((self.width, 1000))
|
||||
return
|
||||
|
||||
def output_line(self, y, bits):
|
||||
for (x,b) in enumerate(bits):
|
||||
for (x, b) in enumerate(bits):
|
||||
if b:
|
||||
self.img.set_at((x,y), (255,255,255))
|
||||
self.img.set_at((x, y), (255, 255, 255))
|
||||
else:
|
||||
self.img.set_at((x,y), (0,0,0))
|
||||
self.img.set_at((x, y), (0, 0, 0))
|
||||
return
|
||||
|
||||
def close(self):
|
||||
pygame.image.save(self.img, 'out.bmp')
|
||||
return
|
||||
for path in argv[1:]:
|
||||
fp = file(path,'rb')
|
||||
(_,_,k,w,h,_) = path.split('.')
|
||||
fp = file(path, 'rb')
|
||||
(_, _, k, w, h, _) = path.split('.')
|
||||
parser = Parser(int(w))
|
||||
parser.feedbytes(fp.read())
|
||||
parser.close()
|
||||
fp.close()
|
||||
return
|
||||
|
||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main(sys.argv))
|
||||
|
|
|
@ -26,7 +26,8 @@ from encodingdb import name2unicode
|
|||
from utils import choplist, nunpack
|
||||
|
||||
|
||||
class CMapError(Exception): pass
|
||||
class CMapError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
## CMap
|
||||
|
@ -44,8 +45,9 @@ class CMap(object):
|
|||
|
||||
def use_cmap(self, cmap):
|
||||
assert isinstance(cmap, CMap)
|
||||
|
||||
def copy(dst, src):
|
||||
for (k,v) in src.iteritems():
|
||||
for (k, v) in src.iteritems():
|
||||
if isinstance(v, dict):
|
||||
d = {}
|
||||
dst[k] = d
|
||||
|
@ -74,10 +76,10 @@ class CMap(object):
|
|||
if code2cid is None:
|
||||
code2cid = self.code2cid
|
||||
code = ()
|
||||
for (k,v) in sorted(code2cid.iteritems()):
|
||||
for (k, v) in sorted(code2cid.iteritems()):
|
||||
c = code+(k,)
|
||||
if isinstance(v, int):
|
||||
out.write('code %r = cid %d\n' % (c,v))
|
||||
out.write('code %r = cid %d\n' % (c, v))
|
||||
else:
|
||||
self.dump(out=out, code2cid=v, code=c)
|
||||
return
|
||||
|
@ -102,7 +104,6 @@ class IdentityCMap(object):
|
|||
return ()
|
||||
|
||||
|
||||
|
||||
## UnicodeMap
|
||||
##
|
||||
class UnicodeMap(object):
|
||||
|
@ -119,8 +120,8 @@ class UnicodeMap(object):
|
|||
return self.cid2unichr[cid]
|
||||
|
||||
def dump(self, out=sys.stdout):
|
||||
for (k,v) in sorted(self.cid2unichr.iteritems()):
|
||||
out.write('cid %d = unicode %r\n' % (k,v))
|
||||
for (k, v) in sorted(self.cid2unichr.iteritems()):
|
||||
out.write('cid %d = unicode %r\n' % (k, v))
|
||||
return
|
||||
|
||||
|
||||
|
@ -153,7 +154,7 @@ class FileCMap(CMap):
|
|||
else:
|
||||
t = {}
|
||||
d[c] = t
|
||||
d =t
|
||||
d = t
|
||||
c = ord(code[-1])
|
||||
d[c] = cid
|
||||
return
|
||||
|
@ -232,17 +233,16 @@ class CMapDB(object):
|
|||
_cmap_cache = {}
|
||||
_umap_cache = {}
|
||||
|
||||
class CMapNotFound(CMapError): pass
|
||||
class CMapNotFound(CMapError):
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def _load_data(klass, name):
|
||||
filename = '%s.pickle.gz' % name
|
||||
if klass.debug:
|
||||
print >>sys.stderr, 'loading:', name
|
||||
cmap_paths = (
|
||||
os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'),
|
||||
os.path.join(os.path.dirname(__file__), 'cmap'),
|
||||
)
|
||||
cmap_paths = (os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'),
|
||||
os.path.join(os.path.dirname(__file__), 'cmap'),)
|
||||
for directory in cmap_paths:
|
||||
path = os.path.join(directory, filename)
|
||||
if os.path.exists(path):
|
||||
|
@ -306,11 +306,12 @@ class CMapParser(PSStackParser):
|
|||
elif name == 'endcmap':
|
||||
self._in_cmap = False
|
||||
return
|
||||
if not self._in_cmap: return
|
||||
if not self._in_cmap:
|
||||
return
|
||||
#
|
||||
if name == 'def':
|
||||
try:
|
||||
((_,k),(_,v)) = self.pop(2)
|
||||
((_, k), (_, v)) = self.pop(2)
|
||||
self.cmap.set_attr(literal_name(k), v)
|
||||
except PSSyntaxError:
|
||||
pass
|
||||
|
@ -318,7 +319,7 @@ class CMapParser(PSStackParser):
|
|||
|
||||
if name == 'usecmap':
|
||||
try:
|
||||
((_,cmapname),) = self.pop(1)
|
||||
((_, cmapname),) = self.pop(1)
|
||||
self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
|
||||
except PSSyntaxError:
|
||||
pass
|
||||
|
@ -337,13 +338,15 @@ class CMapParser(PSStackParser):
|
|||
self.popall()
|
||||
return
|
||||
if name == 'endcidrange':
|
||||
objs = [ obj for (_,obj) in self.popall() ]
|
||||
for (s,e,cid) in choplist(3, objs):
|
||||
objs = [obj for (__, obj) in self.popall()]
|
||||
for (s, e, cid) in choplist(3, objs):
|
||||
if (not isinstance(s, str) or not isinstance(e, str) or
|
||||
not isinstance(cid, int) or len(s) != len(e)): continue
|
||||
not isinstance(cid, int) or len(s) != len(e)):
|
||||
continue
|
||||
sprefix = s[:-4]
|
||||
eprefix = e[:-4]
|
||||
if sprefix != eprefix: continue
|
||||
if sprefix != eprefix:
|
||||
continue
|
||||
svar = s[-4:]
|
||||
evar = e[-4:]
|
||||
s1 = nunpack(svar)
|
||||
|
@ -351,7 +354,7 @@ class CMapParser(PSStackParser):
|
|||
vlen = len(svar)
|
||||
#assert s1 <= e1
|
||||
for i in xrange(e1-s1+1):
|
||||
x = sprefix+struct.pack('>L',s1+i)[-vlen:]
|
||||
x = sprefix+struct.pack('>L', s1+i)[-vlen:]
|
||||
self.cmap.add_code2cid(x, cid+i)
|
||||
return
|
||||
|
||||
|
@ -359,8 +362,8 @@ class CMapParser(PSStackParser):
|
|||
self.popall()
|
||||
return
|
||||
if name == 'endcidchar':
|
||||
objs = [ obj for (_,obj) in self.popall() ]
|
||||
for (cid,code) in choplist(2, objs):
|
||||
objs = [obj for (__, obj) in self.popall()]
|
||||
for (cid, code) in choplist(2, objs):
|
||||
if isinstance(code, str) and isinstance(cid, str):
|
||||
self.cmap.add_code2cid(code, nunpack(cid))
|
||||
return
|
||||
|
@ -369,10 +372,11 @@ class CMapParser(PSStackParser):
|
|||
self.popall()
|
||||
return
|
||||
if name == 'endbfrange':
|
||||
objs = [ obj for (_,obj) in self.popall() ]
|
||||
for (s,e,code) in choplist(3, objs):
|
||||
objs = [obj for (__, obj) in self.popall()]
|
||||
for (s, e, code) in choplist(3, objs):
|
||||
if (not isinstance(s, str) or not isinstance(e, str) or
|
||||
len(s) != len(e)): continue
|
||||
len(s) != len(e)):
|
||||
continue
|
||||
s1 = nunpack(s)
|
||||
e1 = nunpack(e)
|
||||
#assert s1 <= e1
|
||||
|
@ -385,7 +389,7 @@ class CMapParser(PSStackParser):
|
|||
prefix = code[:-4]
|
||||
vlen = len(var)
|
||||
for i in xrange(e1-s1+1):
|
||||
x = prefix+struct.pack('>L',base+i)[-vlen:]
|
||||
x = prefix+struct.pack('>L', base+i)[-vlen:]
|
||||
self.cmap.add_cid2unichr(s1+i, x)
|
||||
return
|
||||
|
||||
|
@ -393,8 +397,8 @@ class CMapParser(PSStackParser):
|
|||
self.popall()
|
||||
return
|
||||
if name == 'endbfchar':
|
||||
objs = [ obj for (_,obj) in self.popall() ]
|
||||
for (cid,code) in choplist(2, objs):
|
||||
objs = [obj for (__, obj) in self.popall()]
|
||||
for (cid, code) in choplist(2, objs):
|
||||
if isinstance(cid, str) and isinstance(code, str):
|
||||
self.cmap.add_cid2unichr(nunpack(cid), code)
|
||||
return
|
||||
|
@ -409,6 +413,7 @@ class CMapParser(PSStackParser):
|
|||
self.push((pos, token))
|
||||
return
|
||||
|
||||
|
||||
# test
|
||||
def main(argv):
|
||||
args = argv[1:]
|
||||
|
@ -421,4 +426,5 @@ def main(argv):
|
|||
cmap.dump()
|
||||
return
|
||||
|
||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main(sys.argv))
|
||||
|
|
|
@ -21,9 +21,9 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|||
return
|
||||
|
||||
def begin_page(self, page, ctm):
|
||||
(x0,y0,x1,y1) = page.mediabox
|
||||
(x0,y0) = apply_matrix_pt(ctm, (x0,y0))
|
||||
(x1,y1) = apply_matrix_pt(ctm, (x1,y1))
|
||||
(x0, y0, x1, y1) = page.mediabox
|
||||
(x0, y0) = apply_matrix_pt(ctm, (x0, y0))
|
||||
(x1, y1) = apply_matrix_pt(ctm, (x1, y1))
|
||||
mediabox = (0, 0, abs(x0-x1), abs(y0-y1))
|
||||
self.cur_item = LTPage(self.pageno, mediabox)
|
||||
return
|
||||
|
@ -61,26 +61,26 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|||
shape = ''.join(x[0] for x in path)
|
||||
if shape == 'ml':
|
||||
# horizontal/vertical line
|
||||
(_,x0,y0) = path[0]
|
||||
(_,x1,y1) = path[1]
|
||||
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
|
||||
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
|
||||
(_, x0, y0) = path[0]
|
||||
(_, x1, y1) = path[1]
|
||||
(x0, y0) = apply_matrix_pt(self.ctm, (x0, y0))
|
||||
(x1, y1) = apply_matrix_pt(self.ctm, (x1, y1))
|
||||
if x0 == x1 or y0 == y1:
|
||||
self.cur_item.add(LTLine(gstate.linewidth, (x0,y0), (x1,y1)))
|
||||
self.cur_item.add(LTLine(gstate.linewidth, (x0, y0), (x1, y1)))
|
||||
return
|
||||
if shape == 'mlllh':
|
||||
# rectangle
|
||||
(_,x0,y0) = path[0]
|
||||
(_,x1,y1) = path[1]
|
||||
(_,x2,y2) = path[2]
|
||||
(_,x3,y3) = path[3]
|
||||
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
|
||||
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
|
||||
(x2,y2) = apply_matrix_pt(self.ctm, (x2,y2))
|
||||
(x3,y3) = apply_matrix_pt(self.ctm, (x3,y3))
|
||||
(_, x0, y0) = path[0]
|
||||
(_, x1, y1) = path[1]
|
||||
(_, x2, y2) = path[2]
|
||||
(_, x3, y3) = path[3]
|
||||
(x0, y0) = apply_matrix_pt(self.ctm, (x0, y0))
|
||||
(x1, y1) = apply_matrix_pt(self.ctm, (x1, y1))
|
||||
(x2, y2) = apply_matrix_pt(self.ctm, (x2, y2))
|
||||
(x3, y3) = apply_matrix_pt(self.ctm, (x3, y3))
|
||||
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
|
||||
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
|
||||
self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
|
||||
self.cur_item.add(LTRect(gstate.linewidth, (x0, y0, x2, y2)))
|
||||
return
|
||||
# other shapes
|
||||
pts = []
|
||||
|
@ -176,7 +176,8 @@ class TextConverter(PDFConverter):
|
|||
# is text. This stops all the image and drawing ouput from being
|
||||
# recorded and taking up RAM.
|
||||
def render_image(self, name, stream):
|
||||
if self.imagewriter is None: return
|
||||
if self.imagewriter is None:
|
||||
return
|
||||
PDFConverter.render_image(self, name, stream)
|
||||
return
|
||||
|
||||
|
@ -196,18 +197,18 @@ class HTMLConverter(PDFConverter):
|
|||
'textgroup': 'red',
|
||||
'curve': 'black',
|
||||
'page': 'gray',
|
||||
}
|
||||
}
|
||||
|
||||
TEXT_COLORS = {
|
||||
'textbox': 'blue',
|
||||
'char': 'black',
|
||||
}
|
||||
}
|
||||
|
||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
|
||||
scale=1, fontscale=1.0, layoutmode='normal', showpageno=True,
|
||||
pagemargin=50, imagewriter=None,
|
||||
rect_colors={'curve':'black', 'page':'gray'},
|
||||
text_colors={'char':'black'}):
|
||||
rect_colors={'curve': 'black', 'page': 'gray'},
|
||||
text_colors={'char': 'black'}):
|
||||
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||
self.scale = scale
|
||||
self.fontscale = fontscale
|
||||
|
@ -238,7 +239,7 @@ class HTMLConverter(PDFConverter):
|
|||
|
||||
def write_footer(self):
|
||||
self.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' %
|
||||
', '.join('<a href="#%s">%s</a>' % (i,i) for i in xrange(1,self.pageno)))
|
||||
', '.join('<a href="#%s">%s</a>' % (i, i) for i in xrange(1, self.pageno)))
|
||||
self.write('</body></html>\n')
|
||||
return
|
||||
|
||||
|
@ -318,6 +319,7 @@ class HTMLConverter(PDFConverter):
|
|||
for child in item:
|
||||
show_group(child)
|
||||
return
|
||||
|
||||
def render(item):
|
||||
if isinstance(item, LTPage):
|
||||
self._yoffset += item.y1
|
||||
|
@ -415,6 +417,7 @@ class XMLConverter(PDFConverter):
|
|||
show_group(child)
|
||||
self.outfp.write('</textgroup>\n')
|
||||
return
|
||||
|
||||
def render(item):
|
||||
if isinstance(item, LTPage):
|
||||
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
|
||||
|
|
|
@ -6,15 +6,18 @@ from glyphlist import glyphname2unicode
|
|||
from latin_enc import ENCODING
|
||||
|
||||
|
||||
STRIP_NAME = re.compile(r'[0-9]+')
|
||||
|
||||
|
||||
## name2unicode
|
||||
##
|
||||
STRIP_NAME = re.compile(r'[0-9]+')
|
||||
def name2unicode(name):
|
||||
"""Converts Adobe glyph names to Unicode numbers."""
|
||||
if name in glyphname2unicode:
|
||||
return glyphname2unicode[name]
|
||||
m = STRIP_NAME.search(name)
|
||||
if not m: raise KeyError(name)
|
||||
if not m:
|
||||
raise KeyError(name)
|
||||
return unichr(int(m.group(0)))
|
||||
|
||||
|
||||
|
@ -26,19 +29,23 @@ class EncodingDB(object):
|
|||
mac2unicode = {}
|
||||
win2unicode = {}
|
||||
pdf2unicode = {}
|
||||
for (name,std,mac,win,pdf) in ENCODING:
|
||||
for (name, std, mac, win, pdf) in ENCODING:
|
||||
c = name2unicode(name)
|
||||
if std: std2unicode[std] = c
|
||||
if mac: mac2unicode[mac] = c
|
||||
if win: win2unicode[win] = c
|
||||
if pdf: pdf2unicode[pdf] = c
|
||||
if std:
|
||||
std2unicode[std] = c
|
||||
if mac:
|
||||
mac2unicode[mac] = c
|
||||
if win:
|
||||
win2unicode[win] = c
|
||||
if pdf:
|
||||
pdf2unicode[pdf] = c
|
||||
|
||||
encodings = {
|
||||
'StandardEncoding': std2unicode,
|
||||
'MacRomanEncoding': mac2unicode,
|
||||
'WinAnsiEncoding': win2unicode,
|
||||
'PDFDocEncoding': pdf2unicode,
|
||||
}
|
||||
'StandardEncoding': std2unicode,
|
||||
'MacRomanEncoding': mac2unicode,
|
||||
'WinAnsiEncoding': win2unicode,
|
||||
'PDFDocEncoding': pdf2unicode,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def get_encoding(klass, name, diff=None):
|
||||
|
|
|
@ -7,9 +7,11 @@ import os, os.path
|
|||
from pdftypes import LITERALS_DCT_DECODE
|
||||
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK
|
||||
|
||||
|
||||
def align32(x):
|
||||
return ((x+3)/4)*4
|
||||
|
||||
|
||||
## BMPWriter
|
||||
##
|
||||
class BMPWriter(object):
|
||||
|
@ -38,12 +40,12 @@ class BMPWriter(object):
|
|||
self.fp.write(info)
|
||||
if ncols == 2:
|
||||
# B&W color table
|
||||
for i in (0,255):
|
||||
self.fp.write(struct.pack('BBBx', i,i,i))
|
||||
for i in (0, 255):
|
||||
self.fp.write(struct.pack('BBBx', i, i, i))
|
||||
elif ncols == 256:
|
||||
# grayscale color table
|
||||
for i in xrange(256):
|
||||
self.fp.write(struct.pack('BBBx', i,i,i))
|
||||
self.fp.write(struct.pack('BBBx', i, i, i))
|
||||
self.pos0 = self.fp.tell()
|
||||
self.pos1 = self.pos0 + self.datasize
|
||||
return
|
||||
|
|
|
@ -82,7 +82,7 @@ class LTComponent(LTItem):
|
|||
return ('<%s %s>' %
|
||||
(self.__class__.__name__, bbox2str(self.bbox)))
|
||||
|
||||
def set_bbox(self, (x0,y0,x1,y1)):
|
||||
def set_bbox(self, (x0, y0, x1, y1)):
|
||||
self.x0 = x0
|
||||
self.y0 = y0
|
||||
self.x1 = x1
|
||||
|
@ -143,7 +143,7 @@ class LTCurve(LTComponent):
|
|||
return
|
||||
|
||||
def get_pts(self):
|
||||
return ','.join( '%.3f,%.3f' % p for p in self.pts )
|
||||
return ','.join('%.3f,%.3f' % p for p in self.pts)
|
||||
|
||||
|
||||
## LTLine
|
||||
|
@ -159,8 +159,8 @@ class LTLine(LTCurve):
|
|||
##
|
||||
class LTRect(LTCurve):
|
||||
|
||||
def __init__(self, linewidth, (x0,y0,x1,y1)):
|
||||
LTCurve.__init__(self, linewidth, [(x0,y0), (x1,y0), (x1,y1), (x0,y1)])
|
||||
def __init__(self, linewidth, (x0, y0, x1, y1)):
|
||||
LTCurve.__init__(self, linewidth, [(x0, y0), (x1, y0), (x1, y1), (x0, y1)])
|
||||
return
|
||||
|
||||
|
||||
|
@ -213,7 +213,7 @@ class LTChar(LTComponent, LTText):
|
|||
if font.is_vertical():
|
||||
# vertical
|
||||
width = font.get_width() * fontsize
|
||||
(vx,vy) = textdisp
|
||||
(vx, vy) = textdisp
|
||||
if vx is None:
|
||||
vx = width/2
|
||||
else:
|
||||
|
@ -230,15 +230,15 @@ class LTChar(LTComponent, LTText):
|
|||
ty = descent + rise
|
||||
bll = (0, ty)
|
||||
bur = (self.adv, ty+height)
|
||||
(a,b,c,d,e,f) = self.matrix
|
||||
(a, b, c, d, e, f) = self.matrix
|
||||
self.upright = (0 < a*d*scaling and b*c <= 0)
|
||||
(x0,y0) = apply_matrix_pt(self.matrix, bll)
|
||||
(x1,y1) = apply_matrix_pt(self.matrix, bur)
|
||||
(x0, y0) = apply_matrix_pt(self.matrix, bll)
|
||||
(x1, y1) = apply_matrix_pt(self.matrix, bur)
|
||||
if x1 < x0:
|
||||
(x0,x1) = (x1,x0)
|
||||
(x0, x1) = (x1, x0)
|
||||
if y1 < y0:
|
||||
(y0,y1) = (y1,y0)
|
||||
LTComponent.__init__(self, (x0,y0,x1,y1))
|
||||
(y0, y1) = (y1, y0)
|
||||
LTComponent.__init__(self, (x0, y0, x1, y1))
|
||||
if font.is_vertical():
|
||||
self.size = self.width
|
||||
else:
|
||||
|
@ -294,7 +294,7 @@ class LTContainer(LTComponent):
|
|||
class LTExpandableContainer(LTContainer):
|
||||
|
||||
def __init__(self):
|
||||
LTContainer.__init__(self, (+INF,+INF,-INF,-INF))
|
||||
LTContainer.__init__(self, (+INF, +INF, -INF, -INF))
|
||||
return
|
||||
|
||||
def add(self, obj):
|
||||
|
@ -314,7 +314,7 @@ class LTTextContainer(LTExpandableContainer, LTText):
|
|||
return
|
||||
|
||||
def get_text(self):
|
||||
return ''.join( obj.get_text() for obj in self if isinstance(obj, LTText) )
|
||||
return ''.join(obj.get_text() for obj in self if isinstance(obj, LTText))
|
||||
|
||||
|
||||
## LTTextLine
|
||||
|
@ -339,6 +339,7 @@ class LTTextLine(LTTextContainer):
|
|||
def find_neighbors(self, plane, ratio):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class LTTextLineHorizontal(LTTextLine):
|
||||
|
||||
def __init__(self, word_margin):
|
||||
|
@ -358,11 +359,12 @@ class LTTextLineHorizontal(LTTextLine):
|
|||
def find_neighbors(self, plane, ratio):
|
||||
d = ratio*self.height
|
||||
objs = plane.find((self.x0, self.y0-d, self.x1, self.y1+d))
|
||||
return [ obj for obj in objs
|
||||
if (isinstance(obj, LTTextLineHorizontal) and
|
||||
abs(obj.height-self.height) < d and
|
||||
(abs(obj.x0-self.x0) < d or
|
||||
abs(obj.x1-self.x1) < d)) ]
|
||||
return [obj for obj in objs
|
||||
if (isinstance(obj, LTTextLineHorizontal) and
|
||||
abs(obj.height-self.height) < d and
|
||||
(abs(obj.x0-self.x0) < d or
|
||||
abs(obj.x1-self.x1) < d))]
|
||||
|
||||
|
||||
class LTTextLineVertical(LTTextLine):
|
||||
|
||||
|
@ -383,11 +385,11 @@ class LTTextLineVertical(LTTextLine):
|
|||
def find_neighbors(self, plane, ratio):
|
||||
d = ratio*self.width
|
||||
objs = plane.find((self.x0-d, self.y0, self.x1+d, self.y1))
|
||||
return [ obj for obj in objs
|
||||
if (isinstance(obj, LTTextLineVertical) and
|
||||
abs(obj.width-self.width) < d and
|
||||
(abs(obj.y0-self.y0) < d or
|
||||
abs(obj.y1-self.y1) < d)) ]
|
||||
return [obj for obj in objs
|
||||
if (isinstance(obj, LTTextLineVertical) and
|
||||
abs(obj.width-self.width) < d and
|
||||
(abs(obj.y0-self.y0) < d or
|
||||
abs(obj.y1-self.y1) < d))]
|
||||
|
||||
|
||||
## LTTextBox
|
||||
|
@ -407,6 +409,7 @@ class LTTextBox(LTTextContainer):
|
|||
(self.__class__.__name__,
|
||||
self.index, bbox2str(self.bbox), self.get_text()))
|
||||
|
||||
|
||||
class LTTextBoxHorizontal(LTTextBox):
|
||||
|
||||
def analyze(self, laparams):
|
||||
|
@ -417,6 +420,7 @@ class LTTextBoxHorizontal(LTTextBox):
|
|||
def get_writing_mode(self):
|
||||
return 'lr-tb'
|
||||
|
||||
|
||||
class LTTextBoxVertical(LTTextBox):
|
||||
|
||||
def analyze(self, laparams):
|
||||
|
@ -437,6 +441,7 @@ class LTTextGroup(LTTextContainer):
|
|||
self.extend(objs)
|
||||
return
|
||||
|
||||
|
||||
class LTTextGroupLRTB(LTTextGroup):
|
||||
|
||||
def analyze(self, laparams):
|
||||
|
@ -447,6 +452,7 @@ class LTTextGroupLRTB(LTTextGroup):
|
|||
(1+laparams.boxes_flow)*(obj.y0+obj.y1))
|
||||
return
|
||||
|
||||
|
||||
class LTTextGroupTBRL(LTTextGroup):
|
||||
|
||||
def analyze(self, laparams):
|
||||
|
@ -454,7 +460,7 @@ class LTTextGroupTBRL(LTTextGroup):
|
|||
# reorder the objects from top-right to bottom-left.
|
||||
self._objs = csort(self._objs, key=lambda obj:
|
||||
-(1+laparams.boxes_flow)*(obj.x0+obj.x1)
|
||||
-(1-laparams.boxes_flow)*(obj.y1))
|
||||
- (1-laparams.boxes_flow)*(obj.y1))
|
||||
return
|
||||
|
||||
|
||||
|
@ -506,8 +512,8 @@ class LTLayoutContainer(LTContainer):
|
|||
# |<-->|
|
||||
# (line_overlap)
|
||||
k |= 2
|
||||
if ( (k & 1 and isinstance(line, LTTextLineHorizontal)) or
|
||||
(k & 2 and isinstance(line, LTTextLineVertical)) ):
|
||||
if ((k & 1 and isinstance(line, LTTextLineHorizontal)) or
|
||||
(k & 2 and isinstance(line, LTTextLineVertical))):
|
||||
line.add(obj1)
|
||||
elif line is not None:
|
||||
yield line
|
||||
|
@ -555,7 +561,8 @@ class LTLayoutContainer(LTContainer):
|
|||
done = set()
|
||||
for line in lines:
|
||||
box = boxes[line]
|
||||
if box in done: continue
|
||||
if box in done:
|
||||
continue
|
||||
done.add(box)
|
||||
if not box.is_empty():
|
||||
yield box
|
||||
|
@ -563,32 +570,34 @@ class LTLayoutContainer(LTContainer):
|
|||
|
||||
def group_textboxes(self, laparams, boxes):
|
||||
assert boxes
|
||||
|
||||
def dist(obj1, obj2):
|
||||
"""A distance function between two TextBoxes.
|
||||
|
||||
Consider the bounding rectangle for obj1 and obj2.
|
||||
Return its area less the areas of obj1 and obj2,
|
||||
shown as 'www' below. This value may be negative.
|
||||
+------+..........+ (x1,y1)
|
||||
+------+..........+ (x1, y1)
|
||||
| obj1 |wwwwwwwwww:
|
||||
+------+www+------+
|
||||
:wwwwwwwwww| obj2 |
|
||||
(x0,y0) +..........+------+
|
||||
(x0, y0) +..........+------+
|
||||
"""
|
||||
x0 = min(obj1.x0,obj2.x0)
|
||||
y0 = min(obj1.y0,obj2.y0)
|
||||
x1 = max(obj1.x1,obj2.x1)
|
||||
y1 = max(obj1.y1,obj2.y1)
|
||||
x0 = min(obj1.x0, obj2.x0)
|
||||
y0 = min(obj1.y0, obj2.y0)
|
||||
x1 = max(obj1.x1, obj2.x1)
|
||||
y1 = max(obj1.y1, obj2.y1)
|
||||
return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height)
|
||||
|
||||
def isany(obj1, obj2):
|
||||
"""Check if there's any other object between obj1 and obj2.
|
||||
"""
|
||||
x0 = min(obj1.x0,obj2.x0)
|
||||
y0 = min(obj1.y0,obj2.y0)
|
||||
x1 = max(obj1.x1,obj2.x1)
|
||||
y1 = max(obj1.y1,obj2.y1)
|
||||
objs = set(plane.find((x0,y0,x1,y1)))
|
||||
return objs.difference((obj1,obj2))
|
||||
x0 = min(obj1.x0, obj2.x0)
|
||||
y0 = min(obj1.y0, obj2.y0)
|
||||
x1 = max(obj1.x1, obj2.x1)
|
||||
y1 = max(obj1.y1, obj2.y1)
|
||||
objs = set(plane.find((x0, y0, x1, y1)))
|
||||
return objs.difference((obj1, obj2))
|
||||
# XXX this still takes O(n^2) :(
|
||||
dists = []
|
||||
for i in xrange(len(boxes)):
|
||||
|
@ -600,23 +609,23 @@ class LTLayoutContainer(LTContainer):
|
|||
plane = Plane(self.bbox)
|
||||
plane.extend(boxes)
|
||||
while dists:
|
||||
(c,d,obj1,obj2) = dists.pop(0)
|
||||
(c, d, obj1, obj2) = dists.pop(0)
|
||||
if c == 0 and isany(obj1, obj2):
|
||||
dists.append((1,d,obj1,obj2))
|
||||
dists.append((1, d, obj1, obj2))
|
||||
continue
|
||||
if (isinstance(obj1, LTTextBoxVertical) or
|
||||
isinstance(obj1, LTTextGroupTBRL) or
|
||||
isinstance(obj2, LTTextBoxVertical) or
|
||||
isinstance(obj2, LTTextGroupTBRL)):
|
||||
group = LTTextGroupTBRL([obj1,obj2])
|
||||
group = LTTextGroupTBRL([obj1, obj2])
|
||||
else:
|
||||
group = LTTextGroupLRTB([obj1,obj2])
|
||||
group = LTTextGroupLRTB([obj1, obj2])
|
||||
plane.remove(obj1)
|
||||
plane.remove(obj2)
|
||||
# this line is optimized -- don't change without profiling
|
||||
dists = [ n for n in dists if n[2] in plane._objs and n[3] in plane._objs ]
|
||||
dists = [n for n in dists if n[2] in plane._objs and n[3] in plane._objs]
|
||||
for other in plane:
|
||||
dists.append((0, dist(group,other), group, other))
|
||||
dists.append((0, dist(group, other), group, other))
|
||||
dists.sort()
|
||||
plane.add(group)
|
||||
assert len(plane) == 1
|
||||
|
@ -628,21 +637,22 @@ class LTLayoutContainer(LTContainer):
|
|||
(textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self._objs)
|
||||
for obj in otherobjs:
|
||||
obj.analyze(laparams)
|
||||
if not textobjs: return
|
||||
if not textobjs:
|
||||
return
|
||||
textlines = list(self.get_textlines(laparams, textobjs))
|
||||
assert len(textobjs) <= sum( len(line._objs) for line in textlines )
|
||||
assert len(textobjs) <= sum(len(line._objs) for line in textlines)
|
||||
(empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)
|
||||
for obj in empties:
|
||||
obj.analyze(laparams)
|
||||
textboxes = list(self.get_textboxes(laparams, textlines))
|
||||
assert len(textlines) == sum( len(box._objs) for box in textboxes )
|
||||
assert len(textlines) == sum(len(box._objs) for box in textboxes)
|
||||
if textboxes:
|
||||
self.groups = self.group_textboxes(laparams, textboxes)
|
||||
assigner = IndexAssigner()
|
||||
for group in self.groups:
|
||||
group.analyze(laparams)
|
||||
assigner.run(group)
|
||||
textboxes.sort(key=lambda box:box.index)
|
||||
textboxes.sort(key=lambda box: box.index)
|
||||
self._objs = textboxes + otherobjs + empties
|
||||
return
|
||||
|
||||
|
@ -654,9 +664,9 @@ class LTFigure(LTLayoutContainer):
|
|||
def __init__(self, name, bbox, matrix):
|
||||
self.name = name
|
||||
self.matrix = matrix
|
||||
(x,y,w,h) = bbox
|
||||
bbox = get_bound( apply_matrix_pt(matrix, (p,q))
|
||||
for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) )
|
||||
(x, y, w, h) = bbox
|
||||
bbox = get_bound(apply_matrix_pt(matrix, (p, q))
|
||||
for (p, q) in ((x, y), (x+w, y), (x, y+h), (x+w, y+h)))
|
||||
LTLayoutContainer.__init__(self, bbox)
|
||||
return
|
||||
|
||||
|
@ -666,7 +676,8 @@ class LTFigure(LTLayoutContainer):
|
|||
bbox2str(self.bbox), matrix2str(self.matrix)))
|
||||
|
||||
def analyze(self, laparams):
|
||||
if not laparams.all_texts: return
|
||||
if not laparams.all_texts:
|
||||
return
|
||||
LTLayoutContainer.analyze(self, laparams)
|
||||
return
|
||||
|
||||
|
|
|
@ -34,17 +34,18 @@ class LZWDecoder(object):
|
|||
# |-----8-bits-----|
|
||||
# |-bpos-|-bits-| |
|
||||
# | |----r----|
|
||||
v = (v<<bits) | ((self.buff>>(r-bits)) & ((1<<bits)-1))
|
||||
v = (v << bits) | ((self.buff >> (r-bits)) & ((1 << bits)-1))
|
||||
self.bpos += bits
|
||||
break
|
||||
else:
|
||||
# |-----8-bits-----|
|
||||
# |-bpos-|---bits----...
|
||||
# | |----r----|
|
||||
v = (v<<r) | (self.buff & ((1<<r)-1))
|
||||
v = (v << r) | (self.buff & ((1 << r)-1))
|
||||
bits -= r
|
||||
x = self.fp.read(1)
|
||||
if not x: raise EOFError
|
||||
if not x:
|
||||
raise EOFError
|
||||
self.buff = ord(x)
|
||||
self.bpos = 0
|
||||
return v
|
||||
|
@ -52,9 +53,9 @@ class LZWDecoder(object):
|
|||
def feed(self, code):
|
||||
x = ''
|
||||
if code == 256:
|
||||
self.table = [ chr(c) for c in xrange(256) ] # 0-255
|
||||
self.table.append(None) # 256
|
||||
self.table.append(None) # 257
|
||||
self.table = [chr(c) for c in xrange(256)] # 0-255
|
||||
self.table.append(None) # 256
|
||||
self.table.append(None) # 257
|
||||
self.prevbuf = ''
|
||||
self.nbits = 9
|
||||
elif code == 257:
|
||||
|
@ -97,6 +98,7 @@ class LZWDecoder(object):
|
|||
(self.nbits, code, x, self.table[258:]))
|
||||
return
|
||||
|
||||
|
||||
# lzwdecode
|
||||
def lzwdecode(data):
|
||||
"""
|
||||
|
|
|
@ -8,6 +8,7 @@ LITERAL_DEVICE_GRAY = LIT('DeviceGray')
|
|||
LITERAL_DEVICE_RGB = LIT('DeviceRGB')
|
||||
LITERAL_DEVICE_CMYK = LIT('DeviceCMYK')
|
||||
|
||||
|
||||
class PDFColorSpace(object):
|
||||
|
||||
def __init__(self, name, ncomponents):
|
||||
|
@ -20,14 +21,14 @@ class PDFColorSpace(object):
|
|||
|
||||
|
||||
PREDEFINED_COLORSPACE = dict(
|
||||
(name, PDFColorSpace(name,n)) for (name,n) in {
|
||||
'CalRGB': 3,
|
||||
'CalGray': 1,
|
||||
'Lab': 3,
|
||||
'DeviceRGB': 3,
|
||||
'DeviceCMYK': 4,
|
||||
'DeviceGray': 1,
|
||||
'Separation': 1,
|
||||
'Indexed': 1,
|
||||
'Pattern': 1,
|
||||
}.iteritems())
|
||||
(name, PDFColorSpace(name, n)) for (name, n) in {
|
||||
'CalRGB': 3,
|
||||
'CalGray': 1,
|
||||
'Lab': 3,
|
||||
'DeviceRGB': 3,
|
||||
'DeviceCMYK': 4,
|
||||
'DeviceGray': 1,
|
||||
'Separation': 1,
|
||||
'Indexed': 1,
|
||||
'Pattern': 1,
|
||||
}.iteritems())
|
||||
|
|
|
@ -28,24 +28,31 @@ class PDFDevice(object):
|
|||
|
||||
def begin_tag(self, tag, props=None):
|
||||
return
|
||||
|
||||
def end_tag(self):
|
||||
return
|
||||
|
||||
def do_tag(self, tag, props=None):
|
||||
return
|
||||
|
||||
def begin_page(self, page, ctm):
|
||||
return
|
||||
|
||||
def end_page(self, page):
|
||||
return
|
||||
|
||||
def begin_figure(self, name, bbox, matrix):
|
||||
return
|
||||
|
||||
def end_figure(self, name):
|
||||
return
|
||||
|
||||
def paint_path(self, graphicstate, stroke, fill, evenodd, path):
|
||||
return
|
||||
|
||||
def render_image(self, name, stream):
|
||||
return
|
||||
|
||||
def render_string(self, textstate, seq):
|
||||
return
|
||||
|
||||
|
@ -75,7 +82,7 @@ class PDFTextDevice(PDFDevice):
|
|||
scaling, charspace, wordspace, rise, dxscale)
|
||||
return
|
||||
|
||||
def render_string_horizontal(self, seq, matrix, (x,y),
|
||||
def render_string_horizontal(self, seq, matrix, (x, y),
|
||||
font, fontsize, scaling, charspace, wordspace, rise, dxscale):
|
||||
needcharspace = False
|
||||
for obj in seq:
|
||||
|
@ -86,14 +93,14 @@ class PDFTextDevice(PDFDevice):
|
|||
for cid in font.decode(obj):
|
||||
if needcharspace:
|
||||
x += charspace
|
||||
x += self.render_char(translate_matrix(matrix, (x,y)),
|
||||
x += self.render_char(translate_matrix(matrix, (x, y)),
|
||||
font, fontsize, scaling, rise, cid)
|
||||
if cid == 32 and wordspace:
|
||||
x += wordspace
|
||||
needcharspace = True
|
||||
return (x, y)
|
||||
|
||||
def render_string_vertical(self, seq, matrix, (x,y),
|
||||
def render_string_vertical(self, seq, matrix, (x, y),
|
||||
font, fontsize, scaling, charspace, wordspace, rise, dxscale):
|
||||
needcharspace = False
|
||||
for obj in seq:
|
||||
|
@ -104,7 +111,7 @@ class PDFTextDevice(PDFDevice):
|
|||
for cid in font.decode(obj):
|
||||
if needcharspace:
|
||||
y += charspace
|
||||
y += self.render_char(translate_matrix(matrix, (x,y)),
|
||||
y += self.render_char(translate_matrix(matrix, (x, y)),
|
||||
font, fontsize, scaling, rise, cid)
|
||||
if cid == 32 and wordspace:
|
||||
y += wordspace
|
||||
|
@ -132,7 +139,8 @@ class TagExtractor(PDFDevice):
|
|||
font = textstate.font
|
||||
text = ''
|
||||
for obj in seq:
|
||||
if not isinstance(obj, str): continue
|
||||
if not isinstance(obj, str):
|
||||
continue
|
||||
chars = font.decode(obj)
|
||||
for cid in chars:
|
||||
try:
|
||||
|
@ -156,8 +164,8 @@ class TagExtractor(PDFDevice):
|
|||
def begin_tag(self, tag, props=None):
|
||||
s = ''
|
||||
if isinstance(props, dict):
|
||||
s = ''.join( ' %s="%s"' % (enc(k), enc(str(v))) for (k,v)
|
||||
in sorted(props.iteritems()) )
|
||||
s = ''.join(' %s="%s"' % (enc(k), enc(str(v))) for (k, v)
|
||||
in sorted(props.iteritems()))
|
||||
self.outfp.write('<%s%s>' % (enc(tag.name), s))
|
||||
self._stack.append(tag)
|
||||
return
|
||||
|
|
|
@ -23,11 +23,24 @@ from utils import decode_text
|
|||
|
||||
## Exceptions
|
||||
##
|
||||
class PDFNoValidXRef(PDFSyntaxError): pass
|
||||
class PDFNoOutlines(PDFException): pass
|
||||
class PDFDestinationNotFound(PDFException): pass
|
||||
class PDFEncryptionError(PDFException): pass
|
||||
class PDFPasswordIncorrect(PDFEncryptionError): pass
|
||||
class PDFNoValidXRef(PDFSyntaxError):
|
||||
pass
|
||||
|
||||
|
||||
class PDFNoOutlines(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFDestinationNotFound(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFEncryptionError(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFPasswordIncorrect(PDFEncryptionError):
|
||||
pass
|
||||
|
||||
# some predefined literals and keywords.
|
||||
LITERAL_OBJSTM = LIT('ObjStm')
|
||||
|
@ -68,7 +81,8 @@ class PDFXRef(PDFBaseXRef):
|
|||
while 1:
|
||||
try:
|
||||
(pos, line) = parser.nextline()
|
||||
if not line.strip(): continue
|
||||
if not line.strip():
|
||||
continue
|
||||
except PSEOF:
|
||||
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
|
||||
if not line:
|
||||
|
@ -92,7 +106,8 @@ class PDFXRef(PDFBaseXRef):
|
|||
if len(f) != 3:
|
||||
raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line))
|
||||
(pos, genno, use) = f
|
||||
if use != 'n': continue
|
||||
if use != 'n':
|
||||
continue
|
||||
self.offsets[objid] = (None, long(pos), int(genno))
|
||||
if 1 <= debug:
|
||||
print >>sys.stderr, 'xref objects:', self.offsets
|
||||
|
@ -100,16 +115,17 @@ class PDFXRef(PDFBaseXRef):
|
|||
return
|
||||
|
||||
KEYWORD_TRAILER = KWD('trailer')
|
||||
|
||||
def load_trailer(self, parser):
|
||||
try:
|
||||
(_,kwd) = parser.nexttoken()
|
||||
(_, kwd) = parser.nexttoken()
|
||||
assert kwd is self.KEYWORD_TRAILER
|
||||
(_,dic) = parser.nextobject()
|
||||
(_, dic) = parser.nextobject()
|
||||
except PSEOF:
|
||||
x = parser.pop(1)
|
||||
if not x:
|
||||
raise PDFNoValidXRef('Unexpected EOF - file corrupted')
|
||||
(_,dic) = x[0]
|
||||
(_, dic) = x[0]
|
||||
self.trailer.update(dict_value(dic))
|
||||
return
|
||||
|
||||
|
@ -134,6 +150,7 @@ class PDFXRefFallback(PDFXRef):
|
|||
return '<PDFXRefFallback: offsets=%r>' % (self.offsets.keys())
|
||||
|
||||
PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
|
||||
|
||||
def load(self, parser, debug=0):
|
||||
parser.seek(0)
|
||||
while 1:
|
||||
|
@ -148,14 +165,15 @@ class PDFXRefFallback(PDFXRef):
|
|||
print >>sys.stderr, 'trailer: %r' % self.get_trailer()
|
||||
break
|
||||
m = self.PDFOBJ_CUE.match(line)
|
||||
if not m: continue
|
||||
if not m:
|
||||
continue
|
||||
(objid, genno) = m.groups()
|
||||
objid = int(objid)
|
||||
genno = int(genno)
|
||||
self.offsets[objid] = (None, pos, genno)
|
||||
# expand ObjStm.
|
||||
parser.seek(pos)
|
||||
(_,obj) = parser.nextobject()
|
||||
(_, obj) = parser.nextobject()
|
||||
if isinstance(obj, PDFStream) and obj.get('Type') is LITERAL_OBJSTM:
|
||||
stream = stream_value(obj)
|
||||
try:
|
||||
|
@ -168,7 +186,7 @@ class PDFXRefFallback(PDFXRef):
|
|||
objs = []
|
||||
try:
|
||||
while 1:
|
||||
(_,obj) = parser1.nextobject()
|
||||
(_, obj) = parser1.nextobject()
|
||||
objs.append(obj)
|
||||
except PSEOF:
|
||||
pass
|
||||
|
@ -193,14 +211,14 @@ class PDFXRefStream(PDFBaseXRef):
|
|||
return '<PDFXRefStream: ranges=%r>' % (self.ranges)
|
||||
|
||||
def load(self, parser, debug=0):
|
||||
(_,objid) = parser.nexttoken() # ignored
|
||||
(_,genno) = parser.nexttoken() # ignored
|
||||
(_,kwd) = parser.nexttoken()
|
||||
(_,stream) = parser.nextobject()
|
||||
(_, objid) = parser.nexttoken() # ignored
|
||||
(_, genno) = parser.nexttoken() # ignored
|
||||
(_, kwd) = parser.nexttoken()
|
||||
(_, stream) = parser.nextobject()
|
||||
if not isinstance(stream, PDFStream) or stream['Type'] is not LITERAL_XREF:
|
||||
raise PDFNoValidXRef('Invalid PDF stream spec.')
|
||||
size = stream['Size']
|
||||
index_array = stream.get('Index', (1,size))
|
||||
index_array = stream.get('Index', (1, size))
|
||||
if len(index_array) % 2 != 0:
|
||||
raise PDFSyntaxError('Invalid index number')
|
||||
self.ranges.extend(choplist(2, index_array))
|
||||
|
@ -210,22 +228,22 @@ class PDFXRefStream(PDFBaseXRef):
|
|||
self.trailer = stream.attrs
|
||||
if 1 <= debug:
|
||||
print >>sys.stderr, ('xref stream: objid=%s, fields=%d,%d,%d' %
|
||||
(', '.join(map(repr, self.ranges)),
|
||||
self.fl1, self.fl2, self.fl3))
|
||||
(', '.join(map(repr, self.ranges)),
|
||||
self.fl1, self.fl2, self.fl3))
|
||||
return
|
||||
|
||||
def get_trailer(self):
|
||||
return self.trailer
|
||||
|
||||
def get_objids(self):
|
||||
for (start,nobjs) in self.ranges:
|
||||
for (start, nobjs) in self.ranges:
|
||||
for i in xrange(nobjs):
|
||||
yield start+i
|
||||
return
|
||||
|
||||
def get_pos(self, objid):
|
||||
index = 0
|
||||
for (start,nobjs) in self.ranges:
|
||||
for (start, nobjs) in self.ranges:
|
||||
if start <= objid and objid < start+nobjs:
|
||||
index += objid - start
|
||||
else:
|
||||
|
@ -292,7 +310,8 @@ class PDFDocument(object):
|
|||
self.xrefs.append(xref)
|
||||
for xref in self.xrefs:
|
||||
trailer = xref.get_trailer()
|
||||
if not trailer: continue
|
||||
if not trailer:
|
||||
continue
|
||||
# If there's an encryption info, remember it.
|
||||
if 'Encrypt' in trailer:
|
||||
#assert not self.encryption
|
||||
|
@ -316,6 +335,7 @@ class PDFDocument(object):
|
|||
# This step is mandatory even if there's no password associated
|
||||
# with the document.
|
||||
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
|
||||
|
||||
def initialize(self, password=''):
|
||||
if not self.encryption:
|
||||
self.is_printable = self.is_modifiable = self.is_extractable = True
|
||||
|
@ -326,9 +346,9 @@ class PDFDocument(object):
|
|||
V = int_value(param.get('V', 0))
|
||||
if not (V == 1 or V == 2):
|
||||
raise PDFEncryptionError('Unknown algorithm: param=%r' % param)
|
||||
length = int_value(param.get('Length', 40)) # Key length (bits)
|
||||
length = int_value(param.get('Length', 40)) # Key length (bits)
|
||||
O = str_value(param['O'])
|
||||
R = int_value(param['R']) # Revision
|
||||
R = int_value(param['R']) # Revision
|
||||
if 5 <= R:
|
||||
raise PDFEncryptionError('Unknown revision: %r' % R)
|
||||
U = str_value(param['U'])
|
||||
|
@ -337,11 +357,11 @@ class PDFDocument(object):
|
|||
self.is_modifiable = bool(P & 8)
|
||||
self.is_extractable = bool(P & 16)
|
||||
# Algorithm 3.2
|
||||
password = (password+self.PASSWORD_PADDING)[:32] # 1
|
||||
hash = md5.md5(password) # 2
|
||||
hash.update(O) # 3
|
||||
hash.update(struct.pack('<l', P)) # 4
|
||||
hash.update(docid[0]) # 5
|
||||
password = (password+self.PASSWORD_PADDING)[:32] # 1
|
||||
hash = md5.md5(password) # 2
|
||||
hash.update(O) # 3
|
||||
hash.update(struct.pack('<l', P)) # 4
|
||||
hash.update(docid[0]) # 5
|
||||
if 4 <= R:
|
||||
# 6
|
||||
raise PDFNotImplementedError('Revision 4 encryption is currently unsupported')
|
||||
|
@ -355,13 +375,13 @@ class PDFDocument(object):
|
|||
u1 = Arcfour(key).process(self.PASSWORD_PADDING)
|
||||
elif R == 3:
|
||||
# Algorithm 3.5
|
||||
hash = md5.md5(self.PASSWORD_PADDING) # 2
|
||||
hash.update(docid[0]) # 3
|
||||
x = Arcfour(key).process(hash.digest()[:16]) # 4
|
||||
for i in xrange(1,19+1):
|
||||
k = ''.join( chr(ord(c) ^ i) for c in key )
|
||||
hash = md5.md5(self.PASSWORD_PADDING) # 2
|
||||
hash.update(docid[0]) # 3
|
||||
x = Arcfour(key).process(hash.digest()[:16]) # 4
|
||||
for i in xrange(1, 19+1):
|
||||
k = ''.join(chr(ord(c) ^ i) for c in key)
|
||||
x = Arcfour(k).process(x)
|
||||
u1 = x+x # 32bytes total
|
||||
u1 = x+x # 32bytes total
|
||||
if R == 2:
|
||||
is_authenticated = (u1 == U)
|
||||
else:
|
||||
|
@ -373,18 +393,18 @@ class PDFDocument(object):
|
|||
return
|
||||
|
||||
def decrypt_rc4(self, objid, genno, data):
|
||||
key = self.decrypt_key + struct.pack('<L',objid)[:3]+struct.pack('<L',genno)[:2]
|
||||
key = self.decrypt_key + struct.pack('<L', objid)[:3]+struct.pack('<L', genno)[:2]
|
||||
hash = md5.md5(key)
|
||||
key = hash.digest()[:min(len(key),16)]
|
||||
key = hash.digest()[:min(len(key), 16)]
|
||||
return Arcfour(key).process(data)
|
||||
|
||||
def _getobj_objstm(self, stream, index, objid):
|
||||
if stream.objid in self._parsed_objs:
|
||||
(objs,n) = self._parsed_objs[stream.objid]
|
||||
(objs, n) = self._parsed_objs[stream.objid]
|
||||
else:
|
||||
(objs,n) = self._get_objects(stream)
|
||||
(objs, n) = self._get_objects(stream)
|
||||
if self.caching:
|
||||
self._parsed_objs[stream.objid] = (objs,n)
|
||||
self._parsed_objs[stream.objid] = (objs, n)
|
||||
i = n*2+index
|
||||
try:
|
||||
obj = objs[i]
|
||||
|
@ -407,23 +427,24 @@ class PDFDocument(object):
|
|||
objs = []
|
||||
try:
|
||||
while 1:
|
||||
(_,obj) = parser.nextobject()
|
||||
(_, obj) = parser.nextobject()
|
||||
objs.append(obj)
|
||||
except PSEOF:
|
||||
pass
|
||||
return (objs, n)
|
||||
|
||||
KEYWORD_OBJ = KWD('obj')
|
||||
|
||||
def _getobj_parse(self, pos, objid):
|
||||
self._parser.seek(pos)
|
||||
(_,objid1) = self._parser.nexttoken() # objid
|
||||
(_, objid1) = self._parser.nexttoken() # objid
|
||||
if objid1 != objid:
|
||||
raise PDFSyntaxError('objid mismatch: %r=%r' % (objid1, objid))
|
||||
(_,genno) = self._parser.nexttoken() # genno
|
||||
(_,kwd) = self._parser.nexttoken()
|
||||
(_, genno) = self._parser.nexttoken() # genno
|
||||
(_, kwd) = self._parser.nexttoken()
|
||||
if kwd is not self.KEYWORD_OBJ:
|
||||
raise PDFSyntaxError('Invalid object spec: offset=%r' % pos)
|
||||
(_,obj) = self._parser.nextobject()
|
||||
(_, obj) = self._parser.nextobject()
|
||||
return obj
|
||||
|
||||
# can raise PDFObjectNotFound
|
||||
|
@ -465,6 +486,7 @@ class PDFDocument(object):
|
|||
def get_outlines(self):
|
||||
if 'Outlines' not in self.catalog:
|
||||
raise PDFNoOutlines
|
||||
|
||||
def search(entry, level):
|
||||
entry = dict_value(entry)
|
||||
if 'Title' in entry:
|
||||
|
@ -487,13 +509,15 @@ class PDFDocument(object):
|
|||
try:
|
||||
names = dict_value(self.catalog['Names'])
|
||||
except (PDFTypeError, KeyError):
|
||||
raise KeyError((cat,key))
|
||||
raise KeyError((cat, key))
|
||||
# may raise KeyError
|
||||
d0 = dict_value(names[cat])
|
||||
|
||||
def lookup(d):
|
||||
if 'Limits' in d:
|
||||
(k1,k2) = list_value(d['Limits'])
|
||||
if key < k1 or k2 < key: return None
|
||||
(k1, k2) = list_value(d['Limits'])
|
||||
if key < k1 or k2 < key:
|
||||
return None
|
||||
if 'Names' in d:
|
||||
objs = list_value(d['Names'])
|
||||
names = dict(choplist(2, objs))
|
||||
|
@ -501,8 +525,9 @@ class PDFDocument(object):
|
|||
if 'Kids' in d:
|
||||
for c in list_value(d['Kids']):
|
||||
v = lookup(dict_value(c))
|
||||
if v: return v
|
||||
raise KeyError((cat,key))
|
||||
if v:
|
||||
return v
|
||||
raise KeyError((cat, key))
|
||||
return lookup(d0)
|
||||
|
||||
def get_dest(self, name):
|
||||
|
@ -528,7 +553,8 @@ class PDFDocument(object):
|
|||
line = line.strip()
|
||||
if 2 <= self.debug:
|
||||
print >>sys.stderr, 'find_xref: %r' % line
|
||||
if line == 'startxref': break
|
||||
if line == 'startxref':
|
||||
break
|
||||
if line:
|
||||
prev = line
|
||||
else:
|
||||
|
|
|
@ -25,13 +25,13 @@ def get_widths(seq):
|
|||
if isinstance(v, list):
|
||||
if r:
|
||||
char1 = r[-1]
|
||||
for (i,w) in enumerate(v):
|
||||
for (i, w) in enumerate(v):
|
||||
widths[char1+i] = w
|
||||
r = []
|
||||
elif isinstance(v, int):
|
||||
r.append(v)
|
||||
if len(r) == 3:
|
||||
(char1,char2,w) = r
|
||||
(char1, char2, w) = r
|
||||
for i in xrange(char1, char2+1):
|
||||
widths[i] = w
|
||||
r = []
|
||||
|
@ -40,6 +40,7 @@ def get_widths(seq):
|
|||
#assert get_widths([1,2,3]) == {1:3, 2:3}
|
||||
#assert get_widths([1,[2,3],6,[7,8]]) == {1:2,2:3, 6:7,7:8}
|
||||
|
||||
|
||||
def get_widths2(seq):
|
||||
widths = {}
|
||||
r = []
|
||||
|
@ -47,20 +48,20 @@ def get_widths2(seq):
|
|||
if isinstance(v, list):
|
||||
if r:
|
||||
char1 = r[-1]
|
||||
for (i,(w,vx,vy)) in enumerate(choplist(3,v)):
|
||||
widths[char1+i] = (w,(vx,vy))
|
||||
for (i, (w, vx, vy)) in enumerate(choplist(3, v)):
|
||||
widths[char1+i] = (w, (vx, vy))
|
||||
r = []
|
||||
elif isinstance(v, int):
|
||||
r.append(v)
|
||||
if len(r) == 5:
|
||||
(char1,char2,w,vx,vy) = r
|
||||
(char1, char2, w, vx, vy) = r
|
||||
for i in xrange(char1, char2+1):
|
||||
widths[i] = (w,(vx,vy))
|
||||
widths[i] = (w, (vx, vy))
|
||||
r = []
|
||||
return widths
|
||||
#assert get_widths2([1]) == {}
|
||||
#assert get_widths2([1,2,3,4,5]) == {1:(3,(4,5)), 2:(3,(4,5))}
|
||||
#assert get_widths2([1,[2,3,4,5],6,[7,8,9]]) == {1:(2,(3,4)), 6:(7,(8,9))}
|
||||
#assert get_widths2([1,2,3,4,5]) == {1:(3, (4,5)), 2:(3, (4,5))}
|
||||
#assert get_widths2([1,[2,3,4,5],6,[7,8,9]]) == {1:(2, (3,4)), 6:(7, (8,9))}
|
||||
|
||||
|
||||
## FontMetricsDB
|
||||
|
@ -94,7 +95,7 @@ class Type1FontHeaderParser(PSStackParser):
|
|||
def get_encoding(self):
|
||||
while 1:
|
||||
try:
|
||||
(cid,name) = self.nextobject()
|
||||
(cid, name) = self.nextobject()
|
||||
except PSEOF:
|
||||
break
|
||||
try:
|
||||
|
@ -105,25 +106,28 @@ class Type1FontHeaderParser(PSStackParser):
|
|||
|
||||
def do_keyword(self, pos, token):
|
||||
if token is self.KEYWORD_PUT:
|
||||
((_,key),(_,value)) = self.pop(2)
|
||||
((_, key), (_, value)) = self.pop(2)
|
||||
if (isinstance(key, int) and
|
||||
isinstance(value, PSLiteral)):
|
||||
self.add_results((key, literal_name(value)))
|
||||
return
|
||||
|
||||
|
||||
NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-')
|
||||
|
||||
|
||||
## CFFFont
|
||||
## (Format specified in Adobe Technical Note: #5176
|
||||
## "The Compact Font Format Specification")
|
||||
##
|
||||
NIBBLES = ('0','1','2','3','4','5','6','7','8','9','.','e','e-',None,'-')
|
||||
def getdict(data):
|
||||
d = {}
|
||||
fp = StringIO(data)
|
||||
stack = []
|
||||
while 1:
|
||||
c = fp.read(1)
|
||||
if not c: break
|
||||
if not c:
|
||||
break
|
||||
b0 = ord(c)
|
||||
if b0 <= 21:
|
||||
d[b0] = stack
|
||||
|
@ -145,19 +149,21 @@ def getdict(data):
|
|||
else:
|
||||
b1 = ord(fp.read(1))
|
||||
if 247 <= b0 and b0 <= 250:
|
||||
value = ((b0-247)<<8)+b1+108
|
||||
value = ((b0-247) << 8)+b1+108
|
||||
elif 251 <= b0 and b0 <= 254:
|
||||
value = -((b0-251)<<8)-b1-108
|
||||
value = -((b0-251) << 8)-b1-108
|
||||
else:
|
||||
b2 = ord(fp.read(1))
|
||||
if 128 <= b1: b1 -= 256
|
||||
if 128 <= b1:
|
||||
b1 -= 256
|
||||
if b0 == 28:
|
||||
value = b1<<8 | b2
|
||||
value = b1 << 8 | b2
|
||||
else:
|
||||
value = b1<<24 | b2<<16 | struct.unpack('>H', fp.read(2))[0]
|
||||
value = b1 << 24 | b2 << 16 | struct.unpack('>H', fp.read(2))[0]
|
||||
stack.append(value)
|
||||
return d
|
||||
|
||||
|
||||
class CFFFont(object):
|
||||
|
||||
STANDARD_STRINGS = (
|
||||
|
@ -239,7 +245,7 @@ class CFFFont(object):
|
|||
'Yacutesmall', 'Thornsmall', 'Ydieresissmall', '001.000',
|
||||
'001.001', '001.002', '001.003', 'Black', 'Bold', 'Book',
|
||||
'Light', 'Medium', 'Regular', 'Roman', 'Semibold',
|
||||
)
|
||||
)
|
||||
|
||||
class INDEX(object):
|
||||
|
||||
|
@ -264,13 +270,13 @@ class CFFFont(object):
|
|||
return self.fp.read(self.offsets[i+1]-self.offsets[i])
|
||||
|
||||
def __iter__(self):
|
||||
return iter( self[i] for i in xrange(len(self)) )
|
||||
return iter(self[i] for i in xrange(len(self)))
|
||||
|
||||
def __init__(self, name, fp):
|
||||
self.name = name
|
||||
self.fp = fp
|
||||
# Header
|
||||
(_major,_minor,hdrsize,offsize) = struct.unpack('BBBB', self.fp.read(4))
|
||||
(_major, _minor, hdrsize, offsize) = struct.unpack('BBBB', self.fp.read(4))
|
||||
self.fp.read(hdrsize-4)
|
||||
# Name INDEX
|
||||
self.name_index = self.INDEX(self.fp)
|
||||
|
@ -297,7 +303,7 @@ class CFFFont(object):
|
|||
if format == '\x00':
|
||||
# Format 0
|
||||
(n,) = struct.unpack('B', self.fp.read(1))
|
||||
for (code,gid) in enumerate(struct.unpack('B'*n, self.fp.read(n))):
|
||||
for (code, gid) in enumerate(struct.unpack('B'*n, self.fp.read(n))):
|
||||
self.code2gid[code] = gid
|
||||
self.gid2code[gid] = code
|
||||
elif format == '\x01':
|
||||
|
@ -305,8 +311,8 @@ class CFFFont(object):
|
|||
(n,) = struct.unpack('B', self.fp.read(1))
|
||||
code = 0
|
||||
for i in xrange(n):
|
||||
(first,nleft) = struct.unpack('BB', self.fp.read(2))
|
||||
for gid in xrange(first,first+nleft+1):
|
||||
(first, nleft) = struct.unpack('BB', self.fp.read(2))
|
||||
for gid in xrange(first, first+nleft+1):
|
||||
self.code2gid[code] = gid
|
||||
self.gid2code[gid] = code
|
||||
code += 1
|
||||
|
@ -320,7 +326,7 @@ class CFFFont(object):
|
|||
if format == '\x00':
|
||||
# Format 0
|
||||
n = self.nglyphs-1
|
||||
for (gid,sid) in enumerate(struct.unpack('>'+'H'*n, self.fp.read(2*n))):
|
||||
for (gid, sid) in enumerate(struct.unpack('>'+'H'*n, self.fp.read(2*n))):
|
||||
gid += 1
|
||||
name = self.getstr(sid)
|
||||
self.name2gid[name] = gid
|
||||
|
@ -330,8 +336,8 @@ class CFFFont(object):
|
|||
(n,) = struct.unpack('B', self.fp.read(1))
|
||||
sid = 0
|
||||
for i in xrange(n):
|
||||
(first,nleft) = struct.unpack('BB', self.fp.read(2))
|
||||
for gid in xrange(first,first+nleft+1):
|
||||
(first, nleft) = struct.unpack('BB', self.fp.read(2))
|
||||
for gid in xrange(first, first+nleft+1):
|
||||
name = self.getstr(sid)
|
||||
self.name2gid[name] = gid
|
||||
self.gid2name[gid] = name
|
||||
|
@ -356,7 +362,8 @@ class CFFFont(object):
|
|||
##
|
||||
class TrueTypeFont(object):
|
||||
|
||||
class CMapNotFound(Exception): pass
|
||||
class CMapNotFound(Exception):
|
||||
pass
|
||||
|
||||
def __init__(self, name, fp):
|
||||
self.name = name
|
||||
|
@ -389,15 +396,16 @@ class TrueTypeFont(object):
|
|||
elif fmttype == 2:
|
||||
subheaderkeys = struct.unpack('>256H', fp.read(512))
|
||||
firstbytes = [0]*8192
|
||||
for (i,k) in enumerate(subheaderkeys):
|
||||
for (i, k) in enumerate(subheaderkeys):
|
||||
firstbytes[k/8] = i
|
||||
nhdrs = max(subheaderkeys)/8 + 1
|
||||
hdrs = []
|
||||
for i in xrange(nhdrs):
|
||||
(firstcode,entcount,delta,offset) = struct.unpack('>HHhH', fp.read(8))
|
||||
hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset))
|
||||
for (i,firstcode,entcount,delta,pos) in hdrs:
|
||||
if not entcount: continue
|
||||
(firstcode, entcount, delta, offset) = struct.unpack('>HHhH', fp.read(8))
|
||||
hdrs.append((i, firstcode, entcount, delta, fp.tell()-2+offset))
|
||||
for (i, firstcode, entcount, delta, pos) in hdrs:
|
||||
if not entcount:
|
||||
continue
|
||||
first = firstcode + (firstbytes[i] << 8)
|
||||
fp.seek(pos)
|
||||
for c in xrange(entcount):
|
||||
|
@ -414,7 +422,7 @@ class TrueTypeFont(object):
|
|||
idds = struct.unpack('>%dh' % segcount, fp.read(2*segcount))
|
||||
pos = fp.tell()
|
||||
idrs = struct.unpack('>%dH' % segcount, fp.read(2*segcount))
|
||||
for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs):
|
||||
for (ec, sc, idd, idr) in zip(ecs, scs, idds, idrs):
|
||||
if idr:
|
||||
fp.seek(pos+idr)
|
||||
for c in xrange(sc, ec+1):
|
||||
|
@ -426,16 +434,19 @@ class TrueTypeFont(object):
|
|||
assert 0
|
||||
# create unicode map
|
||||
unicode_map = FileUnicodeMap()
|
||||
for (char,gid) in char2gid.iteritems():
|
||||
for (char, gid) in char2gid.iteritems():
|
||||
unicode_map.add_cid2unichr(gid, char)
|
||||
return unicode_map
|
||||
|
||||
|
||||
## Fonts
|
||||
##
|
||||
class PDFFontError(PDFException):
|
||||
pass
|
||||
|
||||
class PDFFontError(PDFException): pass
|
||||
class PDFUnicodeNotDefined(PDFFontError): pass
|
||||
|
||||
class PDFUnicodeNotDefined(PDFFontError):
|
||||
pass
|
||||
|
||||
LITERAL_STANDARD_ENCODING = LIT('StandardEncoding')
|
||||
LITERAL_TYPE1C = LIT('Type1C')
|
||||
|
@ -456,7 +467,7 @@ class PDFFont(object):
|
|||
self.italic_angle = num_value(descriptor.get('ItalicAngle', 0))
|
||||
self.default_width = default_width or num_value(descriptor.get('MissingWidth', 0))
|
||||
self.leading = num_value(descriptor.get('Leading', 0))
|
||||
self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0)))
|
||||
self.bbox = list_value(descriptor.get('FontBBox', (0, 0, 0, 0)))
|
||||
self.hscale = self.vscale = .001
|
||||
return
|
||||
|
||||
|
@ -474,6 +485,7 @@ class PDFFont(object):
|
|||
|
||||
def get_ascent(self):
|
||||
return self.ascent * self.vscale
|
||||
|
||||
def get_descent(self):
|
||||
return self.descent * self.vscale
|
||||
|
||||
|
@ -482,6 +494,7 @@ class PDFFont(object):
|
|||
if w == 0:
|
||||
w = -self.default_width
|
||||
return w * self.hscale
|
||||
|
||||
def get_height(self):
|
||||
h = self.bbox[3]-self.bbox[1]
|
||||
if h == 0:
|
||||
|
@ -501,7 +514,7 @@ class PDFFont(object):
|
|||
return 0
|
||||
|
||||
def string_width(self, s):
|
||||
return sum( self.char_width(cid) for cid in self.decode(s) )
|
||||
return sum(self.char_width(cid) for cid in self.decode(s))
|
||||
|
||||
|
||||
# PDFSimpleFont
|
||||
|
@ -540,6 +553,7 @@ class PDFSimpleFont(PDFFont):
|
|||
except KeyError:
|
||||
raise PDFUnicodeNotDefined(None, cid)
|
||||
|
||||
|
||||
# PDFType1Font
|
||||
class PDFType1Font(PDFSimpleFont):
|
||||
|
||||
|
@ -557,7 +571,7 @@ class PDFType1Font(PDFSimpleFont):
|
|||
firstchar = int_value(spec.get('FirstChar', 0))
|
||||
lastchar = int_value(spec.get('LastChar', 255))
|
||||
widths = list_value(spec.get('Widths', [0]*256))
|
||||
widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) )
|
||||
widths = dict((i+firstchar, w) for (i, w) in enumerate(widths))
|
||||
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
||||
if 'Encoding' not in spec and 'FontFile' in descriptor:
|
||||
# try to recover the missing encoding info from the font file.
|
||||
|
@ -571,12 +585,14 @@ class PDFType1Font(PDFSimpleFont):
|
|||
def __repr__(self):
|
||||
return '<PDFType1Font: basefont=%r>' % self.basefont
|
||||
|
||||
|
||||
# PDFTrueTypeFont
|
||||
class PDFTrueTypeFont(PDFType1Font):
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
|
||||
|
||||
|
||||
# PDFType3Font
|
||||
class PDFType3Font(PDFSimpleFont):
|
||||
|
||||
|
@ -584,16 +600,16 @@ class PDFType3Font(PDFSimpleFont):
|
|||
firstchar = int_value(spec.get('FirstChar', 0))
|
||||
lastchar = int_value(spec.get('LastChar', 0))
|
||||
widths = list_value(spec.get('Widths', [0]*256))
|
||||
widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths))
|
||||
widths = dict((i+firstchar, w) for (i, w) in enumerate(widths))
|
||||
if 'FontDescriptor' in spec:
|
||||
descriptor = dict_value(spec['FontDescriptor'])
|
||||
else:
|
||||
descriptor = {'Ascent':0, 'Descent':0,
|
||||
'FontBBox':spec['FontBBox']}
|
||||
descriptor = {'Ascent': 0, 'Descent': 0,
|
||||
'FontBBox': spec['FontBBox']}
|
||||
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
||||
self.matrix = tuple(list_value(spec.get('FontMatrix')))
|
||||
(_,self.descent,_,self.ascent) = self.bbox
|
||||
(self.hscale,self.vscale) = apply_matrix_norm(self.matrix, (1,1))
|
||||
(_, self.descent, _, self.ascent) = self.bbox
|
||||
(self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
|
@ -657,10 +673,10 @@ class PDFCIDFont(PDFFont):
|
|||
if self.vertical:
|
||||
# writing mode: vertical
|
||||
widths = get_widths2(list_value(spec.get('W2', [])))
|
||||
self.disps = dict( (cid,(vx,vy)) for (cid,(_,(vx,vy))) in widths.iteritems() )
|
||||
(vy,w) = spec.get('DW2', [880, -1000])
|
||||
self.default_disp = (None,vy)
|
||||
widths = dict( (cid,w) for (cid,(w,_)) in widths.iteritems() )
|
||||
self.disps = dict((cid, (vx, vy)) for (cid, (_, (vx, vy))) in widths.iteritems())
|
||||
(vy, w) = spec.get('DW2', [880, -1000])
|
||||
self.default_disp = (None, vy)
|
||||
widths = dict((cid, w) for (cid, (w, _)) in widths.iteritems())
|
||||
default_width = w
|
||||
else:
|
||||
# writing mode: horizontal
|
||||
|
@ -689,7 +705,8 @@ class PDFCIDFont(PDFFont):
|
|||
|
||||
def to_unichr(self, cid):
|
||||
try:
|
||||
if not self.unicode_map: raise KeyError(cid)
|
||||
if not self.unicode_map:
|
||||
raise KeyError(cid)
|
||||
return self.unicode_map.get_unichr(cid)
|
||||
except KeyError:
|
||||
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
||||
|
@ -705,4 +722,5 @@ def main(argv):
|
|||
fp.close()
|
||||
return
|
||||
|
||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main(sys.argv))
|
||||
|
|
|
@ -30,8 +30,12 @@ from utils import mult_matrix, MATRIX_IDENTITY
|
|||
|
||||
## Exceptions
|
||||
##
|
||||
class PDFResourceError(PDFException): pass
|
||||
class PDFInterpreterError(PDFException): pass
|
||||
class PDFResourceError(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFInterpreterError(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
## Constants
|
||||
|
@ -120,6 +124,7 @@ class PDFGraphicState(object):
|
|||
(self.linewidth, self.linecap, self.linejoin,
|
||||
self.miterlimit, self.dash, self.intent, self.flatness))
|
||||
|
||||
|
||||
## Resource Manager
|
||||
##
|
||||
class PDFResourceManager(object):
|
||||
|
@ -152,7 +157,8 @@ class PDFResourceManager(object):
|
|||
try:
|
||||
return CMapDB.get_cmap(cmapname)
|
||||
except CMapDB.CMapNotFound:
|
||||
if strict: raise
|
||||
if strict:
|
||||
raise
|
||||
return CMap()
|
||||
|
||||
def get_font(self, objid, spec):
|
||||
|
@ -195,7 +201,7 @@ class PDFResourceManager(object):
|
|||
else:
|
||||
if STRICT:
|
||||
raise PDFFontError('Invalid Font spec: %r' % spec)
|
||||
font = PDFType1Font(self, spec) # this is so wrong!
|
||||
font = PDFType1Font(self, spec) # this is so wrong!
|
||||
if objid and self.caching:
|
||||
self._cached_fonts[objid] = font
|
||||
return font
|
||||
|
@ -227,12 +233,14 @@ class PDFContentParser(PSStackParser):
|
|||
return
|
||||
|
||||
def fillbuf(self):
|
||||
if self.charpos < len(self.buf): return
|
||||
if self.charpos < len(self.buf):
|
||||
return
|
||||
while 1:
|
||||
self.fillfp()
|
||||
self.bufpos = self.fp.tell()
|
||||
self.buf = self.fp.read(self.BUFSIZ)
|
||||
if self.buf: break
|
||||
if self.buf:
|
||||
break
|
||||
self.fp = None
|
||||
self.charpos = 0
|
||||
return
|
||||
|
@ -263,7 +271,7 @@ class PDFContentParser(PSStackParser):
|
|||
except ValueError:
|
||||
data += self.buf[self.charpos:]
|
||||
self.charpos = len(self.buf)
|
||||
data = data[:-(len(target)+1)] # strip the last part
|
||||
data = data[:-(len(target)+1)] # strip the last part
|
||||
data = re.sub(r'(\x0d\x0a|[\x0d\x0a])$', '', data)
|
||||
return (pos, data)
|
||||
|
||||
|
@ -274,6 +282,7 @@ class PDFContentParser(PSStackParser):
|
|||
KEYWORD_BI = KWD('BI')
|
||||
KEYWORD_ID = KWD('ID')
|
||||
KEYWORD_EI = KWD('EI')
|
||||
|
||||
def do_keyword(self, pos, token):
|
||||
if token is self.KEYWORD_BI:
|
||||
# inline image within a content stream
|
||||
|
@ -283,13 +292,14 @@ class PDFContentParser(PSStackParser):
|
|||
(_, objs) = self.end_type('inline')
|
||||
if len(objs) % 2 != 0:
|
||||
raise PSTypeError('Invalid dictionary construct: %r' % objs)
|
||||
d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
|
||||
d = dict((literal_name(k), v) for (k, v) in choplist(2, objs))
|
||||
(pos, data) = self.get_inline_data(pos+len('ID '))
|
||||
obj = PDFStream(d, data)
|
||||
self.push((pos, obj))
|
||||
self.push((pos, self.KEYWORD_EI))
|
||||
except PSTypeError:
|
||||
if STRICT: raise
|
||||
if STRICT:
|
||||
raise
|
||||
else:
|
||||
self.push((pos, token))
|
||||
return
|
||||
|
@ -316,7 +326,9 @@ class PDFPageInterpreter(object):
|
|||
self.fontmap = {}
|
||||
self.xobjmap = {}
|
||||
self.csmap = PREDEFINED_COLORSPACE.copy()
|
||||
if not resources: return
|
||||
if not resources:
|
||||
return
|
||||
|
||||
def get_colorspace(spec):
|
||||
if isinstance(spec, list):
|
||||
name = literal_name(spec[0])
|
||||
|
@ -328,23 +340,23 @@ class PDFPageInterpreter(object):
|
|||
return PDFColorSpace(name, len(list_value(spec[1])))
|
||||
else:
|
||||
return PREDEFINED_COLORSPACE.get(name)
|
||||
for (k,v) in dict_value(resources).iteritems():
|
||||
for (k, v) in dict_value(resources).iteritems():
|
||||
if 2 <= self.debug:
|
||||
print >>sys.stderr, 'Resource: %r: %r' % (k,v)
|
||||
print >>sys.stderr, 'Resource: %r: %r' % (k, v)
|
||||
if k == 'Font':
|
||||
for (fontid,spec) in dict_value(v).iteritems():
|
||||
for (fontid, spec) in dict_value(v).iteritems():
|
||||
objid = None
|
||||
if isinstance(spec, PDFObjRef):
|
||||
objid = spec.objid
|
||||
spec = dict_value(spec)
|
||||
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
|
||||
elif k == 'ColorSpace':
|
||||
for (csid,spec) in dict_value(v).iteritems():
|
||||
for (csid, spec) in dict_value(v).iteritems():
|
||||
self.csmap[csid] = get_colorspace(resolve1(spec))
|
||||
elif k == 'ProcSet':
|
||||
self.rsrcmgr.get_procset(list_value(v))
|
||||
elif k == 'XObject':
|
||||
for (xobjid,xobjstrm) in dict_value(v).iteritems():
|
||||
for (xobjid, xobjstrm) in dict_value(v).iteritems():
|
||||
self.xobjmap[xobjid] = xobjstrm
|
||||
return
|
||||
|
||||
|
@ -371,7 +383,8 @@ class PDFPageInterpreter(object):
|
|||
return
|
||||
|
||||
def pop(self, n):
|
||||
if n == 0: return []
|
||||
if n == 0:
|
||||
return []
|
||||
x = self.argstack[-n:]
|
||||
self.argstack = self.argstack[:-n]
|
||||
return x
|
||||
|
@ -388,6 +401,7 @@ class PDFPageInterpreter(object):
|
|||
def do_q(self):
|
||||
self.gstack.append(self.get_current_state())
|
||||
return
|
||||
|
||||
# grestore
|
||||
def do_Q(self):
|
||||
if self.gstack:
|
||||
|
@ -396,7 +410,7 @@ class PDFPageInterpreter(object):
|
|||
|
||||
# concat-matrix
|
||||
def do_cm(self, a1, b1, c1, d1, e1, f1):
|
||||
self.ctm = mult_matrix((a1,b1,c1,d1,e1,f1), self.ctm)
|
||||
self.ctm = mult_matrix((a1, b1, c1, d1, e1, f1), self.ctm)
|
||||
self.device.set_ctm(self.ctm)
|
||||
return
|
||||
|
||||
|
@ -404,30 +418,37 @@ class PDFPageInterpreter(object):
|
|||
def do_w(self, linewidth):
|
||||
self.graphicstate.linewidth = linewidth
|
||||
return
|
||||
|
||||
# setlinecap
|
||||
def do_J(self, linecap):
|
||||
self.graphicstate.linecap = linecap
|
||||
return
|
||||
|
||||
# setlinejoin
|
||||
def do_j(self, linejoin):
|
||||
self.graphicstate.linejoin = linejoin
|
||||
return
|
||||
|
||||
# setmiterlimit
|
||||
def do_M(self, miterlimit):
|
||||
self.graphicstate.miterlimit = miterlimit
|
||||
return
|
||||
|
||||
# setdash
|
||||
def do_d(self, dash, phase):
|
||||
self.graphicstate.dash = (dash, phase)
|
||||
return
|
||||
|
||||
# setintent
|
||||
def do_ri(self, intent):
|
||||
self.graphicstate.intent = intent
|
||||
return
|
||||
|
||||
# setflatness
|
||||
def do_i(self, flatness):
|
||||
self.graphicstate.flatness = flatness
|
||||
return
|
||||
|
||||
# load-gstate
|
||||
def do_gs(self, name):
|
||||
#XXX
|
||||
|
@ -435,34 +456,40 @@ class PDFPageInterpreter(object):
|
|||
|
||||
# moveto
|
||||
def do_m(self, x, y):
|
||||
self.curpath.append(('m',x,y))
|
||||
self.curpath.append(('m', x, y))
|
||||
return
|
||||
|
||||
# lineto
|
||||
def do_l(self, x, y):
|
||||
self.curpath.append(('l',x,y))
|
||||
self.curpath.append(('l', x, y))
|
||||
return
|
||||
|
||||
# curveto
|
||||
def do_c(self, x1, y1, x2, y2, x3, y3):
|
||||
self.curpath.append(('c',x1,y1,x2,y2,x3,y3))
|
||||
self.curpath.append(('c', x1, y1, x2, y2, x3, y3))
|
||||
return
|
||||
|
||||
# urveto
|
||||
def do_v(self, x2, y2, x3, y3):
|
||||
self.curpath.append(('v',x2,y2,x3,y3))
|
||||
self.curpath.append(('v', x2, y2, x3, y3))
|
||||
return
|
||||
|
||||
# rveto
|
||||
def do_y(self, x1, y1, x3, y3):
|
||||
self.curpath.append(('y',x1,y1,x3,y3))
|
||||
self.curpath.append(('y', x1, y1, x3, y3))
|
||||
return
|
||||
|
||||
# closepath
|
||||
def do_h(self):
|
||||
self.curpath.append(('h',))
|
||||
return
|
||||
|
||||
# rectangle
|
||||
def do_re(self, x, y, w, h):
|
||||
self.curpath.append(('m',x,y))
|
||||
self.curpath.append(('l',x+w,y))
|
||||
self.curpath.append(('l',x+w,y+h))
|
||||
self.curpath.append(('l',x,y+h))
|
||||
self.curpath.append(('m', x, y))
|
||||
self.curpath.append(('l', x+w, y))
|
||||
self.curpath.append(('l', x+w, y+h))
|
||||
self.curpath.append(('l', x, y+h))
|
||||
self.curpath.append(('h',))
|
||||
return
|
||||
|
||||
|
@ -471,11 +498,13 @@ class PDFPageInterpreter(object):
|
|||
self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
|
||||
self.curpath = []
|
||||
return
|
||||
|
||||
# close-and-stroke
|
||||
def do_s(self):
|
||||
self.do_h()
|
||||
self.do_S()
|
||||
return
|
||||
|
||||
# fill
|
||||
def do_f(self):
|
||||
self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
|
||||
|
@ -483,68 +512,85 @@ class PDFPageInterpreter(object):
|
|||
return
|
||||
# fill (obsolete)
|
||||
do_F = do_f
|
||||
|
||||
# fill-even-odd
|
||||
def do_f_a(self):
|
||||
self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
|
||||
self.curpath = []
|
||||
return
|
||||
|
||||
# fill-and-stroke
|
||||
def do_B(self):
|
||||
self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
|
||||
self.curpath = []
|
||||
return
|
||||
|
||||
# fill-and-stroke-even-odd
|
||||
def do_B_a(self):
|
||||
self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
|
||||
self.curpath = []
|
||||
return
|
||||
|
||||
# close-fill-and-stroke
|
||||
def do_b(self):
|
||||
self.do_h()
|
||||
self.do_B()
|
||||
return
|
||||
|
||||
# close-fill-and-stroke-even-odd
|
||||
def do_b_a(self):
|
||||
self.do_h()
|
||||
self.do_B_a()
|
||||
return
|
||||
|
||||
# close-only
|
||||
def do_n(self):
|
||||
self.curpath = []
|
||||
return
|
||||
|
||||
# clip
|
||||
def do_W(self): return
|
||||
def do_W(self):
|
||||
return
|
||||
|
||||
# clip-even-odd
|
||||
def do_W_a(self): return
|
||||
def do_W_a(self):
|
||||
return
|
||||
|
||||
# setcolorspace-stroking
|
||||
def do_CS(self, name):
|
||||
self.scs = self.csmap[literal_name(name)]
|
||||
return
|
||||
|
||||
# setcolorspace-non-strokine
|
||||
def do_cs(self, name):
|
||||
self.ncs = self.csmap[literal_name(name)]
|
||||
return
|
||||
|
||||
# setgray-stroking
|
||||
def do_G(self, gray):
|
||||
#self.do_CS(LITERAL_DEVICE_GRAY)
|
||||
return
|
||||
|
||||
# setgray-non-stroking
|
||||
def do_g(self, gray):
|
||||
#self.do_cs(LITERAL_DEVICE_GRAY)
|
||||
return
|
||||
|
||||
# setrgb-stroking
|
||||
def do_RG(self, r, g, b):
|
||||
#self.do_CS(LITERAL_DEVICE_RGB)
|
||||
return
|
||||
|
||||
# setrgb-non-stroking
|
||||
def do_rg(self, r, g, b):
|
||||
#self.do_cs(LITERAL_DEVICE_RGB)
|
||||
return
|
||||
|
||||
# setcmyk-stroking
|
||||
def do_K(self, c, m, y, k):
|
||||
#self.do_CS(LITERAL_DEVICE_CMYK)
|
||||
return
|
||||
|
||||
# setcmyk-non-stroking
|
||||
def do_k(self, c, m, y, k):
|
||||
#self.do_cs(LITERAL_DEVICE_CMYK)
|
||||
|
@ -560,6 +606,7 @@ class PDFPageInterpreter(object):
|
|||
n = 1
|
||||
self.pop(n)
|
||||
return
|
||||
|
||||
def do_scn(self):
|
||||
if self.ncs:
|
||||
n = self.ncs.ncomponents
|
||||
|
@ -569,42 +616,53 @@ class PDFPageInterpreter(object):
|
|||
n = 1
|
||||
self.pop(n)
|
||||
return
|
||||
|
||||
def do_SC(self):
|
||||
self.do_SCN()
|
||||
return
|
||||
|
||||
def do_sc(self):
|
||||
self.do_scn()
|
||||
return
|
||||
|
||||
# sharing-name
|
||||
def do_sh(self, name): return
|
||||
def do_sh(self, name):
|
||||
return
|
||||
|
||||
# begin-text
|
||||
def do_BT(self):
|
||||
self.textstate.reset()
|
||||
return
|
||||
|
||||
# end-text
|
||||
def do_ET(self):
|
||||
return
|
||||
|
||||
# begin-compat
|
||||
def do_BX(self): return
|
||||
def do_BX(self):
|
||||
return
|
||||
|
||||
# end-compat
|
||||
def do_EX(self): return
|
||||
def do_EX(self):
|
||||
return
|
||||
|
||||
# marked content operators
|
||||
def do_MP(self, tag):
|
||||
self.device.do_tag(tag)
|
||||
return
|
||||
|
||||
def do_DP(self, tag, props):
|
||||
self.device.do_tag(tag, props)
|
||||
return
|
||||
|
||||
def do_BMC(self, tag):
|
||||
self.device.begin_tag(tag)
|
||||
return
|
||||
|
||||
def do_BDC(self, tag, props):
|
||||
self.device.begin_tag(tag, props)
|
||||
return
|
||||
|
||||
def do_EMC(self):
|
||||
self.device.end_tag()
|
||||
return
|
||||
|
@ -613,18 +671,22 @@ class PDFPageInterpreter(object):
|
|||
def do_Tc(self, space):
|
||||
self.textstate.charspace = space
|
||||
return
|
||||
|
||||
# setwordspace
|
||||
def do_Tw(self, space):
|
||||
self.textstate.wordspace = space
|
||||
return
|
||||
|
||||
# textscale
|
||||
def do_Tz(self, scale):
|
||||
self.textstate.scaling = scale
|
||||
return
|
||||
|
||||
# setleading
|
||||
def do_TL(self, leading):
|
||||
self.textstate.leading = -leading
|
||||
return
|
||||
|
||||
# selectfont
|
||||
def do_Tf(self, fontid, fontsize):
|
||||
try:
|
||||
|
@ -635,10 +697,12 @@ class PDFPageInterpreter(object):
|
|||
self.textstate.font = self.rsrcmgr.get_font(None, {})
|
||||
self.textstate.fontsize = fontsize
|
||||
return
|
||||
|
||||
# setrendering
|
||||
def do_Tr(self, render):
|
||||
self.textstate.render = render
|
||||
return
|
||||
|
||||
# settextrise
|
||||
def do_Ts(self, rise):
|
||||
self.textstate.rise = rise
|
||||
|
@ -646,49 +710,55 @@ class PDFPageInterpreter(object):
|
|||
|
||||
# text-move
|
||||
def do_Td(self, tx, ty):
|
||||
(a,b,c,d,e,f) = self.textstate.matrix
|
||||
self.textstate.matrix = (a,b,c,d,tx*a+ty*c+e,tx*b+ty*d+f)
|
||||
(a, b, c, d, e, f) = self.textstate.matrix
|
||||
self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f)
|
||||
self.textstate.linematrix = (0, 0)
|
||||
#print >>sys.stderr, 'Td(%r,%r): %r' % (tx,ty,self.textstate)
|
||||
#print >>sys.stderr, 'Td(%r,%r): %r' % (tx, ty, self.textstate)
|
||||
return
|
||||
|
||||
# text-move
|
||||
def do_TD(self, tx, ty):
|
||||
(a,b,c,d,e,f) = self.textstate.matrix
|
||||
self.textstate.matrix = (a,b,c,d,tx*a+ty*c+e,tx*b+ty*d+f)
|
||||
(a, b, c, d, e, f) = self.textstate.matrix
|
||||
self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f)
|
||||
self.textstate.leading = ty
|
||||
self.textstate.linematrix = (0, 0)
|
||||
#print >>sys.stderr, 'TD(%r,%r): %r' % (tx,ty,self.textstate)
|
||||
#print >>sys.stderr, 'TD(%r,%r): %r' % (tx, ty, self.textstate)
|
||||
return
|
||||
|
||||
# textmatrix
|
||||
def do_Tm(self, a,b,c,d,e,f):
|
||||
self.textstate.matrix = (a,b,c,d,e,f)
|
||||
def do_Tm(self, a, b, c, d, e, f):
|
||||
self.textstate.matrix = (a, b, c, d, e, f)
|
||||
self.textstate.linematrix = (0, 0)
|
||||
return
|
||||
|
||||
# nextline
|
||||
def do_T_a(self):
|
||||
(a,b,c,d,e,f) = self.textstate.matrix
|
||||
self.textstate.matrix = (a,b,c,d,self.textstate.leading*c+e,self.textstate.leading*d+f)
|
||||
(a, b, c, d, e, f) = self.textstate.matrix
|
||||
self.textstate.matrix = (a, b, c, d, self.textstate.leading*c+e, self.textstate.leading*d+f)
|
||||
self.textstate.linematrix = (0, 0)
|
||||
return
|
||||
|
||||
# show-pos
|
||||
def do_TJ(self, seq):
|
||||
#print >>sys.stderr, 'TJ(%r): %r' % (seq,self.textstate)
|
||||
#print >>sys.stderr, 'TJ(%r): %r' % (seq, self.textstate)
|
||||
if self.textstate.font is None:
|
||||
if STRICT:
|
||||
raise PDFInterpreterError('No font specified!')
|
||||
return
|
||||
self.device.render_string(self.textstate, seq)
|
||||
return
|
||||
|
||||
# show
|
||||
def do_Tj(self, s):
|
||||
self.do_TJ([s])
|
||||
return
|
||||
|
||||
# quote
|
||||
def do__q(self, s):
|
||||
self.do_T_a()
|
||||
self.do_TJ([s])
|
||||
return
|
||||
|
||||
# doublequote
|
||||
def do__w(self, aw, ac, s):
|
||||
self.do_Tw(aw)
|
||||
|
@ -697,14 +767,16 @@ class PDFPageInterpreter(object):
|
|||
return
|
||||
|
||||
# inline image
|
||||
def do_BI(self): # never called
|
||||
def do_BI(self): # never called
|
||||
return
|
||||
def do_ID(self): # never called
|
||||
|
||||
def do_ID(self): # never called
|
||||
return
|
||||
|
||||
def do_EI(self, obj):
|
||||
if 'W' in obj and 'H' in obj:
|
||||
iobjid = str(id(obj))
|
||||
self.device.begin_figure(iobjid, (0,0,1,1), MATRIX_IDENTITY)
|
||||
self.device.begin_figure(iobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
|
||||
self.device.render_image(iobjid, obj)
|
||||
self.device.end_figure(iobjid)
|
||||
return
|
||||
|
@ -733,7 +805,7 @@ class PDFPageInterpreter(object):
|
|||
interpreter.render_contents(resources, [xobj], ctm=mult_matrix(matrix, self.ctm))
|
||||
self.device.end_figure(xobjid)
|
||||
elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj:
|
||||
self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY)
|
||||
self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
|
||||
self.device.render_image(xobjid, xobj)
|
||||
self.device.end_figure(xobjid)
|
||||
else:
|
||||
|
@ -744,15 +816,15 @@ class PDFPageInterpreter(object):
|
|||
def process_page(self, page):
|
||||
if 1 <= self.debug:
|
||||
print >>sys.stderr, 'Processing page: %r' % page
|
||||
(x0,y0,x1,y1) = page.mediabox
|
||||
(x0, y0, x1, y1) = page.mediabox
|
||||
if page.rotate == 90:
|
||||
ctm = (0,-1,1,0, -y0,x1)
|
||||
ctm = (0, -1, 1, 0, -y0, x1)
|
||||
elif page.rotate == 180:
|
||||
ctm = (-1,0,0,-1, x1,y1)
|
||||
ctm = (-1, 0, 0, -1, x1, y1)
|
||||
elif page.rotate == 270:
|
||||
ctm = (0,1,-1,0, y1,-x0)
|
||||
ctm = (0, 1, -1, 0, y1, -x0)
|
||||
else:
|
||||
ctm = (1,0,0,1, -x0,-y0)
|
||||
ctm = (1, 0, 0, 1, -x0, -y0)
|
||||
self.device.begin_page(page, ctm)
|
||||
self.render_contents(page.resources, page.contents, ctm=ctm)
|
||||
self.device.end_page(page)
|
||||
|
@ -764,7 +836,7 @@ class PDFPageInterpreter(object):
|
|||
def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY):
|
||||
if 1 <= self.debug:
|
||||
print >>sys.stderr, ('render_contents: resources=%r, streams=%r, ctm=%r' %
|
||||
(resources, streams, ctm))
|
||||
(resources, streams, ctm))
|
||||
self.init_resources(resources)
|
||||
self.init_state(ctm)
|
||||
self.execute(list_value(streams))
|
||||
|
@ -778,12 +850,12 @@ class PDFPageInterpreter(object):
|
|||
return
|
||||
while 1:
|
||||
try:
|
||||
(_,obj) = parser.nextobject()
|
||||
(_, obj) = parser.nextobject()
|
||||
except PSEOF:
|
||||
break
|
||||
if isinstance(obj, PSKeyword):
|
||||
name = keyword_name(obj)
|
||||
method = 'do_%s' % name.replace('*','_a').replace('"','_w').replace("'",'_q')
|
||||
method = 'do_%s' % name.replace('*', '_a').replace('"', '_w').replace("'", '_q')
|
||||
if hasattr(self, method):
|
||||
func = getattr(self, method)
|
||||
nargs = func.func_code.co_argcount-1
|
||||
|
|
|
@ -63,7 +63,7 @@ class PDFPage(object):
|
|||
else:
|
||||
contents = []
|
||||
if not isinstance(contents, list):
|
||||
contents = [ contents ]
|
||||
contents = [contents]
|
||||
self.contents = contents
|
||||
return
|
||||
|
||||
|
@ -71,6 +71,7 @@ class PDFPage(object):
|
|||
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
|
||||
|
||||
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
|
||||
|
||||
@classmethod
|
||||
def create_pages(klass, document, debug=0):
|
||||
def search(obj, parent):
|
||||
|
@ -80,7 +81,7 @@ class PDFPage(object):
|
|||
else:
|
||||
objid = obj.objid
|
||||
tree = dict_value(obj).copy()
|
||||
for (k,v) in parent.iteritems():
|
||||
for (k, v) in parent.iteritems():
|
||||
if k in klass.INHERITABLE_ATTRS and k not in tree:
|
||||
tree[k] = v
|
||||
if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
|
||||
|
@ -95,7 +96,7 @@ class PDFPage(object):
|
|||
yield (objid, tree)
|
||||
pages = False
|
||||
if 'Pages' in document.catalog:
|
||||
for (objid,tree) in search(document.catalog['Pages'], document.catalog):
|
||||
for (objid, tree) in search(document.catalog['Pages'], document.catalog):
|
||||
yield klass(document, objid, tree)
|
||||
pages = True
|
||||
if not pages:
|
||||
|
@ -110,7 +111,8 @@ class PDFPage(object):
|
|||
pass
|
||||
return
|
||||
|
||||
class PDFTextExtractionNotAllowed(PDFEncryptionError): pass
|
||||
class PDFTextExtractionNotAllowed(PDFEncryptionError):
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def get_pages(klass, fp,
|
||||
|
@ -127,8 +129,10 @@ class PDFPage(object):
|
|||
if check_extractable and not doc.is_extractable:
|
||||
raise klass.PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
|
||||
# Process each page contained in the document.
|
||||
for (pageno,page) in enumerate(klass.create_pages(doc)):
|
||||
if pagenos and (pageno not in pagenos): continue
|
||||
for (pageno, page) in enumerate(klass.create_pages(doc)):
|
||||
if pagenos and (pageno not in pagenos):
|
||||
continue
|
||||
yield page
|
||||
if maxpages and maxpages <= pageno+1: break
|
||||
if maxpages and maxpages <= pageno+1:
|
||||
break
|
||||
return
|
||||
|
|
|
@ -15,7 +15,8 @@ from pdftypes import str_value, list_value, dict_value, stream_value
|
|||
|
||||
## Exceptions
|
||||
##
|
||||
class PDFSyntaxError(PDFException): pass
|
||||
class PDFSyntaxError(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
## PDFParser
|
||||
|
@ -55,6 +56,7 @@ class PDFParser(PSStackParser):
|
|||
KEYWORD_STREAM = KWD('stream')
|
||||
KEYWORD_XREF = KWD('xref')
|
||||
KEYWORD_STARTXREF = KWD('startxref')
|
||||
|
||||
def do_keyword(self, pos, token):
|
||||
"""Handles PDF-related keywords."""
|
||||
|
||||
|
@ -71,7 +73,7 @@ class PDFParser(PSStackParser):
|
|||
elif token is self.KEYWORD_R:
|
||||
# reference to indirect object
|
||||
try:
|
||||
((_,objid), (_,genno)) = self.pop(2)
|
||||
((_, objid), (_, genno)) = self.pop(2)
|
||||
(objid, genno) = (int(objid), int(genno))
|
||||
obj = PDFObjRef(self.doc, objid, genno)
|
||||
self.push((pos, obj))
|
||||
|
@ -80,7 +82,7 @@ class PDFParser(PSStackParser):
|
|||
|
||||
elif token is self.KEYWORD_STREAM:
|
||||
# stream object
|
||||
((_,dic),) = self.pop(1)
|
||||
((_, dic),) = self.pop(1)
|
||||
dic = dict_value(dic)
|
||||
objlen = 0
|
||||
if not self.fallback:
|
||||
|
@ -118,7 +120,7 @@ class PDFParser(PSStackParser):
|
|||
# XXX limit objlen not to exceed object boundary
|
||||
if 2 <= self.debug:
|
||||
print >>sys.stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
|
||||
(pos, objlen, dic, data[:10])
|
||||
(pos, objlen, dic, data[:10])
|
||||
obj = PDFStream(dic, data, self.doc.decipher)
|
||||
self.push((pos, obj))
|
||||
|
||||
|
@ -153,7 +155,7 @@ class PDFStreamParser(PDFParser):
|
|||
if token is self.KEYWORD_R:
|
||||
# reference to indirect object
|
||||
try:
|
||||
((_,objid), (_,genno)) = self.pop(2)
|
||||
((_, objid), (_, genno)) = self.pop(2)
|
||||
(objid, genno) = (int(objid), int(genno))
|
||||
obj = PDFObjRef(self.doc, objid, genno)
|
||||
self.push((pos, obj))
|
||||
|
|
|
@ -23,13 +23,28 @@ LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT'))
|
|||
|
||||
## PDF Objects
|
||||
##
|
||||
class PDFObject(PSObject): pass
|
||||
class PDFObject(PSObject):
|
||||
pass
|
||||
|
||||
class PDFException(PSException): pass
|
||||
class PDFTypeError(PDFException): pass
|
||||
class PDFValueError(PDFException): pass
|
||||
class PDFObjectNotFound(PDFException): pass
|
||||
class PDFNotImplementedError(PDFException): pass
|
||||
|
||||
class PDFException(PSException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFTypeError(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFValueError(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFObjectNotFound(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFNotImplementedError(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
## PDFObjRef
|
||||
|
@ -66,6 +81,7 @@ def resolve1(x, default=None):
|
|||
x = x.resolve(default=default)
|
||||
return x
|
||||
|
||||
|
||||
def resolve_all(x, default=None):
|
||||
"""Recursively resolves the given object and all the internals.
|
||||
|
||||
|
@ -75,24 +91,26 @@ def resolve_all(x, default=None):
|
|||
while isinstance(x, PDFObjRef):
|
||||
x = x.resolve(default=default)
|
||||
if isinstance(x, list):
|
||||
x = [ resolve_all(v, default=default) for v in x ]
|
||||
x = [resolve_all(v, default=default) for v in x]
|
||||
elif isinstance(x, dict):
|
||||
for (k,v) in x.iteritems():
|
||||
for (k, v) in x.iteritems():
|
||||
x[k] = resolve_all(v, default=default)
|
||||
return x
|
||||
|
||||
|
||||
def decipher_all(decipher, objid, genno, x):
|
||||
"""Recursively deciphers the given object.
|
||||
"""
|
||||
if isinstance(x, str):
|
||||
return decipher(objid, genno, x)
|
||||
if isinstance(x, list):
|
||||
x = [ decipher_all(decipher, objid, genno, v) for v in x ]
|
||||
x = [decipher_all(decipher, objid, genno, v) for v in x]
|
||||
elif isinstance(x, dict):
|
||||
for (k,v) in x.iteritems():
|
||||
for (k, v) in x.iteritems():
|
||||
x[k] = decipher_all(decipher, objid, genno, v)
|
||||
return x
|
||||
|
||||
|
||||
# Type cheking
|
||||
def int_value(x):
|
||||
x = resolve1(x)
|
||||
|
@ -102,6 +120,7 @@ def int_value(x):
|
|||
return 0
|
||||
return x
|
||||
|
||||
|
||||
def float_value(x):
|
||||
x = resolve1(x)
|
||||
if not isinstance(x, float):
|
||||
|
@ -110,6 +129,7 @@ def float_value(x):
|
|||
return 0.0
|
||||
return x
|
||||
|
||||
|
||||
def num_value(x):
|
||||
x = resolve1(x)
|
||||
if not (isinstance(x, int) or isinstance(x, float)):
|
||||
|
@ -118,6 +138,7 @@ def num_value(x):
|
|||
return 0
|
||||
return x
|
||||
|
||||
|
||||
def str_value(x):
|
||||
x = resolve1(x)
|
||||
if not isinstance(x, str):
|
||||
|
@ -126,6 +147,7 @@ def str_value(x):
|
|||
return ''
|
||||
return x
|
||||
|
||||
|
||||
def list_value(x):
|
||||
x = resolve1(x)
|
||||
if not (isinstance(x, list) or isinstance(x, tuple)):
|
||||
|
@ -134,6 +156,7 @@ def list_value(x):
|
|||
return []
|
||||
return x
|
||||
|
||||
|
||||
def dict_value(x):
|
||||
x = resolve1(x)
|
||||
if not isinstance(x, dict):
|
||||
|
@ -142,6 +165,7 @@ def dict_value(x):
|
|||
return {}
|
||||
return x
|
||||
|
||||
|
||||
def stream_value(x):
|
||||
x = resolve1(x)
|
||||
if not isinstance(x, PDFStream):
|
||||
|
@ -195,12 +219,14 @@ class PDFStream(PDFObject):
|
|||
|
||||
def get_filters(self):
|
||||
filters = self.get_any(('F', 'Filter'))
|
||||
if not filters: return []
|
||||
if isinstance(filters, list): return filters
|
||||
return [ filters ]
|
||||
if not filters:
|
||||
return []
|
||||
if isinstance(filters, list):
|
||||
return filters
|
||||
return [filters]
|
||||
|
||||
def decode(self):
|
||||
assert self.data is None and self.rawdata != None
|
||||
assert self.data is None and self.rawdata is not None
|
||||
data = self.rawdata
|
||||
if self.decipher:
|
||||
# Handle encryption
|
||||
|
|
|
@ -8,11 +8,24 @@ STRICT = 0
|
|||
|
||||
## PS Exceptions
|
||||
##
|
||||
class PSException(Exception): pass
|
||||
class PSEOF(PSException): pass
|
||||
class PSSyntaxError(PSException): pass
|
||||
class PSTypeError(PSException): pass
|
||||
class PSValueError(PSException): pass
|
||||
class PSException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class PSEOF(PSException):
|
||||
pass
|
||||
|
||||
|
||||
class PSSyntaxError(PSException):
|
||||
pass
|
||||
|
||||
|
||||
class PSTypeError(PSException):
|
||||
pass
|
||||
|
||||
|
||||
class PSValueError(PSException):
|
||||
pass
|
||||
|
||||
|
||||
## Basic PostScript Types
|
||||
|
@ -114,6 +127,7 @@ def literal_name(x):
|
|||
return str(x)
|
||||
return x.name
|
||||
|
||||
|
||||
def keyword_name(x):
|
||||
if not isinstance(x, PSKeyword):
|
||||
if STRICT:
|
||||
|
@ -136,7 +150,9 @@ END_NUMBER = re.compile(r'[^0-9]')
|
|||
END_KEYWORD = re.compile(r'[#/%\[\]()<>{}\s]')
|
||||
END_STRING = re.compile(r'[()\134]')
|
||||
OCT_STRING = re.compile(r'[0-7]')
|
||||
ESC_STRING = { 'b':8, 't':9, 'n':10, 'f':12, 'r':13, '(':40, ')':41, '\\':92 }
|
||||
ESC_STRING = {'b': 8, 't': 9, 'n': 10, 'f': 12, 'r': 13, '(': 40, ')': 41, '\\': 92}
|
||||
|
||||
|
||||
class PSBaseParser(object):
|
||||
|
||||
"""Most basic PostScript parser that performs only tokenization.
|
||||
|
@ -190,7 +206,8 @@ class PSBaseParser(object):
|
|||
return
|
||||
|
||||
def fillbuf(self):
|
||||
if self.charpos < len(self.buf): return
|
||||
if self.charpos < len(self.buf):
|
||||
return
|
||||
# fetch next chunk.
|
||||
self.bufpos = self.fp.tell()
|
||||
self.buf = self.fp.read(self.BUFSIZ)
|
||||
|
@ -242,7 +259,8 @@ class PSBaseParser(object):
|
|||
pos = max(0, pos-self.BUFSIZ)
|
||||
self.fp.seek(pos)
|
||||
s = self.fp.read(prevpos-pos)
|
||||
if not s: break
|
||||
if not s:
|
||||
break
|
||||
while 1:
|
||||
n = max(s.rfind('\r'), s.rfind('\n'))
|
||||
if n == -1:
|
||||
|
@ -407,7 +425,7 @@ class PSBaseParser(object):
|
|||
return j+1
|
||||
if c == ')':
|
||||
self.paren -= 1
|
||||
if self.paren: # WTF, they said balanced parens need no special treatment.
|
||||
if self.paren: # WTF, they said balanced parens need no special treatment.
|
||||
self._curtoken += c
|
||||
return j+1
|
||||
self._add_token(self._curtoken)
|
||||
|
@ -520,7 +538,7 @@ class PSStackParser(PSBaseParser):
|
|||
def end_type(self, type):
|
||||
if self.curtype != type:
|
||||
raise PSTypeError('Type mismatch: %r != %r' % (self.curtype, type))
|
||||
objs = [ obj for (_,obj) in self.curstack ]
|
||||
objs = [obj for (_, obj) in self.curstack]
|
||||
(pos, self.curtype, self.curstack) = self.context.pop()
|
||||
if 2 <= self.debug:
|
||||
print >>sys.stderr, 'end_type: pos=%r, type=%r, objs=%r' % (pos, type, objs)
|
||||
|
@ -553,7 +571,8 @@ class PSStackParser(PSBaseParser):
|
|||
try:
|
||||
self.push(self.end_type('a'))
|
||||
except PSTypeError:
|
||||
if STRICT: raise
|
||||
if STRICT:
|
||||
raise
|
||||
elif token == KEYWORD_DICT_BEGIN:
|
||||
# begin dictionary
|
||||
self.start_type(pos, 'd')
|
||||
|
@ -564,10 +583,11 @@ class PSStackParser(PSBaseParser):
|
|||
if len(objs) % 2 != 0:
|
||||
raise PSSyntaxError('Invalid dictionary construct: %r' % objs)
|
||||
# construct a Python dictionary.
|
||||
d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) if v is not None )
|
||||
d = dict((literal_name(k), v) for (k, v) in choplist(2, objs) if v is not None)
|
||||
self.push((pos, d))
|
||||
except PSTypeError:
|
||||
if STRICT: raise
|
||||
if STRICT:
|
||||
raise
|
||||
elif token == KEYWORD_PROC_BEGIN:
|
||||
# begin proc
|
||||
self.start_type(pos, 'p')
|
||||
|
@ -576,7 +596,8 @@ class PSStackParser(PSBaseParser):
|
|||
try:
|
||||
self.push(self.end_type('p'))
|
||||
except PSTypeError:
|
||||
if STRICT: raise
|
||||
if STRICT:
|
||||
raise
|
||||
else:
|
||||
if 2 <= self.debug:
|
||||
print >>sys.stderr, 'do_keyword: pos=%r, token=%r, stack=%r' % \
|
||||
|
@ -592,9 +613,11 @@ class PSStackParser(PSBaseParser):
|
|||
return obj
|
||||
|
||||
|
||||
import unittest
|
||||
|
||||
|
||||
## Simplistic Test cases
|
||||
##
|
||||
import unittest
|
||||
class TestPSBaseParser(unittest.TestCase):
|
||||
|
||||
TESTDATA = r'''%!PS
|
||||
|
@ -630,7 +653,7 @@ func/a/b{(c)do*}def
|
|||
(242, KWD('def')), (246, KWD('[')), (248, 1), (250, 'z'), (254, KWD('!')),
|
||||
(256, KWD(']')), (258, KWD('<<')), (261, LIT('foo')), (266, 'bar'),
|
||||
(272, KWD('>>'))
|
||||
]
|
||||
]
|
||||
|
||||
OBJS = [
|
||||
(23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')),
|
||||
|
@ -641,10 +664,11 @@ func/a/b{(c)do*}def
|
|||
(191, ''), (194, ' '), (199, '@@ '), (211, '\xab\xcd\x00\x124\x05'),
|
||||
(230, LIT('a')), (232, LIT('b')), (234, ['c']), (246, [1, 'z']),
|
||||
(258, {'foo': 'bar'}),
|
||||
]
|
||||
]
|
||||
|
||||
def get_tokens(self, s):
|
||||
import StringIO
|
||||
|
||||
class MyParser(PSBaseParser):
|
||||
def flush(self):
|
||||
self.add_results(*self.popall())
|
||||
|
@ -659,6 +683,7 @@ func/a/b{(c)do*}def
|
|||
|
||||
def get_objects(self, s):
|
||||
import StringIO
|
||||
|
||||
class MyParser(PSStackParser):
|
||||
def flush(self):
|
||||
self.add_results(*self.popall())
|
||||
|
@ -683,4 +708,5 @@ func/a/b{(c)do*}def
|
|||
self.assertEqual(objs, self.OBJS)
|
||||
return
|
||||
|
||||
if __name__ == '__main__': unittest.main()
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
|
1326
pdfminer/rijndael.py
1326
pdfminer/rijndael.py
File diff suppressed because it is too large
Load Diff
|
@ -8,6 +8,7 @@
|
|||
|
||||
import sys
|
||||
|
||||
|
||||
def rldecode(data):
|
||||
"""
|
||||
RunLength decoder (Adobe version) implementation based on PDF Reference
|
||||
|
@ -26,7 +27,7 @@ def rldecode(data):
|
|||
'1234567777777abcde'
|
||||
"""
|
||||
decoded = []
|
||||
i=0
|
||||
i = 0
|
||||
while i < len(data):
|
||||
#print "data[%d]=:%d:" % (i,ord(data[i]))
|
||||
length = ord(data[i])
|
||||
|
|
|
@ -32,13 +32,13 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
|
|||
buf += chr(c)
|
||||
elif pred == '\x02':
|
||||
# PNG up
|
||||
for (a,b) in zip(line0,line1):
|
||||
for (a, b) in zip(line0, line1):
|
||||
c = (ord(a)+ord(b)) & 255
|
||||
buf += chr(c)
|
||||
elif pred == '\x03':
|
||||
# PNG average (UNTESTED)
|
||||
c = 0
|
||||
for (a,b) in zip(line0,line1):
|
||||
for (a, b) in zip(line0, line1):
|
||||
c = ((c+ord(a)+ord(b))/2) & 255
|
||||
buf += chr(c)
|
||||
else:
|
||||
|
@ -52,21 +52,25 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
|
|||
##
|
||||
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
|
||||
|
||||
def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
|
||||
|
||||
def mult_matrix((a1, b1, c1, d1, e1, f1), (a0, b0, c0, d0, e0, f0)):
|
||||
"""Returns the multiplication of two matrices."""
|
||||
return (a0*a1+c0*b1, b0*a1+d0*b1,
|
||||
a0*c1+c0*d1, b0*c1+d0*d1,
|
||||
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
|
||||
|
||||
def translate_matrix((a,b,c,d,e,f), (x,y)):
|
||||
"""Translates a matrix by (x,y)."""
|
||||
return (a,b,c,d,x*a+y*c+e,x*b+y*d+f)
|
||||
|
||||
def apply_matrix_pt((a,b,c,d,e,f), (x,y)):
|
||||
def translate_matrix((a, b, c, d, e, f), (x, y)):
|
||||
"""Translates a matrix by (x, y)."""
|
||||
return (a, b, c, d, x*a+y*c+e, x*b+y*d+f)
|
||||
|
||||
|
||||
def apply_matrix_pt((a, b, c, d, e, f), (x, y)):
|
||||
"""Applies a matrix to a point."""
|
||||
return (a*x+c*y+e, b*x+d*y+f)
|
||||
|
||||
def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
|
||||
|
||||
def apply_matrix_norm((a, b, c, d, e, f), (p, q)):
|
||||
"""Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))"""
|
||||
return (a*p+c*q, b*p+d*q)
|
||||
|
||||
|
@ -79,17 +83,20 @@ def uniq(objs):
|
|||
"""Eliminates duplicated elements."""
|
||||
done = set()
|
||||
for obj in objs:
|
||||
if obj in done: continue
|
||||
if obj in done:
|
||||
continue
|
||||
done.add(obj)
|
||||
yield obj
|
||||
return
|
||||
|
||||
|
||||
# csort
|
||||
def csort(objs, key=lambda x:x):
|
||||
def csort(objs, key=lambda x: x):
|
||||
"""Order-preserving sorting function."""
|
||||
idxs = dict( (obj,i) for (i,obj) in enumerate(objs) )
|
||||
idxs = dict((obj, i) for (i, obj) in enumerate(objs))
|
||||
return sorted(objs, key=lambda obj: (key(obj), idxs[obj]))
|
||||
|
||||
|
||||
# fsplit
|
||||
def fsplit(pred, objs):
|
||||
"""Split a list into two classes according to the predicate."""
|
||||
|
@ -100,7 +107,8 @@ def fsplit(pred, objs):
|
|||
t.append(obj)
|
||||
else:
|
||||
f.append(obj)
|
||||
return (t,f)
|
||||
return (t, f)
|
||||
|
||||
|
||||
# drange
|
||||
def drange(v0, v1, d):
|
||||
|
@ -108,16 +116,18 @@ def drange(v0, v1, d):
|
|||
assert v0 < v1
|
||||
return xrange(int(v0)/d, int(v1+d)/d)
|
||||
|
||||
|
||||
# get_bound
|
||||
def get_bound(pts):
|
||||
"""Compute a minimal rectangle that covers all the points."""
|
||||
(x0, y0, x1, y1) = (INF, INF, -INF, -INF)
|
||||
for (x,y) in pts:
|
||||
for (x, y) in pts:
|
||||
x0 = min(x0, x)
|
||||
y0 = min(y0, y)
|
||||
x1 = max(x1, x)
|
||||
y1 = max(y1, y)
|
||||
return (x0,y0,x1,y1)
|
||||
return (x0, y0, x1, y1)
|
||||
|
||||
|
||||
# pick
|
||||
def pick(seq, func, maxobj=None):
|
||||
|
@ -126,9 +136,10 @@ def pick(seq, func, maxobj=None):
|
|||
for obj in seq:
|
||||
score = func(obj)
|
||||
if maxscore is None or maxscore < score:
|
||||
(maxscore,maxobj) = (score,obj)
|
||||
(maxscore, maxobj) = (score, obj)
|
||||
return maxobj
|
||||
|
||||
|
||||
# choplist
|
||||
def choplist(n, seq):
|
||||
"""Groups every n elements of the list."""
|
||||
|
@ -140,6 +151,7 @@ def choplist(n, seq):
|
|||
r = []
|
||||
return
|
||||
|
||||
|
||||
# nunpack
|
||||
def nunpack(s, default=0):
|
||||
"""Unpacks 1 to 4 byte integers (big endian)."""
|
||||
|
@ -157,59 +169,65 @@ def nunpack(s, default=0):
|
|||
else:
|
||||
raise TypeError('invalid length: %d' % l)
|
||||
|
||||
|
||||
# decode_text
|
||||
PDFDocEncoding = ''.join( unichr(x) for x in (
|
||||
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
|
||||
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
|
||||
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0017, 0x0017,
|
||||
0x02d8, 0x02c7, 0x02c6, 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc,
|
||||
0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
|
||||
0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
|
||||
0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
|
||||
0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
|
||||
0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
|
||||
0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
|
||||
0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
|
||||
0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
|
||||
0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
|
||||
0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
|
||||
0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
|
||||
0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x0000,
|
||||
0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044,
|
||||
0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
|
||||
0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160,
|
||||
0x0178, 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000,
|
||||
0x20ac, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
|
||||
0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af,
|
||||
0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
|
||||
0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
|
||||
0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
|
||||
0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
|
||||
0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
|
||||
0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
|
||||
0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
|
||||
0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
|
||||
0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
|
||||
0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
|
||||
PDFDocEncoding = ''.join(unichr(x) for x in (
|
||||
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
|
||||
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
|
||||
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0017, 0x0017,
|
||||
0x02d8, 0x02c7, 0x02c6, 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc,
|
||||
0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
|
||||
0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
|
||||
0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
|
||||
0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
|
||||
0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
|
||||
0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
|
||||
0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
|
||||
0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
|
||||
0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
|
||||
0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
|
||||
0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
|
||||
0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x0000,
|
||||
0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044,
|
||||
0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
|
||||
0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160,
|
||||
0x0178, 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000,
|
||||
0x20ac, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
|
||||
0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af,
|
||||
0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
|
||||
0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
|
||||
0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
|
||||
0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
|
||||
0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
|
||||
0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
|
||||
0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
|
||||
0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
|
||||
0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
|
||||
0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
|
||||
))
|
||||
|
||||
|
||||
def decode_text(s):
|
||||
"""Decodes a PDFDocEncoding string to Unicode."""
|
||||
if s.startswith('\xfe\xff'):
|
||||
return unicode(s[2:], 'utf-16be', 'ignore')
|
||||
else:
|
||||
return ''.join( PDFDocEncoding[ord(c)] for c in s )
|
||||
return ''.join(PDFDocEncoding[ord(c)] for c in s)
|
||||
|
||||
|
||||
# enc
|
||||
def enc(x, codec='ascii'):
|
||||
"""Encodes a string for SGML/XML/HTML"""
|
||||
x = x.replace('&','&').replace('>','>').replace('<','<').replace('"','"')
|
||||
x = x.replace('&', '&').replace('>', '>').replace('<', '<').replace('"', '"')
|
||||
return x.encode(codec, 'xmlcharrefreplace')
|
||||
|
||||
def bbox2str((x0,y0,x1,y1)):
|
||||
|
||||
def bbox2str((x0, y0, x1, y1)):
|
||||
return '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
|
||||
|
||||
def matrix2str((a,b,c,d,e,f)):
|
||||
return '[%.2f,%.2f,%.2f,%.2f, (%.2f,%.2f)]' % (a,b,c,d,e,f)
|
||||
|
||||
def matrix2str((a, b, c, d, e, f)):
|
||||
return '[%.2f,%.2f,%.2f,%.2f, (%.2f,%.2f)]' % (a, b, c, d, e, f)
|
||||
|
||||
|
||||
## Plane
|
||||
|
@ -240,14 +258,14 @@ class Plane(object):
|
|||
def __contains__(self, obj):
|
||||
return obj in self._objs
|
||||
|
||||
def _getrange(self, (x0,y0,x1,y1)):
|
||||
def _getrange(self, (x0, y0, x1, y1)):
|
||||
x0 = max(self.x0, x0)
|
||||
y0 = max(self.y0, y0)
|
||||
x1 = min(self.x1, x1)
|
||||
y1 = min(self.y1, y1)
|
||||
for y in drange(y0, y1, self.gridsize):
|
||||
for x in drange(x0, x1, self.gridsize):
|
||||
yield (x,y)
|
||||
yield (x, y)
|
||||
return
|
||||
|
||||
# extend(objs)
|
||||
|
@ -279,14 +297,17 @@ class Plane(object):
|
|||
return
|
||||
|
||||
# find(): finds objects that are in a certain area.
|
||||
def find(self, (x0,y0,x1,y1)):
|
||||
def find(self, (x0, y0, x1, y1)):
|
||||
done = set()
|
||||
for k in self._getrange((x0,y0,x1,y1)):
|
||||
if k not in self._grid: continue
|
||||
for k in self._getrange((x0, y0, x1, y1)):
|
||||
if k not in self._grid:
|
||||
continue
|
||||
for obj in self._grid[k]:
|
||||
if obj in done: continue
|
||||
if obj in done:
|
||||
continue
|
||||
done.add(obj)
|
||||
if (obj.x1 <= x0 or x1 <= obj.x0 or
|
||||
obj.y1 <= y0 or y1 <= obj.y0): continue
|
||||
obj.y1 <= y0 or y1 <= obj.y0):
|
||||
continue
|
||||
yield obj
|
||||
return
|
||||
|
|
Loading…
Reference in New Issue