PEP8: Whitespace changes to match pep8

pull/1/head
Matthew Duggan 2013-11-07 17:35:04 +09:00
parent c1da8b835c
commit 2caa5edc25
22 changed files with 1395 additions and 1125 deletions

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python
__version__ = '20131022'
if __name__ == '__main__': print __version__
if __name__ == '__main__':
print __version__

View File

@ -6,6 +6,7 @@ This code is in the public domain.
"""
## Arcfour
##
class Arcfour(object):

View File

@ -9,6 +9,7 @@ This code is in the public domain.
import re
import struct
# ascii85decode(data)
def ascii85decode(data):
"""
@ -35,7 +36,7 @@ def ascii85decode(data):
n += 1
b = b*85+(ord(c)-33)
if n == 5:
out += struct.pack('>L',b)
out += struct.pack('>L', b)
n = b = 0
elif c == 'z':
assert n == 0
@ -44,13 +45,15 @@ def ascii85decode(data):
if n:
for _ in range(5-n):
b = b*85+84
out += struct.pack('>L',b)[:n-1]
out += struct.pack('>L', b)[:n-1]
break
return out
# asciihexdecode(data)
hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE)
trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
def asciihexdecode(data):
"""
ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1

View File

@ -25,10 +25,11 @@ class BitParser(object):
@classmethod
def add(klass, root, v, bits):
p = root
b = None
for i in xrange(len(bits)):
if 0 < i:
if p[b] is None:
p[b] = [None,None]
p[b] = [None, None]
p = p[b]
if bits[i] == '1':
b = 1
@ -40,7 +41,7 @@ class BitParser(object):
def feedbytes(self, data):
for c in data:
b = ord(c)
for m in (128,64,32,16,8,4,2,1):
for m in (128, 64, 32, 16, 8, 4, 2, 1):
self._parse_bit(b & m)
return
@ -61,7 +62,7 @@ class BitParser(object):
##
class CCITTG4Parser(BitParser):
MODE = [None,None]
MODE = [None, None]
BitParser.add(MODE, 0, '1')
BitParser.add(MODE, +1, '011')
BitParser.add(MODE, -1, '010')
@ -81,7 +82,7 @@ class CCITTG4Parser(BitParser):
BitParser.add(MODE, 'x7', '0000001110')
BitParser.add(MODE, 'e', '000000000001000000000001')
WHITE = [None,None]
WHITE = [None, None]
BitParser.add(WHITE, 0 , '00110101')
BitParser.add(WHITE, 1 , '000111')
BitParser.add(WHITE, 2 , '0111')
@ -187,7 +188,7 @@ class CCITTG4Parser(BitParser):
BitParser.add(WHITE, 2496, '000000011110')
BitParser.add(WHITE, 2560, '000000011111')
BLACK = [None,None]
BLACK = [None, None]
BitParser.add(BLACK, 0 , '0000110111')
BitParser.add(BLACK, 1 , '010')
BitParser.add(BLACK, 2 , '11')
@ -293,25 +294,30 @@ class CCITTG4Parser(BitParser):
BitParser.add(BLACK, 2496, '000000011110')
BitParser.add(BLACK, 2560, '000000011111')
UNCOMPRESSED = [None,None]
BitParser.add(UNCOMPRESSED, '1' , '1')
BitParser.add(UNCOMPRESSED, '01' , '01')
BitParser.add(UNCOMPRESSED, '001' , '001')
BitParser.add(UNCOMPRESSED, '0001' , '0001')
BitParser.add(UNCOMPRESSED, '00001' , '00001')
BitParser.add(UNCOMPRESSED, '00000' , '000001')
BitParser.add(UNCOMPRESSED, 'T00' , '00000011')
BitParser.add(UNCOMPRESSED, 'T10' , '00000010')
BitParser.add(UNCOMPRESSED, 'T000' , '000000011')
BitParser.add(UNCOMPRESSED, 'T100' , '000000010')
BitParser.add(UNCOMPRESSED, 'T0000' , '0000000011')
BitParser.add(UNCOMPRESSED, 'T1000' , '0000000010')
BitParser.add(UNCOMPRESSED, 'T00000' , '00000000011')
BitParser.add(UNCOMPRESSED, 'T10000' , '00000000010')
UNCOMPRESSED = [None, None]
BitParser.add(UNCOMPRESSED, '1', '1')
BitParser.add(UNCOMPRESSED, '01', '01')
BitParser.add(UNCOMPRESSED, '001', '001')
BitParser.add(UNCOMPRESSED, '0001', '0001')
BitParser.add(UNCOMPRESSED, '00001', '00001')
BitParser.add(UNCOMPRESSED, '00000', '000001')
BitParser.add(UNCOMPRESSED, 'T00', '00000011')
BitParser.add(UNCOMPRESSED, 'T10', '00000010')
BitParser.add(UNCOMPRESSED, 'T000', '000000011')
BitParser.add(UNCOMPRESSED, 'T100', '000000010')
BitParser.add(UNCOMPRESSED, 'T0000', '0000000011')
BitParser.add(UNCOMPRESSED, 'T1000', '0000000010')
BitParser.add(UNCOMPRESSED, 'T00000', '00000000011')
BitParser.add(UNCOMPRESSED, 'T10000', '00000000010')
class EOFB(Exception): pass
class InvalidData(Exception): pass
class ByteSkip(Exception): pass
class EOFB(Exception):
pass
class InvalidData(Exception):
pass
class ByteSkip(Exception):
pass
def __init__(self, width, bytealign=False):
BitParser.__init__(self)
@ -324,7 +330,7 @@ class CCITTG4Parser(BitParser):
for c in data:
b = ord(c)
try:
for m in (128,64,32,16,8,4,2,1):
for m in (128, 64, 32, 16, 8, 4, 2, 1):
self._parse_bit(b & m)
except self.ByteSkip:
self._accept = self._parse_mode
@ -358,7 +364,8 @@ class CCITTG4Parser(BitParser):
raise self.InvalidData(mode)
def _parse_horiz1(self, n):
if n is None: raise self.InvalidData
if n is None:
raise self.InvalidData
self._n1 += n
if n < 64:
self._n2 = 0
@ -370,7 +377,8 @@ class CCITTG4Parser(BitParser):
return self.BLACK
def _parse_horiz2(self, n):
if n is None: raise self.InvalidData
if n is None:
raise self.InvalidData
self._n2 += n
if n < 64:
self._color = 1-self._color
@ -384,7 +392,8 @@ class CCITTG4Parser(BitParser):
return self.BLACK
def _parse_uncompressed(self, bits):
if not bits: raise self.InvalidData
if not bits:
raise self.InvalidData
if bits.startswith('T'):
self._accept = self._parse_mode
self._color = int(bits[1])
@ -395,17 +404,17 @@ class CCITTG4Parser(BitParser):
return self.UNCOMPRESSED
def _get_bits(self):
return ''.join( str(b) for b in self._curline[:self._curpos] )
return ''.join(str(b) for b in self._curline[:self._curpos])
def _get_refline(self, i):
if i < 0:
return '[]'+''.join( str(b) for b in self._refline )
return '[]'+''.join(str(b) for b in self._refline)
elif len(self._refline) <= i:
return ''.join( str(b) for b in self._refline )+'[]'
return ''.join(str(b) for b in self._refline)+'[]'
else:
return (''.join( str(b) for b in self._refline[:i] )+
'['+str(self._refline[i])+']'+
''.join( str(b) for b in self._refline[i+1:] ))
return (''.join(str(b) for b in self._refline[:i]) +
'['+str(self._refline[i])+']' +
''.join(str(b) for b in self._refline[i+1:]))
def reset(self):
self._y = 0
@ -416,7 +425,7 @@ class CCITTG4Parser(BitParser):
return
def output_line(self, y, bits):
print y, ''.join( str(b) for b in bits )
print y, ''.join(str(b) for b in bits)
return
def _reset_line(self):
@ -441,12 +450,13 @@ class CCITTG4Parser(BitParser):
x1 = self._curpos+1
while 1:
if x1 == 0:
if (self._color == 1 and
self._refline[x1] != self._color): break
if (self._color == 1 and self._refline[x1] != self._color):
break
elif x1 == len(self._refline):
break
elif (self._refline[x1-1] == self._color and
self._refline[x1] != self._color): break
self._refline[x1] != self._color):
break
x1 += 1
x1 += dx
x0 = max(0, self._curpos)
@ -467,21 +477,23 @@ class CCITTG4Parser(BitParser):
x1 = self._curpos+1
while 1:
if x1 == 0:
if (self._color == 1 and
self._refline[x1] != self._color): break
if (self._color == 1 and self._refline[x1] != self._color):
break
elif x1 == len(self._refline):
break
elif (self._refline[x1-1] == self._color and
self._refline[x1] != self._color): break
self._refline[x1] != self._color):
break
x1 += 1
while 1:
if x1 == 0:
if (self._color == 0 and
self._refline[x1] == self._color): break
if (self._color == 0 and self._refline[x1] == self._color):
break
elif x1 == len(self._refline):
break
elif (self._refline[x1-1] != self._color and
self._refline[x1] == self._color): break
self._refline[x1] == self._color):
break
x1 += 1
for x in xrange(self._curpos, x1):
self._curline[x] = self._color
@ -494,11 +506,13 @@ class CCITTG4Parser(BitParser):
self._curpos = 0
x = self._curpos
for _ in xrange(n1):
if len(self._curline) <= x: break
if len(self._curline) <= x:
break
self._curline[x] = self._color
x += 1
for _ in xrange(n2):
if len(self._curline) <= x: break
if len(self._curline) <= x:
break
self._curline[x] = 1-self._color
x += 1
self._curpos = x
@ -512,15 +526,16 @@ class CCITTG4Parser(BitParser):
self._flush_line()
return
import unittest
## Test cases
##
import unittest
class TestCCITTG4Parser(unittest.TestCase):
def get_parser(self, bits):
parser = CCITTG4Parser(len(bits))
parser._curline = [ int(c) for c in bits ]
parser._curline = [int(c) for c in bits]
parser._reset_line()
return parser
@ -655,7 +670,7 @@ class TestCCITTG4Parser(unittest.TestCase):
parser._do_vertical(-1)
parser._do_vertical(-1)
parser._do_vertical(1)
parser._do_horizontal(1,1)
parser._do_horizontal(1, 1)
self.assertEqual(parser._get_bits(), '011101')
return
@ -685,10 +700,10 @@ class CCITTFaxDecoder(CCITTG4Parser):
def output_line(self, y, bits):
bytes = array.array('B', [0]*((len(bits)+7)/8))
if self.reversed:
bits = [ 1-b for b in bits ]
for (i,b) in enumerate(bits):
bits = [1-b for b in bits]
for (i, b) in enumerate(bits):
if b:
bytes[i/8] += (128,64,32,16,8,4,2,1)[i%8]
bytes[i/8] += (128, 64, 32, 16, 8, 4, 2, 1)[i % 8]
self._buf += bytes.tostring()
return
@ -711,28 +726,32 @@ def main(argv):
import pygame
if not argv[1:]:
return unittest.main()
class Parser(CCITTG4Parser):
def __init__(self, width, bytealign=False):
CCITTG4Parser.__init__(self, width, bytealign=bytealign)
self.img = pygame.Surface((self.width,1000))
self.img = pygame.Surface((self.width, 1000))
return
def output_line(self, y, bits):
for (x,b) in enumerate(bits):
for (x, b) in enumerate(bits):
if b:
self.img.set_at((x,y), (255,255,255))
self.img.set_at((x, y), (255, 255, 255))
else:
self.img.set_at((x,y), (0,0,0))
self.img.set_at((x, y), (0, 0, 0))
return
def close(self):
pygame.image.save(self.img, 'out.bmp')
return
for path in argv[1:]:
fp = file(path,'rb')
(_,_,k,w,h,_) = path.split('.')
fp = file(path, 'rb')
(_, _, k, w, h, _) = path.split('.')
parser = Parser(int(w))
parser.feedbytes(fp.read())
parser.close()
fp.close()
return
if __name__ == '__main__': sys.exit(main(sys.argv))
if __name__ == '__main__':
sys.exit(main(sys.argv))

View File

@ -26,7 +26,8 @@ from encodingdb import name2unicode
from utils import choplist, nunpack
class CMapError(Exception): pass
class CMapError(Exception):
pass
## CMap
@ -44,8 +45,9 @@ class CMap(object):
def use_cmap(self, cmap):
assert isinstance(cmap, CMap)
def copy(dst, src):
for (k,v) in src.iteritems():
for (k, v) in src.iteritems():
if isinstance(v, dict):
d = {}
dst[k] = d
@ -74,10 +76,10 @@ class CMap(object):
if code2cid is None:
code2cid = self.code2cid
code = ()
for (k,v) in sorted(code2cid.iteritems()):
for (k, v) in sorted(code2cid.iteritems()):
c = code+(k,)
if isinstance(v, int):
out.write('code %r = cid %d\n' % (c,v))
out.write('code %r = cid %d\n' % (c, v))
else:
self.dump(out=out, code2cid=v, code=c)
return
@ -102,7 +104,6 @@ class IdentityCMap(object):
return ()
## UnicodeMap
##
class UnicodeMap(object):
@ -119,8 +120,8 @@ class UnicodeMap(object):
return self.cid2unichr[cid]
def dump(self, out=sys.stdout):
for (k,v) in sorted(self.cid2unichr.iteritems()):
out.write('cid %d = unicode %r\n' % (k,v))
for (k, v) in sorted(self.cid2unichr.iteritems()):
out.write('cid %d = unicode %r\n' % (k, v))
return
@ -153,7 +154,7 @@ class FileCMap(CMap):
else:
t = {}
d[c] = t
d =t
d = t
c = ord(code[-1])
d[c] = cid
return
@ -232,17 +233,16 @@ class CMapDB(object):
_cmap_cache = {}
_umap_cache = {}
class CMapNotFound(CMapError): pass
class CMapNotFound(CMapError):
pass
@classmethod
def _load_data(klass, name):
filename = '%s.pickle.gz' % name
if klass.debug:
print >>sys.stderr, 'loading:', name
cmap_paths = (
os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'),
os.path.join(os.path.dirname(__file__), 'cmap'),
)
cmap_paths = (os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'),
os.path.join(os.path.dirname(__file__), 'cmap'),)
for directory in cmap_paths:
path = os.path.join(directory, filename)
if os.path.exists(path):
@ -306,11 +306,12 @@ class CMapParser(PSStackParser):
elif name == 'endcmap':
self._in_cmap = False
return
if not self._in_cmap: return
if not self._in_cmap:
return
#
if name == 'def':
try:
((_,k),(_,v)) = self.pop(2)
((_, k), (_, v)) = self.pop(2)
self.cmap.set_attr(literal_name(k), v)
except PSSyntaxError:
pass
@ -318,7 +319,7 @@ class CMapParser(PSStackParser):
if name == 'usecmap':
try:
((_,cmapname),) = self.pop(1)
((_, cmapname),) = self.pop(1)
self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
except PSSyntaxError:
pass
@ -337,13 +338,15 @@ class CMapParser(PSStackParser):
self.popall()
return
if name == 'endcidrange':
objs = [ obj for (_,obj) in self.popall() ]
for (s,e,cid) in choplist(3, objs):
objs = [obj for (__, obj) in self.popall()]
for (s, e, cid) in choplist(3, objs):
if (not isinstance(s, str) or not isinstance(e, str) or
not isinstance(cid, int) or len(s) != len(e)): continue
not isinstance(cid, int) or len(s) != len(e)):
continue
sprefix = s[:-4]
eprefix = e[:-4]
if sprefix != eprefix: continue
if sprefix != eprefix:
continue
svar = s[-4:]
evar = e[-4:]
s1 = nunpack(svar)
@ -351,7 +354,7 @@ class CMapParser(PSStackParser):
vlen = len(svar)
#assert s1 <= e1
for i in xrange(e1-s1+1):
x = sprefix+struct.pack('>L',s1+i)[-vlen:]
x = sprefix+struct.pack('>L', s1+i)[-vlen:]
self.cmap.add_code2cid(x, cid+i)
return
@ -359,8 +362,8 @@ class CMapParser(PSStackParser):
self.popall()
return
if name == 'endcidchar':
objs = [ obj for (_,obj) in self.popall() ]
for (cid,code) in choplist(2, objs):
objs = [obj for (__, obj) in self.popall()]
for (cid, code) in choplist(2, objs):
if isinstance(code, str) and isinstance(cid, str):
self.cmap.add_code2cid(code, nunpack(cid))
return
@ -369,10 +372,11 @@ class CMapParser(PSStackParser):
self.popall()
return
if name == 'endbfrange':
objs = [ obj for (_,obj) in self.popall() ]
for (s,e,code) in choplist(3, objs):
objs = [obj for (__, obj) in self.popall()]
for (s, e, code) in choplist(3, objs):
if (not isinstance(s, str) or not isinstance(e, str) or
len(s) != len(e)): continue
len(s) != len(e)):
continue
s1 = nunpack(s)
e1 = nunpack(e)
#assert s1 <= e1
@ -385,7 +389,7 @@ class CMapParser(PSStackParser):
prefix = code[:-4]
vlen = len(var)
for i in xrange(e1-s1+1):
x = prefix+struct.pack('>L',base+i)[-vlen:]
x = prefix+struct.pack('>L', base+i)[-vlen:]
self.cmap.add_cid2unichr(s1+i, x)
return
@ -393,8 +397,8 @@ class CMapParser(PSStackParser):
self.popall()
return
if name == 'endbfchar':
objs = [ obj for (_,obj) in self.popall() ]
for (cid,code) in choplist(2, objs):
objs = [obj for (__, obj) in self.popall()]
for (cid, code) in choplist(2, objs):
if isinstance(cid, str) and isinstance(code, str):
self.cmap.add_cid2unichr(nunpack(cid), code)
return
@ -409,6 +413,7 @@ class CMapParser(PSStackParser):
self.push((pos, token))
return
# test
def main(argv):
args = argv[1:]
@ -421,4 +426,5 @@ def main(argv):
cmap.dump()
return
if __name__ == '__main__': sys.exit(main(sys.argv))
if __name__ == '__main__':
sys.exit(main(sys.argv))

View File

@ -21,9 +21,9 @@ class PDFLayoutAnalyzer(PDFTextDevice):
return
def begin_page(self, page, ctm):
(x0,y0,x1,y1) = page.mediabox
(x0,y0) = apply_matrix_pt(ctm, (x0,y0))
(x1,y1) = apply_matrix_pt(ctm, (x1,y1))
(x0, y0, x1, y1) = page.mediabox
(x0, y0) = apply_matrix_pt(ctm, (x0, y0))
(x1, y1) = apply_matrix_pt(ctm, (x1, y1))
mediabox = (0, 0, abs(x0-x1), abs(y0-y1))
self.cur_item = LTPage(self.pageno, mediabox)
return
@ -61,26 +61,26 @@ class PDFLayoutAnalyzer(PDFTextDevice):
shape = ''.join(x[0] for x in path)
if shape == 'ml':
# horizontal/vertical line
(_,x0,y0) = path[0]
(_,x1,y1) = path[1]
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
(_, x0, y0) = path[0]
(_, x1, y1) = path[1]
(x0, y0) = apply_matrix_pt(self.ctm, (x0, y0))
(x1, y1) = apply_matrix_pt(self.ctm, (x1, y1))
if x0 == x1 or y0 == y1:
self.cur_item.add(LTLine(gstate.linewidth, (x0,y0), (x1,y1)))
self.cur_item.add(LTLine(gstate.linewidth, (x0, y0), (x1, y1)))
return
if shape == 'mlllh':
# rectangle
(_,x0,y0) = path[0]
(_,x1,y1) = path[1]
(_,x2,y2) = path[2]
(_,x3,y3) = path[3]
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
(x2,y2) = apply_matrix_pt(self.ctm, (x2,y2))
(x3,y3) = apply_matrix_pt(self.ctm, (x3,y3))
(_, x0, y0) = path[0]
(_, x1, y1) = path[1]
(_, x2, y2) = path[2]
(_, x3, y3) = path[3]
(x0, y0) = apply_matrix_pt(self.ctm, (x0, y0))
(x1, y1) = apply_matrix_pt(self.ctm, (x1, y1))
(x2, y2) = apply_matrix_pt(self.ctm, (x2, y2))
(x3, y3) = apply_matrix_pt(self.ctm, (x3, y3))
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
self.cur_item.add(LTRect(gstate.linewidth, (x0, y0, x2, y2)))
return
# other shapes
pts = []
@ -176,7 +176,8 @@ class TextConverter(PDFConverter):
# is text. This stops all the image and drawing ouput from being
# recorded and taking up RAM.
def render_image(self, name, stream):
if self.imagewriter is None: return
if self.imagewriter is None:
return
PDFConverter.render_image(self, name, stream)
return
@ -206,8 +207,8 @@ class HTMLConverter(PDFConverter):
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
scale=1, fontscale=1.0, layoutmode='normal', showpageno=True,
pagemargin=50, imagewriter=None,
rect_colors={'curve':'black', 'page':'gray'},
text_colors={'char':'black'}):
rect_colors={'curve': 'black', 'page': 'gray'},
text_colors={'char': 'black'}):
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.scale = scale
self.fontscale = fontscale
@ -238,7 +239,7 @@ class HTMLConverter(PDFConverter):
def write_footer(self):
self.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' %
', '.join('<a href="#%s">%s</a>' % (i,i) for i in xrange(1,self.pageno)))
', '.join('<a href="#%s">%s</a>' % (i, i) for i in xrange(1, self.pageno)))
self.write('</body></html>\n')
return
@ -318,6 +319,7 @@ class HTMLConverter(PDFConverter):
for child in item:
show_group(child)
return
def render(item):
if isinstance(item, LTPage):
self._yoffset += item.y1
@ -415,6 +417,7 @@ class XMLConverter(PDFConverter):
show_group(child)
self.outfp.write('</textgroup>\n')
return
def render(item):
if isinstance(item, LTPage):
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %

View File

@ -6,15 +6,18 @@ from glyphlist import glyphname2unicode
from latin_enc import ENCODING
STRIP_NAME = re.compile(r'[0-9]+')
## name2unicode
##
STRIP_NAME = re.compile(r'[0-9]+')
def name2unicode(name):
"""Converts Adobe glyph names to Unicode numbers."""
if name in glyphname2unicode:
return glyphname2unicode[name]
m = STRIP_NAME.search(name)
if not m: raise KeyError(name)
if not m:
raise KeyError(name)
return unichr(int(m.group(0)))
@ -26,12 +29,16 @@ class EncodingDB(object):
mac2unicode = {}
win2unicode = {}
pdf2unicode = {}
for (name,std,mac,win,pdf) in ENCODING:
for (name, std, mac, win, pdf) in ENCODING:
c = name2unicode(name)
if std: std2unicode[std] = c
if mac: mac2unicode[mac] = c
if win: win2unicode[win] = c
if pdf: pdf2unicode[pdf] = c
if std:
std2unicode[std] = c
if mac:
mac2unicode[mac] = c
if win:
win2unicode[win] = c
if pdf:
pdf2unicode[pdf] = c
encodings = {
'StandardEncoding': std2unicode,

View File

@ -7,9 +7,11 @@ import os, os.path
from pdftypes import LITERALS_DCT_DECODE
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK
def align32(x):
return ((x+3)/4)*4
## BMPWriter
##
class BMPWriter(object):
@ -38,12 +40,12 @@ class BMPWriter(object):
self.fp.write(info)
if ncols == 2:
# B&W color table
for i in (0,255):
self.fp.write(struct.pack('BBBx', i,i,i))
for i in (0, 255):
self.fp.write(struct.pack('BBBx', i, i, i))
elif ncols == 256:
# grayscale color table
for i in xrange(256):
self.fp.write(struct.pack('BBBx', i,i,i))
self.fp.write(struct.pack('BBBx', i, i, i))
self.pos0 = self.fp.tell()
self.pos1 = self.pos0 + self.datasize
return

View File

@ -82,7 +82,7 @@ class LTComponent(LTItem):
return ('<%s %s>' %
(self.__class__.__name__, bbox2str(self.bbox)))
def set_bbox(self, (x0,y0,x1,y1)):
def set_bbox(self, (x0, y0, x1, y1)):
self.x0 = x0
self.y0 = y0
self.x1 = x1
@ -143,7 +143,7 @@ class LTCurve(LTComponent):
return
def get_pts(self):
return ','.join( '%.3f,%.3f' % p for p in self.pts )
return ','.join('%.3f,%.3f' % p for p in self.pts)
## LTLine
@ -159,8 +159,8 @@ class LTLine(LTCurve):
##
class LTRect(LTCurve):
def __init__(self, linewidth, (x0,y0,x1,y1)):
LTCurve.__init__(self, linewidth, [(x0,y0), (x1,y0), (x1,y1), (x0,y1)])
def __init__(self, linewidth, (x0, y0, x1, y1)):
LTCurve.__init__(self, linewidth, [(x0, y0), (x1, y0), (x1, y1), (x0, y1)])
return
@ -213,7 +213,7 @@ class LTChar(LTComponent, LTText):
if font.is_vertical():
# vertical
width = font.get_width() * fontsize
(vx,vy) = textdisp
(vx, vy) = textdisp
if vx is None:
vx = width/2
else:
@ -230,15 +230,15 @@ class LTChar(LTComponent, LTText):
ty = descent + rise
bll = (0, ty)
bur = (self.adv, ty+height)
(a,b,c,d,e,f) = self.matrix
(a, b, c, d, e, f) = self.matrix
self.upright = (0 < a*d*scaling and b*c <= 0)
(x0,y0) = apply_matrix_pt(self.matrix, bll)
(x1,y1) = apply_matrix_pt(self.matrix, bur)
(x0, y0) = apply_matrix_pt(self.matrix, bll)
(x1, y1) = apply_matrix_pt(self.matrix, bur)
if x1 < x0:
(x0,x1) = (x1,x0)
(x0, x1) = (x1, x0)
if y1 < y0:
(y0,y1) = (y1,y0)
LTComponent.__init__(self, (x0,y0,x1,y1))
(y0, y1) = (y1, y0)
LTComponent.__init__(self, (x0, y0, x1, y1))
if font.is_vertical():
self.size = self.width
else:
@ -294,7 +294,7 @@ class LTContainer(LTComponent):
class LTExpandableContainer(LTContainer):
def __init__(self):
LTContainer.__init__(self, (+INF,+INF,-INF,-INF))
LTContainer.__init__(self, (+INF, +INF, -INF, -INF))
return
def add(self, obj):
@ -314,7 +314,7 @@ class LTTextContainer(LTExpandableContainer, LTText):
return
def get_text(self):
return ''.join( obj.get_text() for obj in self if isinstance(obj, LTText) )
return ''.join(obj.get_text() for obj in self if isinstance(obj, LTText))
## LTTextLine
@ -339,6 +339,7 @@ class LTTextLine(LTTextContainer):
def find_neighbors(self, plane, ratio):
raise NotImplementedError
class LTTextLineHorizontal(LTTextLine):
def __init__(self, word_margin):
@ -358,11 +359,12 @@ class LTTextLineHorizontal(LTTextLine):
def find_neighbors(self, plane, ratio):
d = ratio*self.height
objs = plane.find((self.x0, self.y0-d, self.x1, self.y1+d))
return [ obj for obj in objs
return [obj for obj in objs
if (isinstance(obj, LTTextLineHorizontal) and
abs(obj.height-self.height) < d and
(abs(obj.x0-self.x0) < d or
abs(obj.x1-self.x1) < d)) ]
abs(obj.x1-self.x1) < d))]
class LTTextLineVertical(LTTextLine):
@ -383,11 +385,11 @@ class LTTextLineVertical(LTTextLine):
def find_neighbors(self, plane, ratio):
d = ratio*self.width
objs = plane.find((self.x0-d, self.y0, self.x1+d, self.y1))
return [ obj for obj in objs
return [obj for obj in objs
if (isinstance(obj, LTTextLineVertical) and
abs(obj.width-self.width) < d and
(abs(obj.y0-self.y0) < d or
abs(obj.y1-self.y1) < d)) ]
abs(obj.y1-self.y1) < d))]
## LTTextBox
@ -407,6 +409,7 @@ class LTTextBox(LTTextContainer):
(self.__class__.__name__,
self.index, bbox2str(self.bbox), self.get_text()))
class LTTextBoxHorizontal(LTTextBox):
def analyze(self, laparams):
@ -417,6 +420,7 @@ class LTTextBoxHorizontal(LTTextBox):
def get_writing_mode(self):
return 'lr-tb'
class LTTextBoxVertical(LTTextBox):
def analyze(self, laparams):
@ -437,6 +441,7 @@ class LTTextGroup(LTTextContainer):
self.extend(objs)
return
class LTTextGroupLRTB(LTTextGroup):
def analyze(self, laparams):
@ -447,6 +452,7 @@ class LTTextGroupLRTB(LTTextGroup):
(1+laparams.boxes_flow)*(obj.y0+obj.y1))
return
class LTTextGroupTBRL(LTTextGroup):
def analyze(self, laparams):
@ -454,7 +460,7 @@ class LTTextGroupTBRL(LTTextGroup):
# reorder the objects from top-right to bottom-left.
self._objs = csort(self._objs, key=lambda obj:
-(1+laparams.boxes_flow)*(obj.x0+obj.x1)
-(1-laparams.boxes_flow)*(obj.y1))
- (1-laparams.boxes_flow)*(obj.y1))
return
@ -506,8 +512,8 @@ class LTLayoutContainer(LTContainer):
# |<-->|
# (line_overlap)
k |= 2
if ( (k & 1 and isinstance(line, LTTextLineHorizontal)) or
(k & 2 and isinstance(line, LTTextLineVertical)) ):
if ((k & 1 and isinstance(line, LTTextLineHorizontal)) or
(k & 2 and isinstance(line, LTTextLineVertical))):
line.add(obj1)
elif line is not None:
yield line
@ -555,7 +561,8 @@ class LTLayoutContainer(LTContainer):
done = set()
for line in lines:
box = boxes[line]
if box in done: continue
if box in done:
continue
done.add(box)
if not box.is_empty():
yield box
@ -563,32 +570,34 @@ class LTLayoutContainer(LTContainer):
def group_textboxes(self, laparams, boxes):
assert boxes
def dist(obj1, obj2):
"""A distance function between two TextBoxes.
Consider the bounding rectangle for obj1 and obj2.
Return its area less the areas of obj1 and obj2,
shown as 'www' below. This value may be negative.
+------+..........+ (x1,y1)
+------+..........+ (x1, y1)
| obj1 |wwwwwwwwww:
+------+www+------+
:wwwwwwwwww| obj2 |
(x0,y0) +..........+------+
(x0, y0) +..........+------+
"""
x0 = min(obj1.x0,obj2.x0)
y0 = min(obj1.y0,obj2.y0)
x1 = max(obj1.x1,obj2.x1)
y1 = max(obj1.y1,obj2.y1)
x0 = min(obj1.x0, obj2.x0)
y0 = min(obj1.y0, obj2.y0)
x1 = max(obj1.x1, obj2.x1)
y1 = max(obj1.y1, obj2.y1)
return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height)
def isany(obj1, obj2):
"""Check if there's any other object between obj1 and obj2.
"""
x0 = min(obj1.x0,obj2.x0)
y0 = min(obj1.y0,obj2.y0)
x1 = max(obj1.x1,obj2.x1)
y1 = max(obj1.y1,obj2.y1)
objs = set(plane.find((x0,y0,x1,y1)))
return objs.difference((obj1,obj2))
x0 = min(obj1.x0, obj2.x0)
y0 = min(obj1.y0, obj2.y0)
x1 = max(obj1.x1, obj2.x1)
y1 = max(obj1.y1, obj2.y1)
objs = set(plane.find((x0, y0, x1, y1)))
return objs.difference((obj1, obj2))
# XXX this still takes O(n^2) :(
dists = []
for i in xrange(len(boxes)):
@ -600,23 +609,23 @@ class LTLayoutContainer(LTContainer):
plane = Plane(self.bbox)
plane.extend(boxes)
while dists:
(c,d,obj1,obj2) = dists.pop(0)
(c, d, obj1, obj2) = dists.pop(0)
if c == 0 and isany(obj1, obj2):
dists.append((1,d,obj1,obj2))
dists.append((1, d, obj1, obj2))
continue
if (isinstance(obj1, LTTextBoxVertical) or
isinstance(obj1, LTTextGroupTBRL) or
isinstance(obj2, LTTextBoxVertical) or
isinstance(obj2, LTTextGroupTBRL)):
group = LTTextGroupTBRL([obj1,obj2])
group = LTTextGroupTBRL([obj1, obj2])
else:
group = LTTextGroupLRTB([obj1,obj2])
group = LTTextGroupLRTB([obj1, obj2])
plane.remove(obj1)
plane.remove(obj2)
# this line is optimized -- don't change without profiling
dists = [ n for n in dists if n[2] in plane._objs and n[3] in plane._objs ]
dists = [n for n in dists if n[2] in plane._objs and n[3] in plane._objs]
for other in plane:
dists.append((0, dist(group,other), group, other))
dists.append((0, dist(group, other), group, other))
dists.sort()
plane.add(group)
assert len(plane) == 1
@ -628,21 +637,22 @@ class LTLayoutContainer(LTContainer):
(textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self._objs)
for obj in otherobjs:
obj.analyze(laparams)
if not textobjs: return
if not textobjs:
return
textlines = list(self.get_textlines(laparams, textobjs))
assert len(textobjs) <= sum( len(line._objs) for line in textlines )
assert len(textobjs) <= sum(len(line._objs) for line in textlines)
(empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)
for obj in empties:
obj.analyze(laparams)
textboxes = list(self.get_textboxes(laparams, textlines))
assert len(textlines) == sum( len(box._objs) for box in textboxes )
assert len(textlines) == sum(len(box._objs) for box in textboxes)
if textboxes:
self.groups = self.group_textboxes(laparams, textboxes)
assigner = IndexAssigner()
for group in self.groups:
group.analyze(laparams)
assigner.run(group)
textboxes.sort(key=lambda box:box.index)
textboxes.sort(key=lambda box: box.index)
self._objs = textboxes + otherobjs + empties
return
@ -654,9 +664,9 @@ class LTFigure(LTLayoutContainer):
def __init__(self, name, bbox, matrix):
self.name = name
self.matrix = matrix
(x,y,w,h) = bbox
bbox = get_bound( apply_matrix_pt(matrix, (p,q))
for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) )
(x, y, w, h) = bbox
bbox = get_bound(apply_matrix_pt(matrix, (p, q))
for (p, q) in ((x, y), (x+w, y), (x, y+h), (x+w, y+h)))
LTLayoutContainer.__init__(self, bbox)
return
@ -666,7 +676,8 @@ class LTFigure(LTLayoutContainer):
bbox2str(self.bbox), matrix2str(self.matrix)))
def analyze(self, laparams):
if not laparams.all_texts: return
if not laparams.all_texts:
return
LTLayoutContainer.analyze(self, laparams)
return

View File

@ -34,17 +34,18 @@ class LZWDecoder(object):
# |-----8-bits-----|
# |-bpos-|-bits-| |
# | |----r----|
v = (v<<bits) | ((self.buff>>(r-bits)) & ((1<<bits)-1))
v = (v << bits) | ((self.buff >> (r-bits)) & ((1 << bits)-1))
self.bpos += bits
break
else:
# |-----8-bits-----|
# |-bpos-|---bits----...
# | |----r----|
v = (v<<r) | (self.buff & ((1<<r)-1))
v = (v << r) | (self.buff & ((1 << r)-1))
bits -= r
x = self.fp.read(1)
if not x: raise EOFError
if not x:
raise EOFError
self.buff = ord(x)
self.bpos = 0
return v
@ -52,7 +53,7 @@ class LZWDecoder(object):
def feed(self, code):
x = ''
if code == 256:
self.table = [ chr(c) for c in xrange(256) ] # 0-255
self.table = [chr(c) for c in xrange(256)] # 0-255
self.table.append(None) # 256
self.table.append(None) # 257
self.prevbuf = ''
@ -97,6 +98,7 @@ class LZWDecoder(object):
(self.nbits, code, x, self.table[258:]))
return
# lzwdecode
def lzwdecode(data):
"""

View File

@ -8,6 +8,7 @@ LITERAL_DEVICE_GRAY = LIT('DeviceGray')
LITERAL_DEVICE_RGB = LIT('DeviceRGB')
LITERAL_DEVICE_CMYK = LIT('DeviceCMYK')
class PDFColorSpace(object):
def __init__(self, name, ncomponents):
@ -20,7 +21,7 @@ class PDFColorSpace(object):
PREDEFINED_COLORSPACE = dict(
(name, PDFColorSpace(name,n)) for (name,n) in {
(name, PDFColorSpace(name, n)) for (name, n) in {
'CalRGB': 3,
'CalGray': 1,
'Lab': 3,

View File

@ -28,24 +28,31 @@ class PDFDevice(object):
def begin_tag(self, tag, props=None):
return
def end_tag(self):
return
def do_tag(self, tag, props=None):
return
def begin_page(self, page, ctm):
return
def end_page(self, page):
return
def begin_figure(self, name, bbox, matrix):
return
def end_figure(self, name):
return
def paint_path(self, graphicstate, stroke, fill, evenodd, path):
return
def render_image(self, name, stream):
return
def render_string(self, textstate, seq):
return
@ -75,7 +82,7 @@ class PDFTextDevice(PDFDevice):
scaling, charspace, wordspace, rise, dxscale)
return
def render_string_horizontal(self, seq, matrix, (x,y),
def render_string_horizontal(self, seq, matrix, (x, y),
font, fontsize, scaling, charspace, wordspace, rise, dxscale):
needcharspace = False
for obj in seq:
@ -86,14 +93,14 @@ class PDFTextDevice(PDFDevice):
for cid in font.decode(obj):
if needcharspace:
x += charspace
x += self.render_char(translate_matrix(matrix, (x,y)),
x += self.render_char(translate_matrix(matrix, (x, y)),
font, fontsize, scaling, rise, cid)
if cid == 32 and wordspace:
x += wordspace
needcharspace = True
return (x, y)
def render_string_vertical(self, seq, matrix, (x,y),
def render_string_vertical(self, seq, matrix, (x, y),
font, fontsize, scaling, charspace, wordspace, rise, dxscale):
needcharspace = False
for obj in seq:
@ -104,7 +111,7 @@ class PDFTextDevice(PDFDevice):
for cid in font.decode(obj):
if needcharspace:
y += charspace
y += self.render_char(translate_matrix(matrix, (x,y)),
y += self.render_char(translate_matrix(matrix, (x, y)),
font, fontsize, scaling, rise, cid)
if cid == 32 and wordspace:
y += wordspace
@ -132,7 +139,8 @@ class TagExtractor(PDFDevice):
font = textstate.font
text = ''
for obj in seq:
if not isinstance(obj, str): continue
if not isinstance(obj, str):
continue
chars = font.decode(obj)
for cid in chars:
try:
@ -156,8 +164,8 @@ class TagExtractor(PDFDevice):
def begin_tag(self, tag, props=None):
s = ''
if isinstance(props, dict):
s = ''.join( ' %s="%s"' % (enc(k), enc(str(v))) for (k,v)
in sorted(props.iteritems()) )
s = ''.join(' %s="%s"' % (enc(k), enc(str(v))) for (k, v)
in sorted(props.iteritems()))
self.outfp.write('<%s%s>' % (enc(tag.name), s))
self._stack.append(tag)
return

View File

@ -23,11 +23,24 @@ from utils import decode_text
## Exceptions
##
class PDFNoValidXRef(PDFSyntaxError): pass
class PDFNoOutlines(PDFException): pass
class PDFDestinationNotFound(PDFException): pass
class PDFEncryptionError(PDFException): pass
class PDFPasswordIncorrect(PDFEncryptionError): pass
class PDFNoValidXRef(PDFSyntaxError):
pass
class PDFNoOutlines(PDFException):
pass
class PDFDestinationNotFound(PDFException):
pass
class PDFEncryptionError(PDFException):
pass
class PDFPasswordIncorrect(PDFEncryptionError):
pass
# some predefined literals and keywords.
LITERAL_OBJSTM = LIT('ObjStm')
@ -68,7 +81,8 @@ class PDFXRef(PDFBaseXRef):
while 1:
try:
(pos, line) = parser.nextline()
if not line.strip(): continue
if not line.strip():
continue
except PSEOF:
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
if not line:
@ -92,7 +106,8 @@ class PDFXRef(PDFBaseXRef):
if len(f) != 3:
raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line))
(pos, genno, use) = f
if use != 'n': continue
if use != 'n':
continue
self.offsets[objid] = (None, long(pos), int(genno))
if 1 <= debug:
print >>sys.stderr, 'xref objects:', self.offsets
@ -100,16 +115,17 @@ class PDFXRef(PDFBaseXRef):
return
KEYWORD_TRAILER = KWD('trailer')
def load_trailer(self, parser):
try:
(_,kwd) = parser.nexttoken()
(_, kwd) = parser.nexttoken()
assert kwd is self.KEYWORD_TRAILER
(_,dic) = parser.nextobject()
(_, dic) = parser.nextobject()
except PSEOF:
x = parser.pop(1)
if not x:
raise PDFNoValidXRef('Unexpected EOF - file corrupted')
(_,dic) = x[0]
(_, dic) = x[0]
self.trailer.update(dict_value(dic))
return
@ -134,6 +150,7 @@ class PDFXRefFallback(PDFXRef):
return '<PDFXRefFallback: offsets=%r>' % (self.offsets.keys())
PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
def load(self, parser, debug=0):
parser.seek(0)
while 1:
@ -148,14 +165,15 @@ class PDFXRefFallback(PDFXRef):
print >>sys.stderr, 'trailer: %r' % self.get_trailer()
break
m = self.PDFOBJ_CUE.match(line)
if not m: continue
if not m:
continue
(objid, genno) = m.groups()
objid = int(objid)
genno = int(genno)
self.offsets[objid] = (None, pos, genno)
# expand ObjStm.
parser.seek(pos)
(_,obj) = parser.nextobject()
(_, obj) = parser.nextobject()
if isinstance(obj, PDFStream) and obj.get('Type') is LITERAL_OBJSTM:
stream = stream_value(obj)
try:
@ -168,7 +186,7 @@ class PDFXRefFallback(PDFXRef):
objs = []
try:
while 1:
(_,obj) = parser1.nextobject()
(_, obj) = parser1.nextobject()
objs.append(obj)
except PSEOF:
pass
@ -193,14 +211,14 @@ class PDFXRefStream(PDFBaseXRef):
return '<PDFXRefStream: ranges=%r>' % (self.ranges)
def load(self, parser, debug=0):
(_,objid) = parser.nexttoken() # ignored
(_,genno) = parser.nexttoken() # ignored
(_,kwd) = parser.nexttoken()
(_,stream) = parser.nextobject()
(_, objid) = parser.nexttoken() # ignored
(_, genno) = parser.nexttoken() # ignored
(_, kwd) = parser.nexttoken()
(_, stream) = parser.nextobject()
if not isinstance(stream, PDFStream) or stream['Type'] is not LITERAL_XREF:
raise PDFNoValidXRef('Invalid PDF stream spec.')
size = stream['Size']
index_array = stream.get('Index', (1,size))
index_array = stream.get('Index', (1, size))
if len(index_array) % 2 != 0:
raise PDFSyntaxError('Invalid index number')
self.ranges.extend(choplist(2, index_array))
@ -218,14 +236,14 @@ class PDFXRefStream(PDFBaseXRef):
return self.trailer
def get_objids(self):
for (start,nobjs) in self.ranges:
for (start, nobjs) in self.ranges:
for i in xrange(nobjs):
yield start+i
return
def get_pos(self, objid):
index = 0
for (start,nobjs) in self.ranges:
for (start, nobjs) in self.ranges:
if start <= objid and objid < start+nobjs:
index += objid - start
else:
@ -292,7 +310,8 @@ class PDFDocument(object):
self.xrefs.append(xref)
for xref in self.xrefs:
trailer = xref.get_trailer()
if not trailer: continue
if not trailer:
continue
# If there's an encryption info, remember it.
if 'Encrypt' in trailer:
#assert not self.encryption
@ -316,6 +335,7 @@ class PDFDocument(object):
# This step is mandatory even if there's no password associated
# with the document.
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
def initialize(self, password=''):
if not self.encryption:
self.is_printable = self.is_modifiable = self.is_extractable = True
@ -358,8 +378,8 @@ class PDFDocument(object):
hash = md5.md5(self.PASSWORD_PADDING) # 2
hash.update(docid[0]) # 3
x = Arcfour(key).process(hash.digest()[:16]) # 4
for i in xrange(1,19+1):
k = ''.join( chr(ord(c) ^ i) for c in key )
for i in xrange(1, 19+1):
k = ''.join(chr(ord(c) ^ i) for c in key)
x = Arcfour(k).process(x)
u1 = x+x # 32bytes total
if R == 2:
@ -373,18 +393,18 @@ class PDFDocument(object):
return
def decrypt_rc4(self, objid, genno, data):
key = self.decrypt_key + struct.pack('<L',objid)[:3]+struct.pack('<L',genno)[:2]
key = self.decrypt_key + struct.pack('<L', objid)[:3]+struct.pack('<L', genno)[:2]
hash = md5.md5(key)
key = hash.digest()[:min(len(key),16)]
key = hash.digest()[:min(len(key), 16)]
return Arcfour(key).process(data)
def _getobj_objstm(self, stream, index, objid):
if stream.objid in self._parsed_objs:
(objs,n) = self._parsed_objs[stream.objid]
(objs, n) = self._parsed_objs[stream.objid]
else:
(objs,n) = self._get_objects(stream)
(objs, n) = self._get_objects(stream)
if self.caching:
self._parsed_objs[stream.objid] = (objs,n)
self._parsed_objs[stream.objid] = (objs, n)
i = n*2+index
try:
obj = objs[i]
@ -407,23 +427,24 @@ class PDFDocument(object):
objs = []
try:
while 1:
(_,obj) = parser.nextobject()
(_, obj) = parser.nextobject()
objs.append(obj)
except PSEOF:
pass
return (objs, n)
KEYWORD_OBJ = KWD('obj')
def _getobj_parse(self, pos, objid):
self._parser.seek(pos)
(_,objid1) = self._parser.nexttoken() # objid
(_, objid1) = self._parser.nexttoken() # objid
if objid1 != objid:
raise PDFSyntaxError('objid mismatch: %r=%r' % (objid1, objid))
(_,genno) = self._parser.nexttoken() # genno
(_,kwd) = self._parser.nexttoken()
(_, genno) = self._parser.nexttoken() # genno
(_, kwd) = self._parser.nexttoken()
if kwd is not self.KEYWORD_OBJ:
raise PDFSyntaxError('Invalid object spec: offset=%r' % pos)
(_,obj) = self._parser.nextobject()
(_, obj) = self._parser.nextobject()
return obj
# can raise PDFObjectNotFound
@ -465,6 +486,7 @@ class PDFDocument(object):
def get_outlines(self):
if 'Outlines' not in self.catalog:
raise PDFNoOutlines
def search(entry, level):
entry = dict_value(entry)
if 'Title' in entry:
@ -487,13 +509,15 @@ class PDFDocument(object):
try:
names = dict_value(self.catalog['Names'])
except (PDFTypeError, KeyError):
raise KeyError((cat,key))
raise KeyError((cat, key))
# may raise KeyError
d0 = dict_value(names[cat])
def lookup(d):
if 'Limits' in d:
(k1,k2) = list_value(d['Limits'])
if key < k1 or k2 < key: return None
(k1, k2) = list_value(d['Limits'])
if key < k1 or k2 < key:
return None
if 'Names' in d:
objs = list_value(d['Names'])
names = dict(choplist(2, objs))
@ -501,8 +525,9 @@ class PDFDocument(object):
if 'Kids' in d:
for c in list_value(d['Kids']):
v = lookup(dict_value(c))
if v: return v
raise KeyError((cat,key))
if v:
return v
raise KeyError((cat, key))
return lookup(d0)
def get_dest(self, name):
@ -528,7 +553,8 @@ class PDFDocument(object):
line = line.strip()
if 2 <= self.debug:
print >>sys.stderr, 'find_xref: %r' % line
if line == 'startxref': break
if line == 'startxref':
break
if line:
prev = line
else:

View File

@ -25,13 +25,13 @@ def get_widths(seq):
if isinstance(v, list):
if r:
char1 = r[-1]
for (i,w) in enumerate(v):
for (i, w) in enumerate(v):
widths[char1+i] = w
r = []
elif isinstance(v, int):
r.append(v)
if len(r) == 3:
(char1,char2,w) = r
(char1, char2, w) = r
for i in xrange(char1, char2+1):
widths[i] = w
r = []
@ -40,6 +40,7 @@ def get_widths(seq):
#assert get_widths([1,2,3]) == {1:3, 2:3}
#assert get_widths([1,[2,3],6,[7,8]]) == {1:2,2:3, 6:7,7:8}
def get_widths2(seq):
widths = {}
r = []
@ -47,20 +48,20 @@ def get_widths2(seq):
if isinstance(v, list):
if r:
char1 = r[-1]
for (i,(w,vx,vy)) in enumerate(choplist(3,v)):
widths[char1+i] = (w,(vx,vy))
for (i, (w, vx, vy)) in enumerate(choplist(3, v)):
widths[char1+i] = (w, (vx, vy))
r = []
elif isinstance(v, int):
r.append(v)
if len(r) == 5:
(char1,char2,w,vx,vy) = r
(char1, char2, w, vx, vy) = r
for i in xrange(char1, char2+1):
widths[i] = (w,(vx,vy))
widths[i] = (w, (vx, vy))
r = []
return widths
#assert get_widths2([1]) == {}
#assert get_widths2([1,2,3,4,5]) == {1:(3,(4,5)), 2:(3,(4,5))}
#assert get_widths2([1,[2,3,4,5],6,[7,8,9]]) == {1:(2,(3,4)), 6:(7,(8,9))}
#assert get_widths2([1,2,3,4,5]) == {1:(3, (4,5)), 2:(3, (4,5))}
#assert get_widths2([1,[2,3,4,5],6,[7,8,9]]) == {1:(2, (3,4)), 6:(7, (8,9))}
## FontMetricsDB
@ -94,7 +95,7 @@ class Type1FontHeaderParser(PSStackParser):
def get_encoding(self):
while 1:
try:
(cid,name) = self.nextobject()
(cid, name) = self.nextobject()
except PSEOF:
break
try:
@ -105,25 +106,28 @@ class Type1FontHeaderParser(PSStackParser):
def do_keyword(self, pos, token):
if token is self.KEYWORD_PUT:
((_,key),(_,value)) = self.pop(2)
((_, key), (_, value)) = self.pop(2)
if (isinstance(key, int) and
isinstance(value, PSLiteral)):
self.add_results((key, literal_name(value)))
return
NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-')
## CFFFont
## (Format specified in Adobe Technical Note: #5176
## "The Compact Font Format Specification")
##
NIBBLES = ('0','1','2','3','4','5','6','7','8','9','.','e','e-',None,'-')
def getdict(data):
d = {}
fp = StringIO(data)
stack = []
while 1:
c = fp.read(1)
if not c: break
if not c:
break
b0 = ord(c)
if b0 <= 21:
d[b0] = stack
@ -145,19 +149,21 @@ def getdict(data):
else:
b1 = ord(fp.read(1))
if 247 <= b0 and b0 <= 250:
value = ((b0-247)<<8)+b1+108
value = ((b0-247) << 8)+b1+108
elif 251 <= b0 and b0 <= 254:
value = -((b0-251)<<8)-b1-108
value = -((b0-251) << 8)-b1-108
else:
b2 = ord(fp.read(1))
if 128 <= b1: b1 -= 256
if 128 <= b1:
b1 -= 256
if b0 == 28:
value = b1<<8 | b2
value = b1 << 8 | b2
else:
value = b1<<24 | b2<<16 | struct.unpack('>H', fp.read(2))[0]
value = b1 << 24 | b2 << 16 | struct.unpack('>H', fp.read(2))[0]
stack.append(value)
return d
class CFFFont(object):
STANDARD_STRINGS = (
@ -264,13 +270,13 @@ class CFFFont(object):
return self.fp.read(self.offsets[i+1]-self.offsets[i])
def __iter__(self):
return iter( self[i] for i in xrange(len(self)) )
return iter(self[i] for i in xrange(len(self)))
def __init__(self, name, fp):
self.name = name
self.fp = fp
# Header
(_major,_minor,hdrsize,offsize) = struct.unpack('BBBB', self.fp.read(4))
(_major, _minor, hdrsize, offsize) = struct.unpack('BBBB', self.fp.read(4))
self.fp.read(hdrsize-4)
# Name INDEX
self.name_index = self.INDEX(self.fp)
@ -297,7 +303,7 @@ class CFFFont(object):
if format == '\x00':
# Format 0
(n,) = struct.unpack('B', self.fp.read(1))
for (code,gid) in enumerate(struct.unpack('B'*n, self.fp.read(n))):
for (code, gid) in enumerate(struct.unpack('B'*n, self.fp.read(n))):
self.code2gid[code] = gid
self.gid2code[gid] = code
elif format == '\x01':
@ -305,8 +311,8 @@ class CFFFont(object):
(n,) = struct.unpack('B', self.fp.read(1))
code = 0
for i in xrange(n):
(first,nleft) = struct.unpack('BB', self.fp.read(2))
for gid in xrange(first,first+nleft+1):
(first, nleft) = struct.unpack('BB', self.fp.read(2))
for gid in xrange(first, first+nleft+1):
self.code2gid[code] = gid
self.gid2code[gid] = code
code += 1
@ -320,7 +326,7 @@ class CFFFont(object):
if format == '\x00':
# Format 0
n = self.nglyphs-1
for (gid,sid) in enumerate(struct.unpack('>'+'H'*n, self.fp.read(2*n))):
for (gid, sid) in enumerate(struct.unpack('>'+'H'*n, self.fp.read(2*n))):
gid += 1
name = self.getstr(sid)
self.name2gid[name] = gid
@ -330,8 +336,8 @@ class CFFFont(object):
(n,) = struct.unpack('B', self.fp.read(1))
sid = 0
for i in xrange(n):
(first,nleft) = struct.unpack('BB', self.fp.read(2))
for gid in xrange(first,first+nleft+1):
(first, nleft) = struct.unpack('BB', self.fp.read(2))
for gid in xrange(first, first+nleft+1):
name = self.getstr(sid)
self.name2gid[name] = gid
self.gid2name[gid] = name
@ -356,7 +362,8 @@ class CFFFont(object):
##
class TrueTypeFont(object):
class CMapNotFound(Exception): pass
class CMapNotFound(Exception):
pass
def __init__(self, name, fp):
self.name = name
@ -389,15 +396,16 @@ class TrueTypeFont(object):
elif fmttype == 2:
subheaderkeys = struct.unpack('>256H', fp.read(512))
firstbytes = [0]*8192
for (i,k) in enumerate(subheaderkeys):
for (i, k) in enumerate(subheaderkeys):
firstbytes[k/8] = i
nhdrs = max(subheaderkeys)/8 + 1
hdrs = []
for i in xrange(nhdrs):
(firstcode,entcount,delta,offset) = struct.unpack('>HHhH', fp.read(8))
hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset))
for (i,firstcode,entcount,delta,pos) in hdrs:
if not entcount: continue
(firstcode, entcount, delta, offset) = struct.unpack('>HHhH', fp.read(8))
hdrs.append((i, firstcode, entcount, delta, fp.tell()-2+offset))
for (i, firstcode, entcount, delta, pos) in hdrs:
if not entcount:
continue
first = firstcode + (firstbytes[i] << 8)
fp.seek(pos)
for c in xrange(entcount):
@ -414,7 +422,7 @@ class TrueTypeFont(object):
idds = struct.unpack('>%dh' % segcount, fp.read(2*segcount))
pos = fp.tell()
idrs = struct.unpack('>%dH' % segcount, fp.read(2*segcount))
for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs):
for (ec, sc, idd, idr) in zip(ecs, scs, idds, idrs):
if idr:
fp.seek(pos+idr)
for c in xrange(sc, ec+1):
@ -426,16 +434,19 @@ class TrueTypeFont(object):
assert 0
# create unicode map
unicode_map = FileUnicodeMap()
for (char,gid) in char2gid.iteritems():
for (char, gid) in char2gid.iteritems():
unicode_map.add_cid2unichr(gid, char)
return unicode_map
## Fonts
##
class PDFFontError(PDFException):
pass
class PDFFontError(PDFException): pass
class PDFUnicodeNotDefined(PDFFontError): pass
class PDFUnicodeNotDefined(PDFFontError):
pass
LITERAL_STANDARD_ENCODING = LIT('StandardEncoding')
LITERAL_TYPE1C = LIT('Type1C')
@ -456,7 +467,7 @@ class PDFFont(object):
self.italic_angle = num_value(descriptor.get('ItalicAngle', 0))
self.default_width = default_width or num_value(descriptor.get('MissingWidth', 0))
self.leading = num_value(descriptor.get('Leading', 0))
self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0)))
self.bbox = list_value(descriptor.get('FontBBox', (0, 0, 0, 0)))
self.hscale = self.vscale = .001
return
@ -474,6 +485,7 @@ class PDFFont(object):
def get_ascent(self):
return self.ascent * self.vscale
def get_descent(self):
return self.descent * self.vscale
@ -482,6 +494,7 @@ class PDFFont(object):
if w == 0:
w = -self.default_width
return w * self.hscale
def get_height(self):
h = self.bbox[3]-self.bbox[1]
if h == 0:
@ -501,7 +514,7 @@ class PDFFont(object):
return 0
def string_width(self, s):
return sum( self.char_width(cid) for cid in self.decode(s) )
return sum(self.char_width(cid) for cid in self.decode(s))
# PDFSimpleFont
@ -540,6 +553,7 @@ class PDFSimpleFont(PDFFont):
except KeyError:
raise PDFUnicodeNotDefined(None, cid)
# PDFType1Font
class PDFType1Font(PDFSimpleFont):
@ -557,7 +571,7 @@ class PDFType1Font(PDFSimpleFont):
firstchar = int_value(spec.get('FirstChar', 0))
lastchar = int_value(spec.get('LastChar', 255))
widths = list_value(spec.get('Widths', [0]*256))
widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) )
widths = dict((i+firstchar, w) for (i, w) in enumerate(widths))
PDFSimpleFont.__init__(self, descriptor, widths, spec)
if 'Encoding' not in spec and 'FontFile' in descriptor:
# try to recover the missing encoding info from the font file.
@ -571,12 +585,14 @@ class PDFType1Font(PDFSimpleFont):
def __repr__(self):
return '<PDFType1Font: basefont=%r>' % self.basefont
# PDFTrueTypeFont
class PDFTrueTypeFont(PDFType1Font):
def __repr__(self):
return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
# PDFType3Font
class PDFType3Font(PDFSimpleFont):
@ -584,16 +600,16 @@ class PDFType3Font(PDFSimpleFont):
firstchar = int_value(spec.get('FirstChar', 0))
lastchar = int_value(spec.get('LastChar', 0))
widths = list_value(spec.get('Widths', [0]*256))
widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths))
widths = dict((i+firstchar, w) for (i, w) in enumerate(widths))
if 'FontDescriptor' in spec:
descriptor = dict_value(spec['FontDescriptor'])
else:
descriptor = {'Ascent':0, 'Descent':0,
'FontBBox':spec['FontBBox']}
descriptor = {'Ascent': 0, 'Descent': 0,
'FontBBox': spec['FontBBox']}
PDFSimpleFont.__init__(self, descriptor, widths, spec)
self.matrix = tuple(list_value(spec.get('FontMatrix')))
(_,self.descent,_,self.ascent) = self.bbox
(self.hscale,self.vscale) = apply_matrix_norm(self.matrix, (1,1))
(_, self.descent, _, self.ascent) = self.bbox
(self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))
return
def __repr__(self):
@ -657,10 +673,10 @@ class PDFCIDFont(PDFFont):
if self.vertical:
# writing mode: vertical
widths = get_widths2(list_value(spec.get('W2', [])))
self.disps = dict( (cid,(vx,vy)) for (cid,(_,(vx,vy))) in widths.iteritems() )
(vy,w) = spec.get('DW2', [880, -1000])
self.default_disp = (None,vy)
widths = dict( (cid,w) for (cid,(w,_)) in widths.iteritems() )
self.disps = dict((cid, (vx, vy)) for (cid, (_, (vx, vy))) in widths.iteritems())
(vy, w) = spec.get('DW2', [880, -1000])
self.default_disp = (None, vy)
widths = dict((cid, w) for (cid, (w, _)) in widths.iteritems())
default_width = w
else:
# writing mode: horizontal
@ -689,7 +705,8 @@ class PDFCIDFont(PDFFont):
def to_unichr(self, cid):
try:
if not self.unicode_map: raise KeyError(cid)
if not self.unicode_map:
raise KeyError(cid)
return self.unicode_map.get_unichr(cid)
except KeyError:
raise PDFUnicodeNotDefined(self.cidcoding, cid)
@ -705,4 +722,5 @@ def main(argv):
fp.close()
return
if __name__ == '__main__': sys.exit(main(sys.argv))
if __name__ == '__main__':
sys.exit(main(sys.argv))

View File

@ -30,8 +30,12 @@ from utils import mult_matrix, MATRIX_IDENTITY
## Exceptions
##
class PDFResourceError(PDFException): pass
class PDFInterpreterError(PDFException): pass
class PDFResourceError(PDFException):
pass
class PDFInterpreterError(PDFException):
pass
## Constants
@ -120,6 +124,7 @@ class PDFGraphicState(object):
(self.linewidth, self.linecap, self.linejoin,
self.miterlimit, self.dash, self.intent, self.flatness))
## Resource Manager
##
class PDFResourceManager(object):
@ -152,7 +157,8 @@ class PDFResourceManager(object):
try:
return CMapDB.get_cmap(cmapname)
except CMapDB.CMapNotFound:
if strict: raise
if strict:
raise
return CMap()
def get_font(self, objid, spec):
@ -227,12 +233,14 @@ class PDFContentParser(PSStackParser):
return
def fillbuf(self):
if self.charpos < len(self.buf): return
if self.charpos < len(self.buf):
return
while 1:
self.fillfp()
self.bufpos = self.fp.tell()
self.buf = self.fp.read(self.BUFSIZ)
if self.buf: break
if self.buf:
break
self.fp = None
self.charpos = 0
return
@ -274,6 +282,7 @@ class PDFContentParser(PSStackParser):
KEYWORD_BI = KWD('BI')
KEYWORD_ID = KWD('ID')
KEYWORD_EI = KWD('EI')
def do_keyword(self, pos, token):
if token is self.KEYWORD_BI:
# inline image within a content stream
@ -283,13 +292,14 @@ class PDFContentParser(PSStackParser):
(_, objs) = self.end_type('inline')
if len(objs) % 2 != 0:
raise PSTypeError('Invalid dictionary construct: %r' % objs)
d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
d = dict((literal_name(k), v) for (k, v) in choplist(2, objs))
(pos, data) = self.get_inline_data(pos+len('ID '))
obj = PDFStream(d, data)
self.push((pos, obj))
self.push((pos, self.KEYWORD_EI))
except PSTypeError:
if STRICT: raise
if STRICT:
raise
else:
self.push((pos, token))
return
@ -316,7 +326,9 @@ class PDFPageInterpreter(object):
self.fontmap = {}
self.xobjmap = {}
self.csmap = PREDEFINED_COLORSPACE.copy()
if not resources: return
if not resources:
return
def get_colorspace(spec):
if isinstance(spec, list):
name = literal_name(spec[0])
@ -328,23 +340,23 @@ class PDFPageInterpreter(object):
return PDFColorSpace(name, len(list_value(spec[1])))
else:
return PREDEFINED_COLORSPACE.get(name)
for (k,v) in dict_value(resources).iteritems():
for (k, v) in dict_value(resources).iteritems():
if 2 <= self.debug:
print >>sys.stderr, 'Resource: %r: %r' % (k,v)
print >>sys.stderr, 'Resource: %r: %r' % (k, v)
if k == 'Font':
for (fontid,spec) in dict_value(v).iteritems():
for (fontid, spec) in dict_value(v).iteritems():
objid = None
if isinstance(spec, PDFObjRef):
objid = spec.objid
spec = dict_value(spec)
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
elif k == 'ColorSpace':
for (csid,spec) in dict_value(v).iteritems():
for (csid, spec) in dict_value(v).iteritems():
self.csmap[csid] = get_colorspace(resolve1(spec))
elif k == 'ProcSet':
self.rsrcmgr.get_procset(list_value(v))
elif k == 'XObject':
for (xobjid,xobjstrm) in dict_value(v).iteritems():
for (xobjid, xobjstrm) in dict_value(v).iteritems():
self.xobjmap[xobjid] = xobjstrm
return
@ -371,7 +383,8 @@ class PDFPageInterpreter(object):
return
def pop(self, n):
if n == 0: return []
if n == 0:
return []
x = self.argstack[-n:]
self.argstack = self.argstack[:-n]
return x
@ -388,6 +401,7 @@ class PDFPageInterpreter(object):
def do_q(self):
self.gstack.append(self.get_current_state())
return
# grestore
def do_Q(self):
if self.gstack:
@ -396,7 +410,7 @@ class PDFPageInterpreter(object):
# concat-matrix
def do_cm(self, a1, b1, c1, d1, e1, f1):
self.ctm = mult_matrix((a1,b1,c1,d1,e1,f1), self.ctm)
self.ctm = mult_matrix((a1, b1, c1, d1, e1, f1), self.ctm)
self.device.set_ctm(self.ctm)
return
@ -404,30 +418,37 @@ class PDFPageInterpreter(object):
def do_w(self, linewidth):
self.graphicstate.linewidth = linewidth
return
# setlinecap
def do_J(self, linecap):
self.graphicstate.linecap = linecap
return
# setlinejoin
def do_j(self, linejoin):
self.graphicstate.linejoin = linejoin
return
# setmiterlimit
def do_M(self, miterlimit):
self.graphicstate.miterlimit = miterlimit
return
# setdash
def do_d(self, dash, phase):
self.graphicstate.dash = (dash, phase)
return
# setintent
def do_ri(self, intent):
self.graphicstate.intent = intent
return
# setflatness
def do_i(self, flatness):
self.graphicstate.flatness = flatness
return
# load-gstate
def do_gs(self, name):
#XXX
@ -435,34 +456,40 @@ class PDFPageInterpreter(object):
# moveto
def do_m(self, x, y):
self.curpath.append(('m',x,y))
self.curpath.append(('m', x, y))
return
# lineto
def do_l(self, x, y):
self.curpath.append(('l',x,y))
self.curpath.append(('l', x, y))
return
# curveto
def do_c(self, x1, y1, x2, y2, x3, y3):
self.curpath.append(('c',x1,y1,x2,y2,x3,y3))
self.curpath.append(('c', x1, y1, x2, y2, x3, y3))
return
# urveto
def do_v(self, x2, y2, x3, y3):
self.curpath.append(('v',x2,y2,x3,y3))
self.curpath.append(('v', x2, y2, x3, y3))
return
# rveto
def do_y(self, x1, y1, x3, y3):
self.curpath.append(('y',x1,y1,x3,y3))
self.curpath.append(('y', x1, y1, x3, y3))
return
# closepath
def do_h(self):
self.curpath.append(('h',))
return
# rectangle
def do_re(self, x, y, w, h):
self.curpath.append(('m',x,y))
self.curpath.append(('l',x+w,y))
self.curpath.append(('l',x+w,y+h))
self.curpath.append(('l',x,y+h))
self.curpath.append(('m', x, y))
self.curpath.append(('l', x+w, y))
self.curpath.append(('l', x+w, y+h))
self.curpath.append(('l', x, y+h))
self.curpath.append(('h',))
return
@ -471,11 +498,13 @@ class PDFPageInterpreter(object):
self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
self.curpath = []
return
# close-and-stroke
def do_s(self):
self.do_h()
self.do_S()
return
# fill
def do_f(self):
self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
@ -483,68 +512,85 @@ class PDFPageInterpreter(object):
return
# fill (obsolete)
do_F = do_f
# fill-even-odd
def do_f_a(self):
self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
self.curpath = []
return
# fill-and-stroke
def do_B(self):
self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
self.curpath = []
return
# fill-and-stroke-even-odd
def do_B_a(self):
self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
self.curpath = []
return
# close-fill-and-stroke
def do_b(self):
self.do_h()
self.do_B()
return
# close-fill-and-stroke-even-odd
def do_b_a(self):
self.do_h()
self.do_B_a()
return
# close-only
def do_n(self):
self.curpath = []
return
# clip
def do_W(self): return
def do_W(self):
return
# clip-even-odd
def do_W_a(self): return
def do_W_a(self):
return
# setcolorspace-stroking
def do_CS(self, name):
self.scs = self.csmap[literal_name(name)]
return
# setcolorspace-non-strokine
def do_cs(self, name):
self.ncs = self.csmap[literal_name(name)]
return
# setgray-stroking
def do_G(self, gray):
#self.do_CS(LITERAL_DEVICE_GRAY)
return
# setgray-non-stroking
def do_g(self, gray):
#self.do_cs(LITERAL_DEVICE_GRAY)
return
# setrgb-stroking
def do_RG(self, r, g, b):
#self.do_CS(LITERAL_DEVICE_RGB)
return
# setrgb-non-stroking
def do_rg(self, r, g, b):
#self.do_cs(LITERAL_DEVICE_RGB)
return
# setcmyk-stroking
def do_K(self, c, m, y, k):
#self.do_CS(LITERAL_DEVICE_CMYK)
return
# setcmyk-non-stroking
def do_k(self, c, m, y, k):
#self.do_cs(LITERAL_DEVICE_CMYK)
@ -560,6 +606,7 @@ class PDFPageInterpreter(object):
n = 1
self.pop(n)
return
def do_scn(self):
if self.ncs:
n = self.ncs.ncomponents
@ -569,42 +616,53 @@ class PDFPageInterpreter(object):
n = 1
self.pop(n)
return
def do_SC(self):
self.do_SCN()
return
def do_sc(self):
self.do_scn()
return
# sharing-name
def do_sh(self, name): return
def do_sh(self, name):
return
# begin-text
def do_BT(self):
self.textstate.reset()
return
# end-text
def do_ET(self):
return
# begin-compat
def do_BX(self): return
def do_BX(self):
return
# end-compat
def do_EX(self): return
def do_EX(self):
return
# marked content operators
def do_MP(self, tag):
self.device.do_tag(tag)
return
def do_DP(self, tag, props):
self.device.do_tag(tag, props)
return
def do_BMC(self, tag):
self.device.begin_tag(tag)
return
def do_BDC(self, tag, props):
self.device.begin_tag(tag, props)
return
def do_EMC(self):
self.device.end_tag()
return
@ -613,18 +671,22 @@ class PDFPageInterpreter(object):
def do_Tc(self, space):
self.textstate.charspace = space
return
# setwordspace
def do_Tw(self, space):
self.textstate.wordspace = space
return
# textscale
def do_Tz(self, scale):
self.textstate.scaling = scale
return
# setleading
def do_TL(self, leading):
self.textstate.leading = -leading
return
# selectfont
def do_Tf(self, fontid, fontsize):
try:
@ -635,10 +697,12 @@ class PDFPageInterpreter(object):
self.textstate.font = self.rsrcmgr.get_font(None, {})
self.textstate.fontsize = fontsize
return
# setrendering
def do_Tr(self, render):
self.textstate.render = render
return
# settextrise
def do_Ts(self, rise):
self.textstate.rise = rise
@ -646,49 +710,55 @@ class PDFPageInterpreter(object):
# text-move
def do_Td(self, tx, ty):
(a,b,c,d,e,f) = self.textstate.matrix
self.textstate.matrix = (a,b,c,d,tx*a+ty*c+e,tx*b+ty*d+f)
(a, b, c, d, e, f) = self.textstate.matrix
self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f)
self.textstate.linematrix = (0, 0)
#print >>sys.stderr, 'Td(%r,%r): %r' % (tx,ty,self.textstate)
#print >>sys.stderr, 'Td(%r,%r): %r' % (tx, ty, self.textstate)
return
# text-move
def do_TD(self, tx, ty):
(a,b,c,d,e,f) = self.textstate.matrix
self.textstate.matrix = (a,b,c,d,tx*a+ty*c+e,tx*b+ty*d+f)
(a, b, c, d, e, f) = self.textstate.matrix
self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f)
self.textstate.leading = ty
self.textstate.linematrix = (0, 0)
#print >>sys.stderr, 'TD(%r,%r): %r' % (tx,ty,self.textstate)
#print >>sys.stderr, 'TD(%r,%r): %r' % (tx, ty, self.textstate)
return
# textmatrix
def do_Tm(self, a,b,c,d,e,f):
self.textstate.matrix = (a,b,c,d,e,f)
def do_Tm(self, a, b, c, d, e, f):
self.textstate.matrix = (a, b, c, d, e, f)
self.textstate.linematrix = (0, 0)
return
# nextline
def do_T_a(self):
(a,b,c,d,e,f) = self.textstate.matrix
self.textstate.matrix = (a,b,c,d,self.textstate.leading*c+e,self.textstate.leading*d+f)
(a, b, c, d, e, f) = self.textstate.matrix
self.textstate.matrix = (a, b, c, d, self.textstate.leading*c+e, self.textstate.leading*d+f)
self.textstate.linematrix = (0, 0)
return
# show-pos
def do_TJ(self, seq):
#print >>sys.stderr, 'TJ(%r): %r' % (seq,self.textstate)
#print >>sys.stderr, 'TJ(%r): %r' % (seq, self.textstate)
if self.textstate.font is None:
if STRICT:
raise PDFInterpreterError('No font specified!')
return
self.device.render_string(self.textstate, seq)
return
# show
def do_Tj(self, s):
self.do_TJ([s])
return
# quote
def do__q(self, s):
self.do_T_a()
self.do_TJ([s])
return
# doublequote
def do__w(self, aw, ac, s):
self.do_Tw(aw)
@ -699,12 +769,14 @@ class PDFPageInterpreter(object):
# inline image
def do_BI(self): # never called
return
def do_ID(self): # never called
return
def do_EI(self, obj):
if 'W' in obj and 'H' in obj:
iobjid = str(id(obj))
self.device.begin_figure(iobjid, (0,0,1,1), MATRIX_IDENTITY)
self.device.begin_figure(iobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
self.device.render_image(iobjid, obj)
self.device.end_figure(iobjid)
return
@ -733,7 +805,7 @@ class PDFPageInterpreter(object):
interpreter.render_contents(resources, [xobj], ctm=mult_matrix(matrix, self.ctm))
self.device.end_figure(xobjid)
elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj:
self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY)
self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
self.device.render_image(xobjid, xobj)
self.device.end_figure(xobjid)
else:
@ -744,15 +816,15 @@ class PDFPageInterpreter(object):
def process_page(self, page):
if 1 <= self.debug:
print >>sys.stderr, 'Processing page: %r' % page
(x0,y0,x1,y1) = page.mediabox
(x0, y0, x1, y1) = page.mediabox
if page.rotate == 90:
ctm = (0,-1,1,0, -y0,x1)
ctm = (0, -1, 1, 0, -y0, x1)
elif page.rotate == 180:
ctm = (-1,0,0,-1, x1,y1)
ctm = (-1, 0, 0, -1, x1, y1)
elif page.rotate == 270:
ctm = (0,1,-1,0, y1,-x0)
ctm = (0, 1, -1, 0, y1, -x0)
else:
ctm = (1,0,0,1, -x0,-y0)
ctm = (1, 0, 0, 1, -x0, -y0)
self.device.begin_page(page, ctm)
self.render_contents(page.resources, page.contents, ctm=ctm)
self.device.end_page(page)
@ -778,12 +850,12 @@ class PDFPageInterpreter(object):
return
while 1:
try:
(_,obj) = parser.nextobject()
(_, obj) = parser.nextobject()
except PSEOF:
break
if isinstance(obj, PSKeyword):
name = keyword_name(obj)
method = 'do_%s' % name.replace('*','_a').replace('"','_w').replace("'",'_q')
method = 'do_%s' % name.replace('*', '_a').replace('"', '_w').replace("'", '_q')
if hasattr(self, method):
func = getattr(self, method)
nargs = func.func_code.co_argcount-1

View File

@ -63,7 +63,7 @@ class PDFPage(object):
else:
contents = []
if not isinstance(contents, list):
contents = [ contents ]
contents = [contents]
self.contents = contents
return
@ -71,6 +71,7 @@ class PDFPage(object):
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
@classmethod
def create_pages(klass, document, debug=0):
def search(obj, parent):
@ -80,7 +81,7 @@ class PDFPage(object):
else:
objid = obj.objid
tree = dict_value(obj).copy()
for (k,v) in parent.iteritems():
for (k, v) in parent.iteritems():
if k in klass.INHERITABLE_ATTRS and k not in tree:
tree[k] = v
if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
@ -95,7 +96,7 @@ class PDFPage(object):
yield (objid, tree)
pages = False
if 'Pages' in document.catalog:
for (objid,tree) in search(document.catalog['Pages'], document.catalog):
for (objid, tree) in search(document.catalog['Pages'], document.catalog):
yield klass(document, objid, tree)
pages = True
if not pages:
@ -110,7 +111,8 @@ class PDFPage(object):
pass
return
class PDFTextExtractionNotAllowed(PDFEncryptionError): pass
class PDFTextExtractionNotAllowed(PDFEncryptionError):
pass
@classmethod
def get_pages(klass, fp,
@ -127,8 +129,10 @@ class PDFPage(object):
if check_extractable and not doc.is_extractable:
raise klass.PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
# Process each page contained in the document.
for (pageno,page) in enumerate(klass.create_pages(doc)):
if pagenos and (pageno not in pagenos): continue
for (pageno, page) in enumerate(klass.create_pages(doc)):
if pagenos and (pageno not in pagenos):
continue
yield page
if maxpages and maxpages <= pageno+1: break
if maxpages and maxpages <= pageno+1:
break
return

View File

@ -15,7 +15,8 @@ from pdftypes import str_value, list_value, dict_value, stream_value
## Exceptions
##
class PDFSyntaxError(PDFException): pass
class PDFSyntaxError(PDFException):
pass
## PDFParser
@ -55,6 +56,7 @@ class PDFParser(PSStackParser):
KEYWORD_STREAM = KWD('stream')
KEYWORD_XREF = KWD('xref')
KEYWORD_STARTXREF = KWD('startxref')
def do_keyword(self, pos, token):
"""Handles PDF-related keywords."""
@ -71,7 +73,7 @@ class PDFParser(PSStackParser):
elif token is self.KEYWORD_R:
# reference to indirect object
try:
((_,objid), (_,genno)) = self.pop(2)
((_, objid), (_, genno)) = self.pop(2)
(objid, genno) = (int(objid), int(genno))
obj = PDFObjRef(self.doc, objid, genno)
self.push((pos, obj))
@ -80,7 +82,7 @@ class PDFParser(PSStackParser):
elif token is self.KEYWORD_STREAM:
# stream object
((_,dic),) = self.pop(1)
((_, dic),) = self.pop(1)
dic = dict_value(dic)
objlen = 0
if not self.fallback:
@ -153,7 +155,7 @@ class PDFStreamParser(PDFParser):
if token is self.KEYWORD_R:
# reference to indirect object
try:
((_,objid), (_,genno)) = self.pop(2)
((_, objid), (_, genno)) = self.pop(2)
(objid, genno) = (int(objid), int(genno))
obj = PDFObjRef(self.doc, objid, genno)
self.push((pos, obj))

View File

@ -23,13 +23,28 @@ LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT'))
## PDF Objects
##
class PDFObject(PSObject): pass
class PDFObject(PSObject):
pass
class PDFException(PSException): pass
class PDFTypeError(PDFException): pass
class PDFValueError(PDFException): pass
class PDFObjectNotFound(PDFException): pass
class PDFNotImplementedError(PDFException): pass
class PDFException(PSException):
pass
class PDFTypeError(PDFException):
pass
class PDFValueError(PDFException):
pass
class PDFObjectNotFound(PDFException):
pass
class PDFNotImplementedError(PDFException):
pass
## PDFObjRef
@ -66,6 +81,7 @@ def resolve1(x, default=None):
x = x.resolve(default=default)
return x
def resolve_all(x, default=None):
"""Recursively resolves the given object and all the internals.
@ -75,24 +91,26 @@ def resolve_all(x, default=None):
while isinstance(x, PDFObjRef):
x = x.resolve(default=default)
if isinstance(x, list):
x = [ resolve_all(v, default=default) for v in x ]
x = [resolve_all(v, default=default) for v in x]
elif isinstance(x, dict):
for (k,v) in x.iteritems():
for (k, v) in x.iteritems():
x[k] = resolve_all(v, default=default)
return x
def decipher_all(decipher, objid, genno, x):
"""Recursively deciphers the given object.
"""
if isinstance(x, str):
return decipher(objid, genno, x)
if isinstance(x, list):
x = [ decipher_all(decipher, objid, genno, v) for v in x ]
x = [decipher_all(decipher, objid, genno, v) for v in x]
elif isinstance(x, dict):
for (k,v) in x.iteritems():
for (k, v) in x.iteritems():
x[k] = decipher_all(decipher, objid, genno, v)
return x
# Type cheking
def int_value(x):
x = resolve1(x)
@ -102,6 +120,7 @@ def int_value(x):
return 0
return x
def float_value(x):
x = resolve1(x)
if not isinstance(x, float):
@ -110,6 +129,7 @@ def float_value(x):
return 0.0
return x
def num_value(x):
x = resolve1(x)
if not (isinstance(x, int) or isinstance(x, float)):
@ -118,6 +138,7 @@ def num_value(x):
return 0
return x
def str_value(x):
x = resolve1(x)
if not isinstance(x, str):
@ -126,6 +147,7 @@ def str_value(x):
return ''
return x
def list_value(x):
x = resolve1(x)
if not (isinstance(x, list) or isinstance(x, tuple)):
@ -134,6 +156,7 @@ def list_value(x):
return []
return x
def dict_value(x):
x = resolve1(x)
if not isinstance(x, dict):
@ -142,6 +165,7 @@ def dict_value(x):
return {}
return x
def stream_value(x):
x = resolve1(x)
if not isinstance(x, PDFStream):
@ -195,12 +219,14 @@ class PDFStream(PDFObject):
def get_filters(self):
filters = self.get_any(('F', 'Filter'))
if not filters: return []
if isinstance(filters, list): return filters
return [ filters ]
if not filters:
return []
if isinstance(filters, list):
return filters
return [filters]
def decode(self):
assert self.data is None and self.rawdata != None
assert self.data is None and self.rawdata is not None
data = self.rawdata
if self.decipher:
# Handle encryption

View File

@ -8,11 +8,24 @@ STRICT = 0
## PS Exceptions
##
class PSException(Exception): pass
class PSEOF(PSException): pass
class PSSyntaxError(PSException): pass
class PSTypeError(PSException): pass
class PSValueError(PSException): pass
class PSException(Exception):
pass
class PSEOF(PSException):
pass
class PSSyntaxError(PSException):
pass
class PSTypeError(PSException):
pass
class PSValueError(PSException):
pass
## Basic PostScript Types
@ -114,6 +127,7 @@ def literal_name(x):
return str(x)
return x.name
def keyword_name(x):
if not isinstance(x, PSKeyword):
if STRICT:
@ -136,7 +150,9 @@ END_NUMBER = re.compile(r'[^0-9]')
END_KEYWORD = re.compile(r'[#/%\[\]()<>{}\s]')
END_STRING = re.compile(r'[()\134]')
OCT_STRING = re.compile(r'[0-7]')
ESC_STRING = { 'b':8, 't':9, 'n':10, 'f':12, 'r':13, '(':40, ')':41, '\\':92 }
ESC_STRING = {'b': 8, 't': 9, 'n': 10, 'f': 12, 'r': 13, '(': 40, ')': 41, '\\': 92}
class PSBaseParser(object):
"""Most basic PostScript parser that performs only tokenization.
@ -190,7 +206,8 @@ class PSBaseParser(object):
return
def fillbuf(self):
if self.charpos < len(self.buf): return
if self.charpos < len(self.buf):
return
# fetch next chunk.
self.bufpos = self.fp.tell()
self.buf = self.fp.read(self.BUFSIZ)
@ -242,7 +259,8 @@ class PSBaseParser(object):
pos = max(0, pos-self.BUFSIZ)
self.fp.seek(pos)
s = self.fp.read(prevpos-pos)
if not s: break
if not s:
break
while 1:
n = max(s.rfind('\r'), s.rfind('\n'))
if n == -1:
@ -520,7 +538,7 @@ class PSStackParser(PSBaseParser):
def end_type(self, type):
if self.curtype != type:
raise PSTypeError('Type mismatch: %r != %r' % (self.curtype, type))
objs = [ obj for (_,obj) in self.curstack ]
objs = [obj for (_, obj) in self.curstack]
(pos, self.curtype, self.curstack) = self.context.pop()
if 2 <= self.debug:
print >>sys.stderr, 'end_type: pos=%r, type=%r, objs=%r' % (pos, type, objs)
@ -553,7 +571,8 @@ class PSStackParser(PSBaseParser):
try:
self.push(self.end_type('a'))
except PSTypeError:
if STRICT: raise
if STRICT:
raise
elif token == KEYWORD_DICT_BEGIN:
# begin dictionary
self.start_type(pos, 'd')
@ -564,10 +583,11 @@ class PSStackParser(PSBaseParser):
if len(objs) % 2 != 0:
raise PSSyntaxError('Invalid dictionary construct: %r' % objs)
# construct a Python dictionary.
d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) if v is not None )
d = dict((literal_name(k), v) for (k, v) in choplist(2, objs) if v is not None)
self.push((pos, d))
except PSTypeError:
if STRICT: raise
if STRICT:
raise
elif token == KEYWORD_PROC_BEGIN:
# begin proc
self.start_type(pos, 'p')
@ -576,7 +596,8 @@ class PSStackParser(PSBaseParser):
try:
self.push(self.end_type('p'))
except PSTypeError:
if STRICT: raise
if STRICT:
raise
else:
if 2 <= self.debug:
print >>sys.stderr, 'do_keyword: pos=%r, token=%r, stack=%r' % \
@ -592,9 +613,11 @@ class PSStackParser(PSBaseParser):
return obj
import unittest
## Simplistic Test cases
##
import unittest
class TestPSBaseParser(unittest.TestCase):
TESTDATA = r'''%!PS
@ -645,6 +668,7 @@ func/a/b{(c)do*}def
def get_tokens(self, s):
import StringIO
class MyParser(PSBaseParser):
def flush(self):
self.add_results(*self.popall())
@ -659,6 +683,7 @@ func/a/b{(c)do*}def
def get_objects(self, s):
import StringIO
class MyParser(PSStackParser):
def flush(self):
self.add_results(*self.popall())
@ -683,4 +708,5 @@ func/a/b{(c)do*}def
self.assertEqual(objs, self.OBJS)
return
if __name__ == '__main__': unittest.main()
if __name__ == '__main__':
unittest.main()

View File

@ -13,9 +13,17 @@ by Philip J. Erdelsky:
import sys
import struct
def KEYLENGTH(keybits): return (keybits)/8
def RKLENGTH(keybits): return (keybits)/8+28
def NROUNDS(keybits): return (keybits)/32+6
def KEYLENGTH(keybits):
return (keybits)/8
def RKLENGTH(keybits):
return (keybits)/8+28
def NROUNDS(keybits):
return (keybits)/32+6
Te0 = [
0xc66363a5L, 0xf87c7c84L, 0xee777799L, 0xf67b7b8dL,
@ -82,7 +90,7 @@ Te0 = [
0x65bfbfdaL, 0xd7e6e631L, 0x844242c6L, 0xd06868b8L,
0x824141c3L, 0x299999b0L, 0x5a2d2d77L, 0x1e0f0f11L,
0x7bb0b0cbL, 0xa85454fcL, 0x6dbbbbd6L, 0x2c16163aL,
]
]
Te1 = [
0xa5c66363L, 0x84f87c7cL, 0x99ee7777L, 0x8df67b7bL,
@ -149,7 +157,7 @@ Te1 = [
0xda65bfbfL, 0x31d7e6e6L, 0xc6844242L, 0xb8d06868L,
0xc3824141L, 0xb0299999L, 0x775a2d2dL, 0x111e0f0fL,
0xcb7bb0b0L, 0xfca85454L, 0xd66dbbbbL, 0x3a2c1616L,
]
]
Te2 = [
0x63a5c663L, 0x7c84f87cL, 0x7799ee77L, 0x7b8df67bL,
@ -216,7 +224,7 @@ Te2 = [
0xbfda65bfL, 0xe631d7e6L, 0x42c68442L, 0x68b8d068L,
0x41c38241L, 0x99b02999L, 0x2d775a2dL, 0x0f111e0fL,
0xb0cb7bb0L, 0x54fca854L, 0xbbd66dbbL, 0x163a2c16L,
]
]
Te3 = [
0x6363a5c6L, 0x7c7c84f8L, 0x777799eeL, 0x7b7b8df6L,
@ -283,7 +291,7 @@ Te3 = [
0xbfbfda65L, 0xe6e631d7L, 0x4242c684L, 0x6868b8d0L,
0x4141c382L, 0x9999b029L, 0x2d2d775aL, 0x0f0f111eL,
0xb0b0cb7bL, 0x5454fca8L, 0xbbbbd66dL, 0x16163a2cL,
]
]
Te4 = [
0x63636363L, 0x7c7c7c7cL, 0x77777777L, 0x7b7b7b7bL,
@ -350,7 +358,7 @@ Te4 = [
0xbfbfbfbfL, 0xe6e6e6e6L, 0x42424242L, 0x68686868L,
0x41414141L, 0x99999999L, 0x2d2d2d2dL, 0x0f0f0f0fL,
0xb0b0b0b0L, 0x54545454L, 0xbbbbbbbbL, 0x16161616L,
]
]
Td0 = [
0x51f4a750L, 0x7e416553L, 0x1a17a4c3L, 0x3a275e96L,
@ -417,7 +425,7 @@ Td0 = [
0x161dc372L, 0xbce2250cL, 0x283c498bL, 0xff0d9541L,
0x39a80171L, 0x080cb3deL, 0xd8b4e49cL, 0x6456c190L,
0x7bcb8461L, 0xd532b670L, 0x486c5c74L, 0xd0b85742L,
]
]
Td1 = [
0x5051f4a7L, 0x537e4165L, 0xc31a17a4L, 0x963a275eL,
@ -484,7 +492,7 @@ Td1 = [
0x72161dc3L, 0x0cbce225L, 0x8b283c49L, 0x41ff0d95L,
0x7139a801L, 0xde080cb3L, 0x9cd8b4e4L, 0x906456c1L,
0x617bcb84L, 0x70d532b6L, 0x74486c5cL, 0x42d0b857L,
]
]
Td2 = [
0xa75051f4L, 0x65537e41L, 0xa4c31a17L, 0x5e963a27L,
@ -551,7 +559,7 @@ Td2 = [
0xc372161dL, 0x250cbce2L, 0x498b283cL, 0x9541ff0dL,
0x017139a8L, 0xb3de080cL, 0xe49cd8b4L, 0xc1906456L,
0x84617bcbL, 0xb670d532L, 0x5c74486cL, 0x5742d0b8L,
]
]
Td3 = [
0xf4a75051L, 0x4165537eL, 0x17a4c31aL, 0x275e963aL,
@ -618,7 +626,7 @@ Td3 = [
0x1dc37216L, 0xe2250cbcL, 0x3c498b28L, 0x0d9541ffL,
0xa8017139L, 0x0cb3de08L, 0xb4e49cd8L, 0x56c19064L,
0xcb84617bL, 0x32b670d5L, 0x6c5c7448L, 0xb85742d0L,
]
]
Td4 = [
0x52525252L, 0x09090909L, 0x6a6a6a6aL, 0xd5d5d5d5L,
@ -685,14 +693,14 @@ Td4 = [
0xbabababaL, 0x77777777L, 0xd6d6d6d6L, 0x26262626L,
0xe1e1e1e1L, 0x69696969L, 0x14141414L, 0x63636363L,
0x55555555L, 0x21212121L, 0x0c0c0c0cL, 0x7d7d7d7dL,
]
]
rcon = [
0x01000000, 0x02000000, 0x04000000, 0x08000000,
0x10000000, 0x20000000, 0x40000000, 0x80000000,
0x1B000000, 0x36000000,
# 128-bit blocks, Rijndael never uses more than 10 rcon values
]
]
if len(struct.pack('L',0)) == 4:
# 32bit
@ -703,6 +711,7 @@ else:
def GETU32(x): return struct.unpack('>I', x)[0]
def PUTU32(x): return struct.pack('>I', x)
# Expand the cipher key into the encryption key schedule.
#
# @return the number of rounds for the given cipher key size.
@ -1051,6 +1060,7 @@ class RijndaelDecryptor(object):
assert len(ciphertext) == 16
return rijndaelDecrypt(self.rk, self.nrounds, ciphertext)
# encrypt(key, fin, fout, keybits=256)
class RijndaelEncryptor(object):

View File

@ -8,6 +8,7 @@
import sys
def rldecode(data):
"""
RunLength decoder (Adobe version) implementation based on PDF Reference
@ -26,7 +27,7 @@ def rldecode(data):
'1234567777777abcde'
"""
decoded = []
i=0
i = 0
while i < len(data):
#print "data[%d]=:%d:" % (i,ord(data[i]))
length = ord(data[i])

View File

@ -32,13 +32,13 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
buf += chr(c)
elif pred == '\x02':
# PNG up
for (a,b) in zip(line0,line1):
for (a, b) in zip(line0, line1):
c = (ord(a)+ord(b)) & 255
buf += chr(c)
elif pred == '\x03':
# PNG average (UNTESTED)
c = 0
for (a,b) in zip(line0,line1):
for (a, b) in zip(line0, line1):
c = ((c+ord(a)+ord(b))/2) & 255
buf += chr(c)
else:
@ -52,21 +52,25 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
##
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
def mult_matrix((a1, b1, c1, d1, e1, f1), (a0, b0, c0, d0, e0, f0)):
"""Returns the multiplication of two matrices."""
return (a0*a1+c0*b1, b0*a1+d0*b1,
a0*c1+c0*d1, b0*c1+d0*d1,
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
def translate_matrix((a,b,c,d,e,f), (x,y)):
"""Translates a matrix by (x,y)."""
return (a,b,c,d,x*a+y*c+e,x*b+y*d+f)
def apply_matrix_pt((a,b,c,d,e,f), (x,y)):
def translate_matrix((a, b, c, d, e, f), (x, y)):
"""Translates a matrix by (x, y)."""
return (a, b, c, d, x*a+y*c+e, x*b+y*d+f)
def apply_matrix_pt((a, b, c, d, e, f), (x, y)):
"""Applies a matrix to a point."""
return (a*x+c*y+e, b*x+d*y+f)
def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
def apply_matrix_norm((a, b, c, d, e, f), (p, q)):
"""Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))"""
return (a*p+c*q, b*p+d*q)
@ -79,17 +83,20 @@ def uniq(objs):
"""Eliminates duplicated elements."""
done = set()
for obj in objs:
if obj in done: continue
if obj in done:
continue
done.add(obj)
yield obj
return
# csort
def csort(objs, key=lambda x:x):
def csort(objs, key=lambda x: x):
"""Order-preserving sorting function."""
idxs = dict( (obj,i) for (i,obj) in enumerate(objs) )
idxs = dict((obj, i) for (i, obj) in enumerate(objs))
return sorted(objs, key=lambda obj: (key(obj), idxs[obj]))
# fsplit
def fsplit(pred, objs):
"""Split a list into two classes according to the predicate."""
@ -100,7 +107,8 @@ def fsplit(pred, objs):
t.append(obj)
else:
f.append(obj)
return (t,f)
return (t, f)
# drange
def drange(v0, v1, d):
@ -108,16 +116,18 @@ def drange(v0, v1, d):
assert v0 < v1
return xrange(int(v0)/d, int(v1+d)/d)
# get_bound
def get_bound(pts):
"""Compute a minimal rectangle that covers all the points."""
(x0, y0, x1, y1) = (INF, INF, -INF, -INF)
for (x,y) in pts:
for (x, y) in pts:
x0 = min(x0, x)
y0 = min(y0, y)
x1 = max(x1, x)
y1 = max(y1, y)
return (x0,y0,x1,y1)
return (x0, y0, x1, y1)
# pick
def pick(seq, func, maxobj=None):
@ -126,9 +136,10 @@ def pick(seq, func, maxobj=None):
for obj in seq:
score = func(obj)
if maxscore is None or maxscore < score:
(maxscore,maxobj) = (score,obj)
(maxscore, maxobj) = (score, obj)
return maxobj
# choplist
def choplist(n, seq):
"""Groups every n elements of the list."""
@ -140,6 +151,7 @@ def choplist(n, seq):
r = []
return
# nunpack
def nunpack(s, default=0):
"""Unpacks 1 to 4 byte integers (big endian)."""
@ -157,8 +169,9 @@ def nunpack(s, default=0):
else:
raise TypeError('invalid length: %d' % l)
# decode_text
PDFDocEncoding = ''.join( unichr(x) for x in (
PDFDocEncoding = ''.join(unichr(x) for x in (
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0017, 0x0017,
@ -192,24 +205,29 @@ PDFDocEncoding = ''.join( unichr(x) for x in (
0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
))
def decode_text(s):
"""Decodes a PDFDocEncoding string to Unicode."""
if s.startswith('\xfe\xff'):
return unicode(s[2:], 'utf-16be', 'ignore')
else:
return ''.join( PDFDocEncoding[ord(c)] for c in s )
return ''.join(PDFDocEncoding[ord(c)] for c in s)
# enc
def enc(x, codec='ascii'):
"""Encodes a string for SGML/XML/HTML"""
x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
x = x.replace('&', '&amp;').replace('>', '&gt;').replace('<', '&lt;').replace('"', '&quot;')
return x.encode(codec, 'xmlcharrefreplace')
def bbox2str((x0,y0,x1,y1)):
def bbox2str((x0, y0, x1, y1)):
return '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
def matrix2str((a,b,c,d,e,f)):
return '[%.2f,%.2f,%.2f,%.2f, (%.2f,%.2f)]' % (a,b,c,d,e,f)
def matrix2str((a, b, c, d, e, f)):
return '[%.2f,%.2f,%.2f,%.2f, (%.2f,%.2f)]' % (a, b, c, d, e, f)
## Plane
@ -240,14 +258,14 @@ class Plane(object):
def __contains__(self, obj):
return obj in self._objs
def _getrange(self, (x0,y0,x1,y1)):
def _getrange(self, (x0, y0, x1, y1)):
x0 = max(self.x0, x0)
y0 = max(self.y0, y0)
x1 = min(self.x1, x1)
y1 = min(self.y1, y1)
for y in drange(y0, y1, self.gridsize):
for x in drange(x0, x1, self.gridsize):
yield (x,y)
yield (x, y)
return
# extend(objs)
@ -279,14 +297,17 @@ class Plane(object):
return
# find(): finds objects that are in a certain area.
def find(self, (x0,y0,x1,y1)):
def find(self, (x0, y0, x1, y1)):
done = set()
for k in self._getrange((x0,y0,x1,y1)):
if k not in self._grid: continue
for k in self._getrange((x0, y0, x1, y1)):
if k not in self._grid:
continue
for obj in self._grid[k]:
if obj in done: continue
if obj in done:
continue
done.add(obj)
if (obj.x1 <= x0 or x1 <= obj.x0 or
obj.y1 <= y0 or y1 <= obj.y0): continue
obj.y1 <= y0 or y1 <= obj.y0):
continue
yield obj
return