2008-08-30 12:47:21 +00:00
|
|
|
#!/usr/bin/env python
|
|
|
|
import sys
|
|
|
|
stdout = sys.stdout
|
|
|
|
stderr = sys.stderr
|
2009-02-23 14:00:38 +00:00
|
|
|
from pdffont import PDFUnicodeNotDefined
|
2009-04-18 17:15:49 +00:00
|
|
|
from utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix, \
|
|
|
|
matrix2str, rect2str, point2str
|
2008-08-30 12:47:21 +00:00
|
|
|
|
|
|
|
|
2009-01-10 09:14:46 +00:00
|
|
|
## PDFDevice
|
|
|
|
##
|
|
|
|
class PDFDevice(object):
|
|
|
|
|
|
|
|
debug = 0
|
|
|
|
|
|
|
|
def __init__(self, rsrc):
|
|
|
|
self.rsrc = rsrc
|
|
|
|
self.ctm = None
|
|
|
|
return
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return '<PDFDevice>'
|
|
|
|
|
|
|
|
def close(self):
|
|
|
|
return
|
|
|
|
|
|
|
|
def set_ctm(self, ctm):
|
|
|
|
self.ctm = ctm
|
|
|
|
return
|
|
|
|
|
|
|
|
def begin_tag(self, tag, props=None):
|
|
|
|
return
|
|
|
|
def end_tag(self):
|
|
|
|
return
|
|
|
|
def do_tag(self, tag, props=None):
|
|
|
|
return
|
|
|
|
|
|
|
|
def begin_page(self, page):
|
|
|
|
return
|
|
|
|
def end_page(self, page):
|
|
|
|
return
|
|
|
|
def begin_figure(self, name, bbox):
|
|
|
|
return
|
|
|
|
def end_figure(self, name):
|
|
|
|
return
|
2009-04-19 03:26:52 +00:00
|
|
|
|
|
|
|
def paint_path(self, graphicstate, matrix, stroke, fill, evenodd, path):
|
|
|
|
return
|
2009-01-10 09:14:46 +00:00
|
|
|
def render_string(self, textstate, textmatrix, seq):
|
2009-04-19 03:26:52 +00:00
|
|
|
return
|
2009-01-10 09:14:46 +00:00
|
|
|
def render_image(self, stream, size, matrix):
|
2009-04-19 03:26:52 +00:00
|
|
|
return
|
2009-01-10 09:14:46 +00:00
|
|
|
|
|
|
|
|
2008-08-30 12:47:21 +00:00
|
|
|
## PageItem
|
|
|
|
##
|
|
|
|
class PageItem(object):
|
|
|
|
|
|
|
|
def __init__(self, id, (x0,y0,x1,y1), rotate=0):
|
|
|
|
self.id = id
|
|
|
|
self.bbox = (x0, y0, x1, y1)
|
|
|
|
self.rotate = rotate
|
|
|
|
self.objs = []
|
|
|
|
return
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return ('<page id=%r bbox=%r rotate=%r>' % (self.id, self.bbox, self.rotate))
|
|
|
|
|
|
|
|
def add(self, obj):
|
|
|
|
self.objs.append(obj)
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
## FigureItem
|
|
|
|
##
|
|
|
|
class FigureItem(PageItem):
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return ('<figure id=%r bbox=%r>' % (self.id, self.bbox))
|
|
|
|
|
|
|
|
|
|
|
|
## TextItem
|
|
|
|
##
|
|
|
|
class TextItem(object):
|
|
|
|
|
2009-03-28 17:23:53 +00:00
|
|
|
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
|
2008-08-30 12:47:21 +00:00
|
|
|
self.matrix = matrix
|
|
|
|
self.font = font
|
2008-12-25 15:09:54 +00:00
|
|
|
(_,_,_,_,tx,ty) = self.matrix
|
2008-08-30 12:47:21 +00:00
|
|
|
self.direction = 0
|
2008-12-25 15:09:54 +00:00
|
|
|
self.text = ''
|
2009-04-18 17:15:49 +00:00
|
|
|
adv = 0
|
|
|
|
for (char,cid) in chars:
|
|
|
|
self.text += char
|
|
|
|
adv += font.char_width(cid)
|
|
|
|
adv = (adv * fontsize + len(chars)*charspace) * scaling * .01
|
2009-01-10 10:45:49 +00:00
|
|
|
size = (font.get_ascent() - font.get_descent()) * fontsize
|
2008-08-30 12:47:21 +00:00
|
|
|
if not self.font.is_vertical():
|
2009-01-10 10:45:49 +00:00
|
|
|
# horizontal text
|
2008-08-30 12:47:21 +00:00
|
|
|
self.direction = 1
|
2009-04-18 17:15:49 +00:00
|
|
|
(dx,dy) = apply_matrix_norm(self.matrix, (adv,size))
|
2009-01-10 10:45:49 +00:00
|
|
|
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
|
|
|
|
ty += descent
|
2009-04-18 17:15:49 +00:00
|
|
|
self.adv = (dx, 0)
|
|
|
|
self.bbox = (tx, ty, tx+dx, ty+dy)
|
2008-08-30 12:47:21 +00:00
|
|
|
else:
|
2009-01-10 10:45:49 +00:00
|
|
|
# vertical text
|
2008-08-30 12:47:21 +00:00
|
|
|
self.direction = 2
|
2009-04-18 17:15:49 +00:00
|
|
|
(_,cid) = chars[0]
|
|
|
|
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001))
|
|
|
|
(dx,dy) = apply_matrix_norm(self.matrix, (size,adv))
|
|
|
|
tx -= dx/2
|
2008-08-30 12:47:21 +00:00
|
|
|
ty += disp
|
2009-04-18 17:15:49 +00:00
|
|
|
self.adv = (0, dy)
|
|
|
|
self.bbox = (tx, ty+dy, tx+dx, ty)
|
2009-01-10 10:45:49 +00:00
|
|
|
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
|
2008-08-30 12:47:21 +00:00
|
|
|
return
|
|
|
|
|
|
|
|
def __repr__(self):
|
2009-04-18 17:15:49 +00:00
|
|
|
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s text=%r adv=%s>' %
|
|
|
|
(matrix2str(self.matrix), self.font, self.fontsize,
|
|
|
|
rect2str(self.bbox), self.text, point2str(self.adv)))
|
2008-08-30 12:47:21 +00:00
|
|
|
|
|
|
|
|
2009-01-10 09:25:03 +00:00
|
|
|
## PDFPageAggregator
|
2008-08-30 12:47:21 +00:00
|
|
|
##
|
2009-01-10 09:25:03 +00:00
|
|
|
class PDFPageAggregator(PDFDevice):
|
2008-08-30 12:47:21 +00:00
|
|
|
|
2009-04-18 17:15:49 +00:00
|
|
|
def __init__(self, rsrc, pageno=1):
|
2009-01-05 04:40:50 +00:00
|
|
|
PDFDevice.__init__(self, rsrc)
|
|
|
|
self.pageno = pageno
|
2008-08-30 12:47:21 +00:00
|
|
|
self.stack = []
|
|
|
|
return
|
|
|
|
|
|
|
|
def begin_page(self, page):
|
|
|
|
self.cur_item = PageItem(self.pageno, page.mediabox, page.rotate)
|
|
|
|
return
|
2009-03-28 17:23:53 +00:00
|
|
|
|
2008-08-30 12:47:21 +00:00
|
|
|
def end_page(self, _):
|
|
|
|
assert not self.stack
|
|
|
|
assert isinstance(self.cur_item, PageItem)
|
|
|
|
self.pageno += 1
|
2009-03-28 17:23:53 +00:00
|
|
|
return self.cur_item
|
2008-08-30 12:47:21 +00:00
|
|
|
|
|
|
|
def begin_figure(self, name, bbox):
|
|
|
|
self.stack.append(self.cur_item)
|
|
|
|
self.cur_item = FigureItem(name, bbox)
|
|
|
|
return
|
2009-03-28 17:23:53 +00:00
|
|
|
|
2008-08-30 12:47:21 +00:00
|
|
|
def end_figure(self, _):
|
|
|
|
fig = self.cur_item
|
|
|
|
self.cur_item = self.stack.pop()
|
|
|
|
self.cur_item.add(fig)
|
|
|
|
return
|
|
|
|
|
|
|
|
def handle_undefined_char(self, cidcoding, cid):
|
|
|
|
if self.debug:
|
|
|
|
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
|
2009-04-25 05:24:29 +00:00
|
|
|
return '?'
|
2008-08-30 12:47:21 +00:00
|
|
|
|
2009-04-19 03:26:52 +00:00
|
|
|
def paint_path(self, graphicstate, matrix, stroke, fill, evenodd, path):
|
|
|
|
shape = ''.join(x[0] for x in path)
|
|
|
|
if shape == 'ml': # single line
|
|
|
|
if path[0][1] == path[1][1]:
|
|
|
|
#print 'vertical'
|
|
|
|
pass
|
|
|
|
elif path[0][2] == path[1][2]:
|
|
|
|
#print 'horizontal'
|
|
|
|
pass
|
|
|
|
elif shape == 'mlllh': # rectangle
|
|
|
|
if ((path[0][1] == path[1][1] and path[1][2] == path[2][2] and
|
|
|
|
path[2][1] == path[3][1] and path[3][2] == path[0][2]) or
|
|
|
|
(path[0][2] == path[1][2] and path[1][1] == path[2][1] and
|
|
|
|
path[2][2] == path[3][2] and path[3][1] == path[0][1])):
|
|
|
|
pass
|
|
|
|
return
|
|
|
|
|
2009-04-18 17:15:49 +00:00
|
|
|
def render_chars(self, textmatrix, textstate, chars):
|
|
|
|
if not chars: return (0, 0)
|
|
|
|
item = TextItem(textmatrix, textstate.font, textstate.fontsize, textstate.charspace, textstate.scaling, chars)
|
|
|
|
self.cur_item.add(item)
|
|
|
|
return item.adv
|
|
|
|
|
2008-12-25 15:09:54 +00:00
|
|
|
def render_string(self, textstate, textmatrix, seq):
|
2008-08-30 12:47:21 +00:00
|
|
|
font = textstate.font
|
2009-04-18 17:15:49 +00:00
|
|
|
textmatrix = mult_matrix(textmatrix, self.ctm)
|
2009-03-20 11:00:14 +00:00
|
|
|
chars = []
|
2008-08-30 12:47:21 +00:00
|
|
|
for x in seq:
|
|
|
|
if isinstance(x, int) or isinstance(x, float):
|
2009-04-18 17:15:49 +00:00
|
|
|
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
|
|
|
|
dx -= x * textstate.scaling * .0001
|
|
|
|
textmatrix = translate_matrix(textmatrix, (dx, dy))
|
|
|
|
chars = []
|
2008-08-30 12:47:21 +00:00
|
|
|
else:
|
2009-03-20 11:00:14 +00:00
|
|
|
for cid in font.decode(x):
|
2008-08-30 12:47:21 +00:00
|
|
|
try:
|
|
|
|
char = font.to_unicode(cid)
|
|
|
|
except PDFUnicodeNotDefined, e:
|
|
|
|
(cidcoding, cid) = e.args
|
2009-02-01 15:01:32 +00:00
|
|
|
char = self.handle_undefined_char(cidcoding, cid)
|
2009-04-18 17:15:49 +00:00
|
|
|
chars.append((char, cid))
|
|
|
|
if cid == 32 and not font.is_multibyte():
|
|
|
|
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
|
|
|
|
dx += textstate.wordspace * textstate.scaling * .01
|
|
|
|
textmatrix = translate_matrix(textmatrix, (dx, dy))
|
|
|
|
chars = []
|
|
|
|
self.render_chars(textmatrix, textstate, chars)
|
2008-08-30 12:47:21 +00:00
|
|
|
return
|