2008-08-30 12:47:21 +00:00
|
|
|
#!/usr/bin/env python
|
|
|
|
import sys
|
|
|
|
stdout = sys.stdout
|
|
|
|
stderr = sys.stderr
|
2009-02-23 14:00:38 +00:00
|
|
|
from pdffont import PDFUnicodeNotDefined
|
|
|
|
from utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix
|
2008-08-30 12:47:21 +00:00
|
|
|
|
|
|
|
|
2009-01-10 09:14:46 +00:00
|
|
|
## PDFDevice
|
|
|
|
##
|
|
|
|
class PDFDevice(object):
|
|
|
|
|
|
|
|
debug = 0
|
|
|
|
|
|
|
|
def __init__(self, rsrc):
|
|
|
|
self.rsrc = rsrc
|
|
|
|
self.ctm = None
|
|
|
|
return
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return '<PDFDevice>'
|
|
|
|
|
|
|
|
def close(self):
|
|
|
|
return
|
|
|
|
|
|
|
|
def set_ctm(self, ctm):
|
|
|
|
self.ctm = ctm
|
|
|
|
return
|
|
|
|
|
|
|
|
def begin_tag(self, tag, props=None):
|
|
|
|
return
|
|
|
|
def end_tag(self):
|
|
|
|
return
|
|
|
|
def do_tag(self, tag, props=None):
|
|
|
|
return
|
|
|
|
|
|
|
|
def begin_page(self, page):
|
|
|
|
return
|
|
|
|
def end_page(self, page):
|
|
|
|
return
|
|
|
|
def begin_figure(self, name, bbox):
|
|
|
|
return
|
|
|
|
def end_figure(self, name):
|
|
|
|
return
|
|
|
|
|
|
|
|
def render_string(self, textstate, textmatrix, seq):
|
|
|
|
raise NotImplementedError
|
|
|
|
def render_image(self, stream, size, matrix):
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
|
2008-08-30 12:47:21 +00:00
|
|
|
## PageItem
|
|
|
|
##
|
|
|
|
class PageItem(object):
|
|
|
|
|
|
|
|
def __init__(self, id, (x0,y0,x1,y1), rotate=0):
|
|
|
|
self.id = id
|
|
|
|
self.bbox = (x0, y0, x1, y1)
|
|
|
|
self.rotate = rotate
|
|
|
|
self.objs = []
|
|
|
|
return
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return ('<page id=%r bbox=%r rotate=%r>' % (self.id, self.bbox, self.rotate))
|
|
|
|
|
|
|
|
def add(self, obj):
|
|
|
|
self.objs.append(obj)
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
## FigureItem
|
|
|
|
##
|
|
|
|
class FigureItem(PageItem):
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return ('<figure id=%r bbox=%r>' % (self.id, self.bbox))
|
|
|
|
|
|
|
|
|
|
|
|
## TextItem
|
|
|
|
##
|
|
|
|
class TextItem(object):
|
|
|
|
|
2009-03-28 17:23:53 +00:00
|
|
|
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
|
2008-08-30 12:47:21 +00:00
|
|
|
self.matrix = matrix
|
|
|
|
self.font = font
|
2008-12-25 15:09:54 +00:00
|
|
|
(_,_,_,_,tx,ty) = self.matrix
|
2008-08-30 12:47:21 +00:00
|
|
|
self.origin = (tx,ty)
|
|
|
|
self.direction = 0
|
2008-12-25 15:09:54 +00:00
|
|
|
self.text = ''
|
2009-01-05 04:40:50 +00:00
|
|
|
scaling *= .01
|
2009-01-10 10:45:49 +00:00
|
|
|
size = (font.get_ascent() - font.get_descent()) * fontsize
|
2008-08-30 12:47:21 +00:00
|
|
|
if not self.font.is_vertical():
|
2009-01-10 10:45:49 +00:00
|
|
|
# horizontal text
|
2009-02-08 11:14:08 +00:00
|
|
|
spwidth = font.space_width()
|
2008-08-30 12:47:21 +00:00
|
|
|
self.direction = 1
|
2008-12-25 15:09:54 +00:00
|
|
|
w = 0
|
|
|
|
dx = 0
|
|
|
|
prev = ' '
|
2009-03-28 17:23:53 +00:00
|
|
|
for (char,cid,t) in chars:
|
2009-02-01 15:01:32 +00:00
|
|
|
if char:
|
2008-12-25 15:09:54 +00:00
|
|
|
if prev != ' ' and spwidth < dx:
|
|
|
|
self.text += ' '
|
|
|
|
prev = char
|
2009-02-01 15:01:32 +00:00
|
|
|
self.text += char
|
2008-12-25 15:09:54 +00:00
|
|
|
dx = 0
|
2009-02-01 15:01:32 +00:00
|
|
|
w += (font.char_width(cid) * fontsize + charspace) * scaling
|
2008-12-25 15:09:54 +00:00
|
|
|
else:
|
2009-01-10 10:45:49 +00:00
|
|
|
t *= .001
|
2008-12-25 15:09:54 +00:00
|
|
|
dx -= t
|
2009-02-01 15:01:32 +00:00
|
|
|
w -= t * fontsize * scaling
|
2009-01-10 10:45:49 +00:00
|
|
|
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
|
|
|
|
ty += descent
|
|
|
|
(w,h) = apply_matrix_norm(self.matrix, (w,size))
|
2009-01-05 04:40:50 +00:00
|
|
|
self.adv = (w, 0)
|
2008-12-25 15:09:54 +00:00
|
|
|
self.bbox = (tx, ty, tx+w, ty+h)
|
2008-08-30 12:47:21 +00:00
|
|
|
else:
|
2009-01-10 10:45:49 +00:00
|
|
|
# vertical text
|
2008-08-30 12:47:21 +00:00
|
|
|
self.direction = 2
|
2008-12-25 15:09:54 +00:00
|
|
|
disp = 0
|
|
|
|
h = 0
|
2009-03-28 17:23:53 +00:00
|
|
|
for (char,cid,disp) in chars:
|
2009-02-01 15:01:32 +00:00
|
|
|
if not char: continue
|
|
|
|
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
|
|
|
|
self.text += font.to_unicode(cid)
|
|
|
|
h += (font.char_width(cid) * fontsize + charspace) * scaling
|
|
|
|
break
|
2009-03-28 17:23:53 +00:00
|
|
|
for (char,cid,_) in chars[1:]:
|
2009-02-01 15:01:32 +00:00
|
|
|
if not char: continue
|
|
|
|
self.text += font.to_unicode(cid)
|
|
|
|
h += (font.char_width(cid) * fontsize + charspace) * scaling
|
2009-01-10 10:45:49 +00:00
|
|
|
(w,h) = apply_matrix_norm(self.matrix, (size,h))
|
2008-12-25 15:09:54 +00:00
|
|
|
tx -= w/2
|
2008-08-30 12:47:21 +00:00
|
|
|
ty += disp
|
2009-01-05 04:40:50 +00:00
|
|
|
self.adv = (0, h)
|
2008-12-25 15:09:54 +00:00
|
|
|
self.bbox = (tx, ty+h, tx+w, ty)
|
2009-01-10 10:45:49 +00:00
|
|
|
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
|
2008-08-30 12:47:21 +00:00
|
|
|
return
|
|
|
|
|
|
|
|
def __repr__(self):
|
2009-01-05 04:40:50 +00:00
|
|
|
return ('<text matrix=%r font=%r fontsize=%r bbox=%r text=%r adv=%r>' %
|
|
|
|
(self.matrix, self.font, self.fontsize, self.bbox, self.text, self.adv))
|
2008-08-30 12:47:21 +00:00
|
|
|
|
|
|
|
|
2009-01-10 09:25:03 +00:00
|
|
|
## PDFPageAggregator
|
2008-08-30 12:47:21 +00:00
|
|
|
##
|
2009-01-10 09:25:03 +00:00
|
|
|
class PDFPageAggregator(PDFDevice):
|
2008-08-30 12:47:21 +00:00
|
|
|
|
2009-03-20 11:00:14 +00:00
|
|
|
def __init__(self, rsrc, pageno=1, splitwords=False):
|
2009-01-05 04:40:50 +00:00
|
|
|
PDFDevice.__init__(self, rsrc)
|
|
|
|
self.pageno = pageno
|
2009-03-20 11:00:14 +00:00
|
|
|
self.splitwords = splitwords
|
2008-08-30 12:47:21 +00:00
|
|
|
self.stack = []
|
|
|
|
return
|
|
|
|
|
|
|
|
def begin_page(self, page):
|
|
|
|
self.cur_item = PageItem(self.pageno, page.mediabox, page.rotate)
|
|
|
|
return
|
2009-03-28 17:23:53 +00:00
|
|
|
|
2008-08-30 12:47:21 +00:00
|
|
|
def end_page(self, _):
|
|
|
|
assert not self.stack
|
|
|
|
assert isinstance(self.cur_item, PageItem)
|
|
|
|
self.pageno += 1
|
2009-03-28 17:23:53 +00:00
|
|
|
return self.cur_item
|
2008-08-30 12:47:21 +00:00
|
|
|
|
|
|
|
def begin_figure(self, name, bbox):
|
|
|
|
self.stack.append(self.cur_item)
|
|
|
|
self.cur_item = FigureItem(name, bbox)
|
|
|
|
return
|
2009-03-28 17:23:53 +00:00
|
|
|
|
2008-08-30 12:47:21 +00:00
|
|
|
def end_figure(self, _):
|
|
|
|
fig = self.cur_item
|
|
|
|
self.cur_item = self.stack.pop()
|
|
|
|
self.cur_item.add(fig)
|
|
|
|
return
|
|
|
|
|
|
|
|
def render_image(self, stream, size, matrix):
|
|
|
|
return
|
|
|
|
|
|
|
|
def handle_undefined_char(self, cidcoding, cid):
|
|
|
|
if self.debug:
|
|
|
|
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
|
|
|
|
return None
|
|
|
|
|
2008-12-25 15:09:54 +00:00
|
|
|
def render_string(self, textstate, textmatrix, seq):
|
2008-08-30 12:47:21 +00:00
|
|
|
font = textstate.font
|
2009-03-20 11:00:14 +00:00
|
|
|
chars = []
|
2008-08-30 12:47:21 +00:00
|
|
|
for x in seq:
|
|
|
|
if isinstance(x, int) or isinstance(x, float):
|
2009-03-20 11:00:14 +00:00
|
|
|
chars.append((None, None, x))
|
2008-08-30 12:47:21 +00:00
|
|
|
else:
|
2009-03-20 11:00:14 +00:00
|
|
|
for cid in font.decode(x):
|
2008-08-30 12:47:21 +00:00
|
|
|
try:
|
|
|
|
char = font.to_unicode(cid)
|
|
|
|
except PDFUnicodeNotDefined, e:
|
|
|
|
(cidcoding, cid) = e.args
|
2009-02-01 15:01:32 +00:00
|
|
|
char = self.handle_undefined_char(cidcoding, cid)
|
2009-03-20 11:00:14 +00:00
|
|
|
chars.append((char, cid, font.char_disp(cid)))
|
|
|
|
textmatrix = mult_matrix(textmatrix, self.ctm)
|
|
|
|
word = []
|
|
|
|
for (char, cid, disp) in chars:
|
|
|
|
word.append((char,cid,disp))
|
|
|
|
if self.splitwords and cid == 32 and not font.is_multibyte():
|
|
|
|
if word:
|
|
|
|
item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, word)
|
|
|
|
self.cur_item.add(item)
|
|
|
|
(dx,dy) = item.adv
|
|
|
|
dx += textstate.wordspace * textstate.scaling * .01
|
|
|
|
textmatrix = translate_matrix(textmatrix, (dx, dy))
|
|
|
|
word = []
|
|
|
|
if word:
|
|
|
|
item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, word)
|
2008-08-30 12:47:21 +00:00
|
|
|
self.cur_item.add(item)
|
|
|
|
return
|