#!/usr/bin/env python import sys from pdfdevice import PDFDevice, PDFTextDevice from pdffont import PDFUnicodeNotDefined from layout import LayoutContainer from layout import LTPage, LTText, LTLine, LTRect from layout import LTFigure, LTTextItem, LTTextBox, LTTextLine from utils import enc from utils import apply_matrix_pt, mult_matrix ## TagExtractor ## class TagExtractor(PDFDevice): def __init__(self, rsrc, outfp, codec='utf-8'): PDFDevice.__init__(self, rsrc) self.outfp = outfp self.codec = codec self.pageno = 0 self.tag = None return def render_string(self, textstate, seq): font = textstate.font text = '' for obj in seq: if not isinstance(obj, str): continue chars = font.decode(obj) for cid in chars: try: char = font.to_unichr(cid) text += char except PDFUnicodeNotDefined: pass self.outfp.write(enc(text, self.codec)) return def begin_page(self, page, ctm): (x0, y0, x1, y1) = page.mediabox bbox = '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1) self.outfp.write('' % (self.pageno, bbox, page.rotate)) return def end_page(self, page): self.outfp.write('\n') self.pageno += 1 return def begin_tag(self, tag, props=None): s = '' if props: s = ''.join( ' %s="%s"' % (enc(k), enc(str(v))) for (k,v) in sorted(props.iteritems()) ) self.outfp.write('<%s%s>' % (enc(tag.name), s)) self.tag = tag return def end_tag(self): assert self.tag self.outfp.write('' % enc(self.tag.name)) self.tag = None return def do_tag(self, tag, props=None): self.begin_tag(tag, props) self.tag = None return ## PDFPageAggregator ## class PDFPageAggregator(PDFTextDevice): def __init__(self, rsrc, pageno=1, laparams=None): PDFTextDevice.__init__(self, rsrc) self.laparams = laparams self.pageno = pageno self.stack = [] return def begin_page(self, page, ctm): (x0,y0,x1,y1) = page.mediabox (x0,y0) = apply_matrix_pt(ctm, (x0,y0)) (x1,y1) = apply_matrix_pt(ctm, (x1,y1)) mediabox = (0, 0, abs(x0-x1), abs(y0-y1)) self.cur_item = LTPage(self.pageno, mediabox) return def end_page(self, _): assert not self.stack assert isinstance(self.cur_item, LTPage) self.cur_item.fixate() if self.laparams: self.cur_item.analyze_layout(self.laparams) self.pageno += 1 return self.cur_item def begin_figure(self, name, bbox, matrix): self.stack.append(self.cur_item) self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm)) return def end_figure(self, _): fig = self.cur_item self.cur_item.fixate() self.cur_item = self.stack.pop() self.cur_item.add(fig) return def paint_path(self, gstate, stroke, fill, evenodd, path): shape = ''.join(x[0] for x in path) if shape == 'ml': # horizontal/vertical line (_,x0,y0) = path[0] (_,x1,y1) = path[1] (x0,y0) = apply_matrix_pt(self.ctm, (x0,y0)) (x1,y1) = apply_matrix_pt(self.ctm, (x1,y1)) if y0 == y1: # horizontal ruler self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1))) elif x0 == x1: # vertical ruler self.cur_item.add(LTLine(gstate.linewidth, 'V', (x0,y0,x1,y1))) elif shape == 'mlllh': # rectangle (_,x0,y0) = path[0] (_,x1,y1) = path[1] (_,x2,y2) = path[2] (_,x3,y3) = path[3] (x0,y0) = apply_matrix_pt(self.ctm, (x0,y0)) (x1,y1) = apply_matrix_pt(self.ctm, (x1,y1)) (x2,y2) = apply_matrix_pt(self.ctm, (x2,y2)) (x3,y3) = apply_matrix_pt(self.ctm, (x3,y3)) if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)): self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2))) return def render_chars(self, matrix, font, fontsize, charspace, scaling, chars): if not chars: return (0, 0) item = LTTextItem(matrix, font, fontsize, charspace, scaling, chars) self.cur_item.add(item) return item.adv ## PDFConverter ## class PDFConverter(PDFPageAggregator): def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None): PDFPageAggregator.__init__(self, rsrc, pageno=pageno, laparams=laparams) self.outfp = outfp self.codec = codec return def write(self, text): self.outfp.write(enc(text, self.codec)) return ## XMLConverter ## class XMLConverter(PDFConverter): def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None): PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams) self.outfp.write('\n' % codec) self.outfp.write('\n') return def end_page(self, page): def render(item): if isinstance(item, LTPage): self.outfp.write('\n' % (item.id, item.get_bbox(), item.rotate)) for child in item: render(child) self.outfp.write('\n') elif isinstance(item, LTLine): self.outfp.write('' % (item.linewidth, item.direction, item.get_bbox())) elif isinstance(item, LTRect): self.outfp.write('' % (item.linewidth, item.get_bbox())) elif isinstance(item, LTFigure): self.outfp.write('
\n' % (item.id, item.get_bbox())) for child in item: render(child) self.outfp.write('
\n') elif isinstance(item, LTTextLine): self.outfp.write('\n' % (item.get_bbox())) for child in item: render(child) self.outfp.write('\n') elif isinstance(item, LTTextBox): self.outfp.write('\n' % (item.id, item.get_bbox())) for child in item: render(child) self.outfp.write('\n') elif isinstance(item, LTTextItem): self.outfp.write('' % (enc(item.font.fontname), item.is_vertical(), item.get_bbox(), item.fontsize)) self.write(item.text) self.outfp.write('\n') elif isinstance(item, LTText): self.outfp.write('%s\n' % item.text) else: assert 0, item return page = PDFConverter.end_page(self, page) render(page) return def close(self): self.outfp.write('
\n') return ## HTMLConverter ## class HTMLConverter(PDFConverter): def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, scale=1, showpageno=True, pagepad=50): PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams) self.showpageno = showpageno self.pagepad = pagepad self.scale = scale self.outfp.write('\n') self.outfp.write('\n' % self.codec) self.outfp.write('\n') self.yoffset = self.pagepad return def write_rect(self, color, width, x, y, w, h): self.outfp.write('\n' % (color, width, x*self.scale, y*self.scale, w*self.scale, h*self.scale)) return def end_page(self, page): def render(item): if isinstance(item, LTPage): self.yoffset += item.y1 self.write_rect('gray', 1, item.x0, self.yoffset-item.y1, item.width, item.height) if self.showpageno: self.outfp.write('
' % ((self.yoffset-item.y1)*self.scale)) self.outfp.write('Page %s
\n' % (page.id, page.id)) for child in item: render(child) elif isinstance(item, LTTextItem): if item.vertical: wmode = 'tb-rl' else: wmode = 'lr-tb' self.outfp.write('' % (wmode, item.x0*self.scale, (self.yoffset-item.y1)*self.scale, item.fontsize*self.scale)) self.write(item.text) self.outfp.write('\n') if self.debug: self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height) elif isinstance(item, LTLine) or isinstance(item, LTRect): self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height) elif isinstance(item, LTTextLine): for child in item: render(child) elif isinstance(item, LTTextBox): self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height) for child in item: render(child) elif isinstance(item, LTFigure): self.write_rect('green', 1, item.x0, self.yoffset-item.y1, item.width, item.height) for child in item: render(child) return page = PDFConverter.end_page(self, page) render(page) self.yoffset += self.pagepad return def close(self): self.outfp.write('
Page: %s
\n' % ', '.join('%s' % (i,i) for i in xrange(1,self.pageno))) self.outfp.write('\n') return ## TextConverter ## class TextConverter(PDFConverter): def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, showpageno=False): PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams) self.showpageno = showpageno return def write(self, text): self.outfp.write(text.encode(self.codec, 'ignore')) return def end_page(self, page): def render(item): if isinstance(item, LTText): self.write(item.text) elif isinstance(item, LayoutContainer): for child in item: render(child) if isinstance(item, LTTextBox): self.write('\n') page = PDFConverter.end_page(self, page) if self.showpageno: self.write('Page %d\n' % page.id) render(page) self.write('\f') return