diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 217d6a6..48442f3 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -189,6 +189,20 @@ class TextConverter(PDFConverter): ## class HTMLConverter(PDFConverter): + RECT_COLORS = { + 'char': 'green', + 'figure': 'yellow', + 'textline': 'magenta', + 'polygon': 'black', + 'textbox': 'cyan', + 'textgroup': 'red', + 'page': 'gray', + } + TEXT_COLORS = { + 'char': 'black', + 'textbox': 'black', + } + def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, scale=1, showpageno=True, pagepad=50, outdir=None): PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) @@ -204,25 +218,29 @@ class HTMLConverter(PDFConverter): return def write_rect(self, color, width, x, y, w, h): - self.outfp.write('\n' % - (color, width, - x*self.scale, (self.yoffset-y)*self.scale, - w*self.scale, h*self.scale)) + color = self.RECT_COLORS.get(color) + if color is not None: + self.outfp.write('\n' % + (color, width, + x*self.scale, (self.yoffset-y)*self.scale, + w*self.scale, h*self.scale)) return - def write_text(self, text, x, y, size): - self.outfp.write('' % - (x*self.scale, (self.yoffset-y)*self.scale, size*self.scale)) - self.write(text) - self.outfp.write('\n') + def write_text(self, color, text, x, y, size): + color = self.TEXT_COLORS.get(color) + if color is not None: + self.outfp.write('' % + (color, x*self.scale, (self.yoffset-y)*self.scale, size*self.scale)) + self.write(text) + self.outfp.write('\n') return def receive_layout(self, ltpage): def render(item): if isinstance(item, LTPage): self.yoffset += item.y1 - self.write_rect('gray', 1, item.x0, item.y1, item.width, item.height) + self.write_rect('page', 1, item.x0, item.y1, item.width, item.height) if self.showpageno: self.outfp.write('
' % ((self.yoffset-item.y1)*self.scale)) @@ -230,23 +248,21 @@ class HTMLConverter(PDFConverter): for child in item: render(child) elif isinstance(item, LTChar): - self.write_text(item.text, item.x0, item.y1, item.get_size()) - if self.debug: - self.write_rect('green', 1, item.x0, item.y1, item.width, item.height) + self.write_rect('char', 1, item.x0, item.y1, item.width, item.height) + self.write_text('char', item.text, item.x0, item.y1, item.size) elif isinstance(item, LTPolygon): - self.write_rect('black', 1, item.x0, item.y1, item.width, item.height) + self.write_rect('polygon', 1, item.x0, item.y1, item.width, item.height) elif isinstance(item, LTTextLine): - self.write_rect('magenta', 1, item.x0, item.y1, item.width, item.height) + self.write_rect('textline', 1, item.x0, item.y1, item.width, item.height) for child in item: render(child) elif isinstance(item, LTTextBox): - self.write_rect('cyan', 1, item.x0, item.y1, item.width, item.height) + self.write_rect('textbox', 1, item.x0, item.y1, item.width, item.height) for child in item: render(child) - if self.debug: - self.write_text(str(item.index+1), item.x0, item.y1, 20) + self.write_text('textbox', str(item.index+1), item.x0, item.y1, 20) elif isinstance(item, LTFigure): - self.write_rect('yellow', 1, item.x0, item.y1, item.width, item.height) + self.write_rect('figure', 1, item.x0, item.y1, item.width, item.height) for child in item: render(child) elif isinstance(item, LTImage): @@ -262,7 +278,7 @@ class HTMLConverter(PDFConverter): if self.debug and ltpage.layout: def show_layout(item): if isinstance(item, LTTextGroup): - self.write_rect('red', 1, item.x0, item.y1, item.width, item.height) + self.write_rect('textgroup', 1, item.x0, item.y1, item.width, item.height) for child in item: show_layout(child) return @@ -326,7 +342,7 @@ class XMLConverter(PDFConverter): self.outfp.write('\n') elif isinstance(item, LTChar): self.outfp.write('' % - (enc(item.font.fontname), bbox2str(item.bbox), item.get_size())) + (enc(item.font.fontname), bbox2str(item.bbox), item.size)) self.write(item.text) self.outfp.write('\n') elif isinstance(item, LTText): diff --git a/pdfminer/layout.py b/pdfminer/layout.py index a8aae96..ac3ac55 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -225,6 +225,10 @@ class LTChar(LTItem, LTText): bbox = (apply_matrix_pt(self.matrix, bll) + apply_matrix_pt(self.matrix, bur)) LTItem.__init__(self, bbox) + if self.font.is_vertical(): + self.size = self.width + else: + self.size = self.height return def __repr__(self): @@ -236,9 +240,6 @@ class LTChar(LTItem, LTText): else: return '' % self.text - def get_size(self): - return max(self.width, self.height) - def is_compatible(self, obj): return True