diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 9480b22..23091be 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -91,10 +91,22 @@ class PDFLayoutAnalyzer(PDFTextDevice): return def render_char(self, matrix, font, fontsize, scaling, rise, cid): - item = LTChar(matrix, font, fontsize, scaling, rise, cid) + try: + text = font.to_unichr(cid) + assert isinstance(text, unicode), text + except PDFUnicodeNotDefined: + text = self.handle_undefined_char(font, cid) + textwidth = font.char_width(cid) + textdisp = font.char_disp(cid) + item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth, textdisp) self.cur_item.add(item) return item.adv + def handle_undefined_char(self, font, cid): + if self.debug: + print >>sys.stderr, 'undefined: %r, %r' % (font, cid) + return '?' + def receive_layout(self, ltpage): return @@ -126,10 +138,6 @@ class PDFConverter(PDFLayoutAnalyzer): self.codec = codec return - def write(self, text): - self.outfp.write(enc(text, self.codec)) - return - def write_image(self, image): stream = image.stream filters = stream.get_filters() @@ -163,7 +171,7 @@ class TextConverter(PDFConverter): self.showpageno = showpageno return - def write(self, text): + def write_text(self, text): self.outfp.write(text.encode(self.codec, 'ignore')) return @@ -173,13 +181,13 @@ class TextConverter(PDFConverter): for child in item: render(child) elif isinstance(item, LTText): - self.write(item.text) + self.write_text(item.text) if isinstance(item, LTTextBox): - self.write('\n') + self.write_text('\n') if self.showpageno: - self.write('Page %s\n' % ltpage.pageid) + self.write_text('Page %s\n' % ltpage.pageid) render(ltpage) - self.write('\f') + self.write_text('\f') return @@ -191,92 +199,152 @@ class HTMLConverter(PDFConverter): #'char': 'green', #'figure': 'yellow', #'textline': 'magenta', - 'textbox': 'cyan', - 'textgroup': 'red', + #'textbox': 'cyan', + #'textgroup': 'red', 'polygon': 'black', 'page': 'gray', } TEXT_COLORS = { - 'textbox': 'blue', + #'textbox': 'blue', 'char': 'black', } def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, - scale=1, showpageno=True, pagepad=50, outdir=None): + scale=1, exact=False, showpageno=True, pagepad=50, outdir=None): PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) + self.exact = exact self.showpageno = showpageno self.pagepad = pagepad self.outdir = outdir self.scale = scale - self.outfp.write('\n') - self.outfp.write('\n' % - self.codec) - self.outfp.write('\n') + self.fontscale = 0.7 + self.write('\n') + self.write('\n' % self.codec) + self.write('\n') self.yoffset = self.pagepad + self._font = None + self._fontstack = [] return - def write_rect(self, color, width, x, y, w, h): + def write(self, text): + self.outfp.write(text) + return + + def write_text(self, text): + self.write(enc(text, self.codec)) + return + + def place_rect(self, color, borderwidth, x, y, w, h): color = self.RECT_COLORS.get(color) if color is not None: - self.outfp.write('\n' % - (color, width, - x*self.scale, (self.yoffset-y)*self.scale, - w*self.scale, h*self.scale)) + self.write('\n' % + (color, borderwidth, + x*self.scale, (self.yoffset-y)*self.scale, + w*self.scale, h*self.scale)) return - def write_text(self, color, text, x, y, size): + def place_image(self, item, borderwidth, x, y, w, h): + if self.outdir is not None: + name = self.write_image(item) + self.write('\n' % + (enc(name), borderwidth, + x*self.scale, (self.yoffset-y)*self.scale, + w*self.scale, h*self.scale)) + return + + def place_text(self, color, text, x, y, size): color = self.TEXT_COLORS.get(color) if color is not None: - self.outfp.write('' % - (color, x*self.scale, (self.yoffset-y)*self.scale, size*self.scale)) - self.write(text) - self.outfp.write('\n') + self.write('' % + (color, x*self.scale, (self.yoffset-y)*self.scale, size*self.scale)) + self.write_text(text) + self.write('\n') + return + + def begin_textbox(self, color, borderwidth, x, y, w, h, writing_mode): + self._fontstack.append(self._font) + self._font = None + self.write('
' % + (color, borderwidth, writing_mode, + x*self.scale, (self.yoffset-y)*self.scale, + w*self.scale, h*self.scale)) + return + + def put_text(self, text, fontname, fontsize): + font = (fontname, fontsize) + if font != self._font: + if self._font is not None: + self.write('') + self.write('' % + (fontsize * self.scale * self.fontscale)) + self._font = font + self.write_text(text) + return + + def end_textbox(self, color): + if self._font is not None: + self.write('') + self._font = self._fontstack.pop() + self.write('
') return def receive_layout(self, ltpage): def render(item): if isinstance(item, LTPage): self.yoffset += item.y1 - self.write_rect('page', 1, item.x0, item.y1, item.width, item.height) + self.place_rect('page', 1, item.x0, item.y1, item.width, item.height) if self.showpageno: - self.outfp.write('
' % - ((self.yoffset-item.y1)*self.scale)) - self.outfp.write('Page %s
\n' % (item.pageid, item.pageid)) + self.write('
' % + ((self.yoffset-item.y1)*self.scale)) + self.write('Page %s
\n' % (item.pageid, item.pageid)) for child in item: render(child) - elif isinstance(item, LTChar): - self.write_rect('char', 1, item.x0, item.y1, item.width, item.height) - self.write_text('char', item.text, item.x0, item.y1, item.size) elif isinstance(item, LTPolygon): - self.write_rect('polygon', 1, item.x0, item.y1, item.width, item.height) - elif isinstance(item, LTTextLine): - self.write_rect('textline', 1, item.x0, item.y1, item.width, item.height) - for child in item: - render(child) - elif isinstance(item, LTTextBox): - self.write_rect('textbox', 1, item.x0, item.y1, item.width, item.height) - for child in item: - render(child) - self.write_text('textbox', str(item.index+1), item.x0, item.y1, 20) + self.place_rect('polygon', 1, item.x0, item.y1, item.width, item.height) elif isinstance(item, LTFigure): - self.write_rect('figure', 1, item.x0, item.y1, item.width, item.height) + self.place_rect('figure', 1, item.x0, item.y1, item.width, item.height) for child in item: render(child) elif isinstance(item, LTImage): - if self.outdir: - name = self.write_image(item) - self.outfp.write('\n' % - (enc(name), - item.x0*self.scale, (self.yoffset-item.y1)*self.scale, - item.width*self.scale, item.height*self.scale)) + self.place_image(item, 1, item.x0, item.y1, item.width, item.height) + else: + if self.exact: + if isinstance(item, LTTextLine): + self.place_rect('textline', 1, item.x0, item.y1, item.width, item.height) + for child in item: + render(child) + elif isinstance(item, LTTextBox): + self.place_rect('textbox', 1, item.x0, item.y1, item.width, item.height) + self.place_text('textbox', str(item.index+1), item.x0, item.y1, 20) + for child in item: + render(child) + elif isinstance(item, LTChar): + self.place_rect('char', 1, item.x0, item.y1, item.width, item.height) + self.place_text('char', item.text, item.x0, item.y1, item.size) + else: + if isinstance(item, LTTextLine): + for child in item: + render(child) + self.write('
') + elif isinstance(item, LTTextBox): + self.begin_textbox('textbox', 1, item.x0, item.y1, item.width, item.height, + item.get_writing_mode()) + for child in item: + render(child) + self.end_textbox('textbox') + elif isinstance(item, LTChar): + self.put_text(item.text, item.fontname, item.size) + elif isinstance(item, LTText): + self.write_text(item.text) return render(ltpage) if ltpage.layout: def show_layout(item): if isinstance(item, LTTextGroup): - self.write_rect('textgroup', 1, item.x0, item.y1, item.width, item.height) + self.place_rect('textgroup', 1, item.x0, item.y1, item.width, item.height) for child in item: show_layout(child) return @@ -285,9 +353,9 @@ class HTMLConverter(PDFConverter): return def close(self): - self.outfp.write('
Page: %s
\n' % - ', '.join('%s' % (i,i) for i in xrange(1,self.pageno))) - self.outfp.write('\n') + self.write('
Page: %s
\n' % + ', '.join('%s' % (i,i) for i in xrange(1,self.pageno))) + self.write('\n') return @@ -302,6 +370,10 @@ class XMLConverter(PDFConverter): self.outfp.write('\n') return + def write_text(self, text): + self.outfp.write(enc(text, self.codec)) + return + def receive_layout(self, ltpage): def render(item): if isinstance(item, LTPage): @@ -341,7 +413,7 @@ class XMLConverter(PDFConverter): elif isinstance(item, LTChar): self.outfp.write('' % (enc(item.font.fontname), bbox2str(item.bbox), item.size)) - self.write(item.text) + self.write_text(item.text) self.outfp.write('\n') elif isinstance(item, LTText): self.outfp.write('%s\n' % item.text) diff --git a/pdfminer/layout.py b/pdfminer/layout.py index ee6f869..6daa592 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -183,22 +183,16 @@ class LTChar(LTItem, LTText): debug = 0 - def __init__(self, matrix, font, fontsize, scaling, rise, cid): - self.matrix = matrix - self.font = font - self.fontsize = fontsize - self.adv = font.char_width(cid) * fontsize * scaling - try: - text = font.to_unichr(cid) - assert isinstance(text, unicode), text - except PDFUnicodeNotDefined: - text = '?' + def __init__(self, matrix, font, fontsize, scaling, rise, text, textwidth, textdisp): LTText.__init__(self, text) + self.matrix = matrix + self.fontname = font.fontname + self.adv = textwidth * fontsize * scaling # compute the boundary rectangle. - if self.font.is_vertical(): + if font.is_vertical(): # vertical width = font.get_width() * fontsize - (vx,vy) = font.char_disp(cid) + (vx,vy) = textdisp if vx is None: vx = width/2 else: @@ -224,7 +218,7 @@ class LTChar(LTItem, LTText): if y1 < y0: (y0,y1) = (y1,y0) LTItem.__init__(self, (x0,y0,x1,y1)) - if self.font.is_vertical(): + if font.is_vertical(): self.size = self.width else: self.size = self.height @@ -232,9 +226,9 @@ class LTChar(LTItem, LTText): def __repr__(self): if self.debug: - return ('<%s %s matrix=%s font=%r fontsize=%.1f adv=%s text=%r>' % + return ('<%s %s matrix=%s font=%r adv=%s text=%r>' % (self.__class__.__name__, bbox2str(self.bbox), - matrix2str(self.matrix), self.font, self.fontsize, + matrix2str(self.matrix), self.fontname, self.adv, self.text)) else: return '' % self.text @@ -378,12 +372,18 @@ class LTTextBoxHorizontal(LTTextBox): self._objs = csort(self._objs, key=lambda obj: -obj.y1) return LTTextBox.finish(self) + def get_writing_mode(self): + return 'lr-tb' + class LTTextBoxVertical(LTTextBox): def finish(self): self._objs = csort(self._objs, key=lambda obj: -obj.x1) return LTTextBox.finish(self) + def get_writing_mode(self): + return 'tb-rl' + ## LTTextGroup ##