diff --git a/pdfminer/converter.py b/pdfminer/converter.py index e93b055..4a530ff 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -243,12 +243,17 @@ class HTMLConverter(PDFConverter): return def write(self, text): + if self.codec: + text = text.encode(self.codec) self.outfp.write(text) return def write_header(self): self.write('\n') - self.write('\n' % self.codec) + if self.codec: + self.write('\n' % self.codec) + else: + self.write('\n') self.write('\n') return @@ -259,7 +264,7 @@ class HTMLConverter(PDFConverter): return def write_text(self, text): - self.write(enc(text, self.codec)) + self.write(enc(text, None)) return def place_rect(self, color, borderwidth, x, y, w, h): @@ -281,7 +286,7 @@ class HTMLConverter(PDFConverter): name = self.imagewriter.export_image(item) self.write('\n' % - (enc(name), borderwidth, + (enc(name, None), borderwidth, x*self.scale, (self._yoffset-y)*self.scale, w*self.scale, h*self.scale)) return @@ -411,88 +416,97 @@ class XMLConverter(PDFConverter): self.write_header() return + def write(self, text): + if self.codec: + text = text.encode(self.codec) + self.outfp.write(text) + return + def write_header(self): - self.outfp.write('\n' % self.codec) - self.outfp.write('\n') + if self.codec: + self.write('\n' % self.codec) + else: + self.write('\n') + self.write('\n') return def write_footer(self): - self.outfp.write('\n') + self.write('\n') return def write_text(self, text): if self.stripcontrol: text = self.CONTROL.sub(u'', text) - self.outfp.write(enc(text, self.codec)) + self.write(enc(text, None)) return def receive_layout(self, ltpage): def show_group(item): if isinstance(item, LTTextBox): - self.outfp.write('\n' % + self.write('\n' % (item.index, bbox2str(item.bbox))) elif isinstance(item, LTTextGroup): - self.outfp.write('\n' % bbox2str(item.bbox)) + self.write('\n' % bbox2str(item.bbox)) for child in item: show_group(child) - self.outfp.write('\n') + self.write('\n') return def render(item): if isinstance(item, LTPage): - self.outfp.write('\n' % + self.write('\n' % (item.pageid, bbox2str(item.bbox), item.rotate)) for child in item: render(child) if item.groups is not None: - self.outfp.write('\n') + self.write('\n') for group in item.groups: show_group(group) - self.outfp.write('\n') - self.outfp.write('\n') + self.write('\n') + self.write('\n') elif isinstance(item, LTLine): - self.outfp.write('\n' % + self.write('\n' % (item.linewidth, bbox2str(item.bbox))) elif isinstance(item, LTRect): - self.outfp.write('\n' % + self.write('\n' % (item.linewidth, bbox2str(item.bbox))) elif isinstance(item, LTCurve): - self.outfp.write('\n' % + self.write('\n' % (item.linewidth, bbox2str(item.bbox), item.get_pts())) elif isinstance(item, LTFigure): - self.outfp.write('
\n' % + self.write('
\n' % (item.name, bbox2str(item.bbox))) for child in item: render(child) - self.outfp.write('
\n') + self.write('
\n') elif isinstance(item, LTTextLine): - self.outfp.write('\n' % bbox2str(item.bbox)) + self.write('\n' % bbox2str(item.bbox)) for child in item: render(child) - self.outfp.write('\n') + self.write('\n') elif isinstance(item, LTTextBox): wmode = '' if isinstance(item, LTTextBoxVertical): wmode = ' wmode="vertical"' - self.outfp.write('\n' % + self.write('\n' % (item.index, bbox2str(item.bbox), wmode)) for child in item: render(child) - self.outfp.write('\n') + self.write('\n') elif isinstance(item, LTChar): - self.outfp.write('' % - (enc(item.fontname), bbox2str(item.bbox), item.size)) + self.write('' % + (enc(item.fontname, None), bbox2str(item.bbox), item.size)) self.write_text(item.get_text()) - self.outfp.write('\n') + self.write('\n') elif isinstance(item, LTText): - self.outfp.write('%s\n' % item.get_text()) + self.write('%s\n' % item.get_text()) elif isinstance(item, LTImage): if self.imagewriter is not None: name = self.imagewriter.export_image(item) - self.outfp.write('\n' % - (enc(name), item.width, item.height)) + self.write('\n' % + (enc(name, None), item.width, item.height)) else: - self.outfp.write('\n' % + self.write('\n' % (item.width, item.height)) else: assert 0, item diff --git a/pdfminer/utils.py b/pdfminer/utils.py index 0ec01cf..44c01bd 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -233,7 +233,9 @@ def decode_text(s): def enc(x, codec='ascii'): """Encodes a string for SGML/XML/HTML""" x = x.replace('&', '&').replace('>', '>').replace('<', '<').replace('"', '"') - return x.encode(codec, 'xmlcharrefreplace') + if codec: + x = x.encode(codec, 'xmlcharrefreplace') + return x def bbox2str(bbox):