diff --git a/pdfminer/converter.py b/pdfminer/converter.py
index e93b055..4a530ff 100644
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@@ -243,12 +243,17 @@ class HTMLConverter(PDFConverter):
return
def write(self, text):
+ if self.codec:
+ text = text.encode(self.codec)
self.outfp.write(text)
return
def write_header(self):
self.write('
\n')
- self.write('\n' % self.codec)
+ if self.codec:
+ self.write('\n' % self.codec)
+ else:
+ self.write('\n')
self.write('\n')
return
@@ -259,7 +264,7 @@ class HTMLConverter(PDFConverter):
return
def write_text(self, text):
- self.write(enc(text, self.codec))
+ self.write(enc(text, None))
return
def place_rect(self, color, borderwidth, x, y, w, h):
@@ -281,7 +286,7 @@ class HTMLConverter(PDFConverter):
name = self.imagewriter.export_image(item)
self.write('\n' %
- (enc(name), borderwidth,
+ (enc(name, None), borderwidth,
x*self.scale, (self._yoffset-y)*self.scale,
w*self.scale, h*self.scale))
return
@@ -411,88 +416,97 @@ class XMLConverter(PDFConverter):
self.write_header()
return
+ def write(self, text):
+ if self.codec:
+ text = text.encode(self.codec)
+ self.outfp.write(text)
+ return
+
def write_header(self):
- self.outfp.write('\n' % self.codec)
- self.outfp.write('\n')
+ if self.codec:
+ self.write('\n' % self.codec)
+ else:
+ self.write('\n')
+ self.write('\n')
return
def write_footer(self):
- self.outfp.write('\n')
+ self.write('\n')
return
def write_text(self, text):
if self.stripcontrol:
text = self.CONTROL.sub(u'', text)
- self.outfp.write(enc(text, self.codec))
+ self.write(enc(text, None))
return
def receive_layout(self, ltpage):
def show_group(item):
if isinstance(item, LTTextBox):
- self.outfp.write('\n' %
+ self.write('\n' %
(item.index, bbox2str(item.bbox)))
elif isinstance(item, LTTextGroup):
- self.outfp.write('\n' % bbox2str(item.bbox))
+ self.write('\n' % bbox2str(item.bbox))
for child in item:
show_group(child)
- self.outfp.write('\n')
+ self.write('\n')
return
def render(item):
if isinstance(item, LTPage):
- self.outfp.write('\n' %
+ self.write('\n' %
(item.pageid, bbox2str(item.bbox), item.rotate))
for child in item:
render(child)
if item.groups is not None:
- self.outfp.write('\n')
+ self.write('\n')
for group in item.groups:
show_group(group)
- self.outfp.write('\n')
- self.outfp.write('\n')
+ self.write('\n')
+ self.write('\n')
elif isinstance(item, LTLine):
- self.outfp.write('\n' %
+ self.write('\n' %
(item.linewidth, bbox2str(item.bbox)))
elif isinstance(item, LTRect):
- self.outfp.write('\n' %
+ self.write('\n' %
(item.linewidth, bbox2str(item.bbox)))
elif isinstance(item, LTCurve):
- self.outfp.write('\n' %
+ self.write('\n' %
(item.linewidth, bbox2str(item.bbox), item.get_pts()))
elif isinstance(item, LTFigure):
- self.outfp.write('\n')
elif isinstance(item, LTTextLine):
- self.outfp.write('\n' % bbox2str(item.bbox))
+ self.write('\n' % bbox2str(item.bbox))
for child in item:
render(child)
- self.outfp.write('\n')
+ self.write('\n')
elif isinstance(item, LTTextBox):
wmode = ''
if isinstance(item, LTTextBoxVertical):
wmode = ' wmode="vertical"'
- self.outfp.write('\n' %
+ self.write('\n' %
(item.index, bbox2str(item.bbox), wmode))
for child in item:
render(child)
- self.outfp.write('\n')
+ self.write('\n')
elif isinstance(item, LTChar):
- self.outfp.write('' %
- (enc(item.fontname), bbox2str(item.bbox), item.size))
+ self.write('' %
+ (enc(item.fontname, None), bbox2str(item.bbox), item.size))
self.write_text(item.get_text())
- self.outfp.write('\n')
+ self.write('\n')
elif isinstance(item, LTText):
- self.outfp.write('%s\n' % item.get_text())
+ self.write('%s\n' % item.get_text())
elif isinstance(item, LTImage):
if self.imagewriter is not None:
name = self.imagewriter.export_image(item)
- self.outfp.write('\n' %
- (enc(name), item.width, item.height))
+ self.write('\n' %
+ (enc(name, None), item.width, item.height))
else:
- self.outfp.write('\n' %
+ self.write('\n' %
(item.width, item.height))
else:
assert 0, item
diff --git a/pdfminer/utils.py b/pdfminer/utils.py
index 0ec01cf..44c01bd 100644
--- a/pdfminer/utils.py
+++ b/pdfminer/utils.py
@@ -233,7 +233,9 @@ def decode_text(s):
def enc(x, codec='ascii'):
"""Encodes a string for SGML/XML/HTML"""
x = x.replace('&', '&').replace('>', '>').replace('<', '<').replace('"', '"')
- return x.encode(codec, 'xmlcharrefreplace')
+ if codec:
+ x = x.encode(codec, 'xmlcharrefreplace')
+ return x
def bbox2str(bbox):