diff --git a/CHANGELOG.md b/CHANGELOG.md
index 65c85a1..763cfab 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
### Fixed
- Interpret two's complement integer as unsigned integer ([#352](https://github.com/pdfminer/pdfminer.six/pull/352))
+- Fix font name in html output such that it is recognized by browser ([#357](https://github.com/pdfminer/pdfminer.six/pull/357))
- Compute correct font height by removing scaling with font bounding box height ([#348](https://github.com/pdfminer/pdfminer.six/pull/348))
- KeyError when extracting embedded files and a Unicode file specification is missing ([#338](https://github.com/pdfminer/pdfminer.six/pull/338))
diff --git a/pdfminer/converter.py b/pdfminer/converter.py
index 1dc9583..3aa2e2a 100644
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@@ -292,7 +292,7 @@ class HTMLConverter(PDFConverter):
return
def write_text(self, text):
- self.write(enc(text, None))
+ self.write(enc(text))
return
def place_rect(self, color, borderwidth, x, y, w, h):
@@ -317,7 +317,7 @@ class HTMLConverter(PDFConverter):
name = self.imagewriter.export_image(item)
s = '\n' % \
- (enc(name, None), borderwidth, x * self.scale,
+ (enc(name), borderwidth, x * self.scale,
(self._yoffset - y) * self.scale, w * self.scale,
h * self.scale)
self.write(s)
@@ -358,8 +358,11 @@ class HTMLConverter(PDFConverter):
if font != self._font:
if self._font is not None:
self.write('')
+ # Remove subset tag from fontname, see PDF Reference 5.5.3
+ fontname_without_subset_tag = fontname.split('+')[-1]
self.write('' %
- (enc(fontname), fontsize * self.scale * self.fontscale))
+ (fontname_without_subset_tag,
+ fontsize * self.scale * self.fontscale))
self._font = font
self.write_text(text)
return
@@ -479,7 +482,7 @@ class XMLConverter(PDFConverter):
def write_text(self, text):
if self.stripcontrol:
text = self.CONTROL.sub('', text)
- self.write(enc(text, None))
+ self.write(enc(text))
return
def receive_layout(self, ltpage):
@@ -544,7 +547,7 @@ class XMLConverter(PDFConverter):
elif isinstance(item, LTChar):
s = '' % \
- (enc(item.fontname, None), bbox2str(item.bbox),
+ (enc(item.fontname), bbox2str(item.bbox),
item.ncs.name, item.graphicstate.ncolor, item.size)
self.write(s)
self.write_text(item.get_text())
@@ -555,7 +558,7 @@ class XMLConverter(PDFConverter):
if self.imagewriter is not None:
name = self.imagewriter.export_image(item)
self.write('\n' %
- (enc(name, None), item.width, item.height))
+ (enc(name), item.width, item.height))
else:
self.write('\n' %
(item.width, item.height))
diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py
index e8b3dba..1f66a31 100644
--- a/pdfminer/pdfdevice.py
+++ b/pdfminer/pdfdevice.py
@@ -156,7 +156,7 @@ class TagExtractor(PDFDevice):
except PDFUnicodeNotDefined:
print(chars)
pass
- self.outfp.write(utils.enc(text, self.codec))
+ self.outfp.write(utils.enc(text))
return
def begin_page(self, page, ctm):
diff --git a/pdfminer/utils.py b/pdfminer/utils.py
index feaa8a3..a16d280 100644
--- a/pdfminer/utils.py
+++ b/pdfminer/utils.py
@@ -2,6 +2,8 @@
Miscellaneous Routines.
"""
import struct
+from html import escape
+
import chardet # For str encoding detection
# from sys import maxint as INF doesn't work anymore under Python3, but PDF
@@ -250,15 +252,11 @@ def decode_text(s):
return ''.join(PDFDocEncoding[c] for c in s)
-def enc(x, codec='ascii'):
+def enc(x):
"""Encodes a string for SGML/XML/HTML"""
if isinstance(x, bytes):
return ''
- x = x.replace('&', '&').replace('>', '>').replace('<', '<') \
- .replace('"', '"')
- if codec:
- x = x.encode(codec, 'xmlcharrefreplace')
- return x
+ return escape(x)
def bbox2str(bbox):