diff --git a/CHANGELOG.md b/CHANGELOG.md index 65c85a1..763cfab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Fixed - Interpret two's complement integer as unsigned integer ([#352](https://github.com/pdfminer/pdfminer.six/pull/352)) +- Fix font name in html output such that it is recognized by browser ([#357](https://github.com/pdfminer/pdfminer.six/pull/357)) - Compute correct font height by removing scaling with font bounding box height ([#348](https://github.com/pdfminer/pdfminer.six/pull/348)) - KeyError when extracting embedded files and a Unicode file specification is missing ([#338](https://github.com/pdfminer/pdfminer.six/pull/338)) diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 1dc9583..3aa2e2a 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -292,7 +292,7 @@ class HTMLConverter(PDFConverter): return def write_text(self, text): - self.write(enc(text, None)) + self.write(enc(text)) return def place_rect(self, color, borderwidth, x, y, w, h): @@ -317,7 +317,7 @@ class HTMLConverter(PDFConverter): name = self.imagewriter.export_image(item) s = '\n' % \ - (enc(name, None), borderwidth, x * self.scale, + (enc(name), borderwidth, x * self.scale, (self._yoffset - y) * self.scale, w * self.scale, h * self.scale) self.write(s) @@ -358,8 +358,11 @@ class HTMLConverter(PDFConverter): if font != self._font: if self._font is not None: self.write('') + # Remove subset tag from fontname, see PDF Reference 5.5.3 + fontname_without_subset_tag = fontname.split('+')[-1] self.write('' % - (enc(fontname), fontsize * self.scale * self.fontscale)) + (fontname_without_subset_tag, + fontsize * self.scale * self.fontscale)) self._font = font self.write_text(text) return @@ -479,7 +482,7 @@ class XMLConverter(PDFConverter): def write_text(self, text): if self.stripcontrol: text = self.CONTROL.sub('', text) - self.write(enc(text, None)) + self.write(enc(text)) return def receive_layout(self, ltpage): @@ -544,7 +547,7 @@ class XMLConverter(PDFConverter): elif isinstance(item, LTChar): s = '' % \ - (enc(item.fontname, None), bbox2str(item.bbox), + (enc(item.fontname), bbox2str(item.bbox), item.ncs.name, item.graphicstate.ncolor, item.size) self.write(s) self.write_text(item.get_text()) @@ -555,7 +558,7 @@ class XMLConverter(PDFConverter): if self.imagewriter is not None: name = self.imagewriter.export_image(item) self.write('\n' % - (enc(name, None), item.width, item.height)) + (enc(name), item.width, item.height)) else: self.write('\n' % (item.width, item.height)) diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py index e8b3dba..1f66a31 100644 --- a/pdfminer/pdfdevice.py +++ b/pdfminer/pdfdevice.py @@ -156,7 +156,7 @@ class TagExtractor(PDFDevice): except PDFUnicodeNotDefined: print(chars) pass - self.outfp.write(utils.enc(text, self.codec)) + self.outfp.write(utils.enc(text)) return def begin_page(self, page, ctm): diff --git a/pdfminer/utils.py b/pdfminer/utils.py index feaa8a3..a16d280 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -2,6 +2,8 @@ Miscellaneous Routines. """ import struct +from html import escape + import chardet # For str encoding detection # from sys import maxint as INF doesn't work anymore under Python3, but PDF @@ -250,15 +252,11 @@ def decode_text(s): return ''.join(PDFDocEncoding[c] for c in s) -def enc(x, codec='ascii'): +def enc(x): """Encodes a string for SGML/XML/HTML""" if isinstance(x, bytes): return '' - x = x.replace('&', '&').replace('>', '>').replace('<', '<') \ - .replace('"', '"') - if codec: - x = x.encode(codec, 'xmlcharrefreplace') - return x + return escape(x) def bbox2str(bbox):