Fix value for font-family in html by removing the subset tag from the PDF font-name (#357)
* Fix font name by removing subset tag * Added line to CHANGELOG.md * Add documentation and clear variable name * Use `html.escape()` to encode strings for html and always return `str` instead of `bytes`pull/364/head
parent
fff3ac2ba6
commit
410d7ecac3
|
@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
- Interpret two's complement integer as unsigned integer ([#352](https://github.com/pdfminer/pdfminer.six/pull/352))
|
- Interpret two's complement integer as unsigned integer ([#352](https://github.com/pdfminer/pdfminer.six/pull/352))
|
||||||
|
- Fix font name in html output such that it is recognized by browser ([#357](https://github.com/pdfminer/pdfminer.six/pull/357))
|
||||||
- Compute correct font height by removing scaling with font bounding box height ([#348](https://github.com/pdfminer/pdfminer.six/pull/348))
|
- Compute correct font height by removing scaling with font bounding box height ([#348](https://github.com/pdfminer/pdfminer.six/pull/348))
|
||||||
- KeyError when extracting embedded files and a Unicode file specification is missing ([#338](https://github.com/pdfminer/pdfminer.six/pull/338))
|
- KeyError when extracting embedded files and a Unicode file specification is missing ([#338](https://github.com/pdfminer/pdfminer.six/pull/338))
|
||||||
|
|
||||||
|
|
|
@ -292,7 +292,7 @@ class HTMLConverter(PDFConverter):
|
||||||
return
|
return
|
||||||
|
|
||||||
def write_text(self, text):
|
def write_text(self, text):
|
||||||
self.write(enc(text, None))
|
self.write(enc(text))
|
||||||
return
|
return
|
||||||
|
|
||||||
def place_rect(self, color, borderwidth, x, y, w, h):
|
def place_rect(self, color, borderwidth, x, y, w, h):
|
||||||
|
@ -317,7 +317,7 @@ class HTMLConverter(PDFConverter):
|
||||||
name = self.imagewriter.export_image(item)
|
name = self.imagewriter.export_image(item)
|
||||||
s = '<img src="%s" border="%d" style="position:absolute; ' \
|
s = '<img src="%s" border="%d" style="position:absolute; ' \
|
||||||
'left:%dpx; top:%dpx;" width="%d" height="%d" />\n' % \
|
'left:%dpx; top:%dpx;" width="%d" height="%d" />\n' % \
|
||||||
(enc(name, None), borderwidth, x * self.scale,
|
(enc(name), borderwidth, x * self.scale,
|
||||||
(self._yoffset - y) * self.scale, w * self.scale,
|
(self._yoffset - y) * self.scale, w * self.scale,
|
||||||
h * self.scale)
|
h * self.scale)
|
||||||
self.write(s)
|
self.write(s)
|
||||||
|
@ -358,8 +358,11 @@ class HTMLConverter(PDFConverter):
|
||||||
if font != self._font:
|
if font != self._font:
|
||||||
if self._font is not None:
|
if self._font is not None:
|
||||||
self.write('</span>')
|
self.write('</span>')
|
||||||
|
# Remove subset tag from fontname, see PDF Reference 5.5.3
|
||||||
|
fontname_without_subset_tag = fontname.split('+')[-1]
|
||||||
self.write('<span style="font-family: %s; font-size:%dpx">' %
|
self.write('<span style="font-family: %s; font-size:%dpx">' %
|
||||||
(enc(fontname), fontsize * self.scale * self.fontscale))
|
(fontname_without_subset_tag,
|
||||||
|
fontsize * self.scale * self.fontscale))
|
||||||
self._font = font
|
self._font = font
|
||||||
self.write_text(text)
|
self.write_text(text)
|
||||||
return
|
return
|
||||||
|
@ -479,7 +482,7 @@ class XMLConverter(PDFConverter):
|
||||||
def write_text(self, text):
|
def write_text(self, text):
|
||||||
if self.stripcontrol:
|
if self.stripcontrol:
|
||||||
text = self.CONTROL.sub('', text)
|
text = self.CONTROL.sub('', text)
|
||||||
self.write(enc(text, None))
|
self.write(enc(text))
|
||||||
return
|
return
|
||||||
|
|
||||||
def receive_layout(self, ltpage):
|
def receive_layout(self, ltpage):
|
||||||
|
@ -544,7 +547,7 @@ class XMLConverter(PDFConverter):
|
||||||
elif isinstance(item, LTChar):
|
elif isinstance(item, LTChar):
|
||||||
s = '<text font="%s" bbox="%s" colourspace="%s" ' \
|
s = '<text font="%s" bbox="%s" colourspace="%s" ' \
|
||||||
'ncolour="%s" size="%.3f">' % \
|
'ncolour="%s" size="%.3f">' % \
|
||||||
(enc(item.fontname, None), bbox2str(item.bbox),
|
(enc(item.fontname), bbox2str(item.bbox),
|
||||||
item.ncs.name, item.graphicstate.ncolor, item.size)
|
item.ncs.name, item.graphicstate.ncolor, item.size)
|
||||||
self.write(s)
|
self.write(s)
|
||||||
self.write_text(item.get_text())
|
self.write_text(item.get_text())
|
||||||
|
@ -555,7 +558,7 @@ class XMLConverter(PDFConverter):
|
||||||
if self.imagewriter is not None:
|
if self.imagewriter is not None:
|
||||||
name = self.imagewriter.export_image(item)
|
name = self.imagewriter.export_image(item)
|
||||||
self.write('<image src="%s" width="%d" height="%d" />\n' %
|
self.write('<image src="%s" width="%d" height="%d" />\n' %
|
||||||
(enc(name, None), item.width, item.height))
|
(enc(name), item.width, item.height))
|
||||||
else:
|
else:
|
||||||
self.write('<image width="%d" height="%d" />\n' %
|
self.write('<image width="%d" height="%d" />\n' %
|
||||||
(item.width, item.height))
|
(item.width, item.height))
|
||||||
|
|
|
@ -156,7 +156,7 @@ class TagExtractor(PDFDevice):
|
||||||
except PDFUnicodeNotDefined:
|
except PDFUnicodeNotDefined:
|
||||||
print(chars)
|
print(chars)
|
||||||
pass
|
pass
|
||||||
self.outfp.write(utils.enc(text, self.codec))
|
self.outfp.write(utils.enc(text))
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_page(self, page, ctm):
|
def begin_page(self, page, ctm):
|
||||||
|
|
|
@ -2,6 +2,8 @@
|
||||||
Miscellaneous Routines.
|
Miscellaneous Routines.
|
||||||
"""
|
"""
|
||||||
import struct
|
import struct
|
||||||
|
from html import escape
|
||||||
|
|
||||||
import chardet # For str encoding detection
|
import chardet # For str encoding detection
|
||||||
|
|
||||||
# from sys import maxint as INF doesn't work anymore under Python3, but PDF
|
# from sys import maxint as INF doesn't work anymore under Python3, but PDF
|
||||||
|
@ -250,15 +252,11 @@ def decode_text(s):
|
||||||
return ''.join(PDFDocEncoding[c] for c in s)
|
return ''.join(PDFDocEncoding[c] for c in s)
|
||||||
|
|
||||||
|
|
||||||
def enc(x, codec='ascii'):
|
def enc(x):
|
||||||
"""Encodes a string for SGML/XML/HTML"""
|
"""Encodes a string for SGML/XML/HTML"""
|
||||||
if isinstance(x, bytes):
|
if isinstance(x, bytes):
|
||||||
return ''
|
return ''
|
||||||
x = x.replace('&', '&').replace('>', '>').replace('<', '<') \
|
return escape(x)
|
||||||
.replace('"', '"')
|
|
||||||
if codec:
|
|
||||||
x = x.encode(codec, 'xmlcharrefreplace')
|
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
def bbox2str(bbox):
|
def bbox2str(bbox):
|
||||||
|
|
Loading…
Reference in New Issue