From e27cd54aff17673727ca5ad5ed3013e5fd581396 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Mon, 21 Mar 2022 19:20:42 +0100 Subject: [PATCH] Convert fontname to str if it is bytes in HTMLConverter (#734) * Convert fontname to str if it is bytes * Add CHANGELOG.md --- CHANGELOG.md | 11 ++++++++++- pdfminer/converter.py | 5 +++-- pdfminer/utils.py | 5 ++++- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 22b6c56..a8e7d9e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,17 +1,26 @@ # Changelog -All notable changes in pdfminer.six will be documented in this file. + +All notable changes in pdfminer.six will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [Unreleased] + +### Fixes + +- `TypeError` in HTMLConverter when using a bytes fontname ([#734](https://github.com/pdfminer/pdfminer.six/pull/734)) + ## [20220319] ### Added + - Export type annotations from pypi package per PEP561 ([#679](https://github.com/pdfminer/pdfminer.six/pull/679)) - Support for identity cmap's ([#626](https://github.com/pdfminer/pdfminer.six/pull/626)) - Add support for PDF page labels ([#680](https://github.com/pdfminer/pdfminer.six/pull/680)) - Installation of Pillow as an optional extra dependency ([#714](https://github.com/pdfminer/pdfminer.six/pull/714)) ### Fixed + - Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637)) - Regression (since 20191107) in `LTLayoutContainer.group_textboxes` that returned some text lines out of order ([#659](https://github.com/pdfminer/pdfminer.six/pull/659)) - Add handling of JPXDecode filter to enable extraction of images for some pdfs ([#645](https://github.com/pdfminer/pdfminer.six/pull/645)) diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 3516c78..a414799 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -40,7 +40,7 @@ from .pdffont import PDFUnicodeNotDefined from .pdfinterp import PDFGraphicState, PDFResourceManager from .pdfpage import PDFPage from .pdftypes import PDFStream -from .utils import AnyIO, Point, Matrix, Rect, PathSegment +from .utils import AnyIO, Point, Matrix, Rect, PathSegment, make_compat_str from .utils import apply_matrix_pt from .utils import bbox2str from .utils import enc @@ -633,7 +633,8 @@ class HTMLConverter(PDFConverter[AnyIO]): render(child) self.end_div("textbox") elif isinstance(item, LTChar): - self.put_text(item.get_text(), item.fontname, item.size) + fontname = make_compat_str(item.fontname) + self.put_text(item.get_text(), fontname, item.size) elif isinstance(item, LTText): self.write_text(item.get_text()) return diff --git a/pdfminer/utils.py b/pdfminer/utils.py index 6a35d34..3f6cca5 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -76,7 +76,10 @@ def make_compat_str(o: object) -> str: """Converts everything to string, if bytes guessing the encoding.""" if isinstance(o, bytes): enc = chardet.detect(o) - return o.decode(enc["encoding"]) + try: + return o.decode(enc["encoding"]) + except UnicodeDecodeError: + return str(o) else: return str(o)