From ecc4d056754a528706f7bf43f8754c448ea2dd4b Mon Sep 17 00:00:00 2001 From: speedplane Date: Tue, 11 Nov 2014 23:34:33 -0500 Subject: [PATCH] Fix a unicode conversion bug. See https://github.com/euske/pdfminer/issues/75 --- pdfminer/psparser.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index c1ebe93..be715af 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -343,7 +343,15 @@ class PSBaseParser(object): self.hex = b'' self._parse1 = self._parse_literal_hex return j+1 - self._add_token(LIT(unicode(self._curtoken))) + + try: + # Try to interpret the token as a utf-8 string + utoken = self._curtoken.decode('utf-8') + except UnicodeDecodeError: + # We failed, there is possibly a corrupt PDF here. + if STRICT: raise + utoken = "" + self._add_token(LIT(utoken)) self._parse1 = self._parse_main return j