From 9439a3a31a347836aad1c1226168156125d9505f Mon Sep 17 00:00:00 2001 From: Andrew Baumann <0xabu@users.noreply.github.com> Date: Mon, 6 Feb 2017 05:57:01 -0800 Subject: [PATCH] Miscellaneous bug fixes (#47) * utils.decode_text: fix "TypeError: ord() expected string of length 1, but int found" fixes https://github.com/goulu/pdfminer/issues/24 * pdfinterp.execute: don't assume that every keyword name can be decoded as utf-8 fixes "'str' does not support the buffer interface", https://github.com/goulu/pdfminer/issues/23 * default settings.STRICT to False, for compatibility with the original pdfminer * PDFCIDFont: handle font registry/orderings that may be PDFObjRefs * utils.nunpack: handle 8-byte integers --- pdfminer/pdffont.py | 4 ++-- pdfminer/psparser.py | 5 +---- pdfminer/settings.py | 2 +- pdfminer/utils.py | 6 ++++-- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 00361fa..ae18f0b 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -640,8 +640,8 @@ class PDFCIDFont(PDFFont): raise PDFFontError('BaseFont is missing') self.basefont = 'unknown' self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {})) - self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', b'unknown').decode("latin1"), - self.cidsysteminfo.get('Ordering', b'unknown').decode("latin1")) + self.cidcoding = '%s-%s' % (resolve1(self.cidsysteminfo.get('Registry', b'unknown')).decode("latin1"), + resolve1(self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1")) try: name = literal_name(spec['Encoding']) except KeyError: diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index 644a8c0..6ebd583 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -160,10 +160,7 @@ def keyword_name(x): else: name=x.name if six.PY3: - try: - name = str(name,'utf-8') - except: - pass + name = str(name,'utf-8','ignore') return name diff --git a/pdfminer/settings.py b/pdfminer/settings.py index 344d066..2dd99c0 100644 --- a/pdfminer/settings.py +++ b/pdfminer/settings.py @@ -1,4 +1,4 @@ -STRICT = True +STRICT = False try: from django.conf import settings diff --git a/pdfminer/utils.py b/pdfminer/utils.py index a6ccabe..50e6447 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -211,7 +211,7 @@ def choplist(n, seq): # nunpack def nunpack(s, default=0): - """Unpacks 1 to 4 byte integers (big endian).""" + """Unpacks 1 to 4 or 8 byte integers (big endian).""" l = len(s) if not l: return default @@ -223,6 +223,8 @@ def nunpack(s, default=0): return struct.unpack('>L', b'\x00'+s)[0] elif l == 4: return struct.unpack('>L', s)[0] + elif l == 8: + return struct.unpack('>Q', s)[0] else: raise TypeError('invalid length: %d' % l) @@ -269,7 +271,7 @@ def decode_text(s): if s.startswith(b'\xfe\xff'): return six.text_type(s[2:], 'utf-16be', 'ignore') else: - return ''.join(PDFDocEncoding[ord(c)] for c in s) + return ''.join(PDFDocEncoding[c] for c in s) # enc