Miscellaneous bug fixes (#47)

* utils.decode_text: fix "TypeError: ord() expected string of length 1, but int found"

fixes https://github.com/goulu/pdfminer/issues/24

* pdfinterp.execute: don't assume that every keyword name can be decoded as utf-8

fixes "'str' does not support the buffer interface", https://github.com/goulu/pdfminer/issues/23

* default settings.STRICT to False, for compatibility with the original pdfminer

* PDFCIDFont: handle font registry/orderings that may be PDFObjRefs

* utils.nunpack: handle 8-byte integers
pull/55/head
Andrew Baumann 2017-02-06 05:57:01 -08:00 committed by Goulu
parent fd63dbf62e
commit 9439a3a31a
4 changed files with 8 additions and 9 deletions

View File

@ -640,8 +640,8 @@ class PDFCIDFont(PDFFont):
raise PDFFontError('BaseFont is missing')
self.basefont = 'unknown'
self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', b'unknown').decode("latin1"),
self.cidsysteminfo.get('Ordering', b'unknown').decode("latin1"))
self.cidcoding = '%s-%s' % (resolve1(self.cidsysteminfo.get('Registry', b'unknown')).decode("latin1"),
resolve1(self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1"))
try:
name = literal_name(spec['Encoding'])
except KeyError:

View File

@ -160,10 +160,7 @@ def keyword_name(x):
else:
name=x.name
if six.PY3:
try:
name = str(name,'utf-8')
except:
pass
name = str(name,'utf-8','ignore')
return name

View File

@ -1,4 +1,4 @@
STRICT = True
STRICT = False
try:
from django.conf import settings

View File

@ -211,7 +211,7 @@ def choplist(n, seq):
# nunpack
def nunpack(s, default=0):
"""Unpacks 1 to 4 byte integers (big endian)."""
"""Unpacks 1 to 4 or 8 byte integers (big endian)."""
l = len(s)
if not l:
return default
@ -223,6 +223,8 @@ def nunpack(s, default=0):
return struct.unpack('>L', b'\x00'+s)[0]
elif l == 4:
return struct.unpack('>L', s)[0]
elif l == 8:
return struct.unpack('>Q', s)[0]
else:
raise TypeError('invalid length: %d' % l)
@ -269,7 +271,7 @@ def decode_text(s):
if s.startswith(b'\xfe\xff'):
return six.text_type(s[2:], 'utf-16be', 'ignore')
else:
return ''.join(PDFDocEncoding[ord(c)] for c in s)
return ''.join(PDFDocEncoding[c] for c in s)
# enc