diff --git a/pdfminer/lzw.py b/pdfminer/lzw.py index 6d0eaf1..5a1a668 100644 --- a/pdfminer/lzw.py +++ b/pdfminer/lzw.py @@ -6,6 +6,10 @@ except ImportError: from StringIO import StringIO +class CorruptDataError(Exception): + pass + + ## LZWDecoder ## class LZWDecoder(object): @@ -46,12 +50,12 @@ class LZWDecoder(object): return v def feed(self, code): - x = '' + x = b'' if code == 256: - self.table = [ chr(c) for c in xrange(256) ] # 0-255 + self.table = [bytes([i]) for i in range(256)] # 0-255 self.table.append(None) # 256 self.table.append(None) # 257 - self.prevbuf = '' + self.prevbuf = b'' self.nbits = 9 elif code == 257: pass @@ -60,10 +64,12 @@ class LZWDecoder(object): else: if code < len(self.table): x = self.table[code] - self.table.append(self.prevbuf+x[0]) - else: - self.table.append(self.prevbuf+self.prevbuf[0]) + self.table.append(self.prevbuf+x[:1]) + elif code == len(self.table): + self.table.append(self.prevbuf+self.prevbuf[:1]) x = self.table[code] + else: + raise CorruptDataError() l = len(self.table) if l == 511: self.nbits = 10 @@ -81,6 +87,11 @@ class LZWDecoder(object): except EOFError: break x = self.feed(code) + try: + x = self.feed(code) + except CorruptDataError: + # just ignore corrupt data and stop yielding there + break yield x if self.debug: print >>sys.stderr, ('nbits=%d, code=%d, output=%r, table=%r' % diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index 3b0c789..c0c7ed2 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -217,7 +217,10 @@ class PDFContentParser(PSStackParser): self.istream += 1 else: raise PSEOF('Unexpected EOF, file truncated?') - self.fp = StringIO(strm.get_data()) + data = strm.get_data() + if isinstance(data, bytes): + data = data.decode('latin-1') + self.fp = io.StringIO(data) return def seek(self, pos):