From 44653071c3de365b9ee4bf951d00e47c5edd686c Mon Sep 17 00:00:00 2001 From: Jordan Reiter Date: Wed, 27 Mar 2013 13:05:29 -0400 Subject: [PATCH 1/2] Fixes for LZW error (see https://bitbucket.org/hsoft/pdfminer3k/commits/ae9a4ca0691a/) --- pdfminer/lzw.py | 23 +++++++++++++++++------ pdfminer/pdfinterp.py | 5 ++++- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/pdfminer/lzw.py b/pdfminer/lzw.py index 6d0eaf1..5a1a668 100644 --- a/pdfminer/lzw.py +++ b/pdfminer/lzw.py @@ -6,6 +6,10 @@ except ImportError: from StringIO import StringIO +class CorruptDataError(Exception): + pass + + ## LZWDecoder ## class LZWDecoder(object): @@ -46,12 +50,12 @@ class LZWDecoder(object): return v def feed(self, code): - x = '' + x = b'' if code == 256: - self.table = [ chr(c) for c in xrange(256) ] # 0-255 + self.table = [bytes([i]) for i in range(256)] # 0-255 self.table.append(None) # 256 self.table.append(None) # 257 - self.prevbuf = '' + self.prevbuf = b'' self.nbits = 9 elif code == 257: pass @@ -60,10 +64,12 @@ class LZWDecoder(object): else: if code < len(self.table): x = self.table[code] - self.table.append(self.prevbuf+x[0]) - else: - self.table.append(self.prevbuf+self.prevbuf[0]) + self.table.append(self.prevbuf+x[:1]) + elif code == len(self.table): + self.table.append(self.prevbuf+self.prevbuf[:1]) x = self.table[code] + else: + raise CorruptDataError() l = len(self.table) if l == 511: self.nbits = 10 @@ -81,6 +87,11 @@ class LZWDecoder(object): except EOFError: break x = self.feed(code) + try: + x = self.feed(code) + except CorruptDataError: + # just ignore corrupt data and stop yielding there + break yield x if self.debug: print >>sys.stderr, ('nbits=%d, code=%d, output=%r, table=%r' % diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index 3b0c789..c0c7ed2 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -217,7 +217,10 @@ class PDFContentParser(PSStackParser): self.istream += 1 else: raise PSEOF('Unexpected EOF, file truncated?') - self.fp = StringIO(strm.get_data()) + data = strm.get_data() + if isinstance(data, bytes): + data = data.decode('latin-1') + self.fp = io.StringIO(data) return def seek(self, pos): From e28b75a462ccc189c93ef8a7a9a806966fb51583 Mon Sep 17 00:00:00 2001 From: Jordan Reiter Date: Wed, 27 Mar 2013 13:14:58 -0400 Subject: [PATCH 2/2] StringIO --- pdfminer/pdfinterp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index c0c7ed2..15f3ee0 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -220,7 +220,7 @@ class PDFContentParser(PSStackParser): data = strm.get_data() if isinstance(data, bytes): data = data.decode('latin-1') - self.fp = io.StringIO(data) + self.fp = StringIO(data) return def seek(self, pos):