From 7093bdbdfa1537f18bda7b7cffbadec987ff889b Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Thu, 24 Dec 2009 11:51:43 +0000 Subject: [PATCH] Added RunLengthDecode filter by Troy Bollinger. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@167 1aa58f4a-7d42-0410-adbc-911cccaed67c --- pdfminer/pdftypes.py | 5 +++++ pdfminer/runlength.py | 50 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 pdfminer/runlength.py diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index 4abd307..b102b1d 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -3,6 +3,7 @@ import sys import zlib from lzw import lzwdecode from ascii85 import ascii85decode, asciihexdecode +from runlength import rldecode from psparser import PSException, PSObject from psparser import LIT, KWD, STRICT @@ -11,6 +12,7 @@ LITERALS_FLATE_DECODE = (LIT('FlateDecode'), LIT('Fl')) LITERALS_LZW_DECODE = (LIT('LZWDecode'), LIT('LZW')) LITERALS_ASCII85_DECODE = (LIT('ASCII85Decode'), LIT('A85')) LITERALS_ASCIIHEX_DECODE = (LIT('ASCIIHexDecode'), LIT('AHx')) +LITERALS_RUNLENGTH_DECODE = (LIT('RunLengthDecode'), LIT('RL')) ## PDF Objects @@ -196,7 +198,10 @@ class PDFStream(PDFObject): data = ascii85decode(data) elif f in LITERALS_ASCIIHEX_DECODE: data = asciihexdecode(data) + elif f in LITERALS_RUNLENGTH_DECODE: + data = rldecode(data) elif f == LITERAL_CRYPT: + # not yet.. raise PDFNotImplementedError('/Crypt filter is unsupported') else: raise PDFNotImplementedError('Unsupported filter: %r' % f) diff --git a/pdfminer/runlength.py b/pdfminer/runlength.py new file mode 100644 index 0000000..e17389e --- /dev/null +++ b/pdfminer/runlength.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python +# +# RunLength decoder (Adobe version) implementation based on PDF Reference +# version 1.4 section 3.3.4. +# +# * public domain * +# + +import sys + +def rldecode(data): + """ + RunLength decoder (Adobe version) implementation based on PDF Reference + version 1.4 section 3.3.4: + The RunLengthDecode filter decodes data that has been encoded in a + simple byte-oriented format based on run length. The encoded data + is a sequence of runs, where each run consists of a length byte + followed by 1 to 128 bytes of data. If the length byte is in the + range 0 to 127, the following length + 1 (1 to 128) bytes are + copied literally during decompression. If length is in the range + 129 to 255, the following single byte is to be copied 257 - length + (2 to 128) times during decompression. A length value of 128 + denotes EOD. + >>> s = "\x05123456\xfa7\x04abcde\x80junk" + >>> rldecode(s) + '1234567777777abcde' + """ + decoded = [] + i=0 + while i < len(data): + #print "data[%d]=:%d:" % (i,ord(data[i])) + length = ord(data[i]) + if length == 128: + break + if length >= 0 and length < 128: + run = data[i+1:(i+1)+(length+1)] + #print "length=%d, run=%s" % (length+1,run) + decoded.append(run) + i = (i+1) + (length+1) + if length > 128: + run = data[i+1]*(257-length) + #print "length=%d, run=%s" % (257-length,run) + decoded.append(run) + i = (i+1) + 1 + return ''.join(decoded) + + +if __name__ == '__main__': + import doctest + doctest.testmod()