Added RunLengthDecode filter by Troy Bollinger.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@167 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-12-24 11:51:43 +00:00
parent 6590ad42f5
commit 7093bdbdfa
2 changed files with 55 additions and 0 deletions

View File

@ -3,6 +3,7 @@ import sys
import zlib import zlib
from lzw import lzwdecode from lzw import lzwdecode
from ascii85 import ascii85decode, asciihexdecode from ascii85 import ascii85decode, asciihexdecode
from runlength import rldecode
from psparser import PSException, PSObject from psparser import PSException, PSObject
from psparser import LIT, KWD, STRICT from psparser import LIT, KWD, STRICT
@ -11,6 +12,7 @@ LITERALS_FLATE_DECODE = (LIT('FlateDecode'), LIT('Fl'))
LITERALS_LZW_DECODE = (LIT('LZWDecode'), LIT('LZW')) LITERALS_LZW_DECODE = (LIT('LZWDecode'), LIT('LZW'))
LITERALS_ASCII85_DECODE = (LIT('ASCII85Decode'), LIT('A85')) LITERALS_ASCII85_DECODE = (LIT('ASCII85Decode'), LIT('A85'))
LITERALS_ASCIIHEX_DECODE = (LIT('ASCIIHexDecode'), LIT('AHx')) LITERALS_ASCIIHEX_DECODE = (LIT('ASCIIHexDecode'), LIT('AHx'))
LITERALS_RUNLENGTH_DECODE = (LIT('RunLengthDecode'), LIT('RL'))
## PDF Objects ## PDF Objects
@ -196,7 +198,10 @@ class PDFStream(PDFObject):
data = ascii85decode(data) data = ascii85decode(data)
elif f in LITERALS_ASCIIHEX_DECODE: elif f in LITERALS_ASCIIHEX_DECODE:
data = asciihexdecode(data) data = asciihexdecode(data)
elif f in LITERALS_RUNLENGTH_DECODE:
data = rldecode(data)
elif f == LITERAL_CRYPT: elif f == LITERAL_CRYPT:
# not yet..
raise PDFNotImplementedError('/Crypt filter is unsupported') raise PDFNotImplementedError('/Crypt filter is unsupported')
else: else:
raise PDFNotImplementedError('Unsupported filter: %r' % f) raise PDFNotImplementedError('Unsupported filter: %r' % f)

50
pdfminer/runlength.py Normal file
View File

@ -0,0 +1,50 @@
#!/usr/bin/env python
#
# RunLength decoder (Adobe version) implementation based on PDF Reference
# version 1.4 section 3.3.4.
#
# * public domain *
#
import sys
def rldecode(data):
"""
RunLength decoder (Adobe version) implementation based on PDF Reference
version 1.4 section 3.3.4:
The RunLengthDecode filter decodes data that has been encoded in a
simple byte-oriented format based on run length. The encoded data
is a sequence of runs, where each run consists of a length byte
followed by 1 to 128 bytes of data. If the length byte is in the
range 0 to 127, the following length + 1 (1 to 128) bytes are
copied literally during decompression. If length is in the range
129 to 255, the following single byte is to be copied 257 - length
(2 to 128) times during decompression. A length value of 128
denotes EOD.
>>> s = "\x05123456\xfa7\x04abcde\x80junk"
>>> rldecode(s)
'1234567777777abcde'
"""
decoded = []
i=0
while i < len(data):
#print "data[%d]=:%d:" % (i,ord(data[i]))
length = ord(data[i])
if length == 128:
break
if length >= 0 and length < 128:
run = data[i+1:(i+1)+(length+1)]
#print "length=%d, run=%s" % (length+1,run)
decoded.append(run)
i = (i+1) + (length+1)
if length > 128:
run = data[i+1]*(257-length)
#print "length=%d, run=%s" % (257-length,run)
decoded.append(run)
i = (i+1) + 1
return ''.join(decoded)
if __name__ == '__main__':
import doctest
doctest.testmod()