More docstrings.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@151 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-11-04 11:28:32 +00:00
parent 827c606f82
commit 6bc2bebb5b
4 changed files with 56 additions and 55 deletions

View File

@ -1,12 +1,33 @@
#!/usr/bin/env python
#
# ASCII85/ASCIIHex decoder (Adobe version) implementation
# * public domain *
#
""" Python implementation of ASCII85/ASCIIHex decoder (Adobe version).
This code is in the public domain.
"""
import re
import struct
# ascii85decode(data)
def ascii85decode(data):
import struct
"""
In ASCII85 encoding, every four bytes are encoded with five ASCII
letters, using 85 different types of characters (as 256**4 < 85**5).
When the length of the original bytes is not a multiple of 4, a special
rule is used for round up.
The Adobe's ASCII85 implementation is slightly different from
its original in handling the last characters.
The sample string is taken from:
http://en.wikipedia.org/w/index.php?title=Ascii85
>>> ascii85decode('9jqo^BlbD-BleB1DJ+*+F(f,q')
'Man is distinguished'
>>> ascii85decode('E,9)oF*2M7/c~>')
'pleasure.'
"""
n = b = 0
out = ''
for c in data:
@ -28,6 +49,8 @@ def ascii85decode(data):
return out
# asciihexdecode(data)
hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE)
trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
def asciihexdecode(data):
"""
ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
@ -37,16 +60,14 @@ def asciihexdecode(data):
EOD. Any other characters will cause an error. If the filter encounters
the EOD marker after reading an odd number of hexadecimal digits, it
will behave as if a 0 followed the last digit.
>>> asciihexdecode("61 62 2e6364 65")
>>> asciihexdecode('61 62 2e6364 65')
'ab.cde'
>>> asciihexdecode("61 62 2e6364 657>")
>>> asciihexdecode('61 62 2e6364 657>')
'ab.cdep'
>>> asciihexdecode("7>")
>>> asciihexdecode('7>')
'p'
"""
import re
hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE)
trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
decode = (lambda hx: chr(int(hx, 16)))
out = map(decode, hex_re.findall(data))
m = trail_re.search(data)
@ -55,23 +76,6 @@ def asciihexdecode(data):
return ''.join(out)
# test
# sample taken from: http://en.wikipedia.org/w/index.php?title=Ascii85
if __name__ == '__main__':
orig = r'''
9jqo^BlbD-BleB1DJ+*+F(f,q/0JhKF<GL>Cj@.4Gp$d7F!,L7@<6@)/0JDEF<G%<+EV:2F!,
O<DJ+*.@<*K0@<6L(Df-\0Ec5e;DffZ(EZee.Bl.9pF"AGXBPCsi+DGm>@3BB/F*&OCAfu2/AKY
i(DIb:@FD,*)+C]U=@3BN#EcYf8ATD3s@q?d$AftVqCh[NqF<G:8+EV:.+Cf>-FD5W8ARlolDIa
l(DId<j@<?3r@:F%a+D58'ATD4$Bl@l3De:,-DJs`8ARoFb/0JMK@qB4^F!,R<AKZ&-DfTqBG%G
>uD.RTpAKYo'+CT/5+Cei#DII?(E,9)oF*2M7/c~>
'''
data = \
'Man is distinguished, not only by his reason, but by this singular passion from '\
'other animals, which is a lust of the mind, that by a perseverance of delight in the '\
'continued and indefatigable generation of knowledge, exceeds the short vehemence of '\
'any carnal pleasure.'
assert ascii85decode(orig) == data
print 'ascii85decode test succeeded'
import doctest
doctest.testmod()

View File

@ -1,6 +1,10 @@
#!/usr/bin/env python
import sys
from sys import stderr
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
## LZWDecoder
@ -84,16 +88,15 @@ class LZWDecoder(object):
(self.nbits, code, x, self.table[258:]))
return
# lzwdecode
def lzwdecode(data):
"""
>>> lzwdecode('\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01')
'\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42'
"""
fp = StringIO(data)
return ''.join(LZWDecoder(fp).run())
def main(argv):
import StringIO
data = '\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01'
fp = StringIO.StringIO(data)
expected = '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42'
LZWDecoder.debug = 1
output = ''.join(LZWDecoder(fp).run())
print (data, expected, output)
print output == expected
return 0
if __name__ == '__main__': sys.exit(main(sys.argv))
if __name__ == '__main__':
import doctest
doctest.testmod()

View File

@ -4,6 +4,10 @@ import re
import md5
import struct
from sys import stderr
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
from psparser import PSStackParser
from psparser import PSSyntaxError, PSEOF
from psparser import PSLiteralTable, PSKeywordTable
@ -695,10 +699,6 @@ class PDFParser(PSStackParser):
class PDFObjStrmParser(PDFParser):
def __init__(self, doc, data):
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
PDFParser.__init__(self, doc, StringIO(data))
return

View File

@ -1,7 +1,8 @@
#!/usr/bin/env python
import sys
import zlib
from lzw import LZWDecoder
from lzw import lzwdecode
from ascii85 import ascii85decode, asciihexdecode
from psparser import PSException
from psparser import PSObject, PSLiteral, PSKeyword
from psparser import PSLiteralTable, PSKeywordTable
@ -163,7 +164,6 @@ class PDFStream(PDFObject):
return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
def decomp(self,data):
import zlib
buf = data
# some FlateDecode streams have garbage (newlines, etc) appended to the
# end. remove chars from the end to try and decompress the buffer
@ -194,17 +194,11 @@ class PDFStream(PDFObject):
# will get errors if the document is encrypted.
data = self.decomp(data)
elif f in LITERALS_LZW_DECODE:
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
data = ''.join(LZWDecoder(StringIO(data)).run())
data = lzwdecode(data)
elif f in LITERALS_ASCII85_DECODE:
import ascii85
data = ascii85.ascii85decode(data)
data = ascii85decode(data)
elif f in LITERALS_ASCIIHEX_DECODE:
import ascii85
data = ascii85.asciihexdecode(data)
data = asciihexdecode(data)
elif f == LITERAL_CRYPT:
raise PDFNotImplementedError('/Crypt filter is unsupported')
else: