AsciiHexDecode filter patch incorporated. Thanks to Troy Bollinger.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@86 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-04-08 10:55:01 +00:00
parent d11012d9f7
commit f8510edffc
3 changed files with 40 additions and 5 deletions

View File

@ -1,13 +1,12 @@
#!/usr/bin/env python #!/usr/bin/env python
# #
# ASCII85 decoder (Adobe version) implementation # ASCII85/ASCIIHex decoder (Adobe version) implementation
# * public domain * # * public domain *
# #
import struct
# ascii85decode(data) # ascii85decode(data)
def ascii85decode(data): def ascii85decode(data):
import struct
n = b = 0 n = b = 0
out = '' out = ''
for c in data: for c in data:
@ -28,6 +27,34 @@ def ascii85decode(data):
break break
return out return out
# asciihexdecode(data)
def asciihexdecode(data):
"""
ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the
ASCIIHexDecode filter produces one byte of binary data. All white-space
characters are ignored. A right angle bracket character (>) indicates
EOD. Any other characters will cause an error. If the filter encounters
the EOD marker after reading an odd number of hexadecimal digits, it
will behave as if a 0 followed the last digit.
>>> asciihexdecode("61 62 2e6364 65")
'ab.cde'
>>> asciihexdecode("61 62 2e6364 657>")
'ab.cdep'
>>> asciihexdecode("7>")
'p'
"""
import re
hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE)
trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
decode = (lambda hx: chr(int(hx, 16)))
out = map(decode, hex_re.findall(data))
m = trail_re.search(data)
if m:
out.append(decode("%c0" % m.group(1)))
return ''.join(out)
# test # test
# sample taken from: http://en.wikipedia.org/w/index.php?title=Ascii85 # sample taken from: http://en.wikipedia.org/w/index.php?title=Ascii85
if __name__ == '__main__': if __name__ == '__main__':
@ -44,4 +71,7 @@ if __name__ == '__main__':
'continued and indefatigable generation of knowledge, exceeds the short vehemence of '\ 'continued and indefatigable generation of knowledge, exceeds the short vehemence of '\
'any carnal pleasure.' 'any carnal pleasure.'
assert ascii85decode(orig) == data assert ascii85decode(orig) == data
print 'test succeeded' print 'ascii85decode test succeeded'
import doctest
doctest.testmod()

View File

@ -10,6 +10,7 @@ LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'), PSLiteralTable.intern('Fl')) LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'), PSLiteralTable.intern('Fl'))
LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'), PSLiteralTable.intern('LZW')) LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'), PSLiteralTable.intern('LZW'))
LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'), PSLiteralTable.intern('A85')) LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'), PSLiteralTable.intern('A85'))
LITERALS_ASCIIHEX_DECODE = (PSLiteralTable.intern('ASCIIHexDecode'), PSLiteralTable.intern('AHx'))
## PDF Objects ## PDF Objects
@ -199,6 +200,9 @@ class PDFStream(PDFObject):
elif f in LITERALS_ASCII85_DECODE: elif f in LITERALS_ASCII85_DECODE:
import ascii85 import ascii85
data = ascii85.ascii85decode(data) data = ascii85.ascii85decode(data)
elif f in LITERALS_ASCIIHEX_DECODE:
import ascii85
data = ascii85.asciihexdecode(data)
elif f == LITERAL_CRYPT: elif f == LITERAL_CRYPT:
raise PDFNotImplementedError('/Crypt filter is unsupported') raise PDFNotImplementedError('/Crypt filter is unsupported')
else: else:

View File

@ -139,6 +139,7 @@ def dumppdf(outfp, fname, objids, pagenos, password='',
if (not objids) and (not pagenos) and (not dumpall): if (not objids) and (not pagenos) and (not dumpall):
dumptrailers(outfp, doc) dumptrailers(outfp, doc)
fp.close() fp.close()
if codec not in ('raw','binary'):
outfp.write('\n') outfp.write('\n')
return return