From f8510edffc4611992a79c22850eea3c1335edbf7 Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Wed, 8 Apr 2009 10:55:01 +0000 Subject: [PATCH] AsciiHexDecode filter patch incorporated. Thanks to Troy Bollinger. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@86 1aa58f4a-7d42-0410-adbc-911cccaed67c --- pdflib/ascii85.py | 38 ++++++++++++++++++++++++++++++++++---- pdflib/pdftypes.py | 4 ++++ tools/dumppdf.py | 3 ++- 3 files changed, 40 insertions(+), 5 deletions(-) diff --git a/pdflib/ascii85.py b/pdflib/ascii85.py index 19e9374..16ddb41 100644 --- a/pdflib/ascii85.py +++ b/pdflib/ascii85.py @@ -1,13 +1,12 @@ #!/usr/bin/env python # -# ASCII85 decoder (Adobe version) implementation +# ASCII85/ASCIIHex decoder (Adobe version) implementation # * public domain * # -import struct - # ascii85decode(data) def ascii85decode(data): + import struct n = b = 0 out = '' for c in data: @@ -28,6 +27,34 @@ def ascii85decode(data): break return out +# asciihexdecode(data) +def asciihexdecode(data): + """ + ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1 + For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the + ASCIIHexDecode filter produces one byte of binary data. All white-space + characters are ignored. A right angle bracket character (>) indicates + EOD. Any other characters will cause an error. If the filter encounters + the EOD marker after reading an odd number of hexadecimal digits, it + will behave as if a 0 followed the last digit. + >>> asciihexdecode("61 62 2e6364 65") + 'ab.cde' + >>> asciihexdecode("61 62 2e6364 657>") + 'ab.cdep' + >>> asciihexdecode("7>") + 'p' + """ + import re + hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE) + trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE) + decode = (lambda hx: chr(int(hx, 16))) + out = map(decode, hex_re.findall(data)) + m = trail_re.search(data) + if m: + out.append(decode("%c0" % m.group(1))) + return ''.join(out) + + # test # sample taken from: http://en.wikipedia.org/w/index.php?title=Ascii85 if __name__ == '__main__': @@ -44,4 +71,7 @@ if __name__ == '__main__': 'continued and indefatigable generation of knowledge, exceeds the short vehemence of '\ 'any carnal pleasure.' assert ascii85decode(orig) == data - print 'test succeeded' + print 'ascii85decode test succeeded' + + import doctest + doctest.testmod() diff --git a/pdflib/pdftypes.py b/pdflib/pdftypes.py index 2b85f7a..2f085fa 100644 --- a/pdflib/pdftypes.py +++ b/pdflib/pdftypes.py @@ -10,6 +10,7 @@ LITERAL_CRYPT = PSLiteralTable.intern('Crypt') LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'), PSLiteralTable.intern('Fl')) LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'), PSLiteralTable.intern('LZW')) LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'), PSLiteralTable.intern('A85')) +LITERALS_ASCIIHEX_DECODE = (PSLiteralTable.intern('ASCIIHexDecode'), PSLiteralTable.intern('AHx')) ## PDF Objects @@ -199,6 +200,9 @@ class PDFStream(PDFObject): elif f in LITERALS_ASCII85_DECODE: import ascii85 data = ascii85.ascii85decode(data) + elif f in LITERALS_ASCIIHEX_DECODE: + import ascii85 + data = ascii85.asciihexdecode(data) elif f == LITERAL_CRYPT: raise PDFNotImplementedError('/Crypt filter is unsupported') else: diff --git a/tools/dumppdf.py b/tools/dumppdf.py index 61b2aef..f650848 100755 --- a/tools/dumppdf.py +++ b/tools/dumppdf.py @@ -139,7 +139,8 @@ def dumppdf(outfp, fname, objids, pagenos, password='', if (not objids) and (not pagenos) and (not dumpall): dumptrailers(outfp, doc) fp.close() - outfp.write('\n') + if codec not in ('raw','binary'): + outfp.write('\n') return