From 6183469f119a18f7c6db02390cf63c05fffe547c Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Sun, 27 Apr 2008 04:27:09 +0000 Subject: [PATCH] lzw support added. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@21 1aa58f4a-7d42-0410-adbc-911cccaed67c --- TODO | 10 +++--- lzw.py | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++++ pdf2txt.py | 13 ++++--- pdfparser.py | 7 +++- 4 files changed, 116 insertions(+), 11 deletions(-) create mode 100755 lzw.py diff --git a/TODO b/TODO index 7658243..3e788f9 100644 --- a/TODO +++ b/TODO @@ -1,9 +1,9 @@ -TODO: - - Code Documentation. +TODOs: + - Documentation. - Error handling for invalid type. + - PDF Writers. - Outlines. - Named Objects. (pages) - - Writers. - - Linearized PDF. - - Encryption? + - Any special treatments for linearized PDFs? + - More encryption support? (no way...) diff --git a/lzw.py b/lzw.py new file mode 100755 index 0000000..8081a48 --- /dev/null +++ b/lzw.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python +import sys +stderr = sys.stderr + +## LZWDecoder +## +class LZWDecoder: + + def __init__(self, fp, debug=0): + self.fp = fp + self.debug = debug + self.buff = 0 + self.bpos = 8 + self.nbits = 9 + self.table = None + self.prevbuf = None + return + + def readbits(self, bits): + bits0 = bits + v = 0 + while 1: + # the number of remaining bits we can get from the current buffer. + r = 8-self.bpos + if bits <= r: + # |-----8-bits-----| + # |-bpos-|-bits-| | + # | |----r----| + v = (v<>(r-bits)) & ((1<>stderr, ('nbits=%d, code=%d, output=%r, table=%r' % + (self.nbits, code, x, self.table[258:])) + return + + +def main(argv): + import StringIO + input = '\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01' + fp = StringIO.StringIO(input) + expected = '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42' + output = ''.join(LZWDecoder(fp, debug=1).run()) + print (input, expected, output) + print output == expected + return 0 + +if __name__ == '__main__': sys.exit(main(sys.argv)) diff --git a/pdf2txt.py b/pdf2txt.py index da15ea0..b5aa353 100755 --- a/pdf2txt.py +++ b/pdf2txt.py @@ -83,10 +83,13 @@ class TextItem: (self.matrix, self.font, self.size, self.width, self.text)) def dump(self, outfp, codec): + def e(x): + x = x.replace('&','&').replace('>','>').replace('<','<') + return x.encode(codec, 'xmlcharrefreplace') (a,b,c,d,tx,ty) = self.matrix - outfp.write('' % - (tx, ty, self.font.fontname, self.size, self.width)) - outfp.write(self.text.encode(codec, 'xmlcharrefreplace')) + outfp.write('' % + (tx, ty, e(self.font.fontname), self.size, self.width)) + outfp.write(e(self.text)) outfp.write('\n') return @@ -182,10 +185,10 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0): def main(argv): import getopt def usage(): - print 'usage: %s [-d] [-c codec] [-p pages] file ...' % argv[0] + print 'usage: %s [-d] [-c codec] [-p pages] [-o output] file ...' % argv[0] return 100 try: - (opts, args) = getopt.getopt(argv[1:], 'dp:c:') + (opts, args) = getopt.getopt(argv[1:], 'dp:c:o:') except getopt.GetoptError: return usage() if not args: return usage() diff --git a/pdfparser.py b/pdfparser.py index 97093c2..f75e1b2 100755 --- a/pdfparser.py +++ b/pdfparser.py @@ -9,6 +9,7 @@ import md5, struct stderr = sys.stderr from utils import choplist, nunpack from arcfour import Arcfour +from lzw import LZWDecoder from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \ PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \ literal_name, keyword_name, \ @@ -208,7 +209,11 @@ class PDFStream: # will get errors if the document is encrypted. data = zlib.decompress(data) elif f == LITERAL_LZW_DECODE: - raise PDFNotImplementedError('LZWDecode is currently unsupported.') + try: + from cStringIO import StringIO + except ImportError: + from StringIO import StringIO + data = ''.join(LZWDecoder(StringIO(data)).run()) elif f == LITERAL_CRYPT: raise PDFEncryptionError else: