lzw support added.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@21 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2008-04-27 04:27:09 +00:00
parent b1163b69bb
commit 6183469f11
4 changed files with 116 additions and 11 deletions

10
TODO
View File

@ -1,9 +1,9 @@
TODO: TODOs:
- Code Documentation. - Documentation.
- Error handling for invalid type. - Error handling for invalid type.
- PDF Writers.
- Outlines. - Outlines.
- Named Objects. (pages) - Named Objects. (pages)
- Writers. - Any special treatments for linearized PDFs?
- Linearized PDF. - More encryption support? (no way...)
- Encryption?

97
lzw.py Executable file
View File

@ -0,0 +1,97 @@
#!/usr/bin/env python
import sys
stderr = sys.stderr
## LZWDecoder
##
class LZWDecoder:
def __init__(self, fp, debug=0):
self.fp = fp
self.debug = debug
self.buff = 0
self.bpos = 8
self.nbits = 9
self.table = None
self.prevbuf = None
return
def readbits(self, bits):
bits0 = bits
v = 0
while 1:
# the number of remaining bits we can get from the current buffer.
r = 8-self.bpos
if bits <= r:
# |-----8-bits-----|
# |-bpos-|-bits-| |
# | |----r----|
v = (v<<bits) | ((self.buff>>(r-bits)) & ((1<<bits)-1))
self.bpos += bits
break
else:
# |-----8-bits-----|
# |-bpos-|---bits----...
# | |----r----|
v = (v<<r) | (self.buff & ((1<<r)-1))
bits -= r
x = self.fp.read(1)
if not x: raise EOFError
self.buff = ord(x)
self.bpos = 0
return v
def feed(self, code):
x = ''
if code == 256:
self.table = [ chr(c) for c in xrange(256) ] # 0-255
self.table.append(None) # 256
self.table.append(None) # 257
self.prevbuf = ''
self.nbits = 9
elif code == 257:
pass
elif not self.prevbuf:
x = self.prevbuf = self.table[code]
else:
if code < len(self.table):
x = self.table[code]
self.table.append(self.prevbuf+x[0])
else:
self.table.append(self.prevbuf+self.prevbuf[0])
x = self.table[code]
l = len(self.table)
if l == 511:
self.nbits = 10
elif l == 1023:
self.nbits = 11
elif l == 2047:
self.nbits = 12
self.prevbuf = x
return x
def run(self):
while 1:
try:
code = self.readbits(self.nbits)
except EOFError:
break
x = self.feed(code)
yield x
if self.debug:
print >>stderr, ('nbits=%d, code=%d, output=%r, table=%r' %
(self.nbits, code, x, self.table[258:]))
return
def main(argv):
import StringIO
input = '\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01'
fp = StringIO.StringIO(input)
expected = '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42'
output = ''.join(LZWDecoder(fp, debug=1).run())
print (input, expected, output)
print output == expected
return 0
if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -83,10 +83,13 @@ class TextItem:
(self.matrix, self.font, self.size, self.width, self.text)) (self.matrix, self.font, self.size, self.width, self.text))
def dump(self, outfp, codec): def dump(self, outfp, codec):
def e(x):
x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;')
return x.encode(codec, 'xmlcharrefreplace')
(a,b,c,d,tx,ty) = self.matrix (a,b,c,d,tx,ty) = self.matrix
outfp.write('<text x="%.3f" y="%.3f" font=%r size="%.3f" width="%.3f">' % outfp.write('<text x="%.3f" y="%.3f" font="%s" size="%.3f" width="%.3f">' %
(tx, ty, self.font.fontname, self.size, self.width)) (tx, ty, e(self.font.fontname), self.size, self.width))
outfp.write(self.text.encode(codec, 'xmlcharrefreplace')) outfp.write(e(self.text))
outfp.write('</text>\n') outfp.write('</text>\n')
return return
@ -182,10 +185,10 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
def main(argv): def main(argv):
import getopt import getopt
def usage(): def usage():
print 'usage: %s [-d] [-c codec] [-p pages] file ...' % argv[0] print 'usage: %s [-d] [-c codec] [-p pages] [-o output] file ...' % argv[0]
return 100 return 100
try: try:
(opts, args) = getopt.getopt(argv[1:], 'dp:c:') (opts, args) = getopt.getopt(argv[1:], 'dp:c:o:')
except getopt.GetoptError: except getopt.GetoptError:
return usage() return usage()
if not args: return usage() if not args: return usage()

View File

@ -9,6 +9,7 @@ import md5, struct
stderr = sys.stderr stderr = sys.stderr
from utils import choplist, nunpack from utils import choplist, nunpack
from arcfour import Arcfour from arcfour import Arcfour
from lzw import LZWDecoder
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \ from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \ PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
literal_name, keyword_name, \ literal_name, keyword_name, \
@ -208,7 +209,11 @@ class PDFStream:
# will get errors if the document is encrypted. # will get errors if the document is encrypted.
data = zlib.decompress(data) data = zlib.decompress(data)
elif f == LITERAL_LZW_DECODE: elif f == LITERAL_LZW_DECODE:
raise PDFNotImplementedError('LZWDecode is currently unsupported.') try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
data = ''.join(LZWDecoder(StringIO(data)).run())
elif f == LITERAL_CRYPT: elif f == LITERAL_CRYPT:
raise PDFEncryptionError raise PDFEncryptionError
else: else: