lzw support added.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@21 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2008-04-27 04:27:09 +00:00
parent b1163b69bb
commit 6183469f11
4 changed files with 116 additions and 11 deletions

10
TODO
View File

@ -1,9 +1,9 @@
TODO:
- Code Documentation.
TODOs:
- Documentation.
- Error handling for invalid type.
- PDF Writers.
- Outlines.
- Named Objects. (pages)
- Writers.
- Linearized PDF.
- Encryption?
- Any special treatments for linearized PDFs?
- More encryption support? (no way...)

97
lzw.py Executable file
View File

@ -0,0 +1,97 @@
#!/usr/bin/env python
import sys
stderr = sys.stderr
## LZWDecoder
##
class LZWDecoder:
def __init__(self, fp, debug=0):
self.fp = fp
self.debug = debug
self.buff = 0
self.bpos = 8
self.nbits = 9
self.table = None
self.prevbuf = None
return
def readbits(self, bits):
bits0 = bits
v = 0
while 1:
# the number of remaining bits we can get from the current buffer.
r = 8-self.bpos
if bits <= r:
# |-----8-bits-----|
# |-bpos-|-bits-| |
# | |----r----|
v = (v<<bits) | ((self.buff>>(r-bits)) & ((1<<bits)-1))
self.bpos += bits
break
else:
# |-----8-bits-----|
# |-bpos-|---bits----...
# | |----r----|
v = (v<<r) | (self.buff & ((1<<r)-1))
bits -= r
x = self.fp.read(1)
if not x: raise EOFError
self.buff = ord(x)
self.bpos = 0
return v
def feed(self, code):
x = ''
if code == 256:
self.table = [ chr(c) for c in xrange(256) ] # 0-255
self.table.append(None) # 256
self.table.append(None) # 257
self.prevbuf = ''
self.nbits = 9
elif code == 257:
pass
elif not self.prevbuf:
x = self.prevbuf = self.table[code]
else:
if code < len(self.table):
x = self.table[code]
self.table.append(self.prevbuf+x[0])
else:
self.table.append(self.prevbuf+self.prevbuf[0])
x = self.table[code]
l = len(self.table)
if l == 511:
self.nbits = 10
elif l == 1023:
self.nbits = 11
elif l == 2047:
self.nbits = 12
self.prevbuf = x
return x
def run(self):
while 1:
try:
code = self.readbits(self.nbits)
except EOFError:
break
x = self.feed(code)
yield x
if self.debug:
print >>stderr, ('nbits=%d, code=%d, output=%r, table=%r' %
(self.nbits, code, x, self.table[258:]))
return
def main(argv):
import StringIO
input = '\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01'
fp = StringIO.StringIO(input)
expected = '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42'
output = ''.join(LZWDecoder(fp, debug=1).run())
print (input, expected, output)
print output == expected
return 0
if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -83,10 +83,13 @@ class TextItem:
(self.matrix, self.font, self.size, self.width, self.text))
def dump(self, outfp, codec):
def e(x):
x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;')
return x.encode(codec, 'xmlcharrefreplace')
(a,b,c,d,tx,ty) = self.matrix
outfp.write('<text x="%.3f" y="%.3f" font=%r size="%.3f" width="%.3f">' %
(tx, ty, self.font.fontname, self.size, self.width))
outfp.write(self.text.encode(codec, 'xmlcharrefreplace'))
outfp.write('<text x="%.3f" y="%.3f" font="%s" size="%.3f" width="%.3f">' %
(tx, ty, e(self.font.fontname), self.size, self.width))
outfp.write(e(self.text))
outfp.write('</text>\n')
return
@ -182,10 +185,10 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
def main(argv):
import getopt
def usage():
print 'usage: %s [-d] [-c codec] [-p pages] file ...' % argv[0]
print 'usage: %s [-d] [-c codec] [-p pages] [-o output] file ...' % argv[0]
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:c:')
(opts, args) = getopt.getopt(argv[1:], 'dp:c:o:')
except getopt.GetoptError:
return usage()
if not args: return usage()

View File

@ -9,6 +9,7 @@ import md5, struct
stderr = sys.stderr
from utils import choplist, nunpack
from arcfour import Arcfour
from lzw import LZWDecoder
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
literal_name, keyword_name, \
@ -208,7 +209,11 @@ class PDFStream:
# will get errors if the document is encrypted.
data = zlib.decompress(data)
elif f == LITERAL_LZW_DECODE:
raise PDFNotImplementedError('LZWDecode is currently unsupported.')
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
data = ''.join(LZWDecoder(StringIO(data)).run())
elif f == LITERAL_CRYPT:
raise PDFEncryptionError
else: