lzw support added.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@21 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
b1163b69bb
commit
6183469f11
10
TODO
10
TODO
|
@ -1,9 +1,9 @@
|
|||
TODO:
|
||||
- Code Documentation.
|
||||
TODOs:
|
||||
- Documentation.
|
||||
- Error handling for invalid type.
|
||||
- PDF Writers.
|
||||
|
||||
- Outlines.
|
||||
- Named Objects. (pages)
|
||||
- Writers.
|
||||
- Linearized PDF.
|
||||
- Encryption?
|
||||
- Any special treatments for linearized PDFs?
|
||||
- More encryption support? (no way...)
|
||||
|
|
|
@ -0,0 +1,97 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
stderr = sys.stderr
|
||||
|
||||
## LZWDecoder
|
||||
##
|
||||
class LZWDecoder:
|
||||
|
||||
def __init__(self, fp, debug=0):
|
||||
self.fp = fp
|
||||
self.debug = debug
|
||||
self.buff = 0
|
||||
self.bpos = 8
|
||||
self.nbits = 9
|
||||
self.table = None
|
||||
self.prevbuf = None
|
||||
return
|
||||
|
||||
def readbits(self, bits):
|
||||
bits0 = bits
|
||||
v = 0
|
||||
while 1:
|
||||
# the number of remaining bits we can get from the current buffer.
|
||||
r = 8-self.bpos
|
||||
if bits <= r:
|
||||
# |-----8-bits-----|
|
||||
# |-bpos-|-bits-| |
|
||||
# | |----r----|
|
||||
v = (v<<bits) | ((self.buff>>(r-bits)) & ((1<<bits)-1))
|
||||
self.bpos += bits
|
||||
break
|
||||
else:
|
||||
# |-----8-bits-----|
|
||||
# |-bpos-|---bits----...
|
||||
# | |----r----|
|
||||
v = (v<<r) | (self.buff & ((1<<r)-1))
|
||||
bits -= r
|
||||
x = self.fp.read(1)
|
||||
if not x: raise EOFError
|
||||
self.buff = ord(x)
|
||||
self.bpos = 0
|
||||
return v
|
||||
|
||||
def feed(self, code):
|
||||
x = ''
|
||||
if code == 256:
|
||||
self.table = [ chr(c) for c in xrange(256) ] # 0-255
|
||||
self.table.append(None) # 256
|
||||
self.table.append(None) # 257
|
||||
self.prevbuf = ''
|
||||
self.nbits = 9
|
||||
elif code == 257:
|
||||
pass
|
||||
elif not self.prevbuf:
|
||||
x = self.prevbuf = self.table[code]
|
||||
else:
|
||||
if code < len(self.table):
|
||||
x = self.table[code]
|
||||
self.table.append(self.prevbuf+x[0])
|
||||
else:
|
||||
self.table.append(self.prevbuf+self.prevbuf[0])
|
||||
x = self.table[code]
|
||||
l = len(self.table)
|
||||
if l == 511:
|
||||
self.nbits = 10
|
||||
elif l == 1023:
|
||||
self.nbits = 11
|
||||
elif l == 2047:
|
||||
self.nbits = 12
|
||||
self.prevbuf = x
|
||||
return x
|
||||
|
||||
def run(self):
|
||||
while 1:
|
||||
try:
|
||||
code = self.readbits(self.nbits)
|
||||
except EOFError:
|
||||
break
|
||||
x = self.feed(code)
|
||||
yield x
|
||||
if self.debug:
|
||||
print >>stderr, ('nbits=%d, code=%d, output=%r, table=%r' %
|
||||
(self.nbits, code, x, self.table[258:]))
|
||||
return
|
||||
|
||||
|
||||
def main(argv):
|
||||
import StringIO
|
||||
input = '\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01'
|
||||
fp = StringIO.StringIO(input)
|
||||
expected = '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42'
|
||||
output = ''.join(LZWDecoder(fp, debug=1).run())
|
||||
print (input, expected, output)
|
||||
print output == expected
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
13
pdf2txt.py
13
pdf2txt.py
|
@ -83,10 +83,13 @@ class TextItem:
|
|||
(self.matrix, self.font, self.size, self.width, self.text))
|
||||
|
||||
def dump(self, outfp, codec):
|
||||
def e(x):
|
||||
x = x.replace('&','&').replace('>','>').replace('<','<')
|
||||
return x.encode(codec, 'xmlcharrefreplace')
|
||||
(a,b,c,d,tx,ty) = self.matrix
|
||||
outfp.write('<text x="%.3f" y="%.3f" font=%r size="%.3f" width="%.3f">' %
|
||||
(tx, ty, self.font.fontname, self.size, self.width))
|
||||
outfp.write(self.text.encode(codec, 'xmlcharrefreplace'))
|
||||
outfp.write('<text x="%.3f" y="%.3f" font="%s" size="%.3f" width="%.3f">' %
|
||||
(tx, ty, e(self.font.fontname), self.size, self.width))
|
||||
outfp.write(e(self.text))
|
||||
outfp.write('</text>\n')
|
||||
return
|
||||
|
||||
|
@ -182,10 +185,10 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
|
|||
def main(argv):
|
||||
import getopt
|
||||
def usage():
|
||||
print 'usage: %s [-d] [-c codec] [-p pages] file ...' % argv[0]
|
||||
print 'usage: %s [-d] [-c codec] [-p pages] [-o output] file ...' % argv[0]
|
||||
return 100
|
||||
try:
|
||||
(opts, args) = getopt.getopt(argv[1:], 'dp:c:')
|
||||
(opts, args) = getopt.getopt(argv[1:], 'dp:c:o:')
|
||||
except getopt.GetoptError:
|
||||
return usage()
|
||||
if not args: return usage()
|
||||
|
|
|
@ -9,6 +9,7 @@ import md5, struct
|
|||
stderr = sys.stderr
|
||||
from utils import choplist, nunpack
|
||||
from arcfour import Arcfour
|
||||
from lzw import LZWDecoder
|
||||
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
|
||||
PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
|
||||
literal_name, keyword_name, \
|
||||
|
@ -208,7 +209,11 @@ class PDFStream:
|
|||
# will get errors if the document is encrypted.
|
||||
data = zlib.decompress(data)
|
||||
elif f == LITERAL_LZW_DECODE:
|
||||
raise PDFNotImplementedError('LZWDecode is currently unsupported.')
|
||||
try:
|
||||
from cStringIO import StringIO
|
||||
except ImportError:
|
||||
from StringIO import StringIO
|
||||
data = ''.join(LZWDecoder(StringIO(data)).run())
|
||||
elif f == LITERAL_CRYPT:
|
||||
raise PDFEncryptionError
|
||||
else:
|
||||
|
|
Loading…
Reference in New Issue