Python 3.4 compatibility + tests

pull/1/head
unknown 2014-09-04 09:36:19 +02:00
parent 29c07ea770
commit 4ab48d1803
14 changed files with 911 additions and 925 deletions

View File

@ -7,5 +7,4 @@ install:
- pip install six
- pip install pycrypto
script:
- make test
- nosetests
nosetests

View File

@ -55,12 +55,5 @@ $(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz: $(CMAPDST)
$(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt
test: cmap
$(PYTHON) -m doctest \
pdfminer/arcfour.py \
pdfminer/lzw.py \
pdfminer/ascii85.py \
pdfminer/runlength.py \
pdfminer/rijndael.py
$(PYTHON) -m pdfminer.ccitt
$(PYTHON) -m pdfminer.psparser
nosetests
cd samples && $(MAKE) test

View File

@ -1,31 +1,22 @@
#!/usr/bin/env python
""" Python implementation of Arcfour encryption algorithm.
See https://en.wikipedia.org/wiki/RC4
This code is in the public domain.
"""
import six # Python 2+3 compatibility
## Arcfour
##
class Arcfour(object):
"""
>>> Arcfour(b'Key').process(b'Plaintext').encode('hex')
'bbf316e8d940af0ad3'
>>> Arcfour(b'Wiki').process(b'pedia').encode('hex')
'1021bf0420'
>>> Arcfour(b'Secret').process(b'Attack at dawn').encode('hex')
'45a01f645fc35b383552544b9bf5'
"""
def __init__(self, key):
s = range(256)
s = [i for i in range(256)] #because Py3 range is not indexable
j = 0
klen = len(key)
for i in xrange(256):
j = (j + s[i] + ord(key[i % klen])) % 256
for i in range(256):
j = (j + s[i] + six.indexbytes(key,i % klen)) % 256
(s[i], s[j]) = (s[j], s[i])
self.s = s
(self.i, self.j) = (0, 0)
@ -35,20 +26,14 @@ class Arcfour(object):
(i, j) = (self.i, self.j)
s = self.s
r = b''
for c in data:
for c in six.iterbytes(data):
i = (i+1) % 256
j = (j+s[i]) % 256
(s[i], s[j]) = (s[j], s[i])
k = s[(s[i]+s[j]) % 256]
r += chr(ord(c) ^ k)
r += six.int2byte(c ^ k)
(self.i, self.j) = (i, j)
return r
encrypt = decrypt = process
new = Arcfour
# test
if __name__ == '__main__':
import doctest
doctest.testmod()

View File

@ -23,13 +23,6 @@ def ascii85decode(data):
The Adobe's ASCII85 implementation is slightly different from
its original in handling the last characters.
The sample string is taken from:
http://en.wikipedia.org/w/index.php?title=Ascii85
>>> ascii85decode(b'9jqo^BlbD-BleB1DJ+*+F(f,q')
'Man is distinguished'
>>> ascii85decode(b'E,9)oF*2M7/c~>')
'pleasure.'
"""
n = b = 0
out = b''
@ -53,8 +46,8 @@ def ascii85decode(data):
return out
# asciihexdecode(data)
hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE)
trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
hex_re = re.compile(b'([a-f\d]{2})', re.IGNORECASE)
trail_re = re.compile(b'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
def asciihexdecode(data):
@ -66,22 +59,16 @@ def asciihexdecode(data):
EOD. Any other characters will cause an error. If the filter encounters
the EOD marker after reading an odd number of hexadecimal digits, it
will behave as if a 0 followed the last digit.
>>> asciihexdecode(b'61 62 2e6364 65')
'ab.cde'
>>> asciihexdecode(b'61 62 2e6364 657>')
'ab.cdep'
>>> asciihexdecode(b'7>')
'p'
"""
decode = (lambda hx: chr(int(hx, 16)))
out = map(decode, hex_re.findall(data))
def decode(x):
i=int(x,16)
return six.int2byte(i)
out=b''
for x in hex_re.findall(data):
out+=decode(x)
m = trail_re.search(data)
if m:
out.append(decode('%c0' % m.group(1)))
return b''.join(out)
if __name__ == '__main__':
import doctest
doctest.testmod()
out+=decode(m.group(1)+b'0')
return out

View File

@ -463,10 +463,10 @@ class CCITTG4Parser(BitParser):
x0 = max(0, self._curpos)
x1 = max(0, min(self.width, x1))
if x1 < x0:
for x in xrange(x1, x0):
for x in range(x1, x0):
self._curline[x] = self._color
elif x0 < x1:
for x in xrange(x0, x1):
for x in range(x0, x1):
self._curline[x] = self._color
self._curpos = x1
self._color = 1-self._color
@ -496,7 +496,7 @@ class CCITTG4Parser(BitParser):
self._refline[x1] == self._color):
break
x1 += 1
for x in xrange(self._curpos, x1):
for x in range(self._curpos, x1):
self._curline[x] = self._color
self._curpos = x1
return
@ -506,12 +506,12 @@ class CCITTG4Parser(BitParser):
if self._curpos < 0:
self._curpos = 0
x = self._curpos
for _ in xrange(n1):
for _ in range(n1):
if len(self._curline) <= x:
break
self._curline[x] = self._color
x += 1
for _ in xrange(n2):
for _ in range(n2):
if len(self._curline) <= x:
break
self._curline[x] = 1-self._color
@ -527,184 +527,8 @@ class CCITTG4Parser(BitParser):
self._flush_line()
return
import unittest
## Test cases
##
class TestCCITTG4Parser(unittest.TestCase):
def get_parser(self, bits):
parser = CCITTG4Parser(len(bits))
parser._curline = [int(c) for c in bits]
parser._reset_line()
return parser
def test_b1(self):
parser = self.get_parser('00000')
parser._do_vertical(0)
self.assertEqual(parser._curpos, 0)
return
def test_b2(self):
parser = self.get_parser('10000')
parser._do_vertical(-1)
self.assertEqual(parser._curpos, 0)
return
def test_b3(self):
parser = self.get_parser('000111')
parser._do_pass()
self.assertEqual(parser._curpos, 3)
self.assertEqual(parser._get_bits(), '111')
return
def test_b4(self):
parser = self.get_parser('00000')
parser._do_vertical(+2)
self.assertEqual(parser._curpos, 2)
self.assertEqual(parser._get_bits(), '11')
return
def test_b5(self):
parser = self.get_parser('11111111100')
parser._do_horizontal(0, 3)
self.assertEqual(parser._curpos, 3)
parser._do_vertical(1)
self.assertEqual(parser._curpos, 10)
self.assertEqual(parser._get_bits(), '0001111111')
return
def test_e1(self):
parser = self.get_parser('10000')
parser._do_vertical(0)
self.assertEqual(parser._curpos, 1)
parser._do_vertical(0)
self.assertEqual(parser._curpos, 5)
self.assertEqual(parser._get_bits(), '10000')
return
def test_e2(self):
parser = self.get_parser('10011')
parser._do_vertical(0)
self.assertEqual(parser._curpos, 1)
parser._do_vertical(2)
self.assertEqual(parser._curpos, 5)
self.assertEqual(parser._get_bits(), '10000')
return
def test_e3(self):
parser = self.get_parser('011111')
parser._color = 0
parser._do_vertical(0)
self.assertEqual(parser._color, 1)
self.assertEqual(parser._curpos, 1)
parser._do_vertical(-2)
self.assertEqual(parser._color, 0)
self.assertEqual(parser._curpos, 4)
parser._do_vertical(0)
self.assertEqual(parser._curpos, 6)
self.assertEqual(parser._get_bits(), '011100')
return
def test_e4(self):
parser = self.get_parser('10000')
parser._do_vertical(0)
self.assertEqual(parser._curpos, 1)
parser._do_vertical(-2)
self.assertEqual(parser._curpos, 3)
parser._do_vertical(0)
self.assertEqual(parser._curpos, 5)
self.assertEqual(parser._get_bits(), '10011')
return
def test_e5(self):
parser = self.get_parser('011000')
parser._color = 0
parser._do_vertical(0)
self.assertEqual(parser._curpos, 1)
parser._do_vertical(3)
self.assertEqual(parser._curpos, 6)
self.assertEqual(parser._get_bits(), '011111')
return
def test_e6(self):
parser = self.get_parser('11001')
parser._do_pass()
self.assertEqual(parser._curpos, 4)
parser._do_vertical(0)
self.assertEqual(parser._curpos, 5)
self.assertEqual(parser._get_bits(), '11111')
return
def test_e7(self):
parser = self.get_parser('0000000000')
parser._curpos = 2
parser._color = 1
parser._do_horizontal(2, 6)
self.assertEqual(parser._curpos, 10)
self.assertEqual(parser._get_bits(), '1111000000')
return
def test_e8(self):
parser = self.get_parser('001100000')
parser._curpos = 1
parser._color = 0
parser._do_vertical(0)
self.assertEqual(parser._curpos, 2)
parser._do_horizontal(7, 0)
self.assertEqual(parser._curpos, 9)
self.assertEqual(parser._get_bits(), '101111111')
return
def test_m1(self):
parser = self.get_parser('10101')
parser._do_pass()
self.assertEqual(parser._curpos, 2)
parser._do_pass()
self.assertEqual(parser._curpos, 4)
self.assertEqual(parser._get_bits(), '1111')
return
def test_m2(self):
parser = self.get_parser('101011')
parser._do_vertical(-1)
parser._do_vertical(-1)
parser._do_vertical(1)
parser._do_horizontal(1, 1)
self.assertEqual(parser._get_bits(), '011101')
return
def test_m3(self):
parser = self.get_parser('10111011')
parser._do_vertical(-1)
parser._do_pass()
parser._do_vertical(1)
parser._do_vertical(1)
self.assertEqual(parser._get_bits(), '00000001')
return
## CCITTFaxDecoder
##
def test___init__(self):
# c_citt_g4_parser = CCITTG4Parser(width, bytealign)
raise SkipTest # TODO: implement your test here
def test_feedbytes(self):
# c_citt_g4_parser = CCITTG4Parser(width, bytealign)
# assert_equal(expected, c_citt_g4_parser.feedbytes(data))
raise SkipTest # TODO: implement your test here
def test_output_line(self):
# c_citt_g4_parser = CCITTG4Parser(width, bytealign)
# assert_equal(expected, c_citt_g4_parser.output_line(y, bits))
raise SkipTest # TODO: implement your test here
def test_reset(self):
# c_citt_g4_parser = CCITTG4Parser(width, bytealign)
# assert_equal(expected, c_citt_g4_parser.reset())
raise SkipTest # TODO: implement your test here
class CCITTFaxDecoder(CCITTG4Parser):

View File

@ -96,14 +96,6 @@ class LZWDecoder(object):
# lzwdecode
def lzwdecode(data):
"""
>>> lzwdecode(b'\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01')
'\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42'
"""
fp = BytesIO(data)
s=LZWDecoder(fp).run()
return b''.join(s)
if __name__ == '__main__':
import doctest
doctest.testmod()

View File

@ -28,6 +28,8 @@ from .utils import nunpack
from .utils import choplist
from .utils import isnumber
import six #Python 2+3 compatibility
def get_widths(seq):
widths = {}
@ -492,7 +494,7 @@ class PDFFont(object):
return False
def decode(self, bytes):
return map(ord, bytes)
return [six.indexbytes(bytes, i) for i,_ in enumerate(bytes)] # map(ord, bytes)
def get_ascent(self):
return self.ascent * self.vscale
@ -630,7 +632,7 @@ class PDFType3Font(PDFSimpleFont):
# PDFCIDFont
class PDFCIDFont(PDFFont):
def __init__(self, rsrcmgr, spec):
def __init__(self, rsrcmgr, spec, STRICT=False):
try:
self.basefont = literal_name(spec['BaseFont'])
except KeyError:
@ -684,10 +686,10 @@ class PDFCIDFont(PDFFont):
if self.vertical:
# writing mode: vertical
widths = get_widths2(list_value(spec.get('W2', [])))
self.disps = dict((cid, (vx, vy)) for (cid, (_, (vx, vy))) in widths.iteritems())
self.disps = dict((cid, (vx, vy)) for (cid, (_, (vx, vy))) in six.iteritems(widths))
(vy, w) = spec.get('DW2', [880, -1000])
self.default_disp = (None, vy)
widths = dict((cid, w) for (cid, (w, _)) in widths.iteritems())
widths = dict((cid, w) for (cid, (w, _)) in six.iteritems(widths))
default_width = w
else:
# writing mode: horizontal

File diff suppressed because it is too large Load Diff

View File

@ -6,6 +6,8 @@
# * public domain *
#
import six #Python 2+3 compatibility
def rldecode(data):
"""
RunLength decoder (Adobe version) implementation based on PDF Reference
@ -19,30 +21,24 @@ def rldecode(data):
129 to 255, the following single byte is to be copied 257 - length
(2 to 128) times during decompression. A length value of 128
denotes EOD.
>>> s = b'\x05123456\xfa7\x04abcde\x80junk'
>>> rldecode(s)
'1234567777777abcde'
"""
decoded = []
decoded = b''
i = 0
while i < len(data):
#print 'data[%d]=:%d:' % (i,ord(data[i]))
length = ord(data[i])
length = six.indexbytes(data,i)
if length == 128:
break
if length >= 0 and length < 128:
run = data[i+1:(i+1)+(length+1)]
for j in range(i+1,(i+1)+(length+1)):
decoded+=six.int2byte(six.indexbytes(data,j))
#print 'length=%d, run=%s' % (length+1,run)
decoded.append(run)
i = (i+1) + (length+1)
if length > 128:
run = data[i+1]*(257-length)
run = six.int2byte(six.indexbytes(data,i+1))*(257-length)
#print 'length=%d, run=%s' % (257-length,run)
decoded.append(run)
decoded+=run
i = (i+1) + 1
return b''.join(decoded)
return decoded
if __name__ == '__main__':
import doctest
doctest.testmod()

View File

@ -0,0 +1,167 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
from nose.tools import assert_equal, assert_true, assert_false
from nose import SkipTest
import nose
import logging
from pdfminer.ccitt import *
## Test cases
##
class TestCCITTG4Parser():
def get_parser(self, bits):
parser = CCITTG4Parser(len(bits))
parser._curline = [int(c) for c in bits]
parser._reset_line()
return parser
def test_b1(self):
parser = self.get_parser('00000')
parser._do_vertical(0)
assert_equal(parser._curpos, 0)
return
def test_b2(self):
parser = self.get_parser('10000')
parser._do_vertical(-1)
assert_equal(parser._curpos, 0)
return
def test_b3(self):
parser = self.get_parser('000111')
parser._do_pass()
assert_equal(parser._curpos, 3)
assert_equal(parser._get_bits(), '111')
return
def test_b4(self):
parser = self.get_parser('00000')
parser._do_vertical(+2)
assert_equal(parser._curpos, 2)
assert_equal(parser._get_bits(), '11')
return
def test_b5(self):
parser = self.get_parser('11111111100')
parser._do_horizontal(0, 3)
assert_equal(parser._curpos, 3)
parser._do_vertical(1)
assert_equal(parser._curpos, 10)
assert_equal(parser._get_bits(), '0001111111')
return
def test_e1(self):
parser = self.get_parser('10000')
parser._do_vertical(0)
assert_equal(parser._curpos, 1)
parser._do_vertical(0)
assert_equal(parser._curpos, 5)
assert_equal(parser._get_bits(), '10000')
return
def test_e2(self):
parser = self.get_parser('10011')
parser._do_vertical(0)
assert_equal(parser._curpos, 1)
parser._do_vertical(2)
assert_equal(parser._curpos, 5)
assert_equal(parser._get_bits(), '10000')
return
def test_e3(self):
parser = self.get_parser('011111')
parser._color = 0
parser._do_vertical(0)
assert_equal(parser._color, 1)
assert_equal(parser._curpos, 1)
parser._do_vertical(-2)
assert_equal(parser._color, 0)
assert_equal(parser._curpos, 4)
parser._do_vertical(0)
assert_equal(parser._curpos, 6)
assert_equal(parser._get_bits(), '011100')
return
def test_e4(self):
parser = self.get_parser('10000')
parser._do_vertical(0)
assert_equal(parser._curpos, 1)
parser._do_vertical(-2)
assert_equal(parser._curpos, 3)
parser._do_vertical(0)
assert_equal(parser._curpos, 5)
assert_equal(parser._get_bits(), '10011')
return
def test_e5(self):
parser = self.get_parser('011000')
parser._color = 0
parser._do_vertical(0)
assert_equal(parser._curpos, 1)
parser._do_vertical(3)
assert_equal(parser._curpos, 6)
assert_equal(parser._get_bits(), '011111')
return
def test_e6(self):
parser = self.get_parser('11001')
parser._do_pass()
assert_equal(parser._curpos, 4)
parser._do_vertical(0)
assert_equal(parser._curpos, 5)
assert_equal(parser._get_bits(), '11111')
return
def test_e7(self):
parser = self.get_parser('0000000000')
parser._curpos = 2
parser._color = 1
parser._do_horizontal(2, 6)
assert_equal(parser._curpos, 10)
assert_equal(parser._get_bits(), '1111000000')
return
def test_e8(self):
parser = self.get_parser('001100000')
parser._curpos = 1
parser._color = 0
parser._do_vertical(0)
assert_equal(parser._curpos, 2)
parser._do_horizontal(7, 0)
assert_equal(parser._curpos, 9)
assert_equal(parser._get_bits(), '101111111')
return
def test_m1(self):
parser = self.get_parser('10101')
parser._do_pass()
assert_equal(parser._curpos, 2)
parser._do_pass()
assert_equal(parser._curpos, 4)
assert_equal(parser._get_bits(), '1111')
return
def test_m2(self):
parser = self.get_parser('101011')
parser._do_vertical(-1)
parser._do_vertical(-1)
parser._do_vertical(1)
parser._do_horizontal(1, 1)
assert_equal(parser._get_bits(), '011101')
return
def test_m3(self):
parser = self.get_parser('10111011')
parser._do_vertical(-1)
parser._do_pass()
parser._do_vertical(1)
parser._do_vertical(1)
assert_equal(parser._get_bits(), '00000001')
return
if __name__ == '__main__':
nose.runmodule()

View File

@ -0,0 +1,52 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
from nose.tools import assert_equal
from nose import SkipTest
import nose
#test of various compression/encoding modules (previously in doctests):
from pdfminer.ascii85 import *
from pdfminer.arcfour import *
from pdfminer.lzw import *
from pdfminer.runlength import *
from pdfminer.rijndael import *
import binascii
def hex(b): return binascii.hexlify(b) #encode('hex')
def dehex(b): return binascii.unhexlify(b) #decode('hex')
class TestAscii85():
def test_ascii85decode(self):
#The sample string is taken from: http://en.wikipedia.org/w/index.php?title=Ascii85
assert_equal(ascii85decode(b'9jqo^BlbD-BleB1DJ+*+F(f,q'),b'Man is distinguished')
assert_equal(ascii85decode(b'E,9)oF*2M7/c~>'),b'pleasure.')
def test_asciihexdecode(self):
assert_equal(asciihexdecode(b'61 62 2e6364 65'),b'ab.cde')
assert_equal(asciihexdecode(b'61 62 2e6364 657>'),b'ab.cdep')
assert_equal(asciihexdecode(b'7>'),b'p')
class TestArcfour():
def test(self):
assert_equal(hex(Arcfour(b'Key').process(b'Plaintext')),b'bbf316e8d940af0ad3')
assert_equal(hex(Arcfour(b'Wiki').process(b'pedia')),b'1021bf0420')
assert_equal(hex(Arcfour(b'Secret').process(b'Attack at dawn')),b'45a01f645fc35b383552544b9bf5')
class TestLzw():
def test_lzwdecode(self):
assert_equal(lzwdecode(b'\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01'),b'\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42')
class TestRunlength():
def test_rldecode(self):
assert_equal(rldecode(b'\x05123456\xfa7\x04abcde\x80junk'),b'1234567777777abcde')
class TestRijndaelEncryptor():
def test_RijndaelEncryptor(self):
key = dehex(b'00010203050607080a0b0c0d0f101112')
plaintext = dehex(b'506812a45f08c889b97f5980038b8359')
assert_equal(hex(RijndaelEncryptor(key, 128).encrypt(plaintext)),b'd8f532538289ef7d06b506a4fd5be9c9')
if __name__ == '__main__':
nose.runmodule()

View File

@ -102,6 +102,6 @@ func/a/b{(c)do*}def
return
if __name__ == '__main__':
import logging,sys,os,six
logging.basicConfig(level=logging.DEBUG, filename='%s_%d.%d.log'%(os.path.basename(__file__),sys.version_info[0],sys.version_info[1]))
#import logging,sys,os,six
#logging.basicConfig(level=logging.DEBUG, filename='%s_%d.%d.log'%(os.path.basename(__file__),sys.version_info[0],sys.version_info[1]))
nose.runmodule()

View File

@ -1,7 +1,7 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
import unittest, logging, os
import nose, logging, os
import tools.dumppdf as dumppdf
@ -16,7 +16,7 @@ def run(datapath,filename,options=None):
s='dumppdf -o%s %s'%(o,i)
dumppdf.main(s.split(' '))
class TestDumpPDF(unittest.TestCase):
class TestDumpPDF():
def test_1(self):
@ -41,6 +41,6 @@ class TestDumpPDF(unittest.TestCase):
run('../samples/nonfree/','naacl06-shinyama','-t -a')
if __name__ == '__main__':
import logging,sys,os,six
logging.basicConfig(level=logging.DEBUG, filename='%s_%d.%d.log'%(os.path.basename(__file__),sys.version_info[0],sys.version_info[1]))
unittest.main()
#import logging,sys,os,six
#logging.basicConfig(level=logging.DEBUG, filename='%s_%d.%d.log'%(os.path.basename(__file__),sys.version_info[0],sys.version_info[1]))
nose.runmodule()

View File

@ -1,7 +1,7 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
import unittest, logging, os
import nose, logging, os
import tools.pdf2txt as pdf2txt
@ -16,7 +16,7 @@ def run(datapath,filename,options=None):
s='pdf2txt -o%s %s'%(o,i)
pdf2txt.main(s.split(' '))
class TestDumpPDF(unittest.TestCase):
class TestDumpPDF():
def test_1(self):
@ -41,4 +41,4 @@ class TestDumpPDF(unittest.TestCase):
run('../samples/nonfree/','naacl06-shinyama')
if __name__ == '__main__':
unittest.main()
nose.runmodule()