From a6475b61b45e5badd329ab9375a421bc8f273e22 Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 3 Sep 2014 13:17:41 +0200 Subject: [PATCH] Python 3.4 support added and tested --- pdfminer/ascii85.py | 5 +- pdfminer/lzw.py | 7 ++- pdfminer/pdfdocument.py | 2 +- pdfminer/pdftypes.py | 5 +- pdfminer/utils.py | 4 +- setup.py | 3 + tests/test_pdfminer_psparser.py | 107 ++++++++++++++++++++++++++++++++ tests/test_tools_dumppdf.py | 46 ++++++++++++++ tools/dumppdf.py | 33 +++++----- 9 files changed, 188 insertions(+), 24 deletions(-) create mode 100644 tests/test_pdfminer_psparser.py create mode 100644 tests/test_tools_dumppdf.py diff --git a/pdfminer/ascii85.py b/pdfminer/ascii85.py index 067fccd..45c83ce 100644 --- a/pdfminer/ascii85.py +++ b/pdfminer/ascii85.py @@ -9,6 +9,8 @@ This code is in the public domain. import re import struct +import six #Python 2+3 compatibility + # ascii85decode(data) def ascii85decode(data): @@ -31,7 +33,8 @@ def ascii85decode(data): """ n = b = 0 out = b'' - for c in data: + for i in six.iterbytes(data): + c=six.int2byte(i) if b'!' <= c and c <= b'u': n += 1 b = b*85+(ord(c)-33) diff --git a/pdfminer/lzw.py b/pdfminer/lzw.py index aef3667..9a4303a 100644 --- a/pdfminer/lzw.py +++ b/pdfminer/lzw.py @@ -1,6 +1,8 @@ #!/usr/bin/env python from io import BytesIO +import six #Python 2+3 compatibility + class CorruptDataError(Exception): pass @@ -47,7 +49,7 @@ class LZWDecoder(object): def feed(self, code): x = b'' if code == 256: - self.table = [chr(c) for c in xrange(256)] # 0-255 + self.table = [six.int2byte(c) for c in range(256)] # 0-255 self.table.append(None) # 256 self.table.append(None) # 257 self.prevbuf = b'' @@ -99,7 +101,8 @@ def lzwdecode(data): '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42' """ fp = BytesIO(data) - return b''.join(LZWDecoder(fp).run()) + s=LZWDecoder(fp).run() + return b''.join(s) if __name__ == '__main__': import doctest diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py index a9fb717..c892414 100644 --- a/pdfminer/pdfdocument.py +++ b/pdfminer/pdfdocument.py @@ -180,7 +180,7 @@ class PDFXRefFallback(PDFXRef): logging.info('trailer: %r' % self.get_trailer()) break if six.PY3: - line=line.decode('utf-8') + line=line.decode('latin-1') #default pdf encoding m = self.PDFOBJ_CUE.match(line) if not m: continue diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index 0bb942c..18236a2 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -12,6 +12,7 @@ from .psparser import STRICT from .utils import apply_png_predictor from .utils import isnumber +import six #Python 2+3 compatibility LITERAL_CRYPT = LIT('Crypt') @@ -105,7 +106,7 @@ def decipher_all(decipher, objid, genno, x): if isinstance(x, list): x = [decipher_all(decipher, objid, genno, v) for v in x] elif isinstance(x, dict): - for (k, v) in x.iteritems(): + for (k, v) in six.iteritems(x): x[k] = decipher_all(decipher, objid, genno, v) return x @@ -140,7 +141,7 @@ def num_value(x): def str_value(x): x = resolve1(x) - if not isinstance(x, str): + if not isinstance(x, six.binary_type): if STRICT: raise PDFTypeError('String required: %r' % x) return '' diff --git a/pdfminer/utils.py b/pdfminer/utils.py index 7b1e6b3..2514949 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -17,7 +17,7 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data): i = 0 buf = b'' line0 = b'\x00' * columns - for i in xrange(0, len(data), nbytes+1): + for i in range(0, len(data), nbytes+1): ft = data[i] i += 1 line1 = data[i:i+nbytes] @@ -90,7 +90,7 @@ def apply_matrix_norm(m, v): # isnumber def isnumber(x): - return isinstance(x, (int, long, float)) + return isinstance(x, (six.integer_types, float)) # uniq def uniq(objs): diff --git a/setup.py b/setup.py index c9962fe..ed50fb3 100644 --- a/setup.py +++ b/setup.py @@ -31,6 +31,9 @@ PDF parser that can be used for other purposes instead of text analysis.''', ], keywords=['pdf parser', 'pdf converter', 'layout analysis', 'text mining'], classifiers=[ + 'Programming Language :: Python', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3.4', 'Development Status :: 4 - Beta', 'Environment :: Console', 'Intended Audience :: Developers', diff --git a/tests/test_pdfminer_psparser.py b/tests/test_pdfminer_psparser.py new file mode 100644 index 0000000..4188835 --- /dev/null +++ b/tests/test_pdfminer_psparser.py @@ -0,0 +1,107 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +from nose.tools import assert_equal, assert_true, assert_false +from nose import SkipTest +import nose + +import logging + +from pdfminer.psparser import * + +## Simplistic Test cases +## +class TestPSBaseParser: + + TESTDATA = br'''%!PS +begin end + " @ # +/a/BCD /Some_Name /foo#5f#xbaa +0 +1 -2 .5 1.234 +(abc) () (abc ( def ) ghi) +(def\040\0\0404ghi) (bach\\slask) (foo\nbaa) +(this % is not a comment.) +(foo +baa) +(foo\ +baa) +<> <20> < 40 4020 > + +func/a/b{(c)do*}def +[ 1 (z) ! ] +<< /foo (bar) >> +''' + + TOKENS = [ + (5, KWD(b'begin')), (11, KWD(b'end')), (16, KWD(b'"')), (19, KWD(b'@')), + (21, KWD(b'#')), (23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')), + (41, LIT('foo_xbaa')), (54, 0), (56, 1), (59, -2), (62, 0.5), + (65, 1.234), (71, b'abc'), (77, b''), (80, b'abc ( def ) ghi'), + (98, b'def \x00 4ghi'), (118, b'bach\\slask'), (132, b'foo\nbaa'), + (143, b'this % is not a comment.'), (170, b'foo\nbaa'), (180, b'foobaa'), + (191, b''), (194, b' '), (199, b'@@ '), (211, b'\xab\xcd\x00\x124\x05'), + (226, KWD(b'func')), (230, LIT('a')), (232, LIT('b')), + (234, KWD(b'{')), (235, b'c'), (238, KWD(b'do*')), (241, KWD(b'}')), + (242, KWD(b'def')), (246, KWD(b'[')), (248, 1), (250, b'z'), (254, KWD(b'!')), + (256, KWD(b']')), (258, KWD(b'<<')), (261, LIT('foo')), (266, b'bar'), + (272, KWD(b'>>')) + ] + + OBJS = [ + (23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')), + (41, LIT('foo_xbaa')), (54, 0), (56, 1), (59, -2), (62, 0.5), + (65, 1.234), (71, b'abc'), (77, b''), (80, b'abc ( def ) ghi'), + (98, b'def \x00 4ghi'), (118, b'bach\\slask'), (132, b'foo\nbaa'), + (143, b'this % is not a comment.'), (170, b'foo\nbaa'), (180, b'foobaa'), + (191, b''), (194, b' '), (199, b'@@ '), (211, b'\xab\xcd\x00\x124\x05'), + (230, LIT('a')), (232, LIT('b')), (234, [b'c']), (246, [1, b'z']), + (258, {'foo': b'bar'}), + ] + + def get_tokens(self, s): + from io import BytesIO + + class MyParser(PSBaseParser): + def flush(self): + self.add_results(*self.popall()) + parser = MyParser(BytesIO(s)) + r = [] + try: + while True: + r.append(parser.nexttoken()) + except PSEOF: + pass + return r + + def get_objects(self, s): + from io import BytesIO + + class MyParser(PSStackParser): + def flush(self): + self.add_results(*self.popall()) + parser = MyParser(BytesIO(s)) + r = [] + try: + while True: + r.append(parser.nextobject()) + except PSEOF: + pass + return r + + def test_1(self): + tokens = self.get_tokens(self.TESTDATA) + logging.info(tokens) + assert_equal(tokens, self.TOKENS) + return + + def test_2(self): + objs = self.get_objects(self.TESTDATA) + logging.info(objs) + assert_equal(objs, self.OBJS) + return + +if __name__ == '__main__': + import logging,sys,os,six + logging.basicConfig(level=logging.DEBUG, filename='%s_%d.%d.log'%(os.path.basename(__file__),sys.version_info[0],sys.version_info[1])) + nose.runmodule() \ No newline at end of file diff --git a/tests/test_tools_dumppdf.py b/tests/test_tools_dumppdf.py new file mode 100644 index 0000000..8eec67f --- /dev/null +++ b/tests/test_tools_dumppdf.py @@ -0,0 +1,46 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import unittest, logging, os + +import tools.dumppdf as dumppdf + +path=os.path.dirname(os.path.abspath(__file__))+'/' + +def run(datapath,filename,options=None): + i=path+datapath+filename+'.pdf' + o=path+filename+'.xml' + if options: + s='dumppdf -o%s %s %s'%(o,options,i) + else: + s='dumppdf -o%s %s'%(o,i) + dumppdf.main(s.split(' ')) + +class TestDumpPDF(unittest.TestCase): + + + def test_1(self): + run('../samples/','jo','-t -a') + run('../samples/','simple1','-t -a') + run('../samples/','simple2','-t -a') + run('../samples/','simple3','-t -a') + + def test_2(self): + run('../samples/nonfree/','dmca','-t -a') + + def test_3(self): + run('../samples/nonfree/','f1040nr') + + def test_4(self): + run('../samples/nonfree/','i1040nr') + + def test_5(self): + run('../samples/nonfree/','kampo','-t -a') + + def test_6(self): + run('../samples/nonfree/','naacl06-shinyama','-t -a') + +if __name__ == '__main__': + import logging,sys,os,six + logging.basicConfig(level=logging.DEBUG, filename='%s_%d.%d.log'%(os.path.basename(__file__),sys.version_info[0],sys.version_info[1])) + unittest.main() \ No newline at end of file diff --git a/tools/dumppdf.py b/tools/dumppdf.py index 29a1144..f33296a 100755 --- a/tools/dumppdf.py +++ b/tools/dumppdf.py @@ -6,7 +6,7 @@ # options: # -i objid : object id # -import sys, os.path, re +import sys, os.path, re, logging from pdfminer.psparser import PSKeyword, PSLiteral, LIT from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines @@ -18,8 +18,12 @@ from pdfminer.utils import isnumber ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]') def e(s): + if isinstance(s,six.binary_type): + s=str(s,'latin-1') return ESC_PAT.sub(lambda m:'&#%d;' % ord(m.group(0)), s) +import six # Python 2+3 compatibility + # dumpxml def dumpxml(out, obj, codec=None): @@ -29,7 +33,7 @@ def dumpxml(out, obj, codec=None): if isinstance(obj, dict): out.write('\n' % len(obj)) - for (k,v) in obj.iteritems(): + for (k,v) in six.iteritems(obj): out.write('%s\n' % k) out.write('') dumpxml(out, v) @@ -45,7 +49,7 @@ def dumpxml(out, obj, codec=None): out.write('') return - if isinstance(obj, str): + if isinstance(obj, (six.string_types, six.binary_type)): out.write('%s' % (len(obj), e(obj))) return @@ -113,7 +117,7 @@ def dumpallobjs(out, doc, codec=None): # dumpoutline def dumpoutline(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): - fp = file(fname, 'rb') + fp = open(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) pages = dict( (page.pageid, pageno) for (pageno,page) @@ -183,7 +187,7 @@ def extractembedded(outfp, fname, objids, pagenos, password='', out.close() return - fp = file(fname, 'rb') + fp = open(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) for xref in doc.xrefs: @@ -191,12 +195,13 @@ def extractembedded(outfp, fname, objids, pagenos, password='', obj = doc.getobj(objid) if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC: extract1(obj) + fp.close() return # dumppdf def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): - fp = file(fname, 'rb') + fp = open(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) if objids: @@ -229,22 +234,21 @@ def main(argv): print ('usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-E directory] [-i objid] file ...' % argv[0]) return 100 try: - (opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTE:i:') + (opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTE:i:o:') except getopt.GetoptError: return usage() if not args: return usage() - debug = 0 objids = [] pagenos = set() codec = None - password = '' + password = b'' dumpall = False proc = dumppdf outfp = sys.stdout extractdir = None for (k, v) in opts: - if k == '-d': debug += 1 - elif k == '-o': outfp = file(v, 'wb') + if k == '-d': logging.getLogger().setlevel(logging.DEBUG) + elif k == '-o': outfp = open(v, 'w') elif k == '-i': objids.extend( int(x) for x in v.split(',') ) elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-P': password = v @@ -256,13 +260,10 @@ def main(argv): elif k == '-E': extractdir = v proc = extractembedded - # - PDFDocument.debug = debug - PDFParser.debug = debug - # + for fname in args: proc(outfp, fname, objids, pagenos, password=password, dumpall=dumpall, codec=codec, extractdir=extractdir) - return + outfp.close() if __name__ == '__main__': sys.exit(main(sys.argv))