From a6475b61b45e5badd329ab9375a421bc8f273e22 Mon Sep 17 00:00:00 2001
From: unknown <GUGLIE0P@BSALCZC14262ZS.grpdom.dsbgrp.com>
Date: Wed, 3 Sep 2014 13:17:41 +0200
Subject: [PATCH] Python 3.4 support added and tested

---
 pdfminer/ascii85.py             |   5 +-
 pdfminer/lzw.py                 |   7 ++-
 pdfminer/pdfdocument.py         |   2 +-
 pdfminer/pdftypes.py            |   5 +-
 pdfminer/utils.py               |   4 +-
 setup.py                        |   3 +
 tests/test_pdfminer_psparser.py | 107 ++++++++++++++++++++++++++++++++
 tests/test_tools_dumppdf.py     |  46 ++++++++++++++
 tools/dumppdf.py                |  33 +++++-----
 9 files changed, 188 insertions(+), 24 deletions(-)
 create mode 100644 tests/test_pdfminer_psparser.py
 create mode 100644 tests/test_tools_dumppdf.py

diff --git a/pdfminer/ascii85.py b/pdfminer/ascii85.py
index 067fccd..45c83ce 100644
--- a/pdfminer/ascii85.py
+++ b/pdfminer/ascii85.py
@@ -9,6 +9,8 @@ This code is in the public domain.
 import re
 import struct
 
+import six #Python 2+3 compatibility
+
 
 # ascii85decode(data)
 def ascii85decode(data):
@@ -31,7 +33,8 @@ def ascii85decode(data):
     """
     n = b = 0
     out = b''
-    for c in data:
+    for i in six.iterbytes(data):
+        c=six.int2byte(i)
         if b'!' <= c and c <= b'u':
             n += 1
             b = b*85+(ord(c)-33)
diff --git a/pdfminer/lzw.py b/pdfminer/lzw.py
index aef3667..9a4303a 100644
--- a/pdfminer/lzw.py
+++ b/pdfminer/lzw.py
@@ -1,6 +1,8 @@
 #!/usr/bin/env python
 from io import BytesIO
 
+import six  #Python 2+3 compatibility
+
 
 class CorruptDataError(Exception):
     pass
@@ -47,7 +49,7 @@ class LZWDecoder(object):
     def feed(self, code):
         x = b''
         if code == 256:
-            self.table = [chr(c) for c in xrange(256)]  # 0-255
+            self.table = [six.int2byte(c) for c in range(256)]  # 0-255
             self.table.append(None)  # 256
             self.table.append(None)  # 257
             self.prevbuf = b''
@@ -99,7 +101,8 @@ def lzwdecode(data):
     '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42'
     """
     fp = BytesIO(data)
-    return b''.join(LZWDecoder(fp).run())
+    s=LZWDecoder(fp).run()
+    return b''.join(s)
 
 if __name__ == '__main__':
     import doctest
diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py
index a9fb717..c892414 100644
--- a/pdfminer/pdfdocument.py
+++ b/pdfminer/pdfdocument.py
@@ -180,7 +180,7 @@ class PDFXRefFallback(PDFXRef):
                 logging.info('trailer: %r' % self.get_trailer())
                 break
             if six.PY3:
-                line=line.decode('utf-8')
+                line=line.decode('latin-1') #default pdf encoding
             m = self.PDFOBJ_CUE.match(line)
             if not m:
                 continue
diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py
index 0bb942c..18236a2 100644
--- a/pdfminer/pdftypes.py
+++ b/pdfminer/pdftypes.py
@@ -12,6 +12,7 @@ from .psparser import STRICT
 from .utils import apply_png_predictor
 from .utils import isnumber
 
+import six #Python 2+3 compatibility
 
 LITERAL_CRYPT = LIT('Crypt')
 
@@ -105,7 +106,7 @@ def decipher_all(decipher, objid, genno, x):
     if isinstance(x, list):
         x = [decipher_all(decipher, objid, genno, v) for v in x]
     elif isinstance(x, dict):
-        for (k, v) in x.iteritems():
+        for (k, v) in six.iteritems(x):
             x[k] = decipher_all(decipher, objid, genno, v)
     return x
 
@@ -140,7 +141,7 @@ def num_value(x):
 
 def str_value(x):
     x = resolve1(x)
-    if not isinstance(x, str):
+    if not isinstance(x, six.binary_type):
         if STRICT:
             raise PDFTypeError('String required: %r' % x)
         return ''
diff --git a/pdfminer/utils.py b/pdfminer/utils.py
index 7b1e6b3..2514949 100644
--- a/pdfminer/utils.py
+++ b/pdfminer/utils.py
@@ -17,7 +17,7 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
     i = 0
     buf = b''
     line0 = b'\x00' * columns
-    for i in xrange(0, len(data), nbytes+1):
+    for i in range(0, len(data), nbytes+1):
         ft = data[i]
         i += 1
         line1 = data[i:i+nbytes]
@@ -90,7 +90,7 @@ def apply_matrix_norm(m, v):
 
 # isnumber
 def isnumber(x):
-    return isinstance(x, (int, long, float))
+    return isinstance(x, (six.integer_types, float))
 
 # uniq
 def uniq(objs):
diff --git a/setup.py b/setup.py
index c9962fe..ed50fb3 100644
--- a/setup.py
+++ b/setup.py
@@ -31,6 +31,9 @@ PDF parser that can be used for other purposes instead of text analysis.''',
     ],
     keywords=['pdf parser', 'pdf converter', 'layout analysis', 'text mining'],
     classifiers=[
+    'Programming Language :: Python',
+    'Programming Language :: Python :: 2.7',
+    'Programming Language :: Python :: 3.4',
     'Development Status :: 4 - Beta',
     'Environment :: Console',
     'Intended Audience :: Developers',
diff --git a/tests/test_pdfminer_psparser.py b/tests/test_pdfminer_psparser.py
new file mode 100644
index 0000000..4188835
--- /dev/null
+++ b/tests/test_pdfminer_psparser.py
@@ -0,0 +1,107 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+from nose.tools import assert_equal, assert_true, assert_false
+from nose import SkipTest
+import nose
+
+import logging
+
+from pdfminer.psparser import *
+
+##  Simplistic Test cases
+##
+class TestPSBaseParser:
+
+    TESTDATA = br'''%!PS
+begin end
+ "  @ #
+/a/BCD /Some_Name /foo#5f#xbaa
+0 +1 -2 .5 1.234
+(abc) () (abc ( def ) ghi)
+(def\040\0\0404ghi) (bach\\slask) (foo\nbaa)
+(this % is not a comment.)
+(foo
+baa)
+(foo\
+baa)
+<> <20> < 40 4020 >
+<abcd00
+12345>
+func/a/b{(c)do*}def
+[ 1 (z) ! ]
+<< /foo (bar) >>
+'''
+
+    TOKENS = [
+      (5, KWD(b'begin')), (11, KWD(b'end')), (16, KWD(b'"')), (19, KWD(b'@')),
+      (21, KWD(b'#')), (23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')),
+      (41, LIT('foo_xbaa')), (54, 0), (56, 1), (59, -2), (62, 0.5),
+      (65, 1.234), (71, b'abc'), (77, b''), (80, b'abc ( def ) ghi'),
+      (98, b'def \x00 4ghi'), (118, b'bach\\slask'), (132, b'foo\nbaa'),
+      (143, b'this % is not a comment.'), (170, b'foo\nbaa'), (180, b'foobaa'),
+      (191, b''), (194, b' '), (199, b'@@ '), (211, b'\xab\xcd\x00\x124\x05'),
+      (226, KWD(b'func')), (230, LIT('a')), (232, LIT('b')),
+      (234, KWD(b'{')), (235, b'c'), (238, KWD(b'do*')), (241, KWD(b'}')),
+      (242, KWD(b'def')), (246, KWD(b'[')), (248, 1), (250, b'z'), (254, KWD(b'!')),
+      (256, KWD(b']')), (258, KWD(b'<<')), (261, LIT('foo')), (266, b'bar'),
+      (272, KWD(b'>>'))
+    ]
+
+    OBJS = [
+      (23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')),
+      (41, LIT('foo_xbaa')), (54, 0), (56, 1), (59, -2), (62, 0.5),
+      (65, 1.234), (71, b'abc'), (77, b''), (80, b'abc ( def ) ghi'),
+      (98, b'def \x00 4ghi'), (118, b'bach\\slask'), (132, b'foo\nbaa'),
+      (143, b'this % is not a comment.'), (170, b'foo\nbaa'), (180, b'foobaa'),
+      (191, b''), (194, b' '), (199, b'@@ '), (211, b'\xab\xcd\x00\x124\x05'),
+      (230, LIT('a')), (232, LIT('b')), (234, [b'c']), (246, [1, b'z']),
+      (258, {'foo': b'bar'}),
+    ]
+
+    def get_tokens(self, s):
+        from io import BytesIO
+
+        class MyParser(PSBaseParser):
+            def flush(self):
+                self.add_results(*self.popall())
+        parser = MyParser(BytesIO(s))
+        r = []
+        try:
+            while True:
+                r.append(parser.nexttoken())
+        except PSEOF:
+            pass
+        return r
+
+    def get_objects(self, s):
+        from io import BytesIO
+
+        class MyParser(PSStackParser):
+            def flush(self):
+                self.add_results(*self.popall())
+        parser = MyParser(BytesIO(s))
+        r = []
+        try:
+            while True:
+                r.append(parser.nextobject())
+        except PSEOF:
+            pass
+        return r
+
+    def test_1(self):
+        tokens = self.get_tokens(self.TESTDATA)
+        logging.info(tokens)
+        assert_equal(tokens, self.TOKENS)
+        return
+
+    def test_2(self):
+        objs = self.get_objects(self.TESTDATA)
+        logging.info(objs)
+        assert_equal(objs, self.OBJS)
+        return
+
+if __name__ == '__main__':
+    import logging,sys,os,six
+    logging.basicConfig(level=logging.DEBUG, filename='%s_%d.%d.log'%(os.path.basename(__file__),sys.version_info[0],sys.version_info[1]))
+    nose.runmodule()
\ No newline at end of file
diff --git a/tests/test_tools_dumppdf.py b/tests/test_tools_dumppdf.py
new file mode 100644
index 0000000..8eec67f
--- /dev/null
+++ b/tests/test_tools_dumppdf.py
@@ -0,0 +1,46 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+import unittest, logging, os
+
+import tools.dumppdf as dumppdf
+
+path=os.path.dirname(os.path.abspath(__file__))+'/'
+
+def run(datapath,filename,options=None):
+    i=path+datapath+filename+'.pdf'
+    o=path+filename+'.xml'
+    if options:
+        s='dumppdf -o%s %s %s'%(o,options,i)
+    else:
+         s='dumppdf -o%s %s'%(o,i)
+    dumppdf.main(s.split(' '))
+
+class TestDumpPDF(unittest.TestCase):
+    
+
+    def test_1(self):
+        run('../samples/','jo','-t -a')
+        run('../samples/','simple1','-t -a')
+        run('../samples/','simple2','-t -a')
+        run('../samples/','simple3','-t -a')
+        
+    def test_2(self):
+        run('../samples/nonfree/','dmca','-t -a')
+        
+    def test_3(self):
+        run('../samples/nonfree/','f1040nr')
+
+    def test_4(self):
+        run('../samples/nonfree/','i1040nr')
+        
+    def test_5(self):
+        run('../samples/nonfree/','kampo','-t -a')
+        
+    def test_6(self):
+        run('../samples/nonfree/','naacl06-shinyama','-t -a')
+
+if __name__ == '__main__':
+    import logging,sys,os,six
+    logging.basicConfig(level=logging.DEBUG, filename='%s_%d.%d.log'%(os.path.basename(__file__),sys.version_info[0],sys.version_info[1]))
+    unittest.main()
\ No newline at end of file
diff --git a/tools/dumppdf.py b/tools/dumppdf.py
index 29a1144..f33296a 100755
--- a/tools/dumppdf.py
+++ b/tools/dumppdf.py
@@ -6,7 +6,7 @@
 #  options:
 #    -i objid : object id
 #
-import sys, os.path, re
+import sys, os.path, re, logging
 from pdfminer.psparser import PSKeyword, PSLiteral, LIT
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
@@ -18,8 +18,12 @@ from pdfminer.utils import isnumber
 
 ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
 def e(s):
+    if isinstance(s,six.binary_type):
+        s=str(s,'latin-1')
     return ESC_PAT.sub(lambda m:'&#%d;' % ord(m.group(0)), s)
 
+import six # Python 2+3 compatibility
+
 
 # dumpxml
 def dumpxml(out, obj, codec=None):
@@ -29,7 +33,7 @@ def dumpxml(out, obj, codec=None):
 
     if isinstance(obj, dict):
         out.write('<dict size="%d">\n' % len(obj))
-        for (k,v) in obj.iteritems():
+        for (k,v) in six.iteritems(obj):
             out.write('<key>%s</key>\n' % k)
             out.write('<value>')
             dumpxml(out, v)
@@ -45,7 +49,7 @@ def dumpxml(out, obj, codec=None):
         out.write('</list>')
         return
 
-    if isinstance(obj, str):
+    if isinstance(obj, (six.string_types, six.binary_type)):
         out.write('<string size="%d">%s</string>' % (len(obj), e(obj)))
         return
 
@@ -113,7 +117,7 @@ def dumpallobjs(out, doc, codec=None):
 # dumpoutline
 def dumpoutline(outfp, fname, objids, pagenos, password='',
                 dumpall=False, codec=None, extractdir=None):
-    fp = file(fname, 'rb')
+    fp = open(fname, 'rb')
     parser = PDFParser(fp)
     doc = PDFDocument(parser, password)
     pages = dict( (page.pageid, pageno) for (pageno,page)
@@ -183,7 +187,7 @@ def extractembedded(outfp, fname, objids, pagenos, password='',
         out.close()
         return
 
-    fp = file(fname, 'rb')
+    fp = open(fname, 'rb')
     parser = PDFParser(fp)
     doc = PDFDocument(parser, password)
     for xref in doc.xrefs:
@@ -191,12 +195,13 @@ def extractembedded(outfp, fname, objids, pagenos, password='',
             obj = doc.getobj(objid)
             if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC:
                 extract1(obj)
+    fp.close()
     return
 
 # dumppdf
 def dumppdf(outfp, fname, objids, pagenos, password='',
             dumpall=False, codec=None, extractdir=None):
-    fp = file(fname, 'rb')
+    fp = open(fname, 'rb')
     parser = PDFParser(fp)
     doc = PDFDocument(parser, password)
     if objids:
@@ -229,22 +234,21 @@ def main(argv):
         print ('usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-E directory] [-i objid] file ...' % argv[0])
         return 100
     try:
-        (opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTE:i:')
+        (opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTE:i:o:')
     except getopt.GetoptError:
         return usage()
     if not args: return usage()
-    debug = 0
     objids = []
     pagenos = set()
     codec = None
-    password = ''
+    password = b''
     dumpall = False
     proc = dumppdf
     outfp = sys.stdout
     extractdir = None
     for (k, v) in opts:
-        if k == '-d': debug += 1
-        elif k == '-o': outfp = file(v, 'wb')
+        if k == '-d': logging.getLogger().setlevel(logging.DEBUG)
+        elif k == '-o': outfp = open(v, 'w')
         elif k == '-i': objids.extend( int(x) for x in v.split(',') )
         elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
         elif k == '-P': password = v
@@ -256,13 +260,10 @@ def main(argv):
         elif k == '-E':
             extractdir = v
             proc = extractembedded
-    #
-    PDFDocument.debug = debug
-    PDFParser.debug = debug
-    #
+
     for fname in args:
         proc(outfp, fname, objids, pagenos, password=password,
              dumpall=dumpall, codec=codec, extractdir=extractdir)
-    return
+    outfp.close()
 
 if __name__ == '__main__': sys.exit(main(sys.argv))