Python 3.4 support and tests

2014-09-03 15:26:08 +02:00 · 2014-09-03 15:26:08 +02:00 · 29c07ea770
parent a6475b61b4
commit 29c07ea770
11 changed files with 80 additions and 17 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -2,7 +2,10 @@ language: python
 python:
  - "2.6"
  - "2.7"
+  - "3.4"
 install:
+  - pip install six
  - pip install pycrypto
 script:
  - make test
+  - nosetests
--- a/README.md
+++ b/README.md
@ -34,7 +34,7 @@ Features
 How to Install
 --------------

- * Install Python 2.6 or newer. (**Python 3 is not supported.**)
+ * Install Python 2.6 or newer. (Python 3.4 is supported.)
 * Download the source code.
 * Unpack it.
 * Run `setup.py`:
--- a/pdfminer/cmapdb.py
+++ b/pdfminer/cmapdb.py
@ -31,6 +31,7 @@ from .encodingdb import name2unicode
 from .utils import choplist
 from .utils import nunpack

+import six #Python 2+3 compatibility

 class CMapError(Exception):
    pass
@ -92,8 +93,8 @@ class CMap(CMapBase):
    def decode(self, code):
        logging.debug('decode: %r, %r' % (self, code))
        d = self.code2cid
-        for c in code:
-            c = ord(c)
+        for i in six.iterbytes(code):
+            c = six.int2byte(i)
            if c in d:
                d = d[c]
                if isinstance(d, int):
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@ -21,6 +21,7 @@ from .utils import mult_matrix
 from .utils import enc
 from .utils import bbox2str

+import six # Python 2+3 compatibility

 ##  PDFLayoutAnalyzer
 ##
@ -106,7 +107,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
    def render_char(self, matrix, font, fontsize, scaling, rise, cid):
        try:
            text = font.to_unichr(cid)
-            assert isinstance(text, unicode), text
+            assert isinstance(text, six.text_type), text
        except PDFUnicodeNotDefined:
            text = self.handle_undefined_char(font, cid)
        textwidth = font.char_width(cid)
@ -398,7 +399,7 @@ class HTMLConverter(PDFConverter):
 ##
 class XMLConverter(PDFConverter):

-    CONTROL = re.compile(ur'[\x00-\x08\x0b-\x0c\x0e-\x1f]')
+    CONTROL = re.compile(u'[\x00-\x08\x0b-\x0c\x0e-\x1f]')

    def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1,
                 laparams=None, imagewriter=None, stripcontrol=False):
--- a/pdfminer/encodingdb.py
+++ b/pdfminer/encodingdb.py
@ -4,6 +4,7 @@ from .psparser import PSLiteral
 from .glyphlist import glyphname2unicode
 from .latin_enc import ENCODING

+import six # Python 2+3 compatibility

 STRIP_NAME = re.compile(r'[0-9]+')

@ -17,7 +18,7 @@ def name2unicode(name):
    m = STRIP_NAME.search(name)
    if not m:
        raise KeyError(name)
-    return unichr(int(m.group(0)))
+    return six.unichr(int(m.group(0)))


 ##  EncodingDB
--- a/pdfminer/layout.py
+++ b/pdfminer/layout.py
@ -9,6 +9,7 @@ from .utils import bbox2str
 from .utils import matrix2str
 from .utils import apply_matrix_pt

+import six # Python 2+3 compatibility

 ##  IndexAssigner
 ##
@ -633,9 +634,9 @@ class LTLayoutContainer(LTContainer):
        
        # XXX this still takes O(n^2)  :(
        dists = []
-        for i in xrange(len(boxes)):
+        for i in range(len(boxes)):
            obj1 = boxes[i]
-            for j in xrange(i+1, len(boxes)):
+            for j in range(i+1, len(boxes)):
                obj2 = boxes[j]
                dists.append((0, dist(obj1, obj2), obj1, obj2))
        # We could use dists.sort(), but it would randomize the test result.
--- a/pdfminer/pdfinterp.py
+++ b/pdfminer/pdfinterp.py
@ -41,6 +41,13 @@ class PDFResourceError(PDFException):
 class PDFInterpreterError(PDFException):
    pass

+##  Constants
+##
+LITERAL_PDF = LIT('PDF')
+LITERAL_TEXT = LIT('Text')
+LITERAL_FONT = LIT('Font')
+LITERAL_FORM = LIT('Form')
+LITERAL_IMAGE = LIT('Image')

 ##  PDFTextState
 ##
@ -245,7 +252,8 @@ class PDFContentParser(PSStackParser):
        while i <= len(target):
            self.fillbuf()
            if i:
-                c = self.buf[self.charpos]
+                c = six.indexbytes(self.buf,self.charpos)
+                c=six.int2byte(c)
                data += c
                self.charpos += 1
                if len(target) <= i and c.isspace():
@ -334,7 +342,7 @@ class PDFPageInterpreter(object):
        for (k, v) in six.iteritems(dict_value(resources)):
            logging.debug('Resource: %r: %r' % (k, v))
            if k == 'Font':
-                for (fontid, spec) in dict_value(v).iteritems():
+                for (fontid, spec) in six.iteritems(dict_value(v)):
                    objid = None
                    if isinstance(spec, PDFObjRef):
                        objid = spec.objid
@ -346,7 +354,7 @@ class PDFPageInterpreter(object):
            elif k == 'ProcSet':
                self.rsrcmgr.get_procset(list_value(v))
            elif k == 'XObject':
-                for (xobjid, xobjstrm) in dict_value(v).iteritems():
+                for (xobjid, xobjstrm) in six.iteritems(dict_value(v)):
                    self.xobjmap[xobjid] = xobjstrm
        return

--- a/pdfminer/pdfpage.py
+++ b/pdfminer/pdfpage.py
@ -12,6 +12,10 @@ from .pdfdocument import PDFTextExtractionNotAllowed

 import six # Python 2+3 compatibility

+# some predefined literals and keywords.
+LITERAL_PAGE = LIT('Page')
+LITERAL_PAGES = LIT('Pages')
+
 ##  PDFPage
 ##
 class PDFPage(object):
@ -82,12 +86,12 @@ class PDFPage(object):
            for (k, v) in six.iteritems(parent):
                if k in klass.INHERITABLE_ATTRS and k not in tree:
                    tree[k] = v
-            if tree.get('Type').name=='Pages' and 'Kids' in tree:
+            if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
                logging.info('Pages: Kids=%r' % tree['Kids'])
                for c in list_value(tree['Kids']):
                    for x in search(c, tree):
                        yield x
-            elif tree.get('Type').name=='Page':
+            elif tree.get('Type') is LITERAL_PAGE:
                logging.info('Page: %r' % tree)
                yield (objid, tree)
        pages = False
--- a/pdfminer/utils.py
+++ b/pdfminer/utils.py
@ -128,7 +128,7 @@ def fsplit(pred, objs):
 def drange(v0, v1, d):
    """Returns a discrete range."""
    assert v0 < v1
-    return xrange(int(v0)//d, int(v1+d)//d)
+    return range(int(v0)//d, int(v1+d)//d)


 # get_bound
--- a/tests/test_tools_pdf2txt.py
+++ b/tests/test_tools_pdf2txt.py
@ -0,0 +1,44 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+import unittest, logging, os
+
+import tools.pdf2txt as pdf2txt
+
+path=os.path.dirname(os.path.abspath(__file__))+'/'
+
+def run(datapath,filename,options=None):
+    i=path+datapath+filename+'.pdf'
+    o=path+filename+'.txt'
+    if options:
+        s='pdf2txt -o%s %s %s'%(o,options,i)
+    else:
+         s='pdf2txt -o%s %s'%(o,i)
+    pdf2txt.main(s.split(' '))
+
+class TestDumpPDF(unittest.TestCase):
+    
+
+    def test_1(self):
+        run('../samples/','jo')
+        run('../samples/','simple1')
+        run('../samples/','simple2')
+        run('../samples/','simple3')
+        
+    def test_2(self):
+        run('../samples/nonfree/','dmca')
+        
+    def test_3(self):
+        run('../samples/nonfree/','f1040nr')
+
+    def test_4(self):
+        run('../samples/nonfree/','i1040nr')
+        
+    def test_5(self):
+        run('../samples/nonfree/','kampo')
+        
+    def test_6(self):
+        run('../samples/nonfree/','naacl06-shinyama')
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tools/pdf2txt.py
+++ b/tools/pdf2txt.py
@ -28,7 +28,7 @@ def main(argv):
    # debug option
    debug = 0
    # input option
-    password = ''
+    password = b''
    pagenos = set()
    maxpages = 0
    # output option
@ -82,7 +82,7 @@ def main(argv):
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
-        outfp = file(outfile, 'w')
+        outfp = open(outfile, 'wb')
    else:
        outfp = sys.stdout
    if outtype == 'text':
@ -101,7 +101,7 @@ def main(argv):
    else:
        return usage()
    for fname in args:
-        fp = file(fname, 'rb')
+        fp = open(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp, pagenos,
                                      maxpages=maxpages, password=password,