Merge branch 'master' of https://github.com/pdfminer/pdfminer.six.git

2017-04-19 12:28:03 +02:00 · 2017-04-19 12:28:03 +02:00 · f28ce1ebed
parent 3427dcaf20 11a4c8b6c1
commit f28ce1ebed
7 changed files with 9915 additions and 46 deletions
--- a/README.md
+++ b/README.md
@ -16,7 +16,6 @@ PDF parser that can be used for other purposes than text analysis.
 * Webpage: https://github.com/pdfminer/
 * Download (PyPI): https://pypi.python.org/pypi/pdfminer.six/
 * Demo WebApp: http://pdf2html.tabesugi.net:8080/ (broken?)
 Features
@ -36,14 +35,12 @@ Features
 How to Install
 --------------
- * Install Python 2.7 or newer. (Python 3.4 is supported in pdfminer.six)
+ * Install Python 2.7 or newer. (Python 3.x is supported in pdfminer.six)
- * Download the source code.
+ * Install
 * Unpack it.
 * Run `setup.py`:
-    $ python setup.py install
+    $ pip install pdfminer.six
- * Do the following test:
+ * Run the following test:
    $ pdf2txt.py samples/simple1.pdf
@ -76,35 +73,11 @@ but it's also possible to extract some meaningful contents (e.g. images).
 (For details, refer to the html document.)
 API Changes
 -----------
 As of November 2013, there were a few changes made to the PDFMiner API
 prior to October 2013. This is the result of code restructuring.  Here
 is a list of the changes:
 * PDFDocument class is moved to pdfdocument.py.
 * PDFDocument class now takes a PDFParser object as an argument.
   PDFDocument.set_parser() and PDFParser.set_document() is removed.
 * PDFPage class is moved to pdfpage.py
 * process_pdf function is implemented as a class method PDFPage.get_pages.
 TODO
 ----
 * PEP-8 and PEP-257 conformance.
 * Better documentation.
 * Crypt stream filter support.
 Related Projects
 ----------------
 * <a href="http://pybrary.net/pyPdf/">pyPdf</a>
 * <a href="http://www.foolabs.com/xpdf/">xpdf</a>
 * <a href="http://pdfbox.apache.org/">pdfbox</a>
 * <a href="http://mupdf.com/">mupdf</a>
 Terms and Conditions
--- a/pdfminer/init.py
+++ b/pdfminer/init.py
@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
-__version__ = '20170119'
+__version__ = '20170418'
 if __name__ == '__main__':
    print (__version__)
--- a/pdfminer/ccitt.py
+++ b/pdfminer/ccitt.py
@ -13,7 +13,17 @@
 import sys
 import array
-import six #Python 2+3 compatibility
+import six  #Python 2+3 compatibility
 if six.PY3:
    def get_bytes(data):
        for byte in data:
            yield byte
 else:
    def get_bytes(data):
        for char in data:
            yield ord(char)
 ##  BitParser
 ##
@ -40,10 +50,9 @@ class BitParser(object):
        return
    def feedbytes(self, data):
-        for c in data:
+        for byte in get_bytes(data):
            b = ord(c)
            for m in (128, 64, 32, 16, 8, 4, 2, 1):
-                self._parse_bit(b & m)
+                self._parse_bit(byte & m)
        return
    def _parse_bit(self, x):
@ -328,11 +337,10 @@ class CCITTG4Parser(BitParser):
        return
    def feedbytes(self, data):
-        for c in data:
+        for byte in get_bytes(data):
            b = ord(c)
            try:
                for m in (128, 64, 32, 16, 8, 4, 2, 1):
-                    self._parse_bit(b & m)
+                    self._parse_bit(byte & m)
            except self.ByteSkip:
                self._accept = self._parse_mode
                self._state = self.MODE
--- a/pdfminer/pdftypes.py
+++ b/pdfminer/pdftypes.py
@ -175,7 +175,7 @@ def stream_value(x):
    if not isinstance(x, PDFStream):
        if settings.STRICT:
            raise PDFTypeError('PDFStream required: %r' % x)
-        return PDFStream({}, '')
+        return PDFStream({}, b'')
    return x
--- a/samples/contrib/stamp-no.pdf
+++ b/samples/contrib/stamp-no.pdf
--- a/setup.py
+++ b/setup.py
@ -27,7 +27,7 @@ PDF parser that can be used for other purposes instead of text analysis.''',
    license='MIT/X',
    author='Yusuke Shinyama + Philippe Guglielmetti',
    author_email='pdfminer@goulu.net',
-    url='http://github.com/goulu/pdfminer',
+    url='http://github.com/pdfminer/pdfminer',
    scripts=[
    'tools/pdf2txt.py',
    'tools/dumppdf.py',
--- a/tests/test_tools_pdf2txt.py
+++ b/tests/test_tools_pdf2txt.py
@ -18,28 +18,34 @@ def run(datapath,filename,options=None):
    pdf2txt.main(s.split(' ')[1:])
 class TestDumpPDF():
    def test_1(self):
        run('../samples/','jo')
        run('../samples/','simple1')
        run('../samples/','simple2')
        run('../samples/','simple3')
-        
+
    def test_2(self):
        run('../samples/nonfree/','dmca')
-        
+
    def test_3(self):
        run('../samples/nonfree/','f1040nr')
    def test_4(self):
        run('../samples/nonfree/','i1040nr')
-        
+
    def test_5(self):
        run('../samples/nonfree/','kampo')
-        
+
    def test_6(self):
        run('../samples/nonfree/','naacl06-shinyama')
    # this test works on Windows but on Linux & Travis-CI it says
    # PDFSyntaxError: No /Root object! - Is this really a PDF?
    # TODO: Find why
    """
    def test_7(self):
        run('../samples/contrib/','stamp-no')
    """
 if __name__ == '__main__':
    nose.runmodule()