Merge branch 'master' of https://github.com/pdfminer/pdfminer.six.git

2017-04-19 12:28:03 +02:00 · 2017-04-19 12:28:03 +02:00 · f28ce1ebed
parent 3427dcaf20 11a4c8b6c1
commit f28ce1ebed
7 changed files with 9915 additions and 46 deletions
--- a/README.md
+++ b/README.md
@ -16,7 +16,6 @@ PDF parser that can be used for other purposes than text analysis.

 * Webpage: https://github.com/pdfminer/
 * Download (PyPI): https://pypi.python.org/pypi/pdfminer.six/
- * Demo WebApp: http://pdf2html.tabesugi.net:8080/ (broken?)


 Features
@ -36,14 +35,12 @@ Features
 How to Install
 --------------

- * Install Python 2.7 or newer. (Python 3.4 is supported in pdfminer.six)
- * Download the source code.
- * Unpack it.
- * Run `setup.py`:
+ * Install Python 2.7 or newer. (Python 3.x is supported in pdfminer.six)
+ * Install

-    $ python setup.py install
+    $ pip install pdfminer.six

- * Do the following test:
+ * Run the following test:

    $ pdf2txt.py samples/simple1.pdf

@ -76,35 +73,11 @@ but it's also possible to extract some meaningful contents (e.g. images).
 (For details, refer to the html document.)


-API Changes
-----------
-
-As of November 2013, there were a few changes made to the PDFMiner API
-prior to October 2013. This is the result of code restructuring.  Here
-is a list of the changes:
-
- * PDFDocument class is moved to pdfdocument.py.
- * PDFDocument class now takes a PDFParser object as an argument.
-   PDFDocument.set_parser() and PDFParser.set_document() is removed.
- * PDFPage class is moved to pdfpage.py
- * process_pdf function is implemented as a class method PDFPage.get_pages.
-
-
 TODO
 ----

 * PEP-8 and PEP-257 conformance.
 * Better documentation.
- * Crypt stream filter support.
-
-
-Related Projects
----------------
-
- * <a href="http://pybrary.net/pyPdf/">pyPdf</a>
- * <a href="http://www.foolabs.com/xpdf/">xpdf</a>
- * <a href="http://pdfbox.apache.org/">pdfbox</a>
- * <a href="http://mupdf.com/">mupdf</a>


 Terms and Conditions
--- a/pdfminer/init.py
+++ b/pdfminer/init.py
@ -1,6 +1,6 @@

 # -*- coding: utf-8 -*-
-__version__ = '20170119'
+__version__ = '20170418'

 if __name__ == '__main__':
    print (__version__)
--- a/pdfminer/ccitt.py
+++ b/pdfminer/ccitt.py
@ -13,7 +13,17 @@
 import sys
 import array

-import six #Python 2+3 compatibility
+import six  #Python 2+3 compatibility
+
+if six.PY3:
+    def get_bytes(data):
+        for byte in data:
+            yield byte
+else:
+    def get_bytes(data):
+        for char in data:
+            yield ord(char)
+

 ##  BitParser
 ##
@ -40,10 +50,9 @@ class BitParser(object):
        return

    def feedbytes(self, data):
-        for c in data:
-            b = ord(c)
+        for byte in get_bytes(data):
            for m in (128, 64, 32, 16, 8, 4, 2, 1):
-                self._parse_bit(b & m)
+                self._parse_bit(byte & m)
        return

    def _parse_bit(self, x):
@ -328,11 +337,10 @@ class CCITTG4Parser(BitParser):
        return

    def feedbytes(self, data):
-        for c in data:
-            b = ord(c)
+        for byte in get_bytes(data):
            try:
                for m in (128, 64, 32, 16, 8, 4, 2, 1):
-                    self._parse_bit(b & m)
+                    self._parse_bit(byte & m)
            except self.ByteSkip:
                self._accept = self._parse_mode
                self._state = self.MODE
--- a/pdfminer/pdftypes.py
+++ b/pdfminer/pdftypes.py
@ -175,7 +175,7 @@ def stream_value(x):
    if not isinstance(x, PDFStream):
        if settings.STRICT:
            raise PDFTypeError('PDFStream required: %r' % x)
-        return PDFStream({}, '')
+        return PDFStream({}, b'')
    return x


--- a/samples/contrib/stamp-no.pdf
+++ b/samples/contrib/stamp-no.pdf
--- a/setup.py
+++ b/setup.py
@ -27,7 +27,7 @@ PDF parser that can be used for other purposes instead of text analysis.''',
    license='MIT/X',
    author='Yusuke Shinyama + Philippe Guglielmetti',
    author_email='pdfminer@goulu.net',
-    url='http://github.com/goulu/pdfminer',
+    url='http://github.com/pdfminer/pdfminer',
    scripts=[
    'tools/pdf2txt.py',
    'tools/dumppdf.py',
--- a/tests/test_tools_pdf2txt.py
+++ b/tests/test_tools_pdf2txt.py
@ -18,28 +18,34 @@ def run(datapath,filename,options=None):
    pdf2txt.main(s.split(' ')[1:])

 class TestDumpPDF():
-    

    def test_1(self):
        run('../samples/','jo')
        run('../samples/','simple1')
        run('../samples/','simple2')
        run('../samples/','simple3')
-        
+
    def test_2(self):
        run('../samples/nonfree/','dmca')
-        
+
    def test_3(self):
        run('../samples/nonfree/','f1040nr')

    def test_4(self):
        run('../samples/nonfree/','i1040nr')
-        
+
    def test_5(self):
        run('../samples/nonfree/','kampo')
-        
+
    def test_6(self):
        run('../samples/nonfree/','naacl06-shinyama')

+    # this test works on Windows but on Linux & Travis-CI it says
+    # PDFSyntaxError: No /Root object! - Is this really a PDF?
+    # TODO: Find why
+    """
+    def test_7(self):
+        run('../samples/contrib/','stamp-no')
+    """
 if __name__ == '__main__':
    nose.runmodule()