Merge branch 'master' of https://github.com/pdfminer/pdfminer.six.git
commit
f28ce1ebed
35
README.md
35
README.md
|
@ -16,7 +16,6 @@ PDF parser that can be used for other purposes than text analysis.
|
||||||
|
|
||||||
* Webpage: https://github.com/pdfminer/
|
* Webpage: https://github.com/pdfminer/
|
||||||
* Download (PyPI): https://pypi.python.org/pypi/pdfminer.six/
|
* Download (PyPI): https://pypi.python.org/pypi/pdfminer.six/
|
||||||
* Demo WebApp: http://pdf2html.tabesugi.net:8080/ (broken?)
|
|
||||||
|
|
||||||
|
|
||||||
Features
|
Features
|
||||||
|
@ -36,14 +35,12 @@ Features
|
||||||
How to Install
|
How to Install
|
||||||
--------------
|
--------------
|
||||||
|
|
||||||
* Install Python 2.7 or newer. (Python 3.4 is supported in pdfminer.six)
|
* Install Python 2.7 or newer. (Python 3.x is supported in pdfminer.six)
|
||||||
* Download the source code.
|
* Install
|
||||||
* Unpack it.
|
|
||||||
* Run `setup.py`:
|
|
||||||
|
|
||||||
$ python setup.py install
|
$ pip install pdfminer.six
|
||||||
|
|
||||||
* Do the following test:
|
* Run the following test:
|
||||||
|
|
||||||
$ pdf2txt.py samples/simple1.pdf
|
$ pdf2txt.py samples/simple1.pdf
|
||||||
|
|
||||||
|
@ -76,35 +73,11 @@ but it's also possible to extract some meaningful contents (e.g. images).
|
||||||
(For details, refer to the html document.)
|
(For details, refer to the html document.)
|
||||||
|
|
||||||
|
|
||||||
API Changes
|
|
||||||
-----------
|
|
||||||
|
|
||||||
As of November 2013, there were a few changes made to the PDFMiner API
|
|
||||||
prior to October 2013. This is the result of code restructuring. Here
|
|
||||||
is a list of the changes:
|
|
||||||
|
|
||||||
* PDFDocument class is moved to pdfdocument.py.
|
|
||||||
* PDFDocument class now takes a PDFParser object as an argument.
|
|
||||||
PDFDocument.set_parser() and PDFParser.set_document() is removed.
|
|
||||||
* PDFPage class is moved to pdfpage.py
|
|
||||||
* process_pdf function is implemented as a class method PDFPage.get_pages.
|
|
||||||
|
|
||||||
|
|
||||||
TODO
|
TODO
|
||||||
----
|
----
|
||||||
|
|
||||||
* PEP-8 and PEP-257 conformance.
|
* PEP-8 and PEP-257 conformance.
|
||||||
* Better documentation.
|
* Better documentation.
|
||||||
* Crypt stream filter support.
|
|
||||||
|
|
||||||
|
|
||||||
Related Projects
|
|
||||||
----------------
|
|
||||||
|
|
||||||
* <a href="http://pybrary.net/pyPdf/">pyPdf</a>
|
|
||||||
* <a href="http://www.foolabs.com/xpdf/">xpdf</a>
|
|
||||||
* <a href="http://pdfbox.apache.org/">pdfbox</a>
|
|
||||||
* <a href="http://mupdf.com/">mupdf</a>
|
|
||||||
|
|
||||||
|
|
||||||
Terms and Conditions
|
Terms and Conditions
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
|
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
__version__ = '20170119'
|
__version__ = '20170418'
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print (__version__)
|
print (__version__)
|
||||||
|
|
|
@ -13,7 +13,17 @@
|
||||||
import sys
|
import sys
|
||||||
import array
|
import array
|
||||||
|
|
||||||
import six #Python 2+3 compatibility
|
import six #Python 2+3 compatibility
|
||||||
|
|
||||||
|
if six.PY3:
|
||||||
|
def get_bytes(data):
|
||||||
|
for byte in data:
|
||||||
|
yield byte
|
||||||
|
else:
|
||||||
|
def get_bytes(data):
|
||||||
|
for char in data:
|
||||||
|
yield ord(char)
|
||||||
|
|
||||||
|
|
||||||
## BitParser
|
## BitParser
|
||||||
##
|
##
|
||||||
|
@ -40,10 +50,9 @@ class BitParser(object):
|
||||||
return
|
return
|
||||||
|
|
||||||
def feedbytes(self, data):
|
def feedbytes(self, data):
|
||||||
for c in data:
|
for byte in get_bytes(data):
|
||||||
b = ord(c)
|
|
||||||
for m in (128, 64, 32, 16, 8, 4, 2, 1):
|
for m in (128, 64, 32, 16, 8, 4, 2, 1):
|
||||||
self._parse_bit(b & m)
|
self._parse_bit(byte & m)
|
||||||
return
|
return
|
||||||
|
|
||||||
def _parse_bit(self, x):
|
def _parse_bit(self, x):
|
||||||
|
@ -328,11 +337,10 @@ class CCITTG4Parser(BitParser):
|
||||||
return
|
return
|
||||||
|
|
||||||
def feedbytes(self, data):
|
def feedbytes(self, data):
|
||||||
for c in data:
|
for byte in get_bytes(data):
|
||||||
b = ord(c)
|
|
||||||
try:
|
try:
|
||||||
for m in (128, 64, 32, 16, 8, 4, 2, 1):
|
for m in (128, 64, 32, 16, 8, 4, 2, 1):
|
||||||
self._parse_bit(b & m)
|
self._parse_bit(byte & m)
|
||||||
except self.ByteSkip:
|
except self.ByteSkip:
|
||||||
self._accept = self._parse_mode
|
self._accept = self._parse_mode
|
||||||
self._state = self.MODE
|
self._state = self.MODE
|
||||||
|
|
|
@ -175,7 +175,7 @@ def stream_value(x):
|
||||||
if not isinstance(x, PDFStream):
|
if not isinstance(x, PDFStream):
|
||||||
if settings.STRICT:
|
if settings.STRICT:
|
||||||
raise PDFTypeError('PDFStream required: %r' % x)
|
raise PDFTypeError('PDFStream required: %r' % x)
|
||||||
return PDFStream({}, '')
|
return PDFStream({}, b'')
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
2
setup.py
2
setup.py
|
@ -27,7 +27,7 @@ PDF parser that can be used for other purposes instead of text analysis.''',
|
||||||
license='MIT/X',
|
license='MIT/X',
|
||||||
author='Yusuke Shinyama + Philippe Guglielmetti',
|
author='Yusuke Shinyama + Philippe Guglielmetti',
|
||||||
author_email='pdfminer@goulu.net',
|
author_email='pdfminer@goulu.net',
|
||||||
url='http://github.com/goulu/pdfminer',
|
url='http://github.com/pdfminer/pdfminer',
|
||||||
scripts=[
|
scripts=[
|
||||||
'tools/pdf2txt.py',
|
'tools/pdf2txt.py',
|
||||||
'tools/dumppdf.py',
|
'tools/dumppdf.py',
|
||||||
|
|
|
@ -18,28 +18,34 @@ def run(datapath,filename,options=None):
|
||||||
pdf2txt.main(s.split(' ')[1:])
|
pdf2txt.main(s.split(' ')[1:])
|
||||||
|
|
||||||
class TestDumpPDF():
|
class TestDumpPDF():
|
||||||
|
|
||||||
|
|
||||||
def test_1(self):
|
def test_1(self):
|
||||||
run('../samples/','jo')
|
run('../samples/','jo')
|
||||||
run('../samples/','simple1')
|
run('../samples/','simple1')
|
||||||
run('../samples/','simple2')
|
run('../samples/','simple2')
|
||||||
run('../samples/','simple3')
|
run('../samples/','simple3')
|
||||||
|
|
||||||
def test_2(self):
|
def test_2(self):
|
||||||
run('../samples/nonfree/','dmca')
|
run('../samples/nonfree/','dmca')
|
||||||
|
|
||||||
def test_3(self):
|
def test_3(self):
|
||||||
run('../samples/nonfree/','f1040nr')
|
run('../samples/nonfree/','f1040nr')
|
||||||
|
|
||||||
def test_4(self):
|
def test_4(self):
|
||||||
run('../samples/nonfree/','i1040nr')
|
run('../samples/nonfree/','i1040nr')
|
||||||
|
|
||||||
def test_5(self):
|
def test_5(self):
|
||||||
run('../samples/nonfree/','kampo')
|
run('../samples/nonfree/','kampo')
|
||||||
|
|
||||||
def test_6(self):
|
def test_6(self):
|
||||||
run('../samples/nonfree/','naacl06-shinyama')
|
run('../samples/nonfree/','naacl06-shinyama')
|
||||||
|
|
||||||
|
# this test works on Windows but on Linux & Travis-CI it says
|
||||||
|
# PDFSyntaxError: No /Root object! - Is this really a PDF?
|
||||||
|
# TODO: Find why
|
||||||
|
"""
|
||||||
|
def test_7(self):
|
||||||
|
run('../samples/contrib/','stamp-no')
|
||||||
|
"""
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
nose.runmodule()
|
nose.runmodule()
|
||||||
|
|
Loading…
Reference in New Issue