Merge branch 'master' of https://github.com/pdfminer/pdfminer.six.git
commit
f28ce1ebed
35
README.md
35
README.md
|
@ -16,7 +16,6 @@ PDF parser that can be used for other purposes than text analysis.
|
|||
|
||||
* Webpage: https://github.com/pdfminer/
|
||||
* Download (PyPI): https://pypi.python.org/pypi/pdfminer.six/
|
||||
* Demo WebApp: http://pdf2html.tabesugi.net:8080/ (broken?)
|
||||
|
||||
|
||||
Features
|
||||
|
@ -36,14 +35,12 @@ Features
|
|||
How to Install
|
||||
--------------
|
||||
|
||||
* Install Python 2.7 or newer. (Python 3.4 is supported in pdfminer.six)
|
||||
* Download the source code.
|
||||
* Unpack it.
|
||||
* Run `setup.py`:
|
||||
* Install Python 2.7 or newer. (Python 3.x is supported in pdfminer.six)
|
||||
* Install
|
||||
|
||||
$ python setup.py install
|
||||
$ pip install pdfminer.six
|
||||
|
||||
* Do the following test:
|
||||
* Run the following test:
|
||||
|
||||
$ pdf2txt.py samples/simple1.pdf
|
||||
|
||||
|
@ -76,35 +73,11 @@ but it's also possible to extract some meaningful contents (e.g. images).
|
|||
(For details, refer to the html document.)
|
||||
|
||||
|
||||
API Changes
|
||||
-----------
|
||||
|
||||
As of November 2013, there were a few changes made to the PDFMiner API
|
||||
prior to October 2013. This is the result of code restructuring. Here
|
||||
is a list of the changes:
|
||||
|
||||
* PDFDocument class is moved to pdfdocument.py.
|
||||
* PDFDocument class now takes a PDFParser object as an argument.
|
||||
PDFDocument.set_parser() and PDFParser.set_document() is removed.
|
||||
* PDFPage class is moved to pdfpage.py
|
||||
* process_pdf function is implemented as a class method PDFPage.get_pages.
|
||||
|
||||
|
||||
TODO
|
||||
----
|
||||
|
||||
* PEP-8 and PEP-257 conformance.
|
||||
* Better documentation.
|
||||
* Crypt stream filter support.
|
||||
|
||||
|
||||
Related Projects
|
||||
----------------
|
||||
|
||||
* <a href="http://pybrary.net/pyPdf/">pyPdf</a>
|
||||
* <a href="http://www.foolabs.com/xpdf/">xpdf</a>
|
||||
* <a href="http://pdfbox.apache.org/">pdfbox</a>
|
||||
* <a href="http://mupdf.com/">mupdf</a>
|
||||
|
||||
|
||||
Terms and Conditions
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
|
||||
# -*- coding: utf-8 -*-
|
||||
__version__ = '20170119'
|
||||
__version__ = '20170418'
|
||||
|
||||
if __name__ == '__main__':
|
||||
print (__version__)
|
||||
|
|
|
@ -13,7 +13,17 @@
|
|||
import sys
|
||||
import array
|
||||
|
||||
import six #Python 2+3 compatibility
|
||||
import six #Python 2+3 compatibility
|
||||
|
||||
if six.PY3:
|
||||
def get_bytes(data):
|
||||
for byte in data:
|
||||
yield byte
|
||||
else:
|
||||
def get_bytes(data):
|
||||
for char in data:
|
||||
yield ord(char)
|
||||
|
||||
|
||||
## BitParser
|
||||
##
|
||||
|
@ -40,10 +50,9 @@ class BitParser(object):
|
|||
return
|
||||
|
||||
def feedbytes(self, data):
|
||||
for c in data:
|
||||
b = ord(c)
|
||||
for byte in get_bytes(data):
|
||||
for m in (128, 64, 32, 16, 8, 4, 2, 1):
|
||||
self._parse_bit(b & m)
|
||||
self._parse_bit(byte & m)
|
||||
return
|
||||
|
||||
def _parse_bit(self, x):
|
||||
|
@ -328,11 +337,10 @@ class CCITTG4Parser(BitParser):
|
|||
return
|
||||
|
||||
def feedbytes(self, data):
|
||||
for c in data:
|
||||
b = ord(c)
|
||||
for byte in get_bytes(data):
|
||||
try:
|
||||
for m in (128, 64, 32, 16, 8, 4, 2, 1):
|
||||
self._parse_bit(b & m)
|
||||
self._parse_bit(byte & m)
|
||||
except self.ByteSkip:
|
||||
self._accept = self._parse_mode
|
||||
self._state = self.MODE
|
||||
|
|
|
@ -175,7 +175,7 @@ def stream_value(x):
|
|||
if not isinstance(x, PDFStream):
|
||||
if settings.STRICT:
|
||||
raise PDFTypeError('PDFStream required: %r' % x)
|
||||
return PDFStream({}, '')
|
||||
return PDFStream({}, b'')
|
||||
return x
|
||||
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
2
setup.py
2
setup.py
|
@ -27,7 +27,7 @@ PDF parser that can be used for other purposes instead of text analysis.''',
|
|||
license='MIT/X',
|
||||
author='Yusuke Shinyama + Philippe Guglielmetti',
|
||||
author_email='pdfminer@goulu.net',
|
||||
url='http://github.com/goulu/pdfminer',
|
||||
url='http://github.com/pdfminer/pdfminer',
|
||||
scripts=[
|
||||
'tools/pdf2txt.py',
|
||||
'tools/dumppdf.py',
|
||||
|
|
|
@ -18,28 +18,34 @@ def run(datapath,filename,options=None):
|
|||
pdf2txt.main(s.split(' ')[1:])
|
||||
|
||||
class TestDumpPDF():
|
||||
|
||||
|
||||
def test_1(self):
|
||||
run('../samples/','jo')
|
||||
run('../samples/','simple1')
|
||||
run('../samples/','simple2')
|
||||
run('../samples/','simple3')
|
||||
|
||||
|
||||
def test_2(self):
|
||||
run('../samples/nonfree/','dmca')
|
||||
|
||||
|
||||
def test_3(self):
|
||||
run('../samples/nonfree/','f1040nr')
|
||||
|
||||
def test_4(self):
|
||||
run('../samples/nonfree/','i1040nr')
|
||||
|
||||
|
||||
def test_5(self):
|
||||
run('../samples/nonfree/','kampo')
|
||||
|
||||
|
||||
def test_6(self):
|
||||
run('../samples/nonfree/','naacl06-shinyama')
|
||||
|
||||
# this test works on Windows but on Linux & Travis-CI it says
|
||||
# PDFSyntaxError: No /Root object! - Is this really a PDF?
|
||||
# TODO: Find why
|
||||
"""
|
||||
def test_7(self):
|
||||
run('../samples/contrib/','stamp-no')
|
||||
"""
|
||||
if __name__ == '__main__':
|
||||
nose.runmodule()
|
||||
|
|
Loading…
Reference in New Issue