Philippe Guglielmetti 2017-04-19 12:28:03 +02:00
commit f28ce1ebed
7 changed files with 9915 additions and 46 deletions

View File

@ -16,7 +16,6 @@ PDF parser that can be used for other purposes than text analysis.
* Webpage: https://github.com/pdfminer/ * Webpage: https://github.com/pdfminer/
* Download (PyPI): https://pypi.python.org/pypi/pdfminer.six/ * Download (PyPI): https://pypi.python.org/pypi/pdfminer.six/
* Demo WebApp: http://pdf2html.tabesugi.net:8080/ (broken?)
Features Features
@ -36,14 +35,12 @@ Features
How to Install How to Install
-------------- --------------
* Install Python 2.7 or newer. (Python 3.4 is supported in pdfminer.six) * Install Python 2.7 or newer. (Python 3.x is supported in pdfminer.six)
* Download the source code. * Install
* Unpack it.
* Run `setup.py`:
$ python setup.py install $ pip install pdfminer.six
* Do the following test: * Run the following test:
$ pdf2txt.py samples/simple1.pdf $ pdf2txt.py samples/simple1.pdf
@ -76,35 +73,11 @@ but it's also possible to extract some meaningful contents (e.g. images).
(For details, refer to the html document.) (For details, refer to the html document.)
API Changes
-----------
As of November 2013, there were a few changes made to the PDFMiner API
prior to October 2013. This is the result of code restructuring. Here
is a list of the changes:
* PDFDocument class is moved to pdfdocument.py.
* PDFDocument class now takes a PDFParser object as an argument.
PDFDocument.set_parser() and PDFParser.set_document() is removed.
* PDFPage class is moved to pdfpage.py
* process_pdf function is implemented as a class method PDFPage.get_pages.
TODO TODO
---- ----
* PEP-8 and PEP-257 conformance. * PEP-8 and PEP-257 conformance.
* Better documentation. * Better documentation.
* Crypt stream filter support.
Related Projects
----------------
* <a href="http://pybrary.net/pyPdf/">pyPdf</a>
* <a href="http://www.foolabs.com/xpdf/">xpdf</a>
* <a href="http://pdfbox.apache.org/">pdfbox</a>
* <a href="http://mupdf.com/">mupdf</a>
Terms and Conditions Terms and Conditions

View File

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
__version__ = '20170119' __version__ = '20170418'
if __name__ == '__main__': if __name__ == '__main__':
print (__version__) print (__version__)

View File

@ -13,7 +13,17 @@
import sys import sys
import array import array
import six #Python 2+3 compatibility import six #Python 2+3 compatibility
if six.PY3:
def get_bytes(data):
for byte in data:
yield byte
else:
def get_bytes(data):
for char in data:
yield ord(char)
## BitParser ## BitParser
## ##
@ -40,10 +50,9 @@ class BitParser(object):
return return
def feedbytes(self, data): def feedbytes(self, data):
for c in data: for byte in get_bytes(data):
b = ord(c)
for m in (128, 64, 32, 16, 8, 4, 2, 1): for m in (128, 64, 32, 16, 8, 4, 2, 1):
self._parse_bit(b & m) self._parse_bit(byte & m)
return return
def _parse_bit(self, x): def _parse_bit(self, x):
@ -328,11 +337,10 @@ class CCITTG4Parser(BitParser):
return return
def feedbytes(self, data): def feedbytes(self, data):
for c in data: for byte in get_bytes(data):
b = ord(c)
try: try:
for m in (128, 64, 32, 16, 8, 4, 2, 1): for m in (128, 64, 32, 16, 8, 4, 2, 1):
self._parse_bit(b & m) self._parse_bit(byte & m)
except self.ByteSkip: except self.ByteSkip:
self._accept = self._parse_mode self._accept = self._parse_mode
self._state = self.MODE self._state = self.MODE

View File

@ -175,7 +175,7 @@ def stream_value(x):
if not isinstance(x, PDFStream): if not isinstance(x, PDFStream):
if settings.STRICT: if settings.STRICT:
raise PDFTypeError('PDFStream required: %r' % x) raise PDFTypeError('PDFStream required: %r' % x)
return PDFStream({}, '') return PDFStream({}, b'')
return x return x

9882
samples/contrib/stamp-no.pdf Normal file

File diff suppressed because it is too large Load Diff

View File

@ -27,7 +27,7 @@ PDF parser that can be used for other purposes instead of text analysis.''',
license='MIT/X', license='MIT/X',
author='Yusuke Shinyama + Philippe Guglielmetti', author='Yusuke Shinyama + Philippe Guglielmetti',
author_email='pdfminer@goulu.net', author_email='pdfminer@goulu.net',
url='http://github.com/goulu/pdfminer', url='http://github.com/pdfminer/pdfminer',
scripts=[ scripts=[
'tools/pdf2txt.py', 'tools/pdf2txt.py',
'tools/dumppdf.py', 'tools/dumppdf.py',

View File

@ -18,28 +18,34 @@ def run(datapath,filename,options=None):
pdf2txt.main(s.split(' ')[1:]) pdf2txt.main(s.split(' ')[1:])
class TestDumpPDF(): class TestDumpPDF():
def test_1(self): def test_1(self):
run('../samples/','jo') run('../samples/','jo')
run('../samples/','simple1') run('../samples/','simple1')
run('../samples/','simple2') run('../samples/','simple2')
run('../samples/','simple3') run('../samples/','simple3')
def test_2(self): def test_2(self):
run('../samples/nonfree/','dmca') run('../samples/nonfree/','dmca')
def test_3(self): def test_3(self):
run('../samples/nonfree/','f1040nr') run('../samples/nonfree/','f1040nr')
def test_4(self): def test_4(self):
run('../samples/nonfree/','i1040nr') run('../samples/nonfree/','i1040nr')
def test_5(self): def test_5(self):
run('../samples/nonfree/','kampo') run('../samples/nonfree/','kampo')
def test_6(self): def test_6(self):
run('../samples/nonfree/','naacl06-shinyama') run('../samples/nonfree/','naacl06-shinyama')
# this test works on Windows but on Linux & Travis-CI it says
# PDFSyntaxError: No /Root object! - Is this really a PDF?
# TODO: Find why
"""
def test_7(self):
run('../samples/contrib/','stamp-no')
"""
if __name__ == '__main__': if __name__ == '__main__':
nose.runmodule() nose.runmodule()