Philippe Guglielmetti 2017-04-19 12:28:03 +02:00
commit f28ce1ebed
7 changed files with 9915 additions and 46 deletions

View File

@ -16,7 +16,6 @@ PDF parser that can be used for other purposes than text analysis.
* Webpage: https://github.com/pdfminer/
* Download (PyPI): https://pypi.python.org/pypi/pdfminer.six/
* Demo WebApp: http://pdf2html.tabesugi.net:8080/ (broken?)
Features
@ -36,14 +35,12 @@ Features
How to Install
--------------
* Install Python 2.7 or newer. (Python 3.4 is supported in pdfminer.six)
* Download the source code.
* Unpack it.
* Run `setup.py`:
* Install Python 2.7 or newer. (Python 3.x is supported in pdfminer.six)
* Install
$ python setup.py install
$ pip install pdfminer.six
* Do the following test:
* Run the following test:
$ pdf2txt.py samples/simple1.pdf
@ -76,35 +73,11 @@ but it's also possible to extract some meaningful contents (e.g. images).
(For details, refer to the html document.)
API Changes
-----------
As of November 2013, there were a few changes made to the PDFMiner API
prior to October 2013. This is the result of code restructuring. Here
is a list of the changes:
* PDFDocument class is moved to pdfdocument.py.
* PDFDocument class now takes a PDFParser object as an argument.
PDFDocument.set_parser() and PDFParser.set_document() is removed.
* PDFPage class is moved to pdfpage.py
* process_pdf function is implemented as a class method PDFPage.get_pages.
TODO
----
* PEP-8 and PEP-257 conformance.
* Better documentation.
* Crypt stream filter support.
Related Projects
----------------
* <a href="http://pybrary.net/pyPdf/">pyPdf</a>
* <a href="http://www.foolabs.com/xpdf/">xpdf</a>
* <a href="http://pdfbox.apache.org/">pdfbox</a>
* <a href="http://mupdf.com/">mupdf</a>
Terms and Conditions

View File

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
__version__ = '20170119'
__version__ = '20170418'
if __name__ == '__main__':
print (__version__)

View File

@ -13,7 +13,17 @@
import sys
import array
import six #Python 2+3 compatibility
import six #Python 2+3 compatibility
if six.PY3:
def get_bytes(data):
for byte in data:
yield byte
else:
def get_bytes(data):
for char in data:
yield ord(char)
## BitParser
##
@ -40,10 +50,9 @@ class BitParser(object):
return
def feedbytes(self, data):
for c in data:
b = ord(c)
for byte in get_bytes(data):
for m in (128, 64, 32, 16, 8, 4, 2, 1):
self._parse_bit(b & m)
self._parse_bit(byte & m)
return
def _parse_bit(self, x):
@ -328,11 +337,10 @@ class CCITTG4Parser(BitParser):
return
def feedbytes(self, data):
for c in data:
b = ord(c)
for byte in get_bytes(data):
try:
for m in (128, 64, 32, 16, 8, 4, 2, 1):
self._parse_bit(b & m)
self._parse_bit(byte & m)
except self.ByteSkip:
self._accept = self._parse_mode
self._state = self.MODE

View File

@ -175,7 +175,7 @@ def stream_value(x):
if not isinstance(x, PDFStream):
if settings.STRICT:
raise PDFTypeError('PDFStream required: %r' % x)
return PDFStream({}, '')
return PDFStream({}, b'')
return x

9882
samples/contrib/stamp-no.pdf Normal file

File diff suppressed because it is too large Load Diff

View File

@ -27,7 +27,7 @@ PDF parser that can be used for other purposes instead of text analysis.''',
license='MIT/X',
author='Yusuke Shinyama + Philippe Guglielmetti',
author_email='pdfminer@goulu.net',
url='http://github.com/goulu/pdfminer',
url='http://github.com/pdfminer/pdfminer',
scripts=[
'tools/pdf2txt.py',
'tools/dumppdf.py',

View File

@ -19,7 +19,6 @@ def run(datapath,filename,options=None):
class TestDumpPDF():
def test_1(self):
run('../samples/','jo')
run('../samples/','simple1')
@ -41,5 +40,12 @@ class TestDumpPDF():
def test_6(self):
run('../samples/nonfree/','naacl06-shinyama')
# this test works on Windows but on Linux & Travis-CI it says
# PDFSyntaxError: No /Root object! - Is this really a PDF?
# TODO: Find why
"""
def test_7(self):
run('../samples/contrib/','stamp-no')
"""
if __name__ == '__main__':
nose.runmodule()