diff --git a/docs/index.html b/docs/index.html index 837b7e1..8037bf8 100644 --- a/docs/index.html +++ b/docs/index.html @@ -82,14 +82,14 @@ http://pdf2html.tabesugi.net:8080/

Download

Source distribution:
- -http://pypi.python.org/pypi/pdfminer/ + +http://pypi.python.org/pypi/pdfminer_six/

github:
- -https://github.com/euske/pdfminer/ + +https://github.com/goulu/pdfminer/

Where to Ask

@@ -100,11 +100,9 @@ https://github.com/euske/pdfminer/ http://groups.google.com/group/pdfminer-users/ -

How to Install

  1. Install Python 2.6 or newer. - (Python 3 is not supported.)
  2. Download the PDFMiner source.
  3. Unpack it.
  4. Run setup.py to install:
    @@ -372,82 +370,10 @@ no stream header is displayed for the ease of saving it to a file.
    Increases the debug level. -

    Changes

    +

    Changes:

      -
    • 2014/03/28: Further bugfixes. -
    • 2014/03/24: Bugfixes and improvements for fauly PDFs.
      -API changes: -
        -
      • PDFDocument.initialize() method is removed and no longer needed. - A password is given as an argument of a PDFDocument constructor. -
      -
    • 2013/11/13: Bugfixes and minor improvements.
      -As of November 2013, there were a few changes made to the PDFMiner API -prior to October 2013. This is the result of code restructuring. Here -is a list of the changes: -
        -
      • PDFDocument class is moved to pdfdocument.py. -
      • PDFDocument class now takes a PDFParser object as an argument. -
      • PDFDocument.set_parser() and PDFParser.set_document() is removed. -
      • PDFPage class is moved to pdfpage.py. -
      • process_pdf function is implemented as PDFPage.get_pages. -
      -
    • 2013/10/22: Sudden resurge of interests. API changes. -Incorporated a lot of patches and robust handling of broken PDFs. -
    • 2011/05/15: Speed improvements for layout analysis. -
    • 2011/05/15: API changes. LTText.get_text() is added. -
    • 2011/04/20: API changes. LTPolygon class was renamed as LTCurve. -
    • 2011/04/20: LTLine now represents horizontal/vertical lines only. Thanks to Koji Nakagawa. -
    • 2011/03/07: Documentation improvements by Jakub Wilk. Memory usage patch by Jonathan Hunt. -
    • 2011/02/27: Bugfixes and layout analysis improvements. Thanks to fujimoto.report. -
    • 2010/12/26: A couple of bugfixes and minor improvements. Thanks to Kevin Brubeck Unhammer and Daniel Gerber. -
    • 2010/10/17: A couple of bugfixes and minor improvements. Thanks to standardabweichung and Alastair Irving. -
    • 2010/09/07: A minor bugfix. Thanks to Alexander Garden. -
    • 2010/08/29: A couple of bugfixes. Thanks to Sahan Malagi, pk, and Humberto Pereira. -
    • 2010/07/06: Minor bugfixes. Thanks to Federico Brega. -
    • 2010/06/13: Bugfixes and improvements on CMap data compression. Thanks to Jakub Wilk. -
    • 2010/04/24: Bugfixes and improvements on TOC extraction. Thanks to Jose Maria. -
    • 2010/03/26: Bugfixes. Thanks to Brian Berry and Lubos Pintes. -
    • 2010/03/22: Improved layout analysis. Added regression tests. -
    • 2010/03/12: A couple of bugfixes. Thanks to Sean Manefield. -
    • 2010/02/27: Changed the way of internal layout handling. (LTTextItem -> LTChar) -
    • 2010/02/15: Several bugfixes. Thanks to Sean. -
    • 2010/02/13: Bugfix and enhancement. Thanks to André Auzi. -
    • 2010/02/07: Several bugfixes. Thanks to Hiroshi Manabe. -
    • 2010/01/31: JPEG image extraction supported. Page rotation bug fixed. -
    • 2010/01/04: Python 2.6 warning removal. More doctest conversion. -
    • 2010/01/01: CMap bug fix. Thanks to Winfried Plappert. -
    • 2009/12/24: RunLengthDecode filter added. Thanks to Troy Bollinger. -
    • 2009/12/20: Experimental polygon shape extraction added. Thanks to Yusuf Dewaswala for reporting. -
    • 2009/12/19: CMap resources are now the part of the package. Thanks to Adobe for open-sourcing them. -
    • 2009/11/29: Password encryption bug fixed. Thanks to Yannick Gingras. -
    • 2009/10/31: SGML output format is changed and renamed as XML. -
    • 2009/10/24: Charspace bug fixed. Adjusted for 4-space indentation. -
    • 2009/10/04: Another matrix operation bug fixed. Thanks to Vitaly Sedelnik. -
    • 2009/09/12: Fixed rectangle handling. Able to extract image boundaries. -
    • 2009/08/30: Fixed page rotation handling. -
    • 2009/08/26: Fixed zlib decoding bug. Thanks to Shon Urbas. -
    • 2009/08/24: Fixed a bug in character placing. Thanks to Pawan Jain. -
    • 2009/07/21: Improvement in layout analysis. -
    • 2009/07/11: Improvement in layout analysis. Thanks to Lubos Pintes. -
    • 2009/05/17: Bugfixes, massive code restructuring, and simple graphic element support added. setup.py is supported. -
    • 2009/03/30: Text output mode added. -
    • 2009/03/25: Encoding problems fixed. Word splitting option added. -
    • 2009/02/28: Robust handling of corrupted PDFs. Thanks to Troy Bollinger. -
    • 2009/02/01: Various bugfixes. Thanks to Hiroshi Manabe. -
    • 2009/01/17: Handling a trailer correctly that contains both /XrefStm and /Prev entries. -
    • 2009/01/10: Handling Type3 font metrics correctly. -
    • 2008/12/28: Better handling of word spacing. Thanks to Christian Nentwich. -
    • 2008/09/06: A sample pdf2html webapp added. -
    • 2008/08/30: ASCII85 encoding filter support. -
    • 2008/07/27: Tagged contents extraction support. -
    • 2008/07/10: Outline (TOC) extraction support. -
    • 2008/06/29: HTML output added. Reorganized the directory structure. -
    • 2008/04/29: Bugfix for Win32. Thanks to Chris Clark. -
    • 2008/04/27: Basic encryption and LZW decoding support added. -
    • 2008/01/07: Several bugfixes. Thanks to Nick Fabry for his vast contribution. -
    • 2007/12/31: Initial release. -
    • 2004/12/24: Start writing the code out of boredom... +
    • 2014/09/15: pushed on PyPi
    • +
    • 2014/09/10: pdfminer_six forked from pdfminer since Yusuke didn't want to merge and pdfminer3k is outdated

    TODO

    diff --git a/pdfminer/__init__.py b/pdfminer/__init__.py index be82f84..43e8acf 100644 --- a/pdfminer/__init__.py +++ b/pdfminer/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -__version__ = '20140829' +__version__ = '20140915' if __name__ == '__main__': print (__version__) diff --git a/setup.py b/setup.py index ed50fb3..3519d3c 100644 --- a/setup.py +++ b/setup.py @@ -3,10 +3,13 @@ from distutils.core import setup from pdfminer import __version__ setup( - name='pdfminer', + name='pdfminer_six', version=__version__, + packages=['pdfminer',], + package_data={'pdfminer': ['cmap/*.pickle.gz']}, description='PDF parser and analyzer', - long_description='''PDFMiner is a tool for extracting information from PDF documents. + long_description='''fork of PDFMiner using six for Python 2+3 compatibility +PDFMiner is a tool for extracting information from PDF documents. Unlike other PDF-related tools, it focuses entirely on getting and analyzing text data. PDFMiner allows to obtain the exact location of texts in a page, as well as @@ -15,15 +18,9 @@ It includes a PDF converter that can transform PDF files into other text formats (such as HTML). It has an extensible PDF parser that can be used for other purposes instead of text analysis.''', license='MIT/X', - author='Yusuke Shinyama', - author_email='yusuke at cs dot nyu dot edu', - url='http://euske.github.io/pdfminer/index.html', - packages=[ - 'pdfminer', - ], - package_data={ - 'pdfminer': ['cmap/*.pickle.gz'] - }, + author='Yusuke Shinyama + Philippe Guglielmetti', + author_email='pdfminer@goulu.net', + url='http://github.com/goulu/pdfminer', scripts=[ 'tools/pdf2txt.py', 'tools/dumppdf.py', @@ -34,7 +31,7 @@ PDF parser that can be used for other purposes instead of text analysis.''', 'Programming Language :: Python', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3.4', - 'Development Status :: 4 - Beta', + 'Development Status :: 5 - Production/Stable', 'Environment :: Console', 'Intended Audience :: Developers', 'Intended Audience :: Science/Research',