Add section to documentation with howto for image extraction (#427)
* Make structure of documentation more clear: tutorials, how-to, topics and reference * Add howto for images * Restructure tutorials section, and add install section * Always use up-to-date version * Fix indentation warning in docstring * Add option to dumppdf.py and pdf2txt.py to show version Fixes #162pull/442/head
parent
7254530d27
commit
91d89af788
|
@ -12,10 +12,12 @@
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
import pdfminer
|
||||||
|
|
||||||
sys.path.insert(0, os.path.join(
|
sys.path.insert(0, os.path.join(
|
||||||
os.path.abspath(os.path.dirname(__file__)), '../../'))
|
os.path.abspath(os.path.dirname(__file__)), '../../'))
|
||||||
|
|
||||||
|
|
||||||
# -- Project information -----------------------------------------------------
|
# -- Project information -----------------------------------------------------
|
||||||
|
|
||||||
project = 'pdfminer.six'
|
project = 'pdfminer.six'
|
||||||
|
@ -23,7 +25,7 @@ copyright = '2019, Yusuke Shinyama, Philippe Guglielmetti & Pieter Marsman'
|
||||||
author = 'Yusuke Shinyama, Philippe Guglielmetti & Pieter Marsman'
|
author = 'Yusuke Shinyama, Philippe Guglielmetti & Pieter Marsman'
|
||||||
|
|
||||||
# The full version, including alpha/beta/rc tags
|
# The full version, including alpha/beta/rc tags
|
||||||
release = '20191020'
|
release = pdfminer.__version__
|
||||||
|
|
||||||
|
|
||||||
# -- General configuration ---------------------------------------------------
|
# -- General configuration ---------------------------------------------------
|
||||||
|
|
|
@ -0,0 +1,19 @@
|
||||||
|
.. _images:
|
||||||
|
|
||||||
|
How to extract images from a PDF
|
||||||
|
********************************
|
||||||
|
|
||||||
|
Before you start, make sure you have :ref:`installed pdfminer.six<install>`.
|
||||||
|
The second thing you need is a PDF with images. If you don't have one,
|
||||||
|
you can download `this research paper
|
||||||
|
<https://www.robots.ox.ac.uk/~vgg/publications/2012/parkhi12a/parkhi12a.pdf>`_
|
||||||
|
with images of cats and dogs and save it as `example.pdf`::
|
||||||
|
|
||||||
|
$ curl https://www.robots.ox.ac.uk/~vgg/publications/2012/parkhi12a/parkhi12a.pdf --output example.pdf
|
||||||
|
|
||||||
|
Then run the :ref:`pdf2txt<api_pdf2txt>` command::
|
||||||
|
|
||||||
|
$ pdf2txt.py example.pdf --output-dir cats-and-dogs
|
||||||
|
|
||||||
|
This command extracts all the images from the PDF and saves them into the
|
||||||
|
`cats-and-dogs` directory.
|
|
@ -0,0 +1,11 @@
|
||||||
|
.. _howto:
|
||||||
|
|
||||||
|
How-to guides
|
||||||
|
*************
|
||||||
|
|
||||||
|
How-to guides help you to solve specific problems with pdfminer.six.
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
|
||||||
|
images
|
|
@ -21,12 +21,23 @@ Check out the source on `github <https://github.com/pdfminer/pdfminer.six>`_.
|
||||||
Content
|
Content
|
||||||
=======
|
=======
|
||||||
|
|
||||||
|
This documentation is organized into four sections (according to the `Divio
|
||||||
|
documentation system <https://documentation.divio.com>`_). The
|
||||||
|
:ref:`tutorial` section helps you setup and use pdfminer.six for the first
|
||||||
|
time. Read this section if this is your first time working with pdfminer.six.
|
||||||
|
The :ref:`howto` offers specific recipies for solving common problems.
|
||||||
|
Take a look at the :ref:`topic` if you want more background information on
|
||||||
|
how pdfminer.six works internally. The :ref:`reference` provides
|
||||||
|
detailed api documentation for all the common classes and functions in
|
||||||
|
pdfminer.six.
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 2
|
:maxdepth: 2
|
||||||
|
|
||||||
tutorials/index
|
tutorial/index
|
||||||
topics/index
|
howto/index
|
||||||
api/index
|
topic/index
|
||||||
|
reference/index
|
||||||
|
|
||||||
|
|
||||||
Features
|
Features
|
||||||
|
@ -53,16 +64,6 @@ Before using it, you must install it using Python 3.4 or newer.
|
||||||
$ pip install pdfminer.six
|
$ pip install pdfminer.six
|
||||||
|
|
||||||
|
|
||||||
Common use-cases
|
|
||||||
----------------
|
|
||||||
|
|
||||||
* :ref:`tutorial_commandline` if you just want to extract text from a pdf once.
|
|
||||||
* :ref:`tutorial_highlevel` if you want to integrate pdfminer.six with your
|
|
||||||
Python code.
|
|
||||||
* :ref:`tutorial_composable` when you want to tailor the behavior of
|
|
||||||
pdfmine.six to your needs.
|
|
||||||
|
|
||||||
|
|
||||||
Contributing
|
Contributing
|
||||||
============
|
============
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
API documentation
|
.. _reference:
|
||||||
*****************
|
|
||||||
|
API Reference
|
||||||
|
*************
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 2
|
:maxdepth: 2
|
|
@ -1,5 +1,7 @@
|
||||||
Using pdfminer.six
|
.. _topic:
|
||||||
******************
|
|
||||||
|
Topics
|
||||||
|
******
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 2
|
:maxdepth: 2
|
|
@ -1,7 +1,7 @@
|
||||||
.. _tutorial_commandline:
|
.. _tutorial_commandline:
|
||||||
|
|
||||||
Get started with command-line tools
|
Extract text from a PDF using the commandline
|
||||||
***********************************
|
*********************************************
|
||||||
|
|
||||||
pdfminer.six has several tools that can be used from the command line. The
|
pdfminer.six has several tools that can be used from the command line. The
|
||||||
command-line tools are aimed at users that occasionally want to extract text
|
command-line tools are aimed at users that occasionally want to extract text
|
|
@ -1,7 +1,7 @@
|
||||||
.. _tutorial_composable:
|
.. _tutorial_composable:
|
||||||
|
|
||||||
Get started using the composable components API
|
Extract text from a PDF using Python - part 2
|
||||||
***********************************************
|
*********************************************
|
||||||
|
|
||||||
The command line tools and the high-level API are just shortcuts for often
|
The command line tools and the high-level API are just shortcuts for often
|
||||||
used combinations of pdfminer.six components. You can use these components to
|
used combinations of pdfminer.six components. You can use these components to
|
|
@ -5,8 +5,8 @@
|
||||||
|
|
||||||
.. _tutorial_highlevel:
|
.. _tutorial_highlevel:
|
||||||
|
|
||||||
Get started using the high-level functions
|
Extract text from a PDF using Python
|
||||||
******************************************
|
************************************
|
||||||
|
|
||||||
The high-level API can be used to do common tasks.
|
The high-level API can be used to do common tasks.
|
||||||
|
|
|
@ -0,0 +1,14 @@
|
||||||
|
.. _tutorial:
|
||||||
|
|
||||||
|
Tutorials
|
||||||
|
*********
|
||||||
|
|
||||||
|
Tutorials help you get started with specific parts of pdfminer.six.
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
|
||||||
|
install
|
||||||
|
commandline
|
||||||
|
highlevel
|
||||||
|
composable
|
|
@ -0,0 +1,39 @@
|
||||||
|
.. _install:
|
||||||
|
|
||||||
|
Install pdfminer.six as a Python package
|
||||||
|
****************************************
|
||||||
|
|
||||||
|
To use pdfminer.six for the first time, you need to install the Python
|
||||||
|
package in your Python environment.
|
||||||
|
|
||||||
|
This tutorial requires you to have a system with a working Python and pip
|
||||||
|
installation. If you don't have one and don't know how to install it, take a
|
||||||
|
look at `The Hitchhiker's Guide to Python! <https://docs.python-guide.org/>`_.
|
||||||
|
|
||||||
|
Install using pip
|
||||||
|
=================
|
||||||
|
|
||||||
|
Run the following command on the commandline to install pdfminer.six as a
|
||||||
|
Python package::
|
||||||
|
|
||||||
|
pip install pdfminer.six
|
||||||
|
|
||||||
|
|
||||||
|
Test pdfminer.six installation
|
||||||
|
==============================
|
||||||
|
|
||||||
|
You can test the pdfminer.six installation by importing it in Python.
|
||||||
|
|
||||||
|
Open an interactive Python session from the commandline import pdfminer
|
||||||
|
.six::
|
||||||
|
|
||||||
|
>>> import pdfminer
|
||||||
|
>>> print(pdfminer.__version__) # doctest: +IGNORE_RESULT
|
||||||
|
'<installed version>'
|
||||||
|
|
||||||
|
Now you can use pdfminer.six as a Python package. But pdfminer.six also
|
||||||
|
comes with a couple of useful commandline tools. To test if these tools are
|
||||||
|
correctly installed, run the following on your commandline::
|
||||||
|
|
||||||
|
$ pdf2txt.py --version
|
||||||
|
pdfminer.six <installed version>
|
|
@ -1,9 +0,0 @@
|
||||||
Getting started
|
|
||||||
***************
|
|
||||||
|
|
||||||
.. toctree::
|
|
||||||
:maxdepth: 2
|
|
||||||
|
|
||||||
commandline
|
|
||||||
highlevel
|
|
||||||
composable
|
|
|
@ -23,7 +23,7 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
|
||||||
|
|
||||||
Takes loads of optional arguments but the defaults are somewhat sane.
|
Takes loads of optional arguments but the defaults are somewhat sane.
|
||||||
Beware laparams: Including an empty LAParams is not the same as passing
|
Beware laparams: Including an empty LAParams is not the same as passing
|
||||||
None!
|
None!
|
||||||
|
|
||||||
:param inf: a file-like object to read PDF structure from, such as a
|
:param inf: a file-like object to read PDF structure from, such as a
|
||||||
file handler (using the builtin `open()` function) or a `BytesIO`.
|
file handler (using the builtin `open()` function) or a `BytesIO`.
|
||||||
|
|
|
@ -6,6 +6,7 @@ import re
|
||||||
import sys
|
import sys
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
|
|
||||||
|
import pdfminer
|
||||||
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
|
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
|
||||||
from pdfminer.pdfpage import PDFPage
|
from pdfminer.pdfpage import PDFPage
|
||||||
from pdfminer.pdfparser import PDFParser
|
from pdfminer.pdfparser import PDFParser
|
||||||
|
@ -243,6 +244,9 @@ def create_parser():
|
||||||
parser.add_argument('files', type=str, default=None, nargs='+',
|
parser.add_argument('files', type=str, default=None, nargs='+',
|
||||||
help='One or more paths to PDF files.')
|
help='One or more paths to PDF files.')
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--version", "-v", action="version",
|
||||||
|
version="pdfminer.six v{}".format(pdfminer.__version__))
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--debug', '-d', default=False, action='store_true',
|
'--debug', '-d', default=False, action='store_true',
|
||||||
help='Use debug logging level.')
|
help='Use debug logging level.')
|
||||||
|
|
|
@ -64,6 +64,9 @@ def maketheparser():
|
||||||
"files", type=str, default=None, nargs="+",
|
"files", type=str, default=None, nargs="+",
|
||||||
help="One or more paths to PDF files.")
|
help="One or more paths to PDF files.")
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--version", "-v", action="version",
|
||||||
|
version="pdfminer.six v{}".format(pdfminer.__version__))
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--debug", "-d", default=False, action="store_true",
|
"--debug", "-d", default=False, action="store_true",
|
||||||
help="Use debug logging level.")
|
help="Use debug logging level.")
|
||||||
|
|
Loading…
Reference in New Issue