From 91d89af788aaabc6c49b50500c98dc2d1da7f71f Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sun, 17 May 2020 17:48:06 +0200 Subject: [PATCH] Add section to documentation with howto for image extraction (#427) * Make structure of documentation more clear: tutorials, how-to, topics and reference * Add howto for images * Restructure tutorials section, and add install section * Always use up-to-date version * Fix indentation warning in docstring * Add option to dumppdf.py and pdf2txt.py to show version Fixes #162 --- docs/source/conf.py | 6 ++- docs/source/howto/images.rst | 19 +++++++++ docs/source/howto/index.rst | 11 ++++++ docs/source/index.rst | 27 ++++++------- .../source/{api => reference}/commandline.rst | 0 docs/source/{api => reference}/composable.rst | 0 docs/source/{api => reference}/highlevel.rst | 0 docs/source/{api => reference}/index.rst | 6 ++- .../converting_pdf_to_text.rst | 0 docs/source/{topics => topic}/index.rst | 6 ++- .../{tutorials => tutorial}/commandline.rst | 4 +- .../{tutorials => tutorial}/composable.rst | 4 +- .../{tutorials => tutorial}/highlevel.rst | 4 +- docs/source/tutorial/index.rst | 14 +++++++ docs/source/tutorial/install.rst | 39 +++++++++++++++++++ docs/source/tutorials/index.rst | 9 ----- pdfminer/high_level.py | 2 +- tools/dumppdf.py | 4 ++ tools/pdf2txt.py | 3 ++ 19 files changed, 123 insertions(+), 35 deletions(-) create mode 100644 docs/source/howto/images.rst create mode 100644 docs/source/howto/index.rst rename docs/source/{api => reference}/commandline.rst (100%) rename docs/source/{api => reference}/composable.rst (100%) rename docs/source/{api => reference}/highlevel.rst (100%) rename docs/source/{api => reference}/index.rst (63%) rename docs/source/{topics => topic}/converting_pdf_to_text.rst (100%) rename docs/source/{topics => topic}/index.rst (60%) rename docs/source/{tutorials => tutorial}/commandline.rst (90%) rename docs/source/{tutorials => tutorial}/composable.rst (91%) rename docs/source/{tutorials => tutorial}/highlevel.rst (95%) create mode 100644 docs/source/tutorial/index.rst create mode 100644 docs/source/tutorial/install.rst delete mode 100644 docs/source/tutorials/index.rst diff --git a/docs/source/conf.py b/docs/source/conf.py index d61957b..fcbf595 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -12,10 +12,12 @@ import os import sys + +import pdfminer + sys.path.insert(0, os.path.join( os.path.abspath(os.path.dirname(__file__)), '../../')) - # -- Project information ----------------------------------------------------- project = 'pdfminer.six' @@ -23,7 +25,7 @@ copyright = '2019, Yusuke Shinyama, Philippe Guglielmetti & Pieter Marsman' author = 'Yusuke Shinyama, Philippe Guglielmetti & Pieter Marsman' # The full version, including alpha/beta/rc tags -release = '20191020' +release = pdfminer.__version__ # -- General configuration --------------------------------------------------- diff --git a/docs/source/howto/images.rst b/docs/source/howto/images.rst new file mode 100644 index 0000000..3b4165d --- /dev/null +++ b/docs/source/howto/images.rst @@ -0,0 +1,19 @@ +.. _images: + +How to extract images from a PDF +******************************** + +Before you start, make sure you have :ref:`installed pdfminer.six`. +The second thing you need is a PDF with images. If you don't have one, +you can download `this research paper +`_ +with images of cats and dogs and save it as `example.pdf`:: + + $ curl https://www.robots.ox.ac.uk/~vgg/publications/2012/parkhi12a/parkhi12a.pdf --output example.pdf + +Then run the :ref:`pdf2txt` command:: + + $ pdf2txt.py example.pdf --output-dir cats-and-dogs + +This command extracts all the images from the PDF and saves them into the +`cats-and-dogs` directory. diff --git a/docs/source/howto/index.rst b/docs/source/howto/index.rst new file mode 100644 index 0000000..b8a758b --- /dev/null +++ b/docs/source/howto/index.rst @@ -0,0 +1,11 @@ +.. _howto: + +How-to guides +************* + +How-to guides help you to solve specific problems with pdfminer.six. + +.. toctree:: + :maxdepth: 1 + + images diff --git a/docs/source/index.rst b/docs/source/index.rst index 4e90346..75588f9 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -21,12 +21,23 @@ Check out the source on `github `_. Content ======= +This documentation is organized into four sections (according to the `Divio +documentation system `_). The +:ref:`tutorial` section helps you setup and use pdfminer.six for the first +time. Read this section if this is your first time working with pdfminer.six. +The :ref:`howto` offers specific recipies for solving common problems. +Take a look at the :ref:`topic` if you want more background information on +how pdfminer.six works internally. The :ref:`reference` provides +detailed api documentation for all the common classes and functions in +pdfminer.six. + .. toctree:: :maxdepth: 2 - tutorials/index - topics/index - api/index + tutorial/index + howto/index + topic/index + reference/index Features @@ -53,16 +64,6 @@ Before using it, you must install it using Python 3.4 or newer. $ pip install pdfminer.six -Common use-cases ----------------- - -* :ref:`tutorial_commandline` if you just want to extract text from a pdf once. -* :ref:`tutorial_highlevel` if you want to integrate pdfminer.six with your - Python code. -* :ref:`tutorial_composable` when you want to tailor the behavior of - pdfmine.six to your needs. - - Contributing ============ diff --git a/docs/source/api/commandline.rst b/docs/source/reference/commandline.rst similarity index 100% rename from docs/source/api/commandline.rst rename to docs/source/reference/commandline.rst diff --git a/docs/source/api/composable.rst b/docs/source/reference/composable.rst similarity index 100% rename from docs/source/api/composable.rst rename to docs/source/reference/composable.rst diff --git a/docs/source/api/highlevel.rst b/docs/source/reference/highlevel.rst similarity index 100% rename from docs/source/api/highlevel.rst rename to docs/source/reference/highlevel.rst diff --git a/docs/source/api/index.rst b/docs/source/reference/index.rst similarity index 63% rename from docs/source/api/index.rst rename to docs/source/reference/index.rst index 047ed2d..507796f 100644 --- a/docs/source/api/index.rst +++ b/docs/source/reference/index.rst @@ -1,5 +1,7 @@ -API documentation -***************** +.. _reference: + +API Reference +************* .. toctree:: :maxdepth: 2 diff --git a/docs/source/topics/converting_pdf_to_text.rst b/docs/source/topic/converting_pdf_to_text.rst similarity index 100% rename from docs/source/topics/converting_pdf_to_text.rst rename to docs/source/topic/converting_pdf_to_text.rst diff --git a/docs/source/topics/index.rst b/docs/source/topic/index.rst similarity index 60% rename from docs/source/topics/index.rst rename to docs/source/topic/index.rst index 30c000a..547b4aa 100644 --- a/docs/source/topics/index.rst +++ b/docs/source/topic/index.rst @@ -1,5 +1,7 @@ -Using pdfminer.six -****************** +.. _topic: + +Topics +****** .. toctree:: :maxdepth: 2 diff --git a/docs/source/tutorials/commandline.rst b/docs/source/tutorial/commandline.rst similarity index 90% rename from docs/source/tutorials/commandline.rst rename to docs/source/tutorial/commandline.rst index 2477c65..5aa352d 100644 --- a/docs/source/tutorials/commandline.rst +++ b/docs/source/tutorial/commandline.rst @@ -1,7 +1,7 @@ .. _tutorial_commandline: -Get started with command-line tools -*********************************** +Extract text from a PDF using the commandline +********************************************* pdfminer.six has several tools that can be used from the command line. The command-line tools are aimed at users that occasionally want to extract text diff --git a/docs/source/tutorials/composable.rst b/docs/source/tutorial/composable.rst similarity index 91% rename from docs/source/tutorials/composable.rst rename to docs/source/tutorial/composable.rst index 971332b..5901e3e 100644 --- a/docs/source/tutorials/composable.rst +++ b/docs/source/tutorial/composable.rst @@ -1,7 +1,7 @@ .. _tutorial_composable: -Get started using the composable components API -*********************************************** +Extract text from a PDF using Python - part 2 +********************************************* The command line tools and the high-level API are just shortcuts for often used combinations of pdfminer.six components. You can use these components to diff --git a/docs/source/tutorials/highlevel.rst b/docs/source/tutorial/highlevel.rst similarity index 95% rename from docs/source/tutorials/highlevel.rst rename to docs/source/tutorial/highlevel.rst index d55328d..ffca472 100644 --- a/docs/source/tutorials/highlevel.rst +++ b/docs/source/tutorial/highlevel.rst @@ -5,8 +5,8 @@ .. _tutorial_highlevel: -Get started using the high-level functions -****************************************** +Extract text from a PDF using Python +************************************ The high-level API can be used to do common tasks. diff --git a/docs/source/tutorial/index.rst b/docs/source/tutorial/index.rst new file mode 100644 index 0000000..35b6a9b --- /dev/null +++ b/docs/source/tutorial/index.rst @@ -0,0 +1,14 @@ +.. _tutorial: + +Tutorials +********* + +Tutorials help you get started with specific parts of pdfminer.six. + +.. toctree:: + :maxdepth: 1 + + install + commandline + highlevel + composable diff --git a/docs/source/tutorial/install.rst b/docs/source/tutorial/install.rst new file mode 100644 index 0000000..a1670eb --- /dev/null +++ b/docs/source/tutorial/install.rst @@ -0,0 +1,39 @@ +.. _install: + +Install pdfminer.six as a Python package +**************************************** + +To use pdfminer.six for the first time, you need to install the Python +package in your Python environment. + +This tutorial requires you to have a system with a working Python and pip +installation. If you don't have one and don't know how to install it, take a +look at `The Hitchhiker's Guide to Python! `_. + +Install using pip +================= + +Run the following command on the commandline to install pdfminer.six as a +Python package:: + + pip install pdfminer.six + + +Test pdfminer.six installation +============================== + +You can test the pdfminer.six installation by importing it in Python. + +Open an interactive Python session from the commandline import pdfminer +.six:: + + >>> import pdfminer + >>> print(pdfminer.__version__) # doctest: +IGNORE_RESULT + '' + +Now you can use pdfminer.six as a Python package. But pdfminer.six also +comes with a couple of useful commandline tools. To test if these tools are +correctly installed, run the following on your commandline:: + + $ pdf2txt.py --version + pdfminer.six diff --git a/docs/source/tutorials/index.rst b/docs/source/tutorials/index.rst deleted file mode 100644 index f1b5b17..0000000 --- a/docs/source/tutorials/index.rst +++ /dev/null @@ -1,9 +0,0 @@ -Getting started -*************** - -.. toctree:: - :maxdepth: 2 - - commandline - highlevel - composable diff --git a/pdfminer/high_level.py b/pdfminer/high_level.py index 3acad77..1686c46 100644 --- a/pdfminer/high_level.py +++ b/pdfminer/high_level.py @@ -23,7 +23,7 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8', Takes loads of optional arguments but the defaults are somewhat sane. Beware laparams: Including an empty LAParams is not the same as passing - None! + None! :param inf: a file-like object to read PDF structure from, such as a file handler (using the builtin `open()` function) or a `BytesIO`. diff --git a/tools/dumppdf.py b/tools/dumppdf.py index 3136c70..7e2808c 100755 --- a/tools/dumppdf.py +++ b/tools/dumppdf.py @@ -6,6 +6,7 @@ import re import sys from argparse import ArgumentParser +import pdfminer from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines from pdfminer.pdfpage import PDFPage from pdfminer.pdfparser import PDFParser @@ -243,6 +244,9 @@ def create_parser(): parser.add_argument('files', type=str, default=None, nargs='+', help='One or more paths to PDF files.') + parser.add_argument( + "--version", "-v", action="version", + version="pdfminer.six v{}".format(pdfminer.__version__)) parser.add_argument( '--debug', '-d', default=False, action='store_true', help='Use debug logging level.') diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index 0637ec4..c9fb4cd 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -64,6 +64,9 @@ def maketheparser(): "files", type=str, default=None, nargs="+", help="One or more paths to PDF files.") + parser.add_argument( + "--version", "-v", action="version", + version="pdfminer.six v{}".format(pdfminer.__version__)) parser.add_argument( "--debug", "-d", default=False, action="store_true", help="Use debug logging level.")