From ac2b20a79a62ea1791cf25295611d9a3100679f2 Mon Sep 17 00:00:00 2001 From: Jake Stockwin Date: Mon, 29 Jun 2020 19:07:05 +0100 Subject: [PATCH] [docs] Add extract_pages tutorial (#442) Closes https://github.com/pdfminer/pdfminer.six/issues/361 --- docs/source/topic/converting_pdf_to_text.rst | 7 ++- docs/source/tutorial/extract_pages.rst | 47 ++++++++++++++++++++ docs/source/tutorial/index.rst | 1 + 3 files changed, 53 insertions(+), 2 deletions(-) create mode 100644 docs/source/tutorial/extract_pages.rst diff --git a/docs/source/topic/converting_pdf_to_text.rst b/docs/source/topic/converting_pdf_to_text.rst index 1571b86..6b4dc1e 100644 --- a/docs/source/topic/converting_pdf_to_text.rst +++ b/docs/source/topic/converting_pdf_to_text.rst @@ -20,6 +20,9 @@ interactive elements and higher-level application data. A PDF file contains the objects making up a PDF document along with associated structural information, all represented as a single self-contained sequence of bytes. [1]_ + +.. _topic_pdf_to_text_layout: + Layout analysis algorithm ========================= @@ -70,8 +73,8 @@ Spaces need to be inserted between characters because the PDF format has no notion of the space character. A space is inserted if the characters are further apart that the `word_margin` (W in the figure). The `word_margin` is relative to the maximum width or height of the new character. Having a smaller -`word_margin` creates smaller words. Note that the `word_margin` should at -least be smaller than the `char_margin` otherwise none of the characters will +`word_margin` creates smaller words. Note that the `word_margin` should at +least be smaller than the `char_margin` otherwise none of the characters will be separated by a space. The result of this stage is a list of lines. Each line consists a list of diff --git a/docs/source/tutorial/extract_pages.rst b/docs/source/tutorial/extract_pages.rst new file mode 100644 index 0000000..f0228f8 --- /dev/null +++ b/docs/source/tutorial/extract_pages.rst @@ -0,0 +1,47 @@ +.. _tutorial_extract_pages: + +Extract elements from a PDF using Python +**************************************** + +The high level functions can be used to achieve common tasks. In this case, +we can use :ref:`api_extract_pages`: + +.. code-block:: python + + from pdfminer.high_level import extract_pages + for page_layout in extract_pages("test.pdf"): + for element in page_layout: + print(element) + + +Each ``element`` will be an ``LTTextBox``, ``LTFigure``, ``LTLine``, ``LTRect`` +or an ``LTImage``. Some of these can be iterated further, for example iterating +though an ``LTTextBox`` will give you an ``LTTextLine``, and these in turn can +be iterated through to get an ``LTChar``. See the diagram here: +:ref:`topic_pdf_to_text_layout`. + +Let's say we want to extract all of the text. We could do: + +.. code-block:: python + + from pdfminer.high_level import extract_pages + from pdfminer.layout import LTTextContainer + for page_layout in extract_pages("test.pdf"): + for element in page_layout: + if isinstance(element, LTTextContainer): + print(element.get_text()) + +Or, we could extract the fontname or size of each individual character: + +.. code-block:: python + + from pdfminer.high_level import extract_pages + from pdfminer.layout import LTTextContainer, LTChar + for page_layout in extract_pages("test.pdf"): + for element in page_layout: + if isinstance(element, LTTextContainer): + for text_line in element: + for character in text_line: + if isinstance(character, LTChar): + print(character.fontname) + print(character.size) diff --git a/docs/source/tutorial/index.rst b/docs/source/tutorial/index.rst index 35b6a9b..1e587ed 100644 --- a/docs/source/tutorial/index.rst +++ b/docs/source/tutorial/index.rst @@ -12,3 +12,4 @@ Tutorials help you get started with specific parts of pdfminer.six. commandline highlevel composable + extract_pages