Ignore empty characters when analyzing layout (#689)

* Adding in checks for spurious lines that contain either only spaces or new line characters * Added spurious lines check and unit tests * Updated CHANGELOG.md with changes * Simplify code * Simplify code * Simplify code * Remove changes to lines that are not actually changed * Format import * Improve CHANGELOG.md * Improve CHANGELOG.md * Fix cicd * Blacken Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
2022-02-22 15:20:26 -05:00 · 2022-02-22 15:20:26 -05:00 · 43c8fc8557
parent 121235e24b
commit 43c8fc8557
5 changed files with 43 additions and 0 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -18,6 +18,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 - Fix extraction of jbig2 files, which was producing invalid files ([#652](https://github.com/pdfminer/pdfminer.six/pull/653))
 - Crash in `pdf2txt.py --boxes-flow=disabled` ([#682](https://github.com/pdfminer/pdfminer.six/pull/682))
 - Only use xref fallback if `PDFNoValidXRef` is raised and `fallback` is True ([#684](https://github.com/pdfminer/pdfminer.six/pull/684))
 - Ignore empty characters when analyzing layout ([#499](https://github.com/pdfminer/pdfminer.six/pull/499))
 ### Changed
 - Replace warnings.warn with logging.Logger.warning in line with [recommended use](https://docs.python.org/3/howto/logging.html#when-to-use-logging) ([#673](https://github.com/pdfminer/pdfminer.six/pull/673))
--- a/pdfminer/layout.py
+++ b/pdfminer/layout.py
@ -504,6 +504,9 @@ class LTTextLine(LTTextContainer[TextLineElement]):
    ) -> List["LTTextLine"]:
        raise NotImplementedError
    def is_empty(self) -> bool:
        return super().is_empty() or self.get_text().isspace()
 class LTTextLineHorizontal(LTTextLine):
    def __init__(self, word_margin: float) -> None:
--- a/samples/contrib/issue-449-horizontal.pdf
+++ b/samples/contrib/issue-449-horizontal.pdf
--- a/samples/contrib/issue-449-vertical.pdf
+++ b/samples/contrib/issue-449-vertical.pdf
--- a/tests/test_layout.py
+++ b/tests/test_layout.py
@ -1,12 +1,16 @@
 import unittest
 from pdfminer.high_level import extract_pages
 from pdfminer.layout import (
    LTLayoutContainer,
    LAParams,
    LTTextLineHorizontal,
    LTTextLineVertical,
    LTTextBoxHorizontal,
    LTTextBoxVertical,
 )
 from pdfminer.utils import Plane
 from helpers import absolute_sample_path
 class TestGroupTextLines(unittest.TestCase):
@ -107,3 +111,38 @@ class TestFindNeigbors(unittest.TestCase):
                centrally_aligned_overlapping,
            ],
        )
 def test_pdf_with_empty_characters_horizontal():
    """Regression test for issue #449
    See: https://github.com/pdfminer/pdfminer.six/pull/689
    The page aggregator should separate the 3 horizontal lines in the
    sample PDF. The used PDF sample has multiple explicit space characters
    in between lines with text.
    """
    path = absolute_sample_path("contrib/issue-449-horizontal.pdf")
    pages = extract_pages(path)
    textboxes = [
        textbox for textbox in next(pages) if isinstance(textbox, LTTextBoxHorizontal)
    ]
    assert len(textboxes) == 3
 def test_pdf_with_empty_characters_vertical():
    """Regression test for issue #449
    See: https://github.com/pdfminer/pdfminer.six/pull/689
    The page aggregator should separate the 3 horizontal lines in the
    sample PDF. The used PDF sample has multiple explicit space characters
    in between lines with text.
    """
    path = absolute_sample_path("contrib/issue-449-vertical.pdf")
    laparams = LAParams(detect_vertical=True)
    pages = extract_pages(path, laparams=laparams)
    textboxes = [
        textbox for textbox in next(pages) if isinstance(textbox, LTTextBoxVertical)
    ]
    assert len(textboxes) == 3