diff --git a/CHANGELOG.md b/CHANGELOG.md index c79674c..2e30bb6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - Fix extraction of jbig2 files, which was producing invalid files ([#652](https://github.com/pdfminer/pdfminer.six/pull/653)) - Crash in `pdf2txt.py --boxes-flow=disabled` ([#682](https://github.com/pdfminer/pdfminer.six/pull/682)) - Only use xref fallback if `PDFNoValidXRef` is raised and `fallback` is True ([#684](https://github.com/pdfminer/pdfminer.six/pull/684)) +- Ignore empty characters when analyzing layout ([#499](https://github.com/pdfminer/pdfminer.six/pull/499)) ### Changed - Replace warnings.warn with logging.Logger.warning in line with [recommended use](https://docs.python.org/3/howto/logging.html#when-to-use-logging) ([#673](https://github.com/pdfminer/pdfminer.six/pull/673)) diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 3b84ce6..9196a88 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -504,6 +504,9 @@ class LTTextLine(LTTextContainer[TextLineElement]): ) -> List["LTTextLine"]: raise NotImplementedError + def is_empty(self) -> bool: + return super().is_empty() or self.get_text().isspace() + class LTTextLineHorizontal(LTTextLine): def __init__(self, word_margin: float) -> None: diff --git a/samples/contrib/issue-449-horizontal.pdf b/samples/contrib/issue-449-horizontal.pdf new file mode 100644 index 0000000..33e08a7 Binary files /dev/null and b/samples/contrib/issue-449-horizontal.pdf differ diff --git a/samples/contrib/issue-449-vertical.pdf b/samples/contrib/issue-449-vertical.pdf new file mode 100644 index 0000000..bc6a76c Binary files /dev/null and b/samples/contrib/issue-449-vertical.pdf differ diff --git a/tests/test_layout.py b/tests/test_layout.py index ce13fc9..fd393a4 100644 --- a/tests/test_layout.py +++ b/tests/test_layout.py @@ -1,12 +1,16 @@ import unittest +from pdfminer.high_level import extract_pages from pdfminer.layout import ( LTLayoutContainer, LAParams, LTTextLineHorizontal, LTTextLineVertical, + LTTextBoxHorizontal, + LTTextBoxVertical, ) from pdfminer.utils import Plane +from helpers import absolute_sample_path class TestGroupTextLines(unittest.TestCase): @@ -107,3 +111,38 @@ class TestFindNeigbors(unittest.TestCase): centrally_aligned_overlapping, ], ) + + +def test_pdf_with_empty_characters_horizontal(): + """Regression test for issue #449 + + See: https://github.com/pdfminer/pdfminer.six/pull/689 + + The page aggregator should separate the 3 horizontal lines in the + sample PDF. The used PDF sample has multiple explicit space characters + in between lines with text. + """ + path = absolute_sample_path("contrib/issue-449-horizontal.pdf") + pages = extract_pages(path) + textboxes = [ + textbox for textbox in next(pages) if isinstance(textbox, LTTextBoxHorizontal) + ] + assert len(textboxes) == 3 + + +def test_pdf_with_empty_characters_vertical(): + """Regression test for issue #449 + + See: https://github.com/pdfminer/pdfminer.six/pull/689 + + The page aggregator should separate the 3 horizontal lines in the + sample PDF. The used PDF sample has multiple explicit space characters + in between lines with text. + """ + path = absolute_sample_path("contrib/issue-449-vertical.pdf") + laparams = LAParams(detect_vertical=True) + pages = extract_pages(path, laparams=laparams) + textboxes = [ + textbox for textbox in next(pages) if isinstance(textbox, LTTextBoxVertical) + ] + assert len(textboxes) == 3