Ignore empty characters when analyzing layout (#689)
* Adding in checks for spurious lines that contain either only spaces or new line characters * Added spurious lines check and unit tests * Updated CHANGELOG.md with changes * Simplify code * Simplify code * Simplify code * Remove changes to lines that are not actually changed * Format import * Improve CHANGELOG.md * Improve CHANGELOG.md * Fix cicd * Blacken Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>pull/730/head
parent
121235e24b
commit
43c8fc8557
|
@ -18,6 +18,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
|||
- Fix extraction of jbig2 files, which was producing invalid files ([#652](https://github.com/pdfminer/pdfminer.six/pull/653))
|
||||
- Crash in `pdf2txt.py --boxes-flow=disabled` ([#682](https://github.com/pdfminer/pdfminer.six/pull/682))
|
||||
- Only use xref fallback if `PDFNoValidXRef` is raised and `fallback` is True ([#684](https://github.com/pdfminer/pdfminer.six/pull/684))
|
||||
- Ignore empty characters when analyzing layout ([#499](https://github.com/pdfminer/pdfminer.six/pull/499))
|
||||
|
||||
### Changed
|
||||
- Replace warnings.warn with logging.Logger.warning in line with [recommended use](https://docs.python.org/3/howto/logging.html#when-to-use-logging) ([#673](https://github.com/pdfminer/pdfminer.six/pull/673))
|
||||
|
|
|
@ -504,6 +504,9 @@ class LTTextLine(LTTextContainer[TextLineElement]):
|
|||
) -> List["LTTextLine"]:
|
||||
raise NotImplementedError
|
||||
|
||||
def is_empty(self) -> bool:
|
||||
return super().is_empty() or self.get_text().isspace()
|
||||
|
||||
|
||||
class LTTextLineHorizontal(LTTextLine):
|
||||
def __init__(self, word_margin: float) -> None:
|
||||
|
|
Binary file not shown.
Binary file not shown.
|
@ -1,12 +1,16 @@
|
|||
import unittest
|
||||
|
||||
from pdfminer.high_level import extract_pages
|
||||
from pdfminer.layout import (
|
||||
LTLayoutContainer,
|
||||
LAParams,
|
||||
LTTextLineHorizontal,
|
||||
LTTextLineVertical,
|
||||
LTTextBoxHorizontal,
|
||||
LTTextBoxVertical,
|
||||
)
|
||||
from pdfminer.utils import Plane
|
||||
from helpers import absolute_sample_path
|
||||
|
||||
|
||||
class TestGroupTextLines(unittest.TestCase):
|
||||
|
@ -107,3 +111,38 @@ class TestFindNeigbors(unittest.TestCase):
|
|||
centrally_aligned_overlapping,
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def test_pdf_with_empty_characters_horizontal():
|
||||
"""Regression test for issue #449
|
||||
|
||||
See: https://github.com/pdfminer/pdfminer.six/pull/689
|
||||
|
||||
The page aggregator should separate the 3 horizontal lines in the
|
||||
sample PDF. The used PDF sample has multiple explicit space characters
|
||||
in between lines with text.
|
||||
"""
|
||||
path = absolute_sample_path("contrib/issue-449-horizontal.pdf")
|
||||
pages = extract_pages(path)
|
||||
textboxes = [
|
||||
textbox for textbox in next(pages) if isinstance(textbox, LTTextBoxHorizontal)
|
||||
]
|
||||
assert len(textboxes) == 3
|
||||
|
||||
|
||||
def test_pdf_with_empty_characters_vertical():
|
||||
"""Regression test for issue #449
|
||||
|
||||
See: https://github.com/pdfminer/pdfminer.six/pull/689
|
||||
|
||||
The page aggregator should separate the 3 horizontal lines in the
|
||||
sample PDF. The used PDF sample has multiple explicit space characters
|
||||
in between lines with text.
|
||||
"""
|
||||
path = absolute_sample_path("contrib/issue-449-vertical.pdf")
|
||||
laparams = LAParams(detect_vertical=True)
|
||||
pages = extract_pages(path, laparams=laparams)
|
||||
textboxes = [
|
||||
textbox for textbox in next(pages) if isinstance(textbox, LTTextBoxVertical)
|
||||
]
|
||||
assert len(textboxes) == 3
|
||||
|
|
Loading…
Reference in New Issue