Ignore empty characters when analyzing layout (#689)
* Adding in checks for spurious lines that contain either only spaces or new line characters * Added spurious lines check and unit tests * Updated CHANGELOG.md with changes * Simplify code * Simplify code * Simplify code * Remove changes to lines that are not actually changed * Format import * Improve CHANGELOG.md * Improve CHANGELOG.md * Fix cicd * Blacken Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>pull/730/head
parent
121235e24b
commit
43c8fc8557
|
@ -18,6 +18,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
- Fix extraction of jbig2 files, which was producing invalid files ([#652](https://github.com/pdfminer/pdfminer.six/pull/653))
|
- Fix extraction of jbig2 files, which was producing invalid files ([#652](https://github.com/pdfminer/pdfminer.six/pull/653))
|
||||||
- Crash in `pdf2txt.py --boxes-flow=disabled` ([#682](https://github.com/pdfminer/pdfminer.six/pull/682))
|
- Crash in `pdf2txt.py --boxes-flow=disabled` ([#682](https://github.com/pdfminer/pdfminer.six/pull/682))
|
||||||
- Only use xref fallback if `PDFNoValidXRef` is raised and `fallback` is True ([#684](https://github.com/pdfminer/pdfminer.six/pull/684))
|
- Only use xref fallback if `PDFNoValidXRef` is raised and `fallback` is True ([#684](https://github.com/pdfminer/pdfminer.six/pull/684))
|
||||||
|
- Ignore empty characters when analyzing layout ([#499](https://github.com/pdfminer/pdfminer.six/pull/499))
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
- Replace warnings.warn with logging.Logger.warning in line with [recommended use](https://docs.python.org/3/howto/logging.html#when-to-use-logging) ([#673](https://github.com/pdfminer/pdfminer.six/pull/673))
|
- Replace warnings.warn with logging.Logger.warning in line with [recommended use](https://docs.python.org/3/howto/logging.html#when-to-use-logging) ([#673](https://github.com/pdfminer/pdfminer.six/pull/673))
|
||||||
|
|
|
@ -504,6 +504,9 @@ class LTTextLine(LTTextContainer[TextLineElement]):
|
||||||
) -> List["LTTextLine"]:
|
) -> List["LTTextLine"]:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def is_empty(self) -> bool:
|
||||||
|
return super().is_empty() or self.get_text().isspace()
|
||||||
|
|
||||||
|
|
||||||
class LTTextLineHorizontal(LTTextLine):
|
class LTTextLineHorizontal(LTTextLine):
|
||||||
def __init__(self, word_margin: float) -> None:
|
def __init__(self, word_margin: float) -> None:
|
||||||
|
|
Binary file not shown.
Binary file not shown.
|
@ -1,12 +1,16 @@
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
|
from pdfminer.high_level import extract_pages
|
||||||
from pdfminer.layout import (
|
from pdfminer.layout import (
|
||||||
LTLayoutContainer,
|
LTLayoutContainer,
|
||||||
LAParams,
|
LAParams,
|
||||||
LTTextLineHorizontal,
|
LTTextLineHorizontal,
|
||||||
LTTextLineVertical,
|
LTTextLineVertical,
|
||||||
|
LTTextBoxHorizontal,
|
||||||
|
LTTextBoxVertical,
|
||||||
)
|
)
|
||||||
from pdfminer.utils import Plane
|
from pdfminer.utils import Plane
|
||||||
|
from helpers import absolute_sample_path
|
||||||
|
|
||||||
|
|
||||||
class TestGroupTextLines(unittest.TestCase):
|
class TestGroupTextLines(unittest.TestCase):
|
||||||
|
@ -107,3 +111,38 @@ class TestFindNeigbors(unittest.TestCase):
|
||||||
centrally_aligned_overlapping,
|
centrally_aligned_overlapping,
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_pdf_with_empty_characters_horizontal():
|
||||||
|
"""Regression test for issue #449
|
||||||
|
|
||||||
|
See: https://github.com/pdfminer/pdfminer.six/pull/689
|
||||||
|
|
||||||
|
The page aggregator should separate the 3 horizontal lines in the
|
||||||
|
sample PDF. The used PDF sample has multiple explicit space characters
|
||||||
|
in between lines with text.
|
||||||
|
"""
|
||||||
|
path = absolute_sample_path("contrib/issue-449-horizontal.pdf")
|
||||||
|
pages = extract_pages(path)
|
||||||
|
textboxes = [
|
||||||
|
textbox for textbox in next(pages) if isinstance(textbox, LTTextBoxHorizontal)
|
||||||
|
]
|
||||||
|
assert len(textboxes) == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_pdf_with_empty_characters_vertical():
|
||||||
|
"""Regression test for issue #449
|
||||||
|
|
||||||
|
See: https://github.com/pdfminer/pdfminer.six/pull/689
|
||||||
|
|
||||||
|
The page aggregator should separate the 3 horizontal lines in the
|
||||||
|
sample PDF. The used PDF sample has multiple explicit space characters
|
||||||
|
in between lines with text.
|
||||||
|
"""
|
||||||
|
path = absolute_sample_path("contrib/issue-449-vertical.pdf")
|
||||||
|
laparams = LAParams(detect_vertical=True)
|
||||||
|
pages = extract_pages(path, laparams=laparams)
|
||||||
|
textboxes = [
|
||||||
|
textbox for textbox in next(pages) if isinstance(textbox, LTTextBoxVertical)
|
||||||
|
]
|
||||||
|
assert len(textboxes) == 3
|
||||||
|
|
Loading…
Reference in New Issue