Fix ordering of textlines within a textbox when boxes_flow is disabled (#412)
* Fix ordering of textlines within a textbox when boxes_flow is disabled * Add new test PDF samplepull/442/head
parent
7eff108fa5
commit
7254530d27
|
@ -8,6 +8,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
### Added
|
### Added
|
||||||
|
|
||||||
- Python3 shebang line to script in tools ([408](https://github.com/pdfminer/pdfminer.six/pull/408)
|
- Python3 shebang line to script in tools ([408](https://github.com/pdfminer/pdfminer.six/pull/408)
|
||||||
|
### Fixed
|
||||||
|
- Fix ordering of textlines within a textbox when `boxes_flow=None` ([#411](https://github.com/pdfminer/pdfminer.six/issues/411))
|
||||||
|
|
||||||
## [20200402]
|
## [20200402]
|
||||||
|
|
||||||
|
|
|
@ -798,6 +798,9 @@ class LTLayoutContainer(LTContainer):
|
||||||
obj.analyze(laparams)
|
obj.analyze(laparams)
|
||||||
textboxes = list(self.group_textlines(laparams, textlines))
|
textboxes = list(self.group_textlines(laparams, textlines))
|
||||||
if laparams.boxes_flow is None:
|
if laparams.boxes_flow is None:
|
||||||
|
for textbox in textboxes:
|
||||||
|
textbox.analyze(laparams)
|
||||||
|
|
||||||
def getkey(box):
|
def getkey(box):
|
||||||
if isinstance(box, LTTextBoxVertical):
|
if isinstance(box, LTTextBoxVertical):
|
||||||
return (0, -box.x1, -box.y0)
|
return (0, -box.x1, -box.y0)
|
||||||
|
|
Binary file not shown.
|
@ -1,8 +1,8 @@
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from helpers import absolute_sample_path
|
from helpers import absolute_sample_path
|
||||||
from pdfminer.high_level import extract_text
|
from pdfminer.high_level import extract_text, extract_pages
|
||||||
from pdfminer.layout import LAParams
|
from pdfminer.layout import LAParams, LTTextContainer
|
||||||
|
|
||||||
|
|
||||||
def run_with_string(sample_path, laparams=None):
|
def run_with_string(sample_path, laparams=None):
|
||||||
|
@ -24,11 +24,12 @@ test_strings = {
|
||||||
"simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\n"
|
"simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\n"
|
||||||
"H e l l o \n\nW o r l d\n\n"
|
"H e l l o \n\nW o r l d\n\n"
|
||||||
"H e l l o \n\nW o r l d\n\n\f",
|
"H e l l o \n\nW o r l d\n\n\f",
|
||||||
"simple1.pdf_no_boxes_flow": "Hello \nWorld\nHello \nWorld\n"
|
"simple1.pdf_no_boxes_flow": "Hello \n\nWorld\n\nHello \n\nWorld\n\n"
|
||||||
"H e l l o \nW o r l d\n"
|
"H e l l o \n\nW o r l d\n\n"
|
||||||
"H e l l o \nW o r l d\n\f",
|
"H e l l o \n\nW o r l d\n\n\f",
|
||||||
"simple2.pdf": "\f",
|
"simple2.pdf": "\f",
|
||||||
"simple3.pdf": "Hello\n\nHello\n\nWorld\n\nWorld\n\n\f",
|
"simple3.pdf": "Hello\n\nHello\n\nWorld\n\nWorld\n\n\f",
|
||||||
|
"simple4.pdf": "Text1\nText2\nText3\n\n\f"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -53,6 +54,11 @@ class TestExtractText(unittest.TestCase):
|
||||||
s = run_with_string(test_file)
|
s = run_with_string(test_file)
|
||||||
self.assertEqual(s, test_strings[test_file])
|
self.assertEqual(s, test_strings[test_file])
|
||||||
|
|
||||||
|
def test_simple4_with_string(self):
|
||||||
|
test_file = "simple4.pdf"
|
||||||
|
s = run_with_string(test_file)
|
||||||
|
self.assertEqual(s, test_strings[test_file])
|
||||||
|
|
||||||
def test_simple1_with_file(self):
|
def test_simple1_with_file(self):
|
||||||
test_file = "simple1.pdf"
|
test_file = "simple1.pdf"
|
||||||
s = run_with_file(test_file)
|
s = run_with_file(test_file)
|
||||||
|
@ -68,6 +74,54 @@ class TestExtractText(unittest.TestCase):
|
||||||
s = run_with_file(test_file)
|
s = run_with_file(test_file)
|
||||||
self.assertEqual(s, test_strings[test_file])
|
self.assertEqual(s, test_strings[test_file])
|
||||||
|
|
||||||
|
def test_simple4_with_file(self):
|
||||||
|
test_file = "simple4.pdf"
|
||||||
|
s = run_with_file(test_file)
|
||||||
|
self.assertEqual(s, test_strings[test_file])
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtractPages(unittest.TestCase):
|
||||||
|
def _get_test_file_path(self):
|
||||||
|
test_file = "simple4.pdf"
|
||||||
|
return absolute_sample_path(test_file)
|
||||||
|
|
||||||
|
def test_line_margin(self):
|
||||||
|
# The lines have margin 0.2 relative to the height.
|
||||||
|
# Extract with line_margin 0.19 should break into 3 separate textboxes.
|
||||||
|
pages = list(extract_pages(
|
||||||
|
self._get_test_file_path(), laparams=LAParams(line_margin=0.19)))
|
||||||
|
self.assertEqual(len(pages), 1)
|
||||||
|
page = pages[0]
|
||||||
|
|
||||||
|
elements = [element for element in page
|
||||||
|
if isinstance(element, LTTextContainer)]
|
||||||
|
self.assertEqual(len(elements), 3)
|
||||||
|
self.assertEqual(elements[0].get_text(), "Text1\n")
|
||||||
|
self.assertEqual(elements[1].get_text(), "Text2\n")
|
||||||
|
self.assertEqual(elements[2].get_text(), "Text3\n")
|
||||||
|
|
||||||
|
# Extract with line_margin 0.21 should merge into one textbox.
|
||||||
|
pages = list(extract_pages(
|
||||||
|
self._get_test_file_path(), laparams=LAParams(line_margin=0.21)))
|
||||||
|
self.assertEqual(len(pages), 1)
|
||||||
|
page = pages[0]
|
||||||
|
|
||||||
|
elements = [element for element in page
|
||||||
|
if isinstance(element, LTTextContainer)]
|
||||||
|
self.assertEqual(len(elements), 1)
|
||||||
|
self.assertEqual(elements[0].get_text(), "Text1\nText2\nText3\n")
|
||||||
|
|
||||||
|
def test_no_boxes_flow(self):
|
||||||
|
pages = list(extract_pages(
|
||||||
|
self._get_test_file_path(), laparams=LAParams(boxes_flow=None)))
|
||||||
|
self.assertEqual(len(pages), 1)
|
||||||
|
page = pages[0]
|
||||||
|
|
||||||
|
elements = [element for element in page
|
||||||
|
if isinstance(element, LTTextContainer)]
|
||||||
|
self.assertEqual(len(elements), 1)
|
||||||
|
self.assertEqual(elements[0].get_text(), "Text1\nText2\nText3\n")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|
Loading…
Reference in New Issue