diff --git a/CHANGELOG.md b/CHANGELOG.md index 7130f21..7fa9bf5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - Allow boxes_flow LAParam to be passed as None, validate the input, and update documentation ([#395](https://github.com/pdfminer/pdfminer.six/pull/395)) ### Fixed +- Text no longer comes in reverse order when advanced layout analysis is disabled ([#398](https://github.com/pdfminer/pdfminer.six/pull/398)) - Updated misleading documentation for `word_margin` and `char_margin` ([#407](https://github.com/pdfminer/pdfminer.six/pull/407)) - Ignore ValueError when converting font encoding differences ([#389](https://github.com/pdfminer/pdfminer.six/pull/389)) - Grouping of text lines outside of parent container bounding box ([#386](https://github.com/pdfminer/pdfminer.six/pull/386)) diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 100bf22..cc0c88d 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -800,9 +800,9 @@ class LTLayoutContainer(LTContainer): if laparams.boxes_flow is None: def getkey(box): if isinstance(box, LTTextBoxVertical): - return (0, -box.x1, box.y0) + return (0, -box.x1, -box.y0) else: - return (1, box.y0, box.x0) + return (1, -box.y0, box.x0) textboxes.sort(key=getkey) else: self.groups = self.group_textboxes(laparams, textboxes) diff --git a/tests/test_highlevel_extracttext.py b/tests/test_highlevel_extracttext.py index c5c6f95..9ed7224 100644 --- a/tests/test_highlevel_extracttext.py +++ b/tests/test_highlevel_extracttext.py @@ -2,11 +2,14 @@ import unittest from helpers import absolute_sample_path from pdfminer.high_level import extract_text +from pdfminer.layout import LAParams -def run_with_string(sample_path): +def run_with_string(sample_path, laparams=None): + if laparams is None: + laparams = {} absolute_path = absolute_sample_path(sample_path) - s = extract_text(absolute_path) + s = extract_text(absolute_path, laparams=LAParams(**laparams)) return s @@ -21,6 +24,9 @@ test_strings = { "simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\n" "H e l l o \n\nW o r l d\n\n" "H e l l o \n\nW o r l d\n\n\f", + "simple1.pdf_no_boxes_flow": "Hello \nWorld\nHello \nWorld\n" + "H e l l o \nW o r l d\n" + "H e l l o \nW o r l d\n\f", "simple2.pdf": "\f", "simple3.pdf": "Hello\n\nHello\n\nWorld\n\nWorld\n\n\f", } @@ -32,6 +38,11 @@ class TestExtractText(unittest.TestCase): s = run_with_string(test_file) self.assertEqual(s, test_strings[test_file]) + def test_simple1_no_boxes_flow(self): + test_file = "simple1.pdf" + s = run_with_string(test_file, laparams={"boxes_flow": None}) + self.assertEqual(s, test_strings["simple1.pdf_no_boxes_flow"]) + def test_simple2_with_string(self): test_file = "simple2.pdf" s = run_with_string(test_file)