Fix text coming in reverse order with boxes flow disabled (#399)

Closes #398
pull/421/head
Jake Stockwin 2020-04-01 12:37:04 +01:00 committed by GitHub
parent e55560f858
commit 68e2ae8632
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 16 additions and 4 deletions

View File

@ -9,6 +9,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- Allow boxes_flow LAParam to be passed as None, validate the input, and update documentation ([#395](https://github.com/pdfminer/pdfminer.six/pull/395))
### Fixed
- Text no longer comes in reverse order when advanced layout analysis is disabled ([#398](https://github.com/pdfminer/pdfminer.six/pull/398))
- Updated misleading documentation for `word_margin` and `char_margin` ([#407](https://github.com/pdfminer/pdfminer.six/pull/407))
- Ignore ValueError when converting font encoding differences ([#389](https://github.com/pdfminer/pdfminer.six/pull/389))
- Grouping of text lines outside of parent container bounding box ([#386](https://github.com/pdfminer/pdfminer.six/pull/386))

View File

@ -800,9 +800,9 @@ class LTLayoutContainer(LTContainer):
if laparams.boxes_flow is None:
def getkey(box):
if isinstance(box, LTTextBoxVertical):
return (0, -box.x1, box.y0)
return (0, -box.x1, -box.y0)
else:
return (1, box.y0, box.x0)
return (1, -box.y0, box.x0)
textboxes.sort(key=getkey)
else:
self.groups = self.group_textboxes(laparams, textboxes)

View File

@ -2,11 +2,14 @@ import unittest
from helpers import absolute_sample_path
from pdfminer.high_level import extract_text
from pdfminer.layout import LAParams
def run_with_string(sample_path):
def run_with_string(sample_path, laparams=None):
if laparams is None:
laparams = {}
absolute_path = absolute_sample_path(sample_path)
s = extract_text(absolute_path)
s = extract_text(absolute_path, laparams=LAParams(**laparams))
return s
@ -21,6 +24,9 @@ test_strings = {
"simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\n"
"H e l l o \n\nW o r l d\n\n"
"H e l l o \n\nW o r l d\n\n\f",
"simple1.pdf_no_boxes_flow": "Hello \nWorld\nHello \nWorld\n"
"H e l l o \nW o r l d\n"
"H e l l o \nW o r l d\n\f",
"simple2.pdf": "\f",
"simple3.pdf": "Hello\n\nHello\n\nWorld\n\nWorld\n\n\f",
}
@ -32,6 +38,11 @@ class TestExtractText(unittest.TestCase):
s = run_with_string(test_file)
self.assertEqual(s, test_strings[test_file])
def test_simple1_no_boxes_flow(self):
test_file = "simple1.pdf"
s = run_with_string(test_file, laparams={"boxes_flow": None})
self.assertEqual(s, test_strings["simple1.pdf_no_boxes_flow"])
def test_simple2_with_string(self):
test_file = "simple2.pdf"
s = run_with_string(test_file)