parent
e55560f858
commit
68e2ae8632
|
@ -9,6 +9,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
|||
- Allow boxes_flow LAParam to be passed as None, validate the input, and update documentation ([#395](https://github.com/pdfminer/pdfminer.six/pull/395))
|
||||
|
||||
### Fixed
|
||||
- Text no longer comes in reverse order when advanced layout analysis is disabled ([#398](https://github.com/pdfminer/pdfminer.six/pull/398))
|
||||
- Updated misleading documentation for `word_margin` and `char_margin` ([#407](https://github.com/pdfminer/pdfminer.six/pull/407))
|
||||
- Ignore ValueError when converting font encoding differences ([#389](https://github.com/pdfminer/pdfminer.six/pull/389))
|
||||
- Grouping of text lines outside of parent container bounding box ([#386](https://github.com/pdfminer/pdfminer.six/pull/386))
|
||||
|
|
|
@ -800,9 +800,9 @@ class LTLayoutContainer(LTContainer):
|
|||
if laparams.boxes_flow is None:
|
||||
def getkey(box):
|
||||
if isinstance(box, LTTextBoxVertical):
|
||||
return (0, -box.x1, box.y0)
|
||||
return (0, -box.x1, -box.y0)
|
||||
else:
|
||||
return (1, box.y0, box.x0)
|
||||
return (1, -box.y0, box.x0)
|
||||
textboxes.sort(key=getkey)
|
||||
else:
|
||||
self.groups = self.group_textboxes(laparams, textboxes)
|
||||
|
|
|
@ -2,11 +2,14 @@ import unittest
|
|||
|
||||
from helpers import absolute_sample_path
|
||||
from pdfminer.high_level import extract_text
|
||||
from pdfminer.layout import LAParams
|
||||
|
||||
|
||||
def run_with_string(sample_path):
|
||||
def run_with_string(sample_path, laparams=None):
|
||||
if laparams is None:
|
||||
laparams = {}
|
||||
absolute_path = absolute_sample_path(sample_path)
|
||||
s = extract_text(absolute_path)
|
||||
s = extract_text(absolute_path, laparams=LAParams(**laparams))
|
||||
return s
|
||||
|
||||
|
||||
|
@ -21,6 +24,9 @@ test_strings = {
|
|||
"simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\n"
|
||||
"H e l l o \n\nW o r l d\n\n"
|
||||
"H e l l o \n\nW o r l d\n\n\f",
|
||||
"simple1.pdf_no_boxes_flow": "Hello \nWorld\nHello \nWorld\n"
|
||||
"H e l l o \nW o r l d\n"
|
||||
"H e l l o \nW o r l d\n\f",
|
||||
"simple2.pdf": "\f",
|
||||
"simple3.pdf": "Hello\n\nHello\n\nWorld\n\nWorld\n\n\f",
|
||||
}
|
||||
|
@ -32,6 +38,11 @@ class TestExtractText(unittest.TestCase):
|
|||
s = run_with_string(test_file)
|
||||
self.assertEqual(s, test_strings[test_file])
|
||||
|
||||
def test_simple1_no_boxes_flow(self):
|
||||
test_file = "simple1.pdf"
|
||||
s = run_with_string(test_file, laparams={"boxes_flow": None})
|
||||
self.assertEqual(s, test_strings["simple1.pdf_no_boxes_flow"])
|
||||
|
||||
def test_simple2_with_string(self):
|
||||
test_file = "simple2.pdf"
|
||||
s = run_with_string(test_file)
|
||||
|
|
Loading…
Reference in New Issue