parent
e55560f858
commit
68e2ae8632
|
@ -9,6 +9,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
- Allow boxes_flow LAParam to be passed as None, validate the input, and update documentation ([#395](https://github.com/pdfminer/pdfminer.six/pull/395))
|
- Allow boxes_flow LAParam to be passed as None, validate the input, and update documentation ([#395](https://github.com/pdfminer/pdfminer.six/pull/395))
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
|
- Text no longer comes in reverse order when advanced layout analysis is disabled ([#398](https://github.com/pdfminer/pdfminer.six/pull/398))
|
||||||
- Updated misleading documentation for `word_margin` and `char_margin` ([#407](https://github.com/pdfminer/pdfminer.six/pull/407))
|
- Updated misleading documentation for `word_margin` and `char_margin` ([#407](https://github.com/pdfminer/pdfminer.six/pull/407))
|
||||||
- Ignore ValueError when converting font encoding differences ([#389](https://github.com/pdfminer/pdfminer.six/pull/389))
|
- Ignore ValueError when converting font encoding differences ([#389](https://github.com/pdfminer/pdfminer.six/pull/389))
|
||||||
- Grouping of text lines outside of parent container bounding box ([#386](https://github.com/pdfminer/pdfminer.six/pull/386))
|
- Grouping of text lines outside of parent container bounding box ([#386](https://github.com/pdfminer/pdfminer.six/pull/386))
|
||||||
|
|
|
@ -800,9 +800,9 @@ class LTLayoutContainer(LTContainer):
|
||||||
if laparams.boxes_flow is None:
|
if laparams.boxes_flow is None:
|
||||||
def getkey(box):
|
def getkey(box):
|
||||||
if isinstance(box, LTTextBoxVertical):
|
if isinstance(box, LTTextBoxVertical):
|
||||||
return (0, -box.x1, box.y0)
|
return (0, -box.x1, -box.y0)
|
||||||
else:
|
else:
|
||||||
return (1, box.y0, box.x0)
|
return (1, -box.y0, box.x0)
|
||||||
textboxes.sort(key=getkey)
|
textboxes.sort(key=getkey)
|
||||||
else:
|
else:
|
||||||
self.groups = self.group_textboxes(laparams, textboxes)
|
self.groups = self.group_textboxes(laparams, textboxes)
|
||||||
|
|
|
@ -2,11 +2,14 @@ import unittest
|
||||||
|
|
||||||
from helpers import absolute_sample_path
|
from helpers import absolute_sample_path
|
||||||
from pdfminer.high_level import extract_text
|
from pdfminer.high_level import extract_text
|
||||||
|
from pdfminer.layout import LAParams
|
||||||
|
|
||||||
|
|
||||||
def run_with_string(sample_path):
|
def run_with_string(sample_path, laparams=None):
|
||||||
|
if laparams is None:
|
||||||
|
laparams = {}
|
||||||
absolute_path = absolute_sample_path(sample_path)
|
absolute_path = absolute_sample_path(sample_path)
|
||||||
s = extract_text(absolute_path)
|
s = extract_text(absolute_path, laparams=LAParams(**laparams))
|
||||||
return s
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
@ -21,6 +24,9 @@ test_strings = {
|
||||||
"simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\n"
|
"simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\n"
|
||||||
"H e l l o \n\nW o r l d\n\n"
|
"H e l l o \n\nW o r l d\n\n"
|
||||||
"H e l l o \n\nW o r l d\n\n\f",
|
"H e l l o \n\nW o r l d\n\n\f",
|
||||||
|
"simple1.pdf_no_boxes_flow": "Hello \nWorld\nHello \nWorld\n"
|
||||||
|
"H e l l o \nW o r l d\n"
|
||||||
|
"H e l l o \nW o r l d\n\f",
|
||||||
"simple2.pdf": "\f",
|
"simple2.pdf": "\f",
|
||||||
"simple3.pdf": "Hello\n\nHello\n\nWorld\n\nWorld\n\n\f",
|
"simple3.pdf": "Hello\n\nHello\n\nWorld\n\nWorld\n\n\f",
|
||||||
}
|
}
|
||||||
|
@ -32,6 +38,11 @@ class TestExtractText(unittest.TestCase):
|
||||||
s = run_with_string(test_file)
|
s = run_with_string(test_file)
|
||||||
self.assertEqual(s, test_strings[test_file])
|
self.assertEqual(s, test_strings[test_file])
|
||||||
|
|
||||||
|
def test_simple1_no_boxes_flow(self):
|
||||||
|
test_file = "simple1.pdf"
|
||||||
|
s = run_with_string(test_file, laparams={"boxes_flow": None})
|
||||||
|
self.assertEqual(s, test_strings["simple1.pdf_no_boxes_flow"])
|
||||||
|
|
||||||
def test_simple2_with_string(self):
|
def test_simple2_with_string(self):
|
||||||
test_file = "simple2.pdf"
|
test_file = "simple2.pdf"
|
||||||
s = run_with_string(test_file)
|
s = run_with_string(test_file)
|
||||||
|
|
Loading…
Reference in New Issue