Fix wrong ordering of grouping textboxes introduced by #315. The first grouping of textboxes should be skipped if there are intermediate textboxes. (#335)

Fixes #334
2019-11-10 12:18:49 +01:00 · 2019-11-10 12:18:49 +01:00 · 2bee7d8dcf
parent 5c6fa8f986
commit 2bee7d8dcf
6 changed files with 43 additions and 14 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -5,7 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

 ## [Unreleased]

-Nothing yet
+### Fixed
+- Wrong order of text box grouping introduced by PR #315 ([#335](https://github.com/pdfminer/pdfminer.six/pull/335))

 ## [20191107] - 2019-11-07

--- a/docs/source/tutorials/highlevel.rst
+++ b/docs/source/tutorials/highlevel.rst
@ -17,23 +17,23 @@ The most simple way to extract text from a PDF is to use

    >>> text = extract_text('samples/simple1.pdf')
    >>> print(repr(text))
-    'Hello \n\nWorld\n\nWorld\n\nHello \n\nH e l l o  \n\nH e l l o  \n\nW o r l d\n\nW o r l d\n\n\x0c'
+    'Hello \n\nWorld\n\nHello \n\nWorld\n\nH e l l o  \n\nW o r l d\n\nH e l l o  \n\nW o r l d\n\n\x0c'
    >>> print(text)
    ... # doctest: +NORMALIZE_WHITESPACE
    Hello
    <BLANKLINE>
    World
    <BLANKLINE>
+    Hello
+    <BLANKLINE>
    World
    <BLANKLINE>
    H e l l o
    <BLANKLINE>
-    H e l l o
-    <BLANKLINE>
-    H e l l o
-    <BLANKLINE>
    W o r l d
    <BLANKLINE>
+    H e l l o
+    <BLANKLINE>
    W o r l d
    <BLANKLINE>

--- a/pdfminer/layout.py
+++ b/pdfminer/layout.py
@ -1,6 +1,7 @@
 import heapq
+import logging

-from .utils import INF
+from .utils import INF, shorten_str
 from .utils import Plane
 from .utils import apply_matrix_pt
 from .utils import bbox2str
@ -9,6 +10,8 @@ from .utils import get_bound
 from .utils import matrix2str
 from .utils import uniq

+logger = logging.getLogger(__name__)
+

 class IndexAssigner(object):

@ -45,7 +48,7 @@ class LAParams(object):
        considered to be part of the same paragraph. The margin is
        specified relative to the height of a line.
    :param boxes_flow: Specifies how much a horizontal and vertical position
-        of a text matters when determining the order of lines. The value
+        of a text matters when determining the order of text boxes. The value
        should be within the range of -1.0 (only horizontal position
        matters) to +1.0 (only vertical position matters).
    :param detect_vertical: If vertical text should be considered during
@ -562,6 +565,7 @@ class LTLayoutContainer(LTContainer):

                if ((halign and isinstance(line, LTTextLineHorizontal)) or
                    (valign and isinstance(line, LTTextLineVertical))):
+
                    line.add(obj1)
                elif line is not None:
                    yield line
@ -667,18 +671,19 @@ class LTLayoutContainer(LTContainer):
            obj1 = boxes[i]
            for j in range(i+1, len(boxes)):
                obj2 = boxes[j]
-                dists.append((True, dist(obj1, obj2), id(obj1), id(obj2), obj1, obj2))
+                dists.append((False, dist(obj1, obj2), id(obj1), id(obj2),
+                              obj1, obj2))
        heapq.heapify(dists)

        plane = Plane(self.bbox)
        plane.extend(boxes)
        done = set()
        while len(dists) > 0:
-            (is_first, d, id1, id2, obj1, obj2) = heapq.heappop(dists)
+            (skip_isany, d, id1, id2, obj1, obj2) = heapq.heappop(dists)
            # Skip objects that are already merged
            if (id1 not in done) and (id2 not in done):
-                if is_first and isany(obj1, obj2):
-                    heapq.heappush(dists, (False, d, id1, id2, obj1, obj2))
+                if skip_isany and isany(obj1, obj2):
+                    heapq.heappush(dists, (True, d, id1, id2, obj1, obj2))
                    continue
                if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or \
                        isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL)):
--- a/pdfminer/utils.py
+++ b/pdfminer/utils.py
@ -32,6 +32,16 @@ def make_compat_str(in_str):
    return in_str


+def shorten_str(s, size):
+    if size < 7:
+        return s[:size]
+    if len(s) > size:
+        length = (size - 5) // 2
+        return '{} ... {}'.format(s[:length], s[-length:])
+    else:
+        return s
+
+
 def compatible_encode_method(bytesorstring, encoding='utf-8', erraction='ignore'):
    """When Py2 str.encode is called, it often means bytes.encode in Py3. This does either."""
    if six.PY2:
--- a/tests/test_highlevel_extracttext.py
+++ b/tests/test_highlevel_extracttext.py
@ -11,7 +11,7 @@ def run(sample_path):


 test_strings = {
-    "simple1.pdf": "Hello \n\nWorld\n\nWorld\n\nHello \n\nH e l l o  \n\nH e l l o  \n\nW o r l d\n\nW o r l d\n\n\f",
+    "simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\nH e l l o  \n\nW o r l d\n\nH e l l o  \n\nW o r l d\n\n\f",
    "simple2.pdf": "\f",
    "simple3.pdf": "HelloHello\n\nWorld\n\nWorld\n\n\f",
 }
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@ -1,7 +1,7 @@
 from nose.tools import assert_equal

 from pdfminer.layout import LTComponent
-from pdfminer.utils import Plane
+from pdfminer.utils import Plane, shorten_str


 class TestPlane(object):
@ -38,3 +38,16 @@ class TestPlane(object):
        obj = LTComponent((0, 0, object_size, object_size))
        plane.add(obj)
        return plane, obj
+
+
+class TestFunctions(object):
+    def test_shorten_str(self):
+        s = shorten_str('Hello there World', 15)
+        assert_equal(s, 'Hello ... World')
+
+    def test_shorten_short_str_is_same(self):
+        s = 'Hello World'
+        assert_equal(s, shorten_str(s, 50))
+
+    def test_shorten_to_really_short(self):
+        assert_equal('Hello', shorten_str('Hello World', 5))