diff --git a/CHANGELOG.md b/CHANGELOG.md index 91bebe0..1838ad3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [Unreleased] -Nothing yet +## [20191110] - 2019-11-10 + +### Fixed +- Wrong order of text box grouping introduced by PR #315 ([#335](https://github.com/pdfminer/pdfminer.six/pull/335)) ## [20191107] - 2019-11-07 diff --git a/docs/source/tutorials/highlevel.rst b/docs/source/tutorials/highlevel.rst index 15424b9..d55328d 100644 --- a/docs/source/tutorials/highlevel.rst +++ b/docs/source/tutorials/highlevel.rst @@ -17,23 +17,23 @@ The most simple way to extract text from a PDF is to use >>> text = extract_text('samples/simple1.pdf') >>> print(repr(text)) - 'Hello \n\nWorld\n\nWorld\n\nHello \n\nH e l l o \n\nH e l l o \n\nW o r l d\n\nW o r l d\n\n\x0c' + 'Hello \n\nWorld\n\nHello \n\nWorld\n\nH e l l o \n\nW o r l d\n\nH e l l o \n\nW o r l d\n\n\x0c' >>> print(text) ... # doctest: +NORMALIZE_WHITESPACE Hello World - World - Hello - H e l l o + World H e l l o W o r l d + H e l l o + W o r l d diff --git a/pdfminer/__init__.py b/pdfminer/__init__.py index d41059a..0df3c86 100644 --- a/pdfminer/__init__.py +++ b/pdfminer/__init__.py @@ -13,7 +13,7 @@ other purposes instead of text analysis. import sys import warnings -__version__ = '20191107' +__version__ = '20191110' if sys.version_info < (3, 0): diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 8b8e397..bd2b2a7 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -1,6 +1,7 @@ import heapq +import logging -from .utils import INF +from .utils import INF, shorten_str from .utils import Plane from .utils import apply_matrix_pt from .utils import bbox2str @@ -9,6 +10,8 @@ from .utils import get_bound from .utils import matrix2str from .utils import uniq +logger = logging.getLogger(__name__) + class IndexAssigner(object): @@ -45,7 +48,7 @@ class LAParams(object): considered to be part of the same paragraph. The margin is specified relative to the height of a line. :param boxes_flow: Specifies how much a horizontal and vertical position - of a text matters when determining the order of lines. The value + of a text matters when determining the order of text boxes. The value should be within the range of -1.0 (only horizontal position matters) to +1.0 (only vertical position matters). :param detect_vertical: If vertical text should be considered during @@ -505,7 +508,7 @@ class LTTextGroupTBRL(LTTextGroup): # reorder the objects from top-right to bottom-left. self._objs.sort(key=lambda obj: -(1+laparams.boxes_flow)*(obj.x0+obj.x1) - - (1-laparams.boxes_flow)*(obj.y1)) + -(1-laparams.boxes_flow)*(obj.y1)) return @@ -562,6 +565,7 @@ class LTLayoutContainer(LTContainer): if ((halign and isinstance(line, LTTextLineHorizontal)) or (valign and isinstance(line, LTTextLineVertical))): + line.add(obj1) elif line is not None: yield line @@ -667,18 +671,19 @@ class LTLayoutContainer(LTContainer): obj1 = boxes[i] for j in range(i+1, len(boxes)): obj2 = boxes[j] - dists.append((True, dist(obj1, obj2), id(obj1), id(obj2), obj1, obj2)) + dists.append((False, dist(obj1, obj2), id(obj1), id(obj2), + obj1, obj2)) heapq.heapify(dists) plane = Plane(self.bbox) plane.extend(boxes) done = set() while len(dists) > 0: - (is_first, d, id1, id2, obj1, obj2) = heapq.heappop(dists) + (skip_isany, d, id1, id2, obj1, obj2) = heapq.heappop(dists) # Skip objects that are already merged if (id1 not in done) and (id2 not in done): - if is_first and isany(obj1, obj2): - heapq.heappush(dists, (False, d, id1, id2, obj1, obj2)) + if skip_isany and isany(obj1, obj2): + heapq.heappush(dists, (True, d, id1, id2, obj1, obj2)) continue if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or \ isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL)): diff --git a/pdfminer/utils.py b/pdfminer/utils.py index 4fb5825..3743b3a 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -32,6 +32,16 @@ def make_compat_str(in_str): return in_str +def shorten_str(s, size): + if size < 7: + return s[:size] + if len(s) > size: + length = (size - 5) // 2 + return '{} ... {}'.format(s[:length], s[-length:]) + else: + return s + + def compatible_encode_method(bytesorstring, encoding='utf-8', erraction='ignore'): """When Py2 str.encode is called, it often means bytes.encode in Py3. This does either.""" if six.PY2: diff --git a/tests/test_highlevel_extracttext.py b/tests/test_highlevel_extracttext.py index 7062cd8..425c944 100644 --- a/tests/test_highlevel_extracttext.py +++ b/tests/test_highlevel_extracttext.py @@ -11,7 +11,7 @@ def run(sample_path): test_strings = { - "simple1.pdf": "Hello \n\nWorld\n\nWorld\n\nHello \n\nH e l l o \n\nH e l l o \n\nW o r l d\n\nW o r l d\n\n\f", + "simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\nH e l l o \n\nW o r l d\n\nH e l l o \n\nW o r l d\n\n\f", "simple2.pdf": "\f", "simple3.pdf": "HelloHello\n\nWorld\n\nWorld\n\n\f", } diff --git a/tests/test_utils.py b/tests/test_utils.py index 5787ac2..edfdaa9 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,7 +1,7 @@ from nose.tools import assert_equal from pdfminer.layout import LTComponent -from pdfminer.utils import Plane +from pdfminer.utils import Plane, shorten_str class TestPlane(object): @@ -38,3 +38,16 @@ class TestPlane(object): obj = LTComponent((0, 0, object_size, object_size)) plane.add(obj) return plane, obj + + +class TestFunctions(object): + def test_shorten_str(self): + s = shorten_str('Hello there World', 15) + assert_equal(s, 'Hello ... World') + + def test_shorten_short_str_is_same(self): + s = 'Hello World' + assert_equal(s, shorten_str(s, 50)) + + def test_shorten_to_really_short(self): + assert_equal('Hello', shorten_str('Hello World', 5)) \ No newline at end of file