Fix wrong ordering of grouping textboxes introduced by #315. The first grouping of textboxes should be skipped if there are intermediate textboxes. (#335)
Fixes #334pull/341/head
parent
5c6fa8f986
commit
2bee7d8dcf
|
@ -5,7 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
|
|
||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
Nothing yet
|
### Fixed
|
||||||
|
- Wrong order of text box grouping introduced by PR #315 ([#335](https://github.com/pdfminer/pdfminer.six/pull/335))
|
||||||
|
|
||||||
## [20191107] - 2019-11-07
|
## [20191107] - 2019-11-07
|
||||||
|
|
||||||
|
|
|
@ -17,23 +17,23 @@ The most simple way to extract text from a PDF is to use
|
||||||
|
|
||||||
>>> text = extract_text('samples/simple1.pdf')
|
>>> text = extract_text('samples/simple1.pdf')
|
||||||
>>> print(repr(text))
|
>>> print(repr(text))
|
||||||
'Hello \n\nWorld\n\nWorld\n\nHello \n\nH e l l o \n\nH e l l o \n\nW o r l d\n\nW o r l d\n\n\x0c'
|
'Hello \n\nWorld\n\nHello \n\nWorld\n\nH e l l o \n\nW o r l d\n\nH e l l o \n\nW o r l d\n\n\x0c'
|
||||||
>>> print(text)
|
>>> print(text)
|
||||||
... # doctest: +NORMALIZE_WHITESPACE
|
... # doctest: +NORMALIZE_WHITESPACE
|
||||||
Hello
|
Hello
|
||||||
<BLANKLINE>
|
<BLANKLINE>
|
||||||
World
|
World
|
||||||
<BLANKLINE>
|
<BLANKLINE>
|
||||||
World
|
|
||||||
<BLANKLINE>
|
|
||||||
Hello
|
Hello
|
||||||
<BLANKLINE>
|
<BLANKLINE>
|
||||||
H e l l o
|
World
|
||||||
<BLANKLINE>
|
<BLANKLINE>
|
||||||
H e l l o
|
H e l l o
|
||||||
<BLANKLINE>
|
<BLANKLINE>
|
||||||
W o r l d
|
W o r l d
|
||||||
<BLANKLINE>
|
<BLANKLINE>
|
||||||
|
H e l l o
|
||||||
|
<BLANKLINE>
|
||||||
W o r l d
|
W o r l d
|
||||||
<BLANKLINE>
|
<BLANKLINE>
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import heapq
|
import heapq
|
||||||
|
import logging
|
||||||
|
|
||||||
from .utils import INF
|
from .utils import INF, shorten_str
|
||||||
from .utils import Plane
|
from .utils import Plane
|
||||||
from .utils import apply_matrix_pt
|
from .utils import apply_matrix_pt
|
||||||
from .utils import bbox2str
|
from .utils import bbox2str
|
||||||
|
@ -9,6 +10,8 @@ from .utils import get_bound
|
||||||
from .utils import matrix2str
|
from .utils import matrix2str
|
||||||
from .utils import uniq
|
from .utils import uniq
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class IndexAssigner(object):
|
class IndexAssigner(object):
|
||||||
|
|
||||||
|
@ -45,7 +48,7 @@ class LAParams(object):
|
||||||
considered to be part of the same paragraph. The margin is
|
considered to be part of the same paragraph. The margin is
|
||||||
specified relative to the height of a line.
|
specified relative to the height of a line.
|
||||||
:param boxes_flow: Specifies how much a horizontal and vertical position
|
:param boxes_flow: Specifies how much a horizontal and vertical position
|
||||||
of a text matters when determining the order of lines. The value
|
of a text matters when determining the order of text boxes. The value
|
||||||
should be within the range of -1.0 (only horizontal position
|
should be within the range of -1.0 (only horizontal position
|
||||||
matters) to +1.0 (only vertical position matters).
|
matters) to +1.0 (only vertical position matters).
|
||||||
:param detect_vertical: If vertical text should be considered during
|
:param detect_vertical: If vertical text should be considered during
|
||||||
|
@ -505,7 +508,7 @@ class LTTextGroupTBRL(LTTextGroup):
|
||||||
# reorder the objects from top-right to bottom-left.
|
# reorder the objects from top-right to bottom-left.
|
||||||
self._objs.sort(key=lambda obj:
|
self._objs.sort(key=lambda obj:
|
||||||
-(1+laparams.boxes_flow)*(obj.x0+obj.x1)
|
-(1+laparams.boxes_flow)*(obj.x0+obj.x1)
|
||||||
- (1-laparams.boxes_flow)*(obj.y1))
|
-(1-laparams.boxes_flow)*(obj.y1))
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@ -562,6 +565,7 @@ class LTLayoutContainer(LTContainer):
|
||||||
|
|
||||||
if ((halign and isinstance(line, LTTextLineHorizontal)) or
|
if ((halign and isinstance(line, LTTextLineHorizontal)) or
|
||||||
(valign and isinstance(line, LTTextLineVertical))):
|
(valign and isinstance(line, LTTextLineVertical))):
|
||||||
|
|
||||||
line.add(obj1)
|
line.add(obj1)
|
||||||
elif line is not None:
|
elif line is not None:
|
||||||
yield line
|
yield line
|
||||||
|
@ -667,18 +671,19 @@ class LTLayoutContainer(LTContainer):
|
||||||
obj1 = boxes[i]
|
obj1 = boxes[i]
|
||||||
for j in range(i+1, len(boxes)):
|
for j in range(i+1, len(boxes)):
|
||||||
obj2 = boxes[j]
|
obj2 = boxes[j]
|
||||||
dists.append((True, dist(obj1, obj2), id(obj1), id(obj2), obj1, obj2))
|
dists.append((False, dist(obj1, obj2), id(obj1), id(obj2),
|
||||||
|
obj1, obj2))
|
||||||
heapq.heapify(dists)
|
heapq.heapify(dists)
|
||||||
|
|
||||||
plane = Plane(self.bbox)
|
plane = Plane(self.bbox)
|
||||||
plane.extend(boxes)
|
plane.extend(boxes)
|
||||||
done = set()
|
done = set()
|
||||||
while len(dists) > 0:
|
while len(dists) > 0:
|
||||||
(is_first, d, id1, id2, obj1, obj2) = heapq.heappop(dists)
|
(skip_isany, d, id1, id2, obj1, obj2) = heapq.heappop(dists)
|
||||||
# Skip objects that are already merged
|
# Skip objects that are already merged
|
||||||
if (id1 not in done) and (id2 not in done):
|
if (id1 not in done) and (id2 not in done):
|
||||||
if is_first and isany(obj1, obj2):
|
if skip_isany and isany(obj1, obj2):
|
||||||
heapq.heappush(dists, (False, d, id1, id2, obj1, obj2))
|
heapq.heappush(dists, (True, d, id1, id2, obj1, obj2))
|
||||||
continue
|
continue
|
||||||
if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or \
|
if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or \
|
||||||
isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL)):
|
isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL)):
|
||||||
|
|
|
@ -32,6 +32,16 @@ def make_compat_str(in_str):
|
||||||
return in_str
|
return in_str
|
||||||
|
|
||||||
|
|
||||||
|
def shorten_str(s, size):
|
||||||
|
if size < 7:
|
||||||
|
return s[:size]
|
||||||
|
if len(s) > size:
|
||||||
|
length = (size - 5) // 2
|
||||||
|
return '{} ... {}'.format(s[:length], s[-length:])
|
||||||
|
else:
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
def compatible_encode_method(bytesorstring, encoding='utf-8', erraction='ignore'):
|
def compatible_encode_method(bytesorstring, encoding='utf-8', erraction='ignore'):
|
||||||
"""When Py2 str.encode is called, it often means bytes.encode in Py3. This does either."""
|
"""When Py2 str.encode is called, it often means bytes.encode in Py3. This does either."""
|
||||||
if six.PY2:
|
if six.PY2:
|
||||||
|
|
|
@ -11,7 +11,7 @@ def run(sample_path):
|
||||||
|
|
||||||
|
|
||||||
test_strings = {
|
test_strings = {
|
||||||
"simple1.pdf": "Hello \n\nWorld\n\nWorld\n\nHello \n\nH e l l o \n\nH e l l o \n\nW o r l d\n\nW o r l d\n\n\f",
|
"simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\nH e l l o \n\nW o r l d\n\nH e l l o \n\nW o r l d\n\n\f",
|
||||||
"simple2.pdf": "\f",
|
"simple2.pdf": "\f",
|
||||||
"simple3.pdf": "HelloHello\n\nWorld\n\nWorld\n\n\f",
|
"simple3.pdf": "HelloHello\n\nWorld\n\nWorld\n\n\f",
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from nose.tools import assert_equal
|
from nose.tools import assert_equal
|
||||||
|
|
||||||
from pdfminer.layout import LTComponent
|
from pdfminer.layout import LTComponent
|
||||||
from pdfminer.utils import Plane
|
from pdfminer.utils import Plane, shorten_str
|
||||||
|
|
||||||
|
|
||||||
class TestPlane(object):
|
class TestPlane(object):
|
||||||
|
@ -38,3 +38,16 @@ class TestPlane(object):
|
||||||
obj = LTComponent((0, 0, object_size, object_size))
|
obj = LTComponent((0, 0, object_size, object_size))
|
||||||
plane.add(obj)
|
plane.add(obj)
|
||||||
return plane, obj
|
return plane, obj
|
||||||
|
|
||||||
|
|
||||||
|
class TestFunctions(object):
|
||||||
|
def test_shorten_str(self):
|
||||||
|
s = shorten_str('Hello there World', 15)
|
||||||
|
assert_equal(s, 'Hello ... World')
|
||||||
|
|
||||||
|
def test_shorten_short_str_is_same(self):
|
||||||
|
s = 'Hello World'
|
||||||
|
assert_equal(s, shorten_str(s, 50))
|
||||||
|
|
||||||
|
def test_shorten_to_really_short(self):
|
||||||
|
assert_equal('Hello', shorten_str('Hello World', 5))
|
Loading…
Reference in New Issue