Merge branch 'develop'

pull/341/head 20191110
Pieter Marsman 2019-11-10 12:59:55 +01:00
commit 452f0b4ad0
7 changed files with 46 additions and 15 deletions

View File

@ -5,7 +5,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
## [Unreleased] ## [Unreleased]
Nothing yet ## [20191110] - 2019-11-10
### Fixed
- Wrong order of text box grouping introduced by PR #315 ([#335](https://github.com/pdfminer/pdfminer.six/pull/335))
## [20191107] - 2019-11-07 ## [20191107] - 2019-11-07

View File

@ -17,23 +17,23 @@ The most simple way to extract text from a PDF is to use
>>> text = extract_text('samples/simple1.pdf') >>> text = extract_text('samples/simple1.pdf')
>>> print(repr(text)) >>> print(repr(text))
'Hello \n\nWorld\n\nWorld\n\nHello \n\nH e l l o \n\nH e l l o \n\nW o r l d\n\nW o r l d\n\n\x0c' 'Hello \n\nWorld\n\nHello \n\nWorld\n\nH e l l o \n\nW o r l d\n\nH e l l o \n\nW o r l d\n\n\x0c'
>>> print(text) >>> print(text)
... # doctest: +NORMALIZE_WHITESPACE ... # doctest: +NORMALIZE_WHITESPACE
Hello Hello
<BLANKLINE> <BLANKLINE>
World World
<BLANKLINE> <BLANKLINE>
World
<BLANKLINE>
Hello Hello
<BLANKLINE> <BLANKLINE>
H e l l o World
<BLANKLINE> <BLANKLINE>
H e l l o H e l l o
<BLANKLINE> <BLANKLINE>
W o r l d W o r l d
<BLANKLINE> <BLANKLINE>
H e l l o
<BLANKLINE>
W o r l d W o r l d
<BLANKLINE> <BLANKLINE>

View File

@ -13,7 +13,7 @@ other purposes instead of text analysis.
import sys import sys
import warnings import warnings
__version__ = '20191107' __version__ = '20191110'
if sys.version_info < (3, 0): if sys.version_info < (3, 0):

View File

@ -1,6 +1,7 @@
import heapq import heapq
import logging
from .utils import INF from .utils import INF, shorten_str
from .utils import Plane from .utils import Plane
from .utils import apply_matrix_pt from .utils import apply_matrix_pt
from .utils import bbox2str from .utils import bbox2str
@ -9,6 +10,8 @@ from .utils import get_bound
from .utils import matrix2str from .utils import matrix2str
from .utils import uniq from .utils import uniq
logger = logging.getLogger(__name__)
class IndexAssigner(object): class IndexAssigner(object):
@ -45,7 +48,7 @@ class LAParams(object):
considered to be part of the same paragraph. The margin is considered to be part of the same paragraph. The margin is
specified relative to the height of a line. specified relative to the height of a line.
:param boxes_flow: Specifies how much a horizontal and vertical position :param boxes_flow: Specifies how much a horizontal and vertical position
of a text matters when determining the order of lines. The value of a text matters when determining the order of text boxes. The value
should be within the range of -1.0 (only horizontal position should be within the range of -1.0 (only horizontal position
matters) to +1.0 (only vertical position matters). matters) to +1.0 (only vertical position matters).
:param detect_vertical: If vertical text should be considered during :param detect_vertical: If vertical text should be considered during
@ -505,7 +508,7 @@ class LTTextGroupTBRL(LTTextGroup):
# reorder the objects from top-right to bottom-left. # reorder the objects from top-right to bottom-left.
self._objs.sort(key=lambda obj: self._objs.sort(key=lambda obj:
-(1+laparams.boxes_flow)*(obj.x0+obj.x1) -(1+laparams.boxes_flow)*(obj.x0+obj.x1)
- (1-laparams.boxes_flow)*(obj.y1)) -(1-laparams.boxes_flow)*(obj.y1))
return return
@ -562,6 +565,7 @@ class LTLayoutContainer(LTContainer):
if ((halign and isinstance(line, LTTextLineHorizontal)) or if ((halign and isinstance(line, LTTextLineHorizontal)) or
(valign and isinstance(line, LTTextLineVertical))): (valign and isinstance(line, LTTextLineVertical))):
line.add(obj1) line.add(obj1)
elif line is not None: elif line is not None:
yield line yield line
@ -667,18 +671,19 @@ class LTLayoutContainer(LTContainer):
obj1 = boxes[i] obj1 = boxes[i]
for j in range(i+1, len(boxes)): for j in range(i+1, len(boxes)):
obj2 = boxes[j] obj2 = boxes[j]
dists.append((True, dist(obj1, obj2), id(obj1), id(obj2), obj1, obj2)) dists.append((False, dist(obj1, obj2), id(obj1), id(obj2),
obj1, obj2))
heapq.heapify(dists) heapq.heapify(dists)
plane = Plane(self.bbox) plane = Plane(self.bbox)
plane.extend(boxes) plane.extend(boxes)
done = set() done = set()
while len(dists) > 0: while len(dists) > 0:
(is_first, d, id1, id2, obj1, obj2) = heapq.heappop(dists) (skip_isany, d, id1, id2, obj1, obj2) = heapq.heappop(dists)
# Skip objects that are already merged # Skip objects that are already merged
if (id1 not in done) and (id2 not in done): if (id1 not in done) and (id2 not in done):
if is_first and isany(obj1, obj2): if skip_isany and isany(obj1, obj2):
heapq.heappush(dists, (False, d, id1, id2, obj1, obj2)) heapq.heappush(dists, (True, d, id1, id2, obj1, obj2))
continue continue
if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or \ if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or \
isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL)): isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL)):

View File

@ -32,6 +32,16 @@ def make_compat_str(in_str):
return in_str return in_str
def shorten_str(s, size):
if size < 7:
return s[:size]
if len(s) > size:
length = (size - 5) // 2
return '{} ... {}'.format(s[:length], s[-length:])
else:
return s
def compatible_encode_method(bytesorstring, encoding='utf-8', erraction='ignore'): def compatible_encode_method(bytesorstring, encoding='utf-8', erraction='ignore'):
"""When Py2 str.encode is called, it often means bytes.encode in Py3. This does either.""" """When Py2 str.encode is called, it often means bytes.encode in Py3. This does either."""
if six.PY2: if six.PY2:

View File

@ -11,7 +11,7 @@ def run(sample_path):
test_strings = { test_strings = {
"simple1.pdf": "Hello \n\nWorld\n\nWorld\n\nHello \n\nH e l l o \n\nH e l l o \n\nW o r l d\n\nW o r l d\n\n\f", "simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\nH e l l o \n\nW o r l d\n\nH e l l o \n\nW o r l d\n\n\f",
"simple2.pdf": "\f", "simple2.pdf": "\f",
"simple3.pdf": "HelloHello\n\nWorld\n\nWorld\n\n\f", "simple3.pdf": "HelloHello\n\nWorld\n\nWorld\n\n\f",
} }

View File

@ -1,7 +1,7 @@
from nose.tools import assert_equal from nose.tools import assert_equal
from pdfminer.layout import LTComponent from pdfminer.layout import LTComponent
from pdfminer.utils import Plane from pdfminer.utils import Plane, shorten_str
class TestPlane(object): class TestPlane(object):
@ -38,3 +38,16 @@ class TestPlane(object):
obj = LTComponent((0, 0, object_size, object_size)) obj = LTComponent((0, 0, object_size, object_size))
plane.add(obj) plane.add(obj)
return plane, obj return plane, obj
class TestFunctions(object):
def test_shorten_str(self):
s = shorten_str('Hello there World', 15)
assert_equal(s, 'Hello ... World')
def test_shorten_short_str_is_same(self):
s = 'Hello World'
assert_equal(s, shorten_str(s, 50))
def test_shorten_to_really_short(self):
assert_equal('Hello', shorten_str('Hello World', 5))