Merge pull request #141 from timb07/speedup_layout

Speed up layout of text boxes
pull/219/head
Tata Ganesh 2018-11-08 20:28:40 +05:30 committed by GitHub
commit e03ecab856
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 17 additions and 23 deletions

View File

@ -8,5 +8,6 @@ install:
- pip install six - pip install six
- pip install pycryptodome - pip install pycryptodome
- pip install chardet - pip install chardet
- pip install sortedcontainers
script: script:
nosetests --nologcapture nosetests --nologcapture

View File

@ -1,9 +1,9 @@
from sortedcontainers import SortedListWithKey
from .utils import INF from .utils import INF
from .utils import Plane from .utils import Plane
from .utils import get_bound from .utils import get_bound
from .utils import uniq from .utils import uniq
from .utils import csort
from .utils import fsplit from .utils import fsplit
from .utils import bbox2str from .utils import bbox2str
from .utils import matrix2str from .utils import matrix2str
@ -441,7 +441,7 @@ class LTTextBoxHorizontal(LTTextBox):
def analyze(self, laparams): def analyze(self, laparams):
LTTextBox.analyze(self, laparams) LTTextBox.analyze(self, laparams)
self._objs = csort(self._objs, key=lambda obj: -obj.y1) self._objs.sort(key=lambda obj: -obj.y1)
return return
def get_writing_mode(self): def get_writing_mode(self):
@ -452,7 +452,7 @@ class LTTextBoxVertical(LTTextBox):
def analyze(self, laparams): def analyze(self, laparams):
LTTextBox.analyze(self, laparams) LTTextBox.analyze(self, laparams)
self._objs = csort(self._objs, key=lambda obj: -obj.x1) self._objs.sort(key=lambda obj: -obj.x1)
return return
def get_writing_mode(self): def get_writing_mode(self):
@ -474,7 +474,7 @@ class LTTextGroupLRTB(LTTextGroup):
def analyze(self, laparams): def analyze(self, laparams):
LTTextGroup.analyze(self, laparams) LTTextGroup.analyze(self, laparams)
# reorder the objects from top-left to bottom-right. # reorder the objects from top-left to bottom-right.
self._objs = csort(self._objs, key=lambda obj: self._objs.sort(key=lambda obj:
(1-laparams.boxes_flow)*(obj.x0) - (1-laparams.boxes_flow)*(obj.x0) -
(1+laparams.boxes_flow)*(obj.y0+obj.y1)) (1+laparams.boxes_flow)*(obj.y0+obj.y1))
return return
@ -485,7 +485,7 @@ class LTTextGroupTBRL(LTTextGroup):
def analyze(self, laparams): def analyze(self, laparams):
LTTextGroup.analyze(self, laparams) LTTextGroup.analyze(self, laparams)
# reorder the objects from top-right to bottom-left. # reorder the objects from top-right to bottom-left.
self._objs = csort(self._objs, key=lambda obj: self._objs.sort(key=lambda obj:
-(1+laparams.boxes_flow)*(obj.x0+obj.x1) -(1+laparams.boxes_flow)*(obj.x0+obj.x1)
- (1-laparams.boxes_flow)*(obj.y1)) - (1-laparams.boxes_flow)*(obj.y1))
return return
@ -639,21 +639,18 @@ class LTLayoutContainer(LTContainer):
(c,d,_,_) = t (c,d,_,_) = t
return (c,d) return (c,d)
# XXX this still takes O(n^2) :( dists = SortedListWithKey(key=key_obj)
dists = []
for i in range(len(boxes)): for i in range(len(boxes)):
obj1 = boxes[i] obj1 = boxes[i]
for j in range(i+1, len(boxes)): for j in range(i+1, len(boxes)):
obj2 = boxes[j] obj2 = boxes[j]
dists.append((0, dist(obj1, obj2), obj1, obj2)) dists.add((0, dist(obj1, obj2), obj1, obj2))
# We could use dists.sort(), but it would randomize the test result.
dists = csort(dists, key=key_obj)
plane = Plane(self.bbox) plane = Plane(self.bbox)
plane.extend(boxes) plane.extend(boxes)
while dists: while dists:
(c, d, obj1, obj2) = dists.pop(0) (c, d, obj1, obj2) = dists.pop(0)
if c == 0 and isany(obj1, obj2): if c == 0 and isany(obj1, obj2):
dists.append((1, d, obj1, obj2)) dists.add((1, d, obj1, obj2))
continue continue
if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or
isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))): isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))):
@ -662,11 +659,13 @@ class LTLayoutContainer(LTContainer):
group = LTTextGroupLRTB([obj1, obj2]) group = LTTextGroupLRTB([obj1, obj2])
plane.remove(obj1) plane.remove(obj1)
plane.remove(obj2) plane.remove(obj2)
dists = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists removed = [obj1, obj2]
if (obj1 in plane and obj2 in plane) ] to_remove = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists
if (obj1 in removed or obj2 in removed) ]
for r in to_remove:
dists.remove(r)
for other in plane: for other in plane:
dists.append((0, dist(group, other), group, other)) dists.add((0, dist(group, other), group, other))
dists = csort(dists, key=key_obj)
plane.add(group) plane.add(group)
assert len(plane) == 1, str(len(plane)) assert len(plane) == 1, str(len(plane))
return list(plane) return list(plane)

View File

@ -145,13 +145,6 @@ def uniq(objs):
return return
# csort
def csort(objs, key):
"""Order-preserving sorting function."""
idxs = dict((obj, i) for (i, obj) in enumerate(objs))
return sorted(objs, key=lambda obj: (key(obj), idxs[obj]))
# fsplit # fsplit
def fsplit(pred, objs): def fsplit(pred, objs):
"""Split a list into two classes according to the predicate.""" """Split a list into two classes according to the predicate."""

View File

@ -3,7 +3,7 @@ import sys
import pdfminer as package import pdfminer as package
requires = ['six', 'pycryptodome'] requires = ['six', 'pycryptodome', 'sortedcontainers']
if sys.version_info >= (3, 0): if sys.version_info >= (3, 0):
requires.append('chardet') requires.append('chardet')

View File

@ -8,3 +8,4 @@ deps =
pycryptodome pycryptodome
chardet chardet
nose nose
sortedcontainers