eliminate empty textboxes
parent
dfd621b98c
commit
bb26cf9180
|
@ -1,7 +1,7 @@
|
||||||
#!/usr/bin/env python2
|
#!/usr/bin/env python2
|
||||||
import sys
|
import sys
|
||||||
from utils import apply_matrix_pt, get_bound, INF
|
from utils import INF, Plane, get_bound, uniq, csort, fsplit
|
||||||
from utils import bbox2str, matrix2str, uniq, csort, Plane
|
from utils import bbox2str, matrix2str, apply_matrix_pt
|
||||||
|
|
||||||
|
|
||||||
## LAParams
|
## LAParams
|
||||||
|
@ -51,6 +51,9 @@ class LTItem(object):
|
||||||
self.height = y1-y0
|
self.height = y1-y0
|
||||||
self.bbox = (x0, y0, x1, y1)
|
self.bbox = (x0, y0, x1, y1)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def is_empty(self):
|
||||||
|
return self.width <= 0 or self.height <= 0
|
||||||
|
|
||||||
def is_hoverlap(self, obj):
|
def is_hoverlap(self, obj):
|
||||||
assert isinstance(obj, LTItem)
|
assert isinstance(obj, LTItem)
|
||||||
|
@ -414,10 +417,11 @@ class LTLayoutContainer(LTContainer):
|
||||||
def analyze(self, laparams):
|
def analyze(self, laparams):
|
||||||
# textobjs is a list of LTChar objects, i.e.
|
# textobjs is a list of LTChar objects, i.e.
|
||||||
# it has all the individual characters in the page.
|
# it has all the individual characters in the page.
|
||||||
(textobjs, otherobjs) = self.get_textobjs(self._objs)
|
(textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self._objs)
|
||||||
if not textobjs: return
|
if not textobjs: return
|
||||||
textlines = list(self.get_textlines(laparams, textobjs))
|
textlines = list(self.get_textlines(laparams, textobjs))
|
||||||
assert len(textobjs) <= sum( len(line._objs) for line in textlines )
|
assert len(textobjs) <= sum( len(line._objs) for line in textlines )
|
||||||
|
(empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)
|
||||||
textboxes = list(self.get_textboxes(laparams, textlines))
|
textboxes = list(self.get_textboxes(laparams, textlines))
|
||||||
assert len(textlines) == sum( len(box._objs) for box in textboxes )
|
assert len(textlines) == sum( len(box._objs) for box in textboxes )
|
||||||
top = self.group_textboxes(laparams, textboxes)
|
top = self.group_textboxes(laparams, textboxes)
|
||||||
|
@ -431,21 +435,10 @@ class LTLayoutContainer(LTContainer):
|
||||||
return i
|
return i
|
||||||
assign_index(top, 0)
|
assign_index(top, 0)
|
||||||
textboxes.sort(key=lambda box:box.index)
|
textboxes.sort(key=lambda box:box.index)
|
||||||
self._objs = textboxes + otherobjs
|
self._objs = textboxes + otherobjs + empties
|
||||||
self.layout = top
|
self.layout = top
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def get_textobjs(self, objs):
|
|
||||||
"""Split all the objects in the page into text-related objects and others."""
|
|
||||||
textobjs = []
|
|
||||||
otherobjs = []
|
|
||||||
for obj in objs:
|
|
||||||
if isinstance(obj, LTChar):
|
|
||||||
textobjs.append(obj)
|
|
||||||
else:
|
|
||||||
otherobjs.append(obj)
|
|
||||||
return (textobjs, otherobjs)
|
|
||||||
|
|
||||||
def get_textlines(self, laparams, objs):
|
def get_textlines(self, laparams, objs):
|
||||||
obj0 = None
|
obj0 = None
|
||||||
line = None
|
line = None
|
||||||
|
|
|
@ -48,6 +48,18 @@ def csort(objs, key):
|
||||||
idxs = dict( (obj,i) for (i,obj) in enumerate(objs) )
|
idxs = dict( (obj,i) for (i,obj) in enumerate(objs) )
|
||||||
return sorted(objs, key=lambda obj:(key(obj), idxs[obj]))
|
return sorted(objs, key=lambda obj:(key(obj), idxs[obj]))
|
||||||
|
|
||||||
|
# fsplit
|
||||||
|
def fsplit(pred, objs):
|
||||||
|
"""Split a list into two classes according to the predicate."""
|
||||||
|
t = []
|
||||||
|
f = []
|
||||||
|
for obj in objs:
|
||||||
|
if pred(obj):
|
||||||
|
t.append(obj)
|
||||||
|
else:
|
||||||
|
f.append(obj)
|
||||||
|
return (t,f)
|
||||||
|
|
||||||
# drange
|
# drange
|
||||||
def drange(v0, v1, d):
|
def drange(v0, v1, d):
|
||||||
"""Returns a discrete range."""
|
"""Returns a discrete range."""
|
||||||
|
|
Loading…
Reference in New Issue