eliminate empty textboxes

pull/1/head
Yusuke Shinyama 2011-03-01 20:47:20 +09:00
parent dfd621b98c
commit bb26cf9180
2 changed files with 20 additions and 15 deletions

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
import sys import sys
from utils import apply_matrix_pt, get_bound, INF from utils import INF, Plane, get_bound, uniq, csort, fsplit
from utils import bbox2str, matrix2str, uniq, csort, Plane from utils import bbox2str, matrix2str, apply_matrix_pt
## LAParams ## LAParams
@ -51,6 +51,9 @@ class LTItem(object):
self.height = y1-y0 self.height = y1-y0
self.bbox = (x0, y0, x1, y1) self.bbox = (x0, y0, x1, y1)
return return
def is_empty(self):
return self.width <= 0 or self.height <= 0
def is_hoverlap(self, obj): def is_hoverlap(self, obj):
assert isinstance(obj, LTItem) assert isinstance(obj, LTItem)
@ -414,10 +417,11 @@ class LTLayoutContainer(LTContainer):
def analyze(self, laparams): def analyze(self, laparams):
# textobjs is a list of LTChar objects, i.e. # textobjs is a list of LTChar objects, i.e.
# it has all the individual characters in the page. # it has all the individual characters in the page.
(textobjs, otherobjs) = self.get_textobjs(self._objs) (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self._objs)
if not textobjs: return if not textobjs: return
textlines = list(self.get_textlines(laparams, textobjs)) textlines = list(self.get_textlines(laparams, textobjs))
assert len(textobjs) <= sum( len(line._objs) for line in textlines ) assert len(textobjs) <= sum( len(line._objs) for line in textlines )
(empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)
textboxes = list(self.get_textboxes(laparams, textlines)) textboxes = list(self.get_textboxes(laparams, textlines))
assert len(textlines) == sum( len(box._objs) for box in textboxes ) assert len(textlines) == sum( len(box._objs) for box in textboxes )
top = self.group_textboxes(laparams, textboxes) top = self.group_textboxes(laparams, textboxes)
@ -431,21 +435,10 @@ class LTLayoutContainer(LTContainer):
return i return i
assign_index(top, 0) assign_index(top, 0)
textboxes.sort(key=lambda box:box.index) textboxes.sort(key=lambda box:box.index)
self._objs = textboxes + otherobjs self._objs = textboxes + otherobjs + empties
self.layout = top self.layout = top
return self return self
def get_textobjs(self, objs):
"""Split all the objects in the page into text-related objects and others."""
textobjs = []
otherobjs = []
for obj in objs:
if isinstance(obj, LTChar):
textobjs.append(obj)
else:
otherobjs.append(obj)
return (textobjs, otherobjs)
def get_textlines(self, laparams, objs): def get_textlines(self, laparams, objs):
obj0 = None obj0 = None
line = None line = None

View File

@ -48,6 +48,18 @@ def csort(objs, key):
idxs = dict( (obj,i) for (i,obj) in enumerate(objs) ) idxs = dict( (obj,i) for (i,obj) in enumerate(objs) )
return sorted(objs, key=lambda obj:(key(obj), idxs[obj])) return sorted(objs, key=lambda obj:(key(obj), idxs[obj]))
# fsplit
def fsplit(pred, objs):
"""Split a list into two classes according to the predicate."""
t = []
f = []
for obj in objs:
if pred(obj):
t.append(obj)
else:
f.append(obj)
return (t,f)
# drange # drange
def drange(v0, v1, d): def drange(v0, v1, d):
"""Returns a discrete range.""" """Returns a discrete range."""