layout analysis changed drastically.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@186 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
85c5476623
commit
ffaaea0bac
|
@ -19,7 +19,7 @@ Python PDF parser and analyzer
|
||||||
|
|
||||||
<div align=right class=lastmod>
|
<div align=right class=lastmod>
|
||||||
<!-- hhmts start -->
|
<!-- hhmts start -->
|
||||||
Last Modified: Fri Mar 12 13:45:59 UTC 2010
|
Last Modified: Sat Mar 20 05:43:04 UTC 2010
|
||||||
<!-- hhmts end -->
|
<!-- hhmts end -->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -131,7 +131,7 @@ In order to handle CJK languages,
|
||||||
an additional data called <code>CMap</code> is required.
|
an additional data called <code>CMap</code> is required.
|
||||||
CMap files are not installed by default.
|
CMap files are not installed by default.
|
||||||
<p>
|
<p>
|
||||||
Here is the additional step you need:
|
Here is the additional step you need to take:
|
||||||
<blockquote><pre>
|
<blockquote><pre>
|
||||||
# <strong>make cmap</strong>
|
# <strong>make cmap</strong>
|
||||||
python tools/conv_cmap.py pdfminer/cmap Adobe-CNS1 cmaprsrc/cid2code_Adobe_CNS1.txt cp950 big5
|
python tools/conv_cmap.py pdfminer/cmap Adobe-CNS1 cmaprsrc/cid2code_Adobe_CNS1.txt cp950 big5
|
||||||
|
@ -348,6 +348,7 @@ no stream header is displayed for the ease of saving it to a file.
|
||||||
<hr noshade>
|
<hr noshade>
|
||||||
<h2>Changes</h2>
|
<h2>Changes</h2>
|
||||||
<ul>
|
<ul>
|
||||||
|
<li> 2010/03/xx: Improved layout analysis.
|
||||||
<li> 2010/03/12: A couple of bugfixes. Thanks to Sean Manefield.
|
<li> 2010/03/12: A couple of bugfixes. Thanks to Sean Manefield.
|
||||||
<li> 2010/02/27: Changed the way of internal layout handling. (LTTextItem -> LTChar)
|
<li> 2010/02/27: Changed the way of internal layout handling. (LTTextItem -> LTChar)
|
||||||
<li> 2010/02/15: Several bugfixes. Thanks to Sean.
|
<li> 2010/02/15: Several bugfixes. Thanks to Sean.
|
||||||
|
|
|
@ -290,8 +290,7 @@ class XMLConverter(PDFConverter):
|
||||||
render(child)
|
render(child)
|
||||||
self.outfp.write('</textline>\n')
|
self.outfp.write('</textline>\n')
|
||||||
elif isinstance(item, LTTextBox):
|
elif isinstance(item, LTTextBox):
|
||||||
self.outfp.write('<textbox id="%s" bbox="%s">\n' %
|
self.outfp.write('<textbox bbox="%s">\n' % bbox2str(item.bbox))
|
||||||
(item.id, bbox2str(item.bbox)))
|
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
self.outfp.write('</textbox>\n')
|
self.outfp.write('</textbox>\n')
|
||||||
|
|
|
@ -19,6 +19,14 @@ def get_bounds(pts):
|
||||||
y1 = max(y1, y)
|
y1 = max(y1, y)
|
||||||
return (x0,y0,x1,y1)
|
return (x0,y0,x1,y1)
|
||||||
|
|
||||||
|
def uniq(objs):
|
||||||
|
done = set()
|
||||||
|
for obj in objs:
|
||||||
|
if obj in done: continue
|
||||||
|
done.add(obj)
|
||||||
|
yield obj
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
## LAParams
|
## LAParams
|
||||||
##
|
##
|
||||||
|
@ -27,7 +35,7 @@ class LAParams(object):
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
direction=None,
|
direction=None,
|
||||||
line_overlap=0.5,
|
line_overlap=0.5,
|
||||||
char_margin=1.0,
|
char_margin=3.0,
|
||||||
line_margin=0.5,
|
line_margin=0.5,
|
||||||
word_margin=0.1):
|
word_margin=0.1):
|
||||||
self.direction = direction
|
self.direction = direction
|
||||||
|
@ -73,59 +81,52 @@ class Plane(object):
|
||||||
def find(self, (x0,y0,x1,y1)):
|
def find(self, (x0,y0,x1,y1)):
|
||||||
i0 = bsearch(self.xobjs, x0)[0]
|
i0 = bsearch(self.xobjs, x0)[0]
|
||||||
i1 = bsearch(self.xobjs, x1)[1]
|
i1 = bsearch(self.xobjs, x1)[1]
|
||||||
xobjs = set( [pair[1] for pair in self.xobjs[i0:i1]] )
|
xobjs = set( obj for (_,obj) in self.xobjs[i0:i1] )
|
||||||
i0 = bsearch(self.yobjs, y0)[0]
|
i0 = bsearch(self.yobjs, y0)[0]
|
||||||
i1 = bsearch(self.yobjs, y1)[1]
|
i1 = bsearch(self.yobjs, y1)[1]
|
||||||
yobjs = [pair[1] for pair in self.yobjs[i0:i1]]
|
yobjs = set( obj for (_,obj) in self.yobjs[i0:i1] )
|
||||||
xobjs.intersection_update(yobjs)
|
xobjs.intersection_update(yobjs)
|
||||||
return xobjs
|
return list(xobjs)
|
||||||
|
|
||||||
|
|
||||||
## ClusterSet
|
## ClusterBuilder
|
||||||
##
|
##
|
||||||
class ClusterSet(object):
|
class ClusterBuilder(object):
|
||||||
|
|
||||||
def __init__(self, klass):
|
def __init__(self, groupfunc):
|
||||||
self.clusters = {}
|
self.clusters = {}
|
||||||
self.klass = klass
|
self.groupfunc = groupfunc
|
||||||
self.i = 0
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
# group(objs): groups given objects into one cluster.
|
||||||
return '<cset: %d>' % self.i
|
def group(self, objs):
|
||||||
|
r = objs[:]
|
||||||
# add(objs): groups text objects if necessary.
|
for obj1 in objs:
|
||||||
def add(self, objs):
|
if obj1 in self.clusters:
|
||||||
group = self.klass(self.i, objs)
|
r.extend(self.clusters.pop(obj1))
|
||||||
self.i += 1
|
cluster = self.groupfunc(list(uniq(r)))
|
||||||
for obj in objs:
|
for obj in r:
|
||||||
if obj in self.clusters:
|
self.clusters[obj] = cluster
|
||||||
group.merge(self.clusters[obj])
|
|
||||||
for obj in group:
|
|
||||||
self.clusters[obj] = group
|
|
||||||
return
|
return
|
||||||
|
|
||||||
# finish(): returns all the LTTextBoxes in a page.
|
# finish(): returns all the clusters.
|
||||||
def finish(self):
|
def finish(self):
|
||||||
r = set(self.clusters.itervalues())
|
clusters = set(self.clusters.itervalues())
|
||||||
for group in r:
|
for cluster in clusters:
|
||||||
group.fixate()
|
cluster.fixate()
|
||||||
return list(r)
|
return list(clusters)
|
||||||
|
|
||||||
@classmethod
|
def build_clusters(groupfunc, objs, (hratio, vratio)):
|
||||||
def build(klass, objs, hratio, vratio, objtype, func=None):
|
plane = Plane(objs)
|
||||||
plane = Plane(objs)
|
builder = ClusterBuilder(groupfunc)
|
||||||
cset = ClusterSet(objtype)
|
for obj in objs:
|
||||||
for obj in objs:
|
margin = obj.get_margin()
|
||||||
margin = obj.get_margin()
|
hmargin = hratio * margin
|
||||||
hmargin = hratio * margin
|
vmargin = vratio * margin
|
||||||
vmargin = vratio * margin
|
neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin))
|
||||||
neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin))
|
assert obj in neighbors, obj
|
||||||
assert obj in neighbors, obj
|
builder.group(neighbors)
|
||||||
if func:
|
return builder.finish()
|
||||||
neighbors = [ x for x in neighbors if x is obj or func(obj, x) ]
|
|
||||||
cset.add(neighbors)
|
|
||||||
return cset.finish()
|
|
||||||
|
|
||||||
|
|
||||||
## LayoutItem
|
## LayoutItem
|
||||||
|
@ -153,58 +154,62 @@ class LayoutItem(object):
|
||||||
|
|
||||||
def is_hoverlap(self, obj):
|
def is_hoverlap(self, obj):
|
||||||
assert isinstance(obj, LayoutItem)
|
assert isinstance(obj, LayoutItem)
|
||||||
if self.x1 <= obj.x0 or obj.x1 <= self.x0:
|
return obj.x0 <= self.x1 and self.x0 <= obj.x1
|
||||||
return False
|
|
||||||
else:
|
|
||||||
return True
|
|
||||||
|
|
||||||
def hoverlap(self, obj):
|
def hdistance(self, obj):
|
||||||
assert isinstance(obj, LayoutItem)
|
assert isinstance(obj, LayoutItem)
|
||||||
if self.x1 <= obj.x0 or obj.x1 <= self.x0:
|
if self.is_hoverlap(obj):
|
||||||
return 0
|
return 0
|
||||||
else:
|
else:
|
||||||
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
|
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
|
||||||
|
|
||||||
|
def hoverlap(self, obj):
|
||||||
|
assert isinstance(obj, LayoutItem)
|
||||||
|
if self.is_hoverlap(obj):
|
||||||
|
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
def is_voverlap(self, obj):
|
def is_voverlap(self, obj):
|
||||||
assert isinstance(obj, LayoutItem)
|
assert isinstance(obj, LayoutItem)
|
||||||
if self.y1 <= obj.y0 or obj.y1 <= self.y0:
|
return obj.y0 <= self.y1 and self.y0 <= obj.y1
|
||||||
return False
|
|
||||||
else:
|
|
||||||
return True
|
|
||||||
|
|
||||||
def voverlap(self, obj):
|
def vdistance(self, obj):
|
||||||
assert isinstance(obj, LayoutItem)
|
assert isinstance(obj, LayoutItem)
|
||||||
if self.y1 <= obj.y0 or obj.y1 <= self.y0:
|
if self.is_voverlap(obj):
|
||||||
return 0
|
return 0
|
||||||
else:
|
else:
|
||||||
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
|
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
|
||||||
|
|
||||||
|
def voverlap(self, obj):
|
||||||
|
assert isinstance(obj, LayoutItem)
|
||||||
|
if self.is_voverlap(obj):
|
||||||
|
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
def get_margin(self):
|
def get_margin(self):
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
def get_weight(self):
|
def get_weight(self):
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
def get_direction(self):
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
## LayoutContainer
|
## LayoutContainer
|
||||||
##
|
##
|
||||||
class LayoutContainer(LayoutItem):
|
class LayoutContainer(LayoutItem):
|
||||||
|
|
||||||
def __init__(self, id, bbox, objs=None):
|
def __init__(self, bbox, objs=None):
|
||||||
LayoutItem.__init__(self, bbox)
|
LayoutItem.__init__(self, bbox)
|
||||||
self.id = id
|
|
||||||
if objs:
|
if objs:
|
||||||
self.objs = set(objs)
|
self.objs = objs[:]
|
||||||
else:
|
else:
|
||||||
self.objs = set()
|
self.objs = []
|
||||||
self.weight = None
|
self.weight = None
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<group %s>' % bbox2str(self.bbox))
|
return ('<container %s>' % bbox2str(self.bbox))
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
return iter(self.objs)
|
return iter(self.objs)
|
||||||
|
@ -213,11 +218,11 @@ class LayoutContainer(LayoutItem):
|
||||||
return len(self.objs)
|
return len(self.objs)
|
||||||
|
|
||||||
def add(self, obj):
|
def add(self, obj):
|
||||||
self.objs.add(obj)
|
self.objs.append(obj)
|
||||||
return
|
return
|
||||||
|
|
||||||
def merge(self, group):
|
def merge(self, container):
|
||||||
self.objs.update(iter(group))
|
self.objs.extend(container.objs)
|
||||||
return
|
return
|
||||||
|
|
||||||
# fixate(): determines its boundery and writing direction.
|
# fixate(): determines its boundery and writing direction.
|
||||||
|
@ -236,9 +241,6 @@ class LayoutContainer(LayoutItem):
|
||||||
def get_weight(self):
|
def get_weight(self):
|
||||||
return self.weight
|
return self.weight
|
||||||
|
|
||||||
def get_direction(self):
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
## LTPolygon
|
## LTPolygon
|
||||||
##
|
##
|
||||||
|
@ -259,15 +261,6 @@ class LTPolygon(LayoutItem):
|
||||||
class LTLine(LTPolygon):
|
class LTLine(LTPolygon):
|
||||||
|
|
||||||
def __init__(self, linewidth, p0, p1):
|
def __init__(self, linewidth, p0, p1):
|
||||||
(x0,y0) = p0
|
|
||||||
(x1,y1) = p0
|
|
||||||
self.direction = None
|
|
||||||
if y0 == y1:
|
|
||||||
# horizontal ruler
|
|
||||||
self.direction = 'H'
|
|
||||||
elif x0 == x1:
|
|
||||||
# vertical ruler
|
|
||||||
self.direction = 'V'
|
|
||||||
LTPolygon.__init__(self, linewidth, [p0, p1])
|
LTPolygon.__init__(self, linewidth, [p0, p1])
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -397,8 +390,9 @@ class LTFigure(LayoutContainer):
|
||||||
(x,y,w,h) = bbox
|
(x,y,w,h) = bbox
|
||||||
bbox = get_bounds( apply_matrix_pt(matrix, (p,q))
|
bbox = get_bounds( apply_matrix_pt(matrix, (p,q))
|
||||||
for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) )
|
for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) )
|
||||||
|
self.id = id
|
||||||
self.matrix = matrix
|
self.matrix = matrix
|
||||||
LayoutContainer.__init__(self, id, bbox)
|
LayoutContainer.__init__(self, bbox)
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
|
@ -410,47 +404,51 @@ class LTFigure(LayoutContainer):
|
||||||
##
|
##
|
||||||
class LTTextLine(LayoutContainer):
|
class LTTextLine(LayoutContainer):
|
||||||
|
|
||||||
def __init__(self, id, objs, direction, word_margin):
|
def __init__(self, objs):
|
||||||
LayoutContainer.__init__(self, id, (0,0,0,0), objs)
|
LayoutContainer.__init__(self, (0,0,0,0), objs)
|
||||||
self.direction = direction
|
|
||||||
self.word_margin = word_margin
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<textline %s(%s)>' % (bbox2str(self.bbox), self.direction))
|
return ('<textline %s>' % bbox2str(self.bbox))
|
||||||
|
|
||||||
def get_margin(self):
|
def get_margin(self):
|
||||||
return min(self.width, self.height)
|
return min(self.width, self.height)
|
||||||
|
|
||||||
def get_direction(self):
|
|
||||||
return self.direction
|
|
||||||
|
|
||||||
def get_text(self):
|
def get_text(self):
|
||||||
return ''.join( obj.text for obj in self.objs if isinstance(obj, LTText) )
|
return ''.join( obj.text for obj in self.objs if isinstance(obj, LTText) )
|
||||||
|
|
||||||
def fixate(self):
|
class LTTextLineHorizontal(LTTextLine):
|
||||||
|
|
||||||
|
def __init__(self, objs, word_margin):
|
||||||
|
LTTextLine.__init__(self, objs)
|
||||||
LayoutContainer.fixate(self)
|
LayoutContainer.fixate(self)
|
||||||
objs = []
|
objs = []
|
||||||
if self.direction == 'V':
|
x1 = INF
|
||||||
y0 = -INF
|
for obj in sorted(self.objs, key=lambda obj: obj.x0):
|
||||||
for obj in sorted(self.objs, key=lambda obj: -obj.y1):
|
if isinstance(obj, LTChar) and word_margin:
|
||||||
if isinstance(obj, LTChar) and self.word_margin:
|
margin = word_margin * obj.get_margin()
|
||||||
margin = self.word_margin * obj.get_margin()
|
if x1 < obj.x0-margin:
|
||||||
if obj.y1+margin < y0:
|
objs.append(LTAnon(' '))
|
||||||
objs.append(LTAnon(' '))
|
objs.append(obj)
|
||||||
objs.append(obj)
|
x1 = obj.x1
|
||||||
y0 = obj.y0
|
self.objs = objs + [LTAnon('\n')]
|
||||||
else:
|
return
|
||||||
x1 = INF
|
|
||||||
for obj in sorted(self.objs, key=lambda obj: obj.x0):
|
class LTTextLineVertical(LTTextLine):
|
||||||
if isinstance(obj, LTChar) and self.word_margin:
|
|
||||||
margin = self.word_margin * obj.get_margin()
|
def __init__(self, objs, word_margin):
|
||||||
if x1 < obj.x0-margin:
|
LTTextLine.__init__(self, objs)
|
||||||
objs.append(LTAnon(' '))
|
LayoutContainer.fixate(self)
|
||||||
objs.append(obj)
|
objs = []
|
||||||
x1 = obj.x1
|
y0 = -INF
|
||||||
objs.append(LTAnon('\n'))
|
for obj in sorted(self.objs, key=lambda obj: -obj.y1):
|
||||||
self.objs = objs
|
if isinstance(obj, LTChar) and word_margin:
|
||||||
|
margin = word_margin * obj.get_margin()
|
||||||
|
if obj.y1+margin < y0:
|
||||||
|
objs.append(LTAnon(' '))
|
||||||
|
objs.append(obj)
|
||||||
|
y0 = obj.y0
|
||||||
|
self.objs = objs + [LTAnon('\n')]
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@ -461,27 +459,29 @@ class LTTextLine(LayoutContainer):
|
||||||
##
|
##
|
||||||
class LTTextBox(LayoutContainer):
|
class LTTextBox(LayoutContainer):
|
||||||
|
|
||||||
def __init__(self, id, objs, direction):
|
def __init__(self, objs):
|
||||||
LayoutContainer.__init__(self, id, (0,0,0,0), objs)
|
LayoutContainer.__init__(self, (0,0,0,0), objs)
|
||||||
self.direction = direction
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<textbox %s(%s) %r...>' % (bbox2str(self.bbox), self.direction, self.get_text()[:20]))
|
return ('<textbox(%d) %s %r...>' % (len(self.objs), bbox2str(self.bbox), self.get_text()[:20]))
|
||||||
|
|
||||||
def get_text(self):
|
def get_text(self):
|
||||||
return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) )
|
return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) )
|
||||||
|
|
||||||
|
class LTTextBoxHorizontal(LTTextBox):
|
||||||
|
|
||||||
def fixate(self):
|
def fixate(self):
|
||||||
LayoutContainer.fixate(self)
|
LTTextBox.fixate(self)
|
||||||
if self.direction == 'V':
|
self.objs = sorted(self.objs, key=lambda obj: -obj.y1)
|
||||||
self.objs = sorted(self.objs, key=lambda obj: -obj.x1)
|
|
||||||
else:
|
|
||||||
self.objs = sorted(self.objs, key=lambda obj: -obj.y1)
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_direction(self):
|
class LTTextBoxVertical(LTTextBox):
|
||||||
return self.direction
|
|
||||||
|
def fixate(self):
|
||||||
|
LTTextBox.fixate(self)
|
||||||
|
self.objs = sorted(self.objs, key=lambda obj: -obj.x1)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
def tsort(objs, f):
|
def tsort(objs, f):
|
||||||
|
@ -518,7 +518,8 @@ def tsort(objs, f):
|
||||||
class LTPage(LayoutContainer):
|
class LTPage(LayoutContainer):
|
||||||
|
|
||||||
def __init__(self, id, bbox, rotate=0):
|
def __init__(self, id, bbox, rotate=0):
|
||||||
LayoutContainer.__init__(self, id, bbox)
|
LayoutContainer.__init__(self, bbox)
|
||||||
|
self.id = id
|
||||||
self.rotate = rotate
|
self.rotate = rotate
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -534,36 +535,87 @@ class LTPage(LayoutContainer):
|
||||||
else:
|
else:
|
||||||
otherobjs.append(obj)
|
otherobjs.append(obj)
|
||||||
if laparams.direction == 'V':
|
if laparams.direction == 'V':
|
||||||
def vline(obj1, obj2):
|
textobjs = self.analyze_layout_vertical(textobjs, laparams)
|
||||||
return obj1.width * laparams.line_overlap < obj1.hoverlap(obj2)
|
|
||||||
def vorder(obj1, obj2):
|
|
||||||
if obj1.is_voverlap(obj2):
|
|
||||||
return obj2.x1 < obj1.x0
|
|
||||||
elif obj1.is_hoverlap(obj2):
|
|
||||||
return obj2.y1 < obj1.y0
|
|
||||||
else:
|
|
||||||
return obj2.x1 < obj1.x0 and obj2.y1 < obj1.y0
|
|
||||||
lines = ClusterSet.build(textobjs, 0, laparams.char_margin,
|
|
||||||
(lambda id,objs: LTTextLine(id, objs, 'V', laparams.word_margin)),
|
|
||||||
vline)
|
|
||||||
boxes = ClusterSet.build(lines, laparams.line_margin, 0,
|
|
||||||
(lambda id,objs: LTTextBox(id, objs, 'V')))
|
|
||||||
boxes = tsort(boxes, vorder)
|
|
||||||
else:
|
else:
|
||||||
def hline(obj1, obj2):
|
textobjs = self.analyze_layout_horizontal(textobjs, laparams)
|
||||||
return obj1.height * laparams.line_overlap < obj1.voverlap(obj2)
|
self.objs = textobjs + otherobjs
|
||||||
def horder(obj1, obj2):
|
|
||||||
if obj1.is_hoverlap(obj2):
|
|
||||||
return obj2.y1 < obj1.y0
|
|
||||||
elif obj1.is_voverlap(obj2):
|
|
||||||
return obj1.x1 < obj2.x0
|
|
||||||
else:
|
|
||||||
return obj1.x1 < obj2.x0 and obj2.y1 < obj1.y0
|
|
||||||
lines = ClusterSet.build(textobjs, laparams.char_margin, 0,
|
|
||||||
(lambda id,objs: LTTextLine(id, objs, 'H', laparams.word_margin)),
|
|
||||||
hline)
|
|
||||||
boxes = ClusterSet.build(lines, 0, laparams.line_margin,
|
|
||||||
(lambda id,objs: LTTextBox(id, objs, 'H')))
|
|
||||||
boxes = tsort(boxes, horder)
|
|
||||||
self.objs = otherobjs + boxes
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def analyze_layout_horizontal(self, objs, laparams):
|
||||||
|
|
||||||
|
def halign(obj1, obj2):
|
||||||
|
# +------+ - - -
|
||||||
|
# | obj1 | - - +------+ -
|
||||||
|
# | | | obj2 | | (line_overlap)
|
||||||
|
# +------+ - - | | -
|
||||||
|
# - - - +------+
|
||||||
|
#
|
||||||
|
# |<--->|
|
||||||
|
# (char_margin)
|
||||||
|
return ((min(obj1.height, obj2.height) * laparams.line_overlap < obj1.voverlap(obj2)) and
|
||||||
|
(obj1.hdistance(obj2) < min(obj1.width, obj2.width) * laparams.char_margin))
|
||||||
|
|
||||||
|
lines = []
|
||||||
|
line = []
|
||||||
|
prev = None
|
||||||
|
for cur in objs:
|
||||||
|
if prev is not None and not halign(prev, cur):
|
||||||
|
if line:
|
||||||
|
lines.append(LTTextLineHorizontal(line, laparams.word_margin))
|
||||||
|
line = []
|
||||||
|
line.append(cur)
|
||||||
|
prev = cur
|
||||||
|
if line:
|
||||||
|
lines.append(LTTextLineHorizontal(line, laparams.word_margin))
|
||||||
|
boxes = build_clusters(LTTextBoxHorizontal, lines, (0, laparams.line_margin))
|
||||||
|
|
||||||
|
def horder(obj1, obj2):
|
||||||
|
if obj1.is_hoverlap(obj2):
|
||||||
|
return obj2.y1 < obj1.y0
|
||||||
|
elif obj1.is_voverlap(obj2):
|
||||||
|
return obj1.x1 < obj2.x0
|
||||||
|
else:
|
||||||
|
return obj1.x1 < obj2.x0 and obj2.y1 < obj1.y0
|
||||||
|
return tsort(boxes, horder)
|
||||||
|
|
||||||
|
def analyze_layout_vertical(self, objs, laparams):
|
||||||
|
|
||||||
|
def valign(obj1, obj2):
|
||||||
|
# +------+
|
||||||
|
# | obj1 |
|
||||||
|
# | |
|
||||||
|
# +------+ - - -
|
||||||
|
# | | | (char_margin)
|
||||||
|
# +------+ - -
|
||||||
|
# | obj2 |
|
||||||
|
# | |
|
||||||
|
# +------+
|
||||||
|
#
|
||||||
|
# |<--->|
|
||||||
|
# (line_overlap)
|
||||||
|
#
|
||||||
|
return ((min(obj1.width, obj2.width) * laparams.line_overlap < obj1.hoverlap(obj2)) and
|
||||||
|
(obj1.vdistance(obj2) < min(obj1.height, obj2.height) * laparams.char_margin))
|
||||||
|
|
||||||
|
lines = []
|
||||||
|
line = []
|
||||||
|
prev = None
|
||||||
|
for cur in objs:
|
||||||
|
if prev is not None and not valign(prev, cur):
|
||||||
|
if line:
|
||||||
|
lines.append(LTTextLineVertical(line, laparams.word_margin))
|
||||||
|
line = []
|
||||||
|
line.append(cur)
|
||||||
|
prev = cur
|
||||||
|
if line:
|
||||||
|
lines.append(LTTextLineVertical(line, laparams.word_margin))
|
||||||
|
boxes = build_clusters(LTTextBoxVertical, lines, (laparams.line_margin, 0))
|
||||||
|
|
||||||
|
def vorder(obj1, obj2):
|
||||||
|
if obj1.is_voverlap(obj2):
|
||||||
|
return obj2.y1 < obj1.y0
|
||||||
|
elif obj1.is_hoverlap(obj2):
|
||||||
|
return obj1.x1 < obj2.x0
|
||||||
|
else:
|
||||||
|
return obj1.x1 < obj2.x0 and obj2.y1 < obj1.y0
|
||||||
|
return tsort(boxes, vorder)
|
||||||
|
|
Loading…
Reference in New Issue