layout analysis changed drastically.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@186 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2010-03-20 05:43:34 +00:00
parent 85c5476623
commit ffaaea0bac
3 changed files with 204 additions and 152 deletions

View File

@ -19,7 +19,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod>
<!-- hhmts start -->
Last Modified: Fri Mar 12 13:45:59 UTC 2010
Last Modified: Sat Mar 20 05:43:04 UTC 2010
<!-- hhmts end -->
</div>
@ -131,7 +131,7 @@ In order to handle CJK languages,
an additional data called <code>CMap</code> is required.
CMap files are not installed by default.
<p>
Here is the additional step you need:
Here is the additional step you need to take:
<blockquote><pre>
# <strong>make cmap</strong>
python tools/conv_cmap.py pdfminer/cmap Adobe-CNS1 cmaprsrc/cid2code_Adobe_CNS1.txt cp950 big5
@ -348,6 +348,7 @@ no stream header is displayed for the ease of saving it to a file.
<hr noshade>
<h2>Changes</h2>
<ul>
<li> 2010/03/xx: Improved layout analysis.
<li> 2010/03/12: A couple of bugfixes. Thanks to Sean Manefield.
<li> 2010/02/27: Changed the way of internal layout handling. (LTTextItem -&gt; LTChar)
<li> 2010/02/15: Several bugfixes. Thanks to Sean.

View File

@ -290,8 +290,7 @@ class XMLConverter(PDFConverter):
render(child)
self.outfp.write('</textline>\n')
elif isinstance(item, LTTextBox):
self.outfp.write('<textbox id="%s" bbox="%s">\n' %
(item.id, bbox2str(item.bbox)))
self.outfp.write('<textbox bbox="%s">\n' % bbox2str(item.bbox))
for child in item:
render(child)
self.outfp.write('</textbox>\n')

View File

@ -19,6 +19,14 @@ def get_bounds(pts):
y1 = max(y1, y)
return (x0,y0,x1,y1)
def uniq(objs):
done = set()
for obj in objs:
if obj in done: continue
done.add(obj)
yield obj
return
## LAParams
##
@ -27,7 +35,7 @@ class LAParams(object):
def __init__(self,
direction=None,
line_overlap=0.5,
char_margin=1.0,
char_margin=3.0,
line_margin=0.5,
word_margin=0.1):
self.direction = direction
@ -73,59 +81,52 @@ class Plane(object):
def find(self, (x0,y0,x1,y1)):
i0 = bsearch(self.xobjs, x0)[0]
i1 = bsearch(self.xobjs, x1)[1]
xobjs = set( [pair[1] for pair in self.xobjs[i0:i1]] )
xobjs = set( obj for (_,obj) in self.xobjs[i0:i1] )
i0 = bsearch(self.yobjs, y0)[0]
i1 = bsearch(self.yobjs, y1)[1]
yobjs = [pair[1] for pair in self.yobjs[i0:i1]]
yobjs = set( obj for (_,obj) in self.yobjs[i0:i1] )
xobjs.intersection_update(yobjs)
return xobjs
return list(xobjs)
## ClusterSet
## ClusterBuilder
##
class ClusterSet(object):
class ClusterBuilder(object):
def __init__(self, klass):
def __init__(self, groupfunc):
self.clusters = {}
self.klass = klass
self.i = 0
self.groupfunc = groupfunc
return
def __repr__(self):
return '<cset: %d>' % self.i
# add(objs): groups text objects if necessary.
def add(self, objs):
group = self.klass(self.i, objs)
self.i += 1
for obj in objs:
if obj in self.clusters:
group.merge(self.clusters[obj])
for obj in group:
self.clusters[obj] = group
# group(objs): groups given objects into one cluster.
def group(self, objs):
r = objs[:]
for obj1 in objs:
if obj1 in self.clusters:
r.extend(self.clusters.pop(obj1))
cluster = self.groupfunc(list(uniq(r)))
for obj in r:
self.clusters[obj] = cluster
return
# finish(): returns all the LTTextBoxes in a page.
# finish(): returns all the clusters.
def finish(self):
r = set(self.clusters.itervalues())
for group in r:
group.fixate()
return list(r)
clusters = set(self.clusters.itervalues())
for cluster in clusters:
cluster.fixate()
return list(clusters)
@classmethod
def build(klass, objs, hratio, vratio, objtype, func=None):
def build_clusters(groupfunc, objs, (hratio, vratio)):
plane = Plane(objs)
cset = ClusterSet(objtype)
builder = ClusterBuilder(groupfunc)
for obj in objs:
margin = obj.get_margin()
hmargin = hratio * margin
vmargin = vratio * margin
neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin))
assert obj in neighbors, obj
if func:
neighbors = [ x for x in neighbors if x is obj or func(obj, x) ]
cset.add(neighbors)
return cset.finish()
builder.group(neighbors)
return builder.finish()
## LayoutItem
@ -153,58 +154,62 @@ class LayoutItem(object):
def is_hoverlap(self, obj):
assert isinstance(obj, LayoutItem)
if self.x1 <= obj.x0 or obj.x1 <= self.x0:
return False
else:
return True
return obj.x0 <= self.x1 and self.x0 <= obj.x1
def hoverlap(self, obj):
def hdistance(self, obj):
assert isinstance(obj, LayoutItem)
if self.x1 <= obj.x0 or obj.x1 <= self.x0:
if self.is_hoverlap(obj):
return 0
else:
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
def hoverlap(self, obj):
assert isinstance(obj, LayoutItem)
if self.is_hoverlap(obj):
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
else:
return 0
def is_voverlap(self, obj):
assert isinstance(obj, LayoutItem)
if self.y1 <= obj.y0 or obj.y1 <= self.y0:
return False
else:
return True
return obj.y0 <= self.y1 and self.y0 <= obj.y1
def voverlap(self, obj):
def vdistance(self, obj):
assert isinstance(obj, LayoutItem)
if self.y1 <= obj.y0 or obj.y1 <= self.y0:
if self.is_voverlap(obj):
return 0
else:
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
def voverlap(self, obj):
assert isinstance(obj, LayoutItem)
if self.is_voverlap(obj):
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
else:
return 0
def get_margin(self):
return 0
def get_weight(self):
return 0
def get_direction(self):
return None
## LayoutContainer
##
class LayoutContainer(LayoutItem):
def __init__(self, id, bbox, objs=None):
def __init__(self, bbox, objs=None):
LayoutItem.__init__(self, bbox)
self.id = id
if objs:
self.objs = set(objs)
self.objs = objs[:]
else:
self.objs = set()
self.objs = []
self.weight = None
return
def __repr__(self):
return ('<group %s>' % bbox2str(self.bbox))
return ('<container %s>' % bbox2str(self.bbox))
def __iter__(self):
return iter(self.objs)
@ -213,11 +218,11 @@ class LayoutContainer(LayoutItem):
return len(self.objs)
def add(self, obj):
self.objs.add(obj)
self.objs.append(obj)
return
def merge(self, group):
self.objs.update(iter(group))
def merge(self, container):
self.objs.extend(container.objs)
return
# fixate(): determines its boundery and writing direction.
@ -236,9 +241,6 @@ class LayoutContainer(LayoutItem):
def get_weight(self):
return self.weight
def get_direction(self):
return None
## LTPolygon
##
@ -259,15 +261,6 @@ class LTPolygon(LayoutItem):
class LTLine(LTPolygon):
def __init__(self, linewidth, p0, p1):
(x0,y0) = p0
(x1,y1) = p0
self.direction = None
if y0 == y1:
# horizontal ruler
self.direction = 'H'
elif x0 == x1:
# vertical ruler
self.direction = 'V'
LTPolygon.__init__(self, linewidth, [p0, p1])
return
@ -397,8 +390,9 @@ class LTFigure(LayoutContainer):
(x,y,w,h) = bbox
bbox = get_bounds( apply_matrix_pt(matrix, (p,q))
for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) )
self.id = id
self.matrix = matrix
LayoutContainer.__init__(self, id, bbox)
LayoutContainer.__init__(self, bbox)
return
def __repr__(self):
@ -410,47 +404,51 @@ class LTFigure(LayoutContainer):
##
class LTTextLine(LayoutContainer):
def __init__(self, id, objs, direction, word_margin):
LayoutContainer.__init__(self, id, (0,0,0,0), objs)
self.direction = direction
self.word_margin = word_margin
def __init__(self, objs):
LayoutContainer.__init__(self, (0,0,0,0), objs)
return
def __repr__(self):
return ('<textline %s(%s)>' % (bbox2str(self.bbox), self.direction))
return ('<textline %s>' % bbox2str(self.bbox))
def get_margin(self):
return min(self.width, self.height)
def get_direction(self):
return self.direction
def get_text(self):
return ''.join( obj.text for obj in self.objs if isinstance(obj, LTText) )
def fixate(self):
class LTTextLineHorizontal(LTTextLine):
def __init__(self, objs, word_margin):
LTTextLine.__init__(self, objs)
LayoutContainer.fixate(self)
objs = []
if self.direction == 'V':
y0 = -INF
for obj in sorted(self.objs, key=lambda obj: -obj.y1):
if isinstance(obj, LTChar) and self.word_margin:
margin = self.word_margin * obj.get_margin()
if obj.y1+margin < y0:
objs.append(LTAnon(' '))
objs.append(obj)
y0 = obj.y0
else:
x1 = INF
for obj in sorted(self.objs, key=lambda obj: obj.x0):
if isinstance(obj, LTChar) and self.word_margin:
margin = self.word_margin * obj.get_margin()
if isinstance(obj, LTChar) and word_margin:
margin = word_margin * obj.get_margin()
if x1 < obj.x0-margin:
objs.append(LTAnon(' '))
objs.append(obj)
x1 = obj.x1
objs.append(LTAnon('\n'))
self.objs = objs
self.objs = objs + [LTAnon('\n')]
return
class LTTextLineVertical(LTTextLine):
def __init__(self, objs, word_margin):
LTTextLine.__init__(self, objs)
LayoutContainer.fixate(self)
objs = []
y0 = -INF
for obj in sorted(self.objs, key=lambda obj: -obj.y1):
if isinstance(obj, LTChar) and word_margin:
margin = word_margin * obj.get_margin()
if obj.y1+margin < y0:
objs.append(LTAnon(' '))
objs.append(obj)
y0 = obj.y0
self.objs = objs + [LTAnon('\n')]
return
@ -461,27 +459,29 @@ class LTTextLine(LayoutContainer):
##
class LTTextBox(LayoutContainer):
def __init__(self, id, objs, direction):
LayoutContainer.__init__(self, id, (0,0,0,0), objs)
self.direction = direction
def __init__(self, objs):
LayoutContainer.__init__(self, (0,0,0,0), objs)
return
def __repr__(self):
return ('<textbox %s(%s) %r...>' % (bbox2str(self.bbox), self.direction, self.get_text()[:20]))
return ('<textbox(%d) %s %r...>' % (len(self.objs), bbox2str(self.bbox), self.get_text()[:20]))
def get_text(self):
return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) )
class LTTextBoxHorizontal(LTTextBox):
def fixate(self):
LayoutContainer.fixate(self)
if self.direction == 'V':
self.objs = sorted(self.objs, key=lambda obj: -obj.x1)
else:
LTTextBox.fixate(self)
self.objs = sorted(self.objs, key=lambda obj: -obj.y1)
return
def get_direction(self):
return self.direction
class LTTextBoxVertical(LTTextBox):
def fixate(self):
LTTextBox.fixate(self)
self.objs = sorted(self.objs, key=lambda obj: -obj.x1)
return
def tsort(objs, f):
@ -518,7 +518,8 @@ def tsort(objs, f):
class LTPage(LayoutContainer):
def __init__(self, id, bbox, rotate=0):
LayoutContainer.__init__(self, id, bbox)
LayoutContainer.__init__(self, bbox)
self.id = id
self.rotate = rotate
return
@ -534,24 +535,40 @@ class LTPage(LayoutContainer):
else:
otherobjs.append(obj)
if laparams.direction == 'V':
def vline(obj1, obj2):
return obj1.width * laparams.line_overlap < obj1.hoverlap(obj2)
def vorder(obj1, obj2):
if obj1.is_voverlap(obj2):
return obj2.x1 < obj1.x0
elif obj1.is_hoverlap(obj2):
return obj2.y1 < obj1.y0
textobjs = self.analyze_layout_vertical(textobjs, laparams)
else:
return obj2.x1 < obj1.x0 and obj2.y1 < obj1.y0
lines = ClusterSet.build(textobjs, 0, laparams.char_margin,
(lambda id,objs: LTTextLine(id, objs, 'V', laparams.word_margin)),
vline)
boxes = ClusterSet.build(lines, laparams.line_margin, 0,
(lambda id,objs: LTTextBox(id, objs, 'V')))
boxes = tsort(boxes, vorder)
else:
def hline(obj1, obj2):
return obj1.height * laparams.line_overlap < obj1.voverlap(obj2)
textobjs = self.analyze_layout_horizontal(textobjs, laparams)
self.objs = textobjs + otherobjs
return
def analyze_layout_horizontal(self, objs, laparams):
def halign(obj1, obj2):
# +------+ - - -
# | obj1 | - - +------+ -
# | | | obj2 | | (line_overlap)
# +------+ - - | | -
# - - - +------+
#
# |<--->|
# (char_margin)
return ((min(obj1.height, obj2.height) * laparams.line_overlap < obj1.voverlap(obj2)) and
(obj1.hdistance(obj2) < min(obj1.width, obj2.width) * laparams.char_margin))
lines = []
line = []
prev = None
for cur in objs:
if prev is not None and not halign(prev, cur):
if line:
lines.append(LTTextLineHorizontal(line, laparams.word_margin))
line = []
line.append(cur)
prev = cur
if line:
lines.append(LTTextLineHorizontal(line, laparams.word_margin))
boxes = build_clusters(LTTextBoxHorizontal, lines, (0, laparams.line_margin))
def horder(obj1, obj2):
if obj1.is_hoverlap(obj2):
return obj2.y1 < obj1.y0
@ -559,11 +576,46 @@ class LTPage(LayoutContainer):
return obj1.x1 < obj2.x0
else:
return obj1.x1 < obj2.x0 and obj2.y1 < obj1.y0
lines = ClusterSet.build(textobjs, laparams.char_margin, 0,
(lambda id,objs: LTTextLine(id, objs, 'H', laparams.word_margin)),
hline)
boxes = ClusterSet.build(lines, 0, laparams.line_margin,
(lambda id,objs: LTTextBox(id, objs, 'H')))
boxes = tsort(boxes, horder)
self.objs = otherobjs + boxes
return
return tsort(boxes, horder)
def analyze_layout_vertical(self, objs, laparams):
def valign(obj1, obj2):
# +------+
# | obj1 |
# | |
# +------+ - - -
# | | | (char_margin)
# +------+ - -
# | obj2 |
# | |
# +------+
#
# |<--->|
# (line_overlap)
#
return ((min(obj1.width, obj2.width) * laparams.line_overlap < obj1.hoverlap(obj2)) and
(obj1.vdistance(obj2) < min(obj1.height, obj2.height) * laparams.char_margin))
lines = []
line = []
prev = None
for cur in objs:
if prev is not None and not valign(prev, cur):
if line:
lines.append(LTTextLineVertical(line, laparams.word_margin))
line = []
line.append(cur)
prev = cur
if line:
lines.append(LTTextLineVertical(line, laparams.word_margin))
boxes = build_clusters(LTTextBoxVertical, lines, (laparams.line_margin, 0))
def vorder(obj1, obj2):
if obj1.is_voverlap(obj2):
return obj2.y1 < obj1.y0
elif obj1.is_hoverlap(obj2):
return obj1.x1 < obj2.x0
else:
return obj1.x1 < obj2.x0 and obj2.y1 < obj1.y0
return tsort(boxes, vorder)