layout analysis changed drastically.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@186 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2010-03-20 05:43:34 +00:00
parent 85c5476623
commit ffaaea0bac
3 changed files with 204 additions and 152 deletions

View File

@ -19,7 +19,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod> <div align=right class=lastmod>
<!-- hhmts start --> <!-- hhmts start -->
Last Modified: Fri Mar 12 13:45:59 UTC 2010 Last Modified: Sat Mar 20 05:43:04 UTC 2010
<!-- hhmts end --> <!-- hhmts end -->
</div> </div>
@ -131,7 +131,7 @@ In order to handle CJK languages,
an additional data called <code>CMap</code> is required. an additional data called <code>CMap</code> is required.
CMap files are not installed by default. CMap files are not installed by default.
<p> <p>
Here is the additional step you need: Here is the additional step you need to take:
<blockquote><pre> <blockquote><pre>
# <strong>make cmap</strong> # <strong>make cmap</strong>
python tools/conv_cmap.py pdfminer/cmap Adobe-CNS1 cmaprsrc/cid2code_Adobe_CNS1.txt cp950 big5 python tools/conv_cmap.py pdfminer/cmap Adobe-CNS1 cmaprsrc/cid2code_Adobe_CNS1.txt cp950 big5
@ -348,6 +348,7 @@ no stream header is displayed for the ease of saving it to a file.
<hr noshade> <hr noshade>
<h2>Changes</h2> <h2>Changes</h2>
<ul> <ul>
<li> 2010/03/xx: Improved layout analysis.
<li> 2010/03/12: A couple of bugfixes. Thanks to Sean Manefield. <li> 2010/03/12: A couple of bugfixes. Thanks to Sean Manefield.
<li> 2010/02/27: Changed the way of internal layout handling. (LTTextItem -&gt; LTChar) <li> 2010/02/27: Changed the way of internal layout handling. (LTTextItem -&gt; LTChar)
<li> 2010/02/15: Several bugfixes. Thanks to Sean. <li> 2010/02/15: Several bugfixes. Thanks to Sean.

View File

@ -290,8 +290,7 @@ class XMLConverter(PDFConverter):
render(child) render(child)
self.outfp.write('</textline>\n') self.outfp.write('</textline>\n')
elif isinstance(item, LTTextBox): elif isinstance(item, LTTextBox):
self.outfp.write('<textbox id="%s" bbox="%s">\n' % self.outfp.write('<textbox bbox="%s">\n' % bbox2str(item.bbox))
(item.id, bbox2str(item.bbox)))
for child in item: for child in item:
render(child) render(child)
self.outfp.write('</textbox>\n') self.outfp.write('</textbox>\n')

View File

@ -19,6 +19,14 @@ def get_bounds(pts):
y1 = max(y1, y) y1 = max(y1, y)
return (x0,y0,x1,y1) return (x0,y0,x1,y1)
def uniq(objs):
done = set()
for obj in objs:
if obj in done: continue
done.add(obj)
yield obj
return
## LAParams ## LAParams
## ##
@ -27,7 +35,7 @@ class LAParams(object):
def __init__(self, def __init__(self,
direction=None, direction=None,
line_overlap=0.5, line_overlap=0.5,
char_margin=1.0, char_margin=3.0,
line_margin=0.5, line_margin=0.5,
word_margin=0.1): word_margin=0.1):
self.direction = direction self.direction = direction
@ -73,59 +81,52 @@ class Plane(object):
def find(self, (x0,y0,x1,y1)): def find(self, (x0,y0,x1,y1)):
i0 = bsearch(self.xobjs, x0)[0] i0 = bsearch(self.xobjs, x0)[0]
i1 = bsearch(self.xobjs, x1)[1] i1 = bsearch(self.xobjs, x1)[1]
xobjs = set( [pair[1] for pair in self.xobjs[i0:i1]] ) xobjs = set( obj for (_,obj) in self.xobjs[i0:i1] )
i0 = bsearch(self.yobjs, y0)[0] i0 = bsearch(self.yobjs, y0)[0]
i1 = bsearch(self.yobjs, y1)[1] i1 = bsearch(self.yobjs, y1)[1]
yobjs = [pair[1] for pair in self.yobjs[i0:i1]] yobjs = set( obj for (_,obj) in self.yobjs[i0:i1] )
xobjs.intersection_update(yobjs) xobjs.intersection_update(yobjs)
return xobjs return list(xobjs)
## ClusterSet ## ClusterBuilder
## ##
class ClusterSet(object): class ClusterBuilder(object):
def __init__(self, klass): def __init__(self, groupfunc):
self.clusters = {} self.clusters = {}
self.klass = klass self.groupfunc = groupfunc
self.i = 0
return return
def __repr__(self): # group(objs): groups given objects into one cluster.
return '<cset: %d>' % self.i def group(self, objs):
r = objs[:]
# add(objs): groups text objects if necessary. for obj1 in objs:
def add(self, objs): if obj1 in self.clusters:
group = self.klass(self.i, objs) r.extend(self.clusters.pop(obj1))
self.i += 1 cluster = self.groupfunc(list(uniq(r)))
for obj in objs: for obj in r:
if obj in self.clusters: self.clusters[obj] = cluster
group.merge(self.clusters[obj])
for obj in group:
self.clusters[obj] = group
return return
# finish(): returns all the LTTextBoxes in a page. # finish(): returns all the clusters.
def finish(self): def finish(self):
r = set(self.clusters.itervalues()) clusters = set(self.clusters.itervalues())
for group in r: for cluster in clusters:
group.fixate() cluster.fixate()
return list(r) return list(clusters)
@classmethod def build_clusters(groupfunc, objs, (hratio, vratio)):
def build(klass, objs, hratio, vratio, objtype, func=None): plane = Plane(objs)
plane = Plane(objs) builder = ClusterBuilder(groupfunc)
cset = ClusterSet(objtype) for obj in objs:
for obj in objs: margin = obj.get_margin()
margin = obj.get_margin() hmargin = hratio * margin
hmargin = hratio * margin vmargin = vratio * margin
vmargin = vratio * margin neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin))
neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin)) assert obj in neighbors, obj
assert obj in neighbors, obj builder.group(neighbors)
if func: return builder.finish()
neighbors = [ x for x in neighbors if x is obj or func(obj, x) ]
cset.add(neighbors)
return cset.finish()
## LayoutItem ## LayoutItem
@ -153,58 +154,62 @@ class LayoutItem(object):
def is_hoverlap(self, obj): def is_hoverlap(self, obj):
assert isinstance(obj, LayoutItem) assert isinstance(obj, LayoutItem)
if self.x1 <= obj.x0 or obj.x1 <= self.x0: return obj.x0 <= self.x1 and self.x0 <= obj.x1
return False
else:
return True
def hoverlap(self, obj): def hdistance(self, obj):
assert isinstance(obj, LayoutItem) assert isinstance(obj, LayoutItem)
if self.x1 <= obj.x0 or obj.x1 <= self.x0: if self.is_hoverlap(obj):
return 0 return 0
else: else:
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0)) return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
def hoverlap(self, obj):
assert isinstance(obj, LayoutItem)
if self.is_hoverlap(obj):
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
else:
return 0
def is_voverlap(self, obj): def is_voverlap(self, obj):
assert isinstance(obj, LayoutItem) assert isinstance(obj, LayoutItem)
if self.y1 <= obj.y0 or obj.y1 <= self.y0: return obj.y0 <= self.y1 and self.y0 <= obj.y1
return False
else:
return True
def voverlap(self, obj): def vdistance(self, obj):
assert isinstance(obj, LayoutItem) assert isinstance(obj, LayoutItem)
if self.y1 <= obj.y0 or obj.y1 <= self.y0: if self.is_voverlap(obj):
return 0 return 0
else: else:
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0)) return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
def voverlap(self, obj):
assert isinstance(obj, LayoutItem)
if self.is_voverlap(obj):
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
else:
return 0
def get_margin(self): def get_margin(self):
return 0 return 0
def get_weight(self): def get_weight(self):
return 0 return 0
def get_direction(self):
return None
## LayoutContainer ## LayoutContainer
## ##
class LayoutContainer(LayoutItem): class LayoutContainer(LayoutItem):
def __init__(self, id, bbox, objs=None): def __init__(self, bbox, objs=None):
LayoutItem.__init__(self, bbox) LayoutItem.__init__(self, bbox)
self.id = id
if objs: if objs:
self.objs = set(objs) self.objs = objs[:]
else: else:
self.objs = set() self.objs = []
self.weight = None self.weight = None
return return
def __repr__(self): def __repr__(self):
return ('<group %s>' % bbox2str(self.bbox)) return ('<container %s>' % bbox2str(self.bbox))
def __iter__(self): def __iter__(self):
return iter(self.objs) return iter(self.objs)
@ -213,11 +218,11 @@ class LayoutContainer(LayoutItem):
return len(self.objs) return len(self.objs)
def add(self, obj): def add(self, obj):
self.objs.add(obj) self.objs.append(obj)
return return
def merge(self, group): def merge(self, container):
self.objs.update(iter(group)) self.objs.extend(container.objs)
return return
# fixate(): determines its boundery and writing direction. # fixate(): determines its boundery and writing direction.
@ -236,9 +241,6 @@ class LayoutContainer(LayoutItem):
def get_weight(self): def get_weight(self):
return self.weight return self.weight
def get_direction(self):
return None
## LTPolygon ## LTPolygon
## ##
@ -259,15 +261,6 @@ class LTPolygon(LayoutItem):
class LTLine(LTPolygon): class LTLine(LTPolygon):
def __init__(self, linewidth, p0, p1): def __init__(self, linewidth, p0, p1):
(x0,y0) = p0
(x1,y1) = p0
self.direction = None
if y0 == y1:
# horizontal ruler
self.direction = 'H'
elif x0 == x1:
# vertical ruler
self.direction = 'V'
LTPolygon.__init__(self, linewidth, [p0, p1]) LTPolygon.__init__(self, linewidth, [p0, p1])
return return
@ -397,8 +390,9 @@ class LTFigure(LayoutContainer):
(x,y,w,h) = bbox (x,y,w,h) = bbox
bbox = get_bounds( apply_matrix_pt(matrix, (p,q)) bbox = get_bounds( apply_matrix_pt(matrix, (p,q))
for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) ) for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) )
self.id = id
self.matrix = matrix self.matrix = matrix
LayoutContainer.__init__(self, id, bbox) LayoutContainer.__init__(self, bbox)
return return
def __repr__(self): def __repr__(self):
@ -410,47 +404,51 @@ class LTFigure(LayoutContainer):
## ##
class LTTextLine(LayoutContainer): class LTTextLine(LayoutContainer):
def __init__(self, id, objs, direction, word_margin): def __init__(self, objs):
LayoutContainer.__init__(self, id, (0,0,0,0), objs) LayoutContainer.__init__(self, (0,0,0,0), objs)
self.direction = direction
self.word_margin = word_margin
return return
def __repr__(self): def __repr__(self):
return ('<textline %s(%s)>' % (bbox2str(self.bbox), self.direction)) return ('<textline %s>' % bbox2str(self.bbox))
def get_margin(self): def get_margin(self):
return min(self.width, self.height) return min(self.width, self.height)
def get_direction(self):
return self.direction
def get_text(self): def get_text(self):
return ''.join( obj.text for obj in self.objs if isinstance(obj, LTText) ) return ''.join( obj.text for obj in self.objs if isinstance(obj, LTText) )
def fixate(self): class LTTextLineHorizontal(LTTextLine):
def __init__(self, objs, word_margin):
LTTextLine.__init__(self, objs)
LayoutContainer.fixate(self) LayoutContainer.fixate(self)
objs = [] objs = []
if self.direction == 'V': x1 = INF
y0 = -INF for obj in sorted(self.objs, key=lambda obj: obj.x0):
for obj in sorted(self.objs, key=lambda obj: -obj.y1): if isinstance(obj, LTChar) and word_margin:
if isinstance(obj, LTChar) and self.word_margin: margin = word_margin * obj.get_margin()
margin = self.word_margin * obj.get_margin() if x1 < obj.x0-margin:
if obj.y1+margin < y0: objs.append(LTAnon(' '))
objs.append(LTAnon(' ')) objs.append(obj)
objs.append(obj) x1 = obj.x1
y0 = obj.y0 self.objs = objs + [LTAnon('\n')]
else: return
x1 = INF
for obj in sorted(self.objs, key=lambda obj: obj.x0): class LTTextLineVertical(LTTextLine):
if isinstance(obj, LTChar) and self.word_margin:
margin = self.word_margin * obj.get_margin() def __init__(self, objs, word_margin):
if x1 < obj.x0-margin: LTTextLine.__init__(self, objs)
objs.append(LTAnon(' ')) LayoutContainer.fixate(self)
objs.append(obj) objs = []
x1 = obj.x1 y0 = -INF
objs.append(LTAnon('\n')) for obj in sorted(self.objs, key=lambda obj: -obj.y1):
self.objs = objs if isinstance(obj, LTChar) and word_margin:
margin = word_margin * obj.get_margin()
if obj.y1+margin < y0:
objs.append(LTAnon(' '))
objs.append(obj)
y0 = obj.y0
self.objs = objs + [LTAnon('\n')]
return return
@ -461,27 +459,29 @@ class LTTextLine(LayoutContainer):
## ##
class LTTextBox(LayoutContainer): class LTTextBox(LayoutContainer):
def __init__(self, id, objs, direction): def __init__(self, objs):
LayoutContainer.__init__(self, id, (0,0,0,0), objs) LayoutContainer.__init__(self, (0,0,0,0), objs)
self.direction = direction
return return
def __repr__(self): def __repr__(self):
return ('<textbox %s(%s) %r...>' % (bbox2str(self.bbox), self.direction, self.get_text()[:20])) return ('<textbox(%d) %s %r...>' % (len(self.objs), bbox2str(self.bbox), self.get_text()[:20]))
def get_text(self): def get_text(self):
return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) ) return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) )
class LTTextBoxHorizontal(LTTextBox):
def fixate(self): def fixate(self):
LayoutContainer.fixate(self) LTTextBox.fixate(self)
if self.direction == 'V': self.objs = sorted(self.objs, key=lambda obj: -obj.y1)
self.objs = sorted(self.objs, key=lambda obj: -obj.x1)
else:
self.objs = sorted(self.objs, key=lambda obj: -obj.y1)
return return
def get_direction(self): class LTTextBoxVertical(LTTextBox):
return self.direction
def fixate(self):
LTTextBox.fixate(self)
self.objs = sorted(self.objs, key=lambda obj: -obj.x1)
return
def tsort(objs, f): def tsort(objs, f):
@ -518,7 +518,8 @@ def tsort(objs, f):
class LTPage(LayoutContainer): class LTPage(LayoutContainer):
def __init__(self, id, bbox, rotate=0): def __init__(self, id, bbox, rotate=0):
LayoutContainer.__init__(self, id, bbox) LayoutContainer.__init__(self, bbox)
self.id = id
self.rotate = rotate self.rotate = rotate
return return
@ -534,36 +535,87 @@ class LTPage(LayoutContainer):
else: else:
otherobjs.append(obj) otherobjs.append(obj)
if laparams.direction == 'V': if laparams.direction == 'V':
def vline(obj1, obj2): textobjs = self.analyze_layout_vertical(textobjs, laparams)
return obj1.width * laparams.line_overlap < obj1.hoverlap(obj2)
def vorder(obj1, obj2):
if obj1.is_voverlap(obj2):
return obj2.x1 < obj1.x0
elif obj1.is_hoverlap(obj2):
return obj2.y1 < obj1.y0
else:
return obj2.x1 < obj1.x0 and obj2.y1 < obj1.y0
lines = ClusterSet.build(textobjs, 0, laparams.char_margin,
(lambda id,objs: LTTextLine(id, objs, 'V', laparams.word_margin)),
vline)
boxes = ClusterSet.build(lines, laparams.line_margin, 0,
(lambda id,objs: LTTextBox(id, objs, 'V')))
boxes = tsort(boxes, vorder)
else: else:
def hline(obj1, obj2): textobjs = self.analyze_layout_horizontal(textobjs, laparams)
return obj1.height * laparams.line_overlap < obj1.voverlap(obj2) self.objs = textobjs + otherobjs
def horder(obj1, obj2):
if obj1.is_hoverlap(obj2):
return obj2.y1 < obj1.y0
elif obj1.is_voverlap(obj2):
return obj1.x1 < obj2.x0
else:
return obj1.x1 < obj2.x0 and obj2.y1 < obj1.y0
lines = ClusterSet.build(textobjs, laparams.char_margin, 0,
(lambda id,objs: LTTextLine(id, objs, 'H', laparams.word_margin)),
hline)
boxes = ClusterSet.build(lines, 0, laparams.line_margin,
(lambda id,objs: LTTextBox(id, objs, 'H')))
boxes = tsort(boxes, horder)
self.objs = otherobjs + boxes
return return
def analyze_layout_horizontal(self, objs, laparams):
def halign(obj1, obj2):
# +------+ - - -
# | obj1 | - - +------+ -
# | | | obj2 | | (line_overlap)
# +------+ - - | | -
# - - - +------+
#
# |<--->|
# (char_margin)
return ((min(obj1.height, obj2.height) * laparams.line_overlap < obj1.voverlap(obj2)) and
(obj1.hdistance(obj2) < min(obj1.width, obj2.width) * laparams.char_margin))
lines = []
line = []
prev = None
for cur in objs:
if prev is not None and not halign(prev, cur):
if line:
lines.append(LTTextLineHorizontal(line, laparams.word_margin))
line = []
line.append(cur)
prev = cur
if line:
lines.append(LTTextLineHorizontal(line, laparams.word_margin))
boxes = build_clusters(LTTextBoxHorizontal, lines, (0, laparams.line_margin))
def horder(obj1, obj2):
if obj1.is_hoverlap(obj2):
return obj2.y1 < obj1.y0
elif obj1.is_voverlap(obj2):
return obj1.x1 < obj2.x0
else:
return obj1.x1 < obj2.x0 and obj2.y1 < obj1.y0
return tsort(boxes, horder)
def analyze_layout_vertical(self, objs, laparams):
def valign(obj1, obj2):
# +------+
# | obj1 |
# | |
# +------+ - - -
# | | | (char_margin)
# +------+ - -
# | obj2 |
# | |
# +------+
#
# |<--->|
# (line_overlap)
#
return ((min(obj1.width, obj2.width) * laparams.line_overlap < obj1.hoverlap(obj2)) and
(obj1.vdistance(obj2) < min(obj1.height, obj2.height) * laparams.char_margin))
lines = []
line = []
prev = None
for cur in objs:
if prev is not None and not valign(prev, cur):
if line:
lines.append(LTTextLineVertical(line, laparams.word_margin))
line = []
line.append(cur)
prev = cur
if line:
lines.append(LTTextLineVertical(line, laparams.word_margin))
boxes = build_clusters(LTTextBoxVertical, lines, (laparams.line_margin, 0))
def vorder(obj1, obj2):
if obj1.is_voverlap(obj2):
return obj2.y1 < obj1.y0
elif obj1.is_hoverlap(obj2):
return obj1.x1 < obj2.x0
else:
return obj1.x1 < obj2.x0 and obj2.y1 < obj1.y0
return tsort(boxes, vorder)