layout analysis improved

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@245 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2010-10-17 05:13:39 +00:00
parent bc1303e901
commit 3305c07ba2
5 changed files with 97 additions and 80 deletions

View File

@ -121,7 +121,7 @@ class FileCMap(CMap):
return '<CMap: %s>' % self.attrs.get('CMapName') return '<CMap: %s>' % self.attrs.get('CMapName')
def is_vertical(self): def is_vertical(self):
return self.attrs.get('WMode', 0) return self.attrs.get('WMode', 0) != 0
def set_attr(self, k, v): def set_attr(self, k, v):
self.attrs[k] = v self.attrs[k] = v

View File

@ -170,11 +170,11 @@ class TextConverter(PDFConverter):
def receive_layout(self, ltpage): def receive_layout(self, ltpage):
def render(item): def render(item):
if isinstance(item, LTText): if isinstance(item, LTContainer):
self.write(item.text)
elif isinstance(item, LTContainer):
for child in item: for child in item:
render(child) render(child)
elif isinstance(item, LTText):
self.write(item.get_text())
if isinstance(item, LTTextBox): if isinstance(item, LTTextBox):
self.write('\n') self.write('\n')
if self.showpageno: if self.showpageno:
@ -231,20 +231,21 @@ class HTMLConverter(PDFConverter):
elif isinstance(item, LTChar): elif isinstance(item, LTChar):
self.write_text(item.text, item.x0, item.y1, item.get_size()) self.write_text(item.text, item.x0, item.y1, item.get_size())
if self.debug: if self.debug:
self.write_rect('red', 1, item.x0, item.y1, item.width, item.height) self.write_rect('green', 1, item.x0, item.y1, item.width, item.height)
elif isinstance(item, LTPolygon): elif isinstance(item, LTPolygon):
self.write_rect('black', 1, item.x0, item.y1, item.width, item.height) self.write_rect('black', 1, item.x0, item.y1, item.width, item.height)
elif isinstance(item, LTTextLine): elif isinstance(item, LTTextLine):
self.write_rect('magenta', 1, item.x0, item.y1, item.width, item.height)
for child in item: for child in item:
render(child) render(child)
elif isinstance(item, LTTextBox): elif isinstance(item, LTTextBox):
self.write_rect('blue', 1, item.x0, item.y1, item.width, item.height) self.write_rect('cyan', 1, item.x0, item.y1, item.width, item.height)
for child in item: for child in item:
render(child) render(child)
if self.debug: if self.debug:
self.write_text(str(item.index+1), item.x0, item.y1, 20) self.write_text(str(item.index+1), item.x0, item.y1, 20)
elif isinstance(item, LTFigure): elif isinstance(item, LTFigure):
self.write_rect('green', 1, item.x0, item.y1, item.width, item.height) self.write_rect('yellow', 1, item.x0, item.y1, item.width, item.height)
for child in item: for child in item:
render(child) render(child)
elif isinstance(item, LTImage): elif isinstance(item, LTImage):

View File

@ -1,24 +1,10 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys import sys
from sys import maxint as INF from utils import apply_matrix_pt, get_bound, INF
from utils import apply_matrix_pt
from utils import bsearch, bbox2str, matrix2str from utils import bsearch, bbox2str, matrix2str
from pdffont import PDFUnicodeNotDefined from pdffont import PDFUnicodeNotDefined
## get_bounds
##
def get_bounds(pts):
"""Compute a minimal rectangle that covers all the points."""
(x0, y0, x1, y1) = (INF, INF, -INF, -INF)
for (x,y) in pts:
x0 = min(x0, x)
y0 = min(y0, y)
x1 = max(x1, x)
y1 = max(y1, y)
return (x0,y0,x1,y1)
def uniq(objs): def uniq(objs):
done = set() done = set()
for obj in objs: for obj in objs:
@ -39,7 +25,7 @@ class LAParams(object):
def __init__(self, def __init__(self,
writing_mode='lr-tb', writing_mode='lr-tb',
line_overlap=0.5, line_overlap=0.5,
char_margin=3.0, char_margin=2.0,
line_margin=0.5, line_margin=0.5,
word_margin=0.1, word_margin=0.1,
all_texts=False): all_texts=False):
@ -52,8 +38,8 @@ class LAParams(object):
return return
def __repr__(self): def __repr__(self):
return ('<LAParams: writing_mode=%r, char_margin=%.1f, line_margin=%.1f, word_margin=%.1f all_texts=%r>' % return ('<LAParams: char_margin=%.1f, line_margin=%.1f, word_margin=%.1f all_texts=%r>' %
(self.writing_mode, self.char_margin, self.line_margin, self.word_margin, self.all_texts)) (self.char_margin, self.line_margin, self.word_margin, self.all_texts))
## LTItem ## LTItem
@ -65,7 +51,8 @@ class LTItem(object):
return return
def __repr__(self): def __repr__(self):
return ('<item bbox=%s>' % bbox2str(self.bbox)) return ('<%s %s>' %
(self.__class__.__name__, bbox2str(self.bbox)))
def set_bbox(self, (x0,y0,x1,y1)): def set_bbox(self, (x0,y0,x1,y1)):
if x1 < x0: (x0,x1) = (x1,x0) if x1 < x0: (x0,x1) = (x1,x0)
@ -123,7 +110,7 @@ class LTPolygon(LTItem):
def __init__(self, linewidth, pts): def __init__(self, linewidth, pts):
self.pts = pts self.pts = pts
self.linewidth = linewidth self.linewidth = linewidth
LTItem.__init__(self, get_bounds(pts)) LTItem.__init__(self, get_bound(pts))
return return
def get_pts(self): def get_pts(self):
@ -167,7 +154,9 @@ class LTImage(LTItem):
def __repr__(self): def __repr__(self):
(w,h) = self.srcsize (w,h) = self.srcsize
return '<image %s %dx%d>' % (self.name, w, h) return ('<%s(%s) %s %dx%d>' %
(self.__class__.__name__, self.name,
bbox2str(self.bbox), w, h))
## LTText ## LTText
@ -179,10 +168,11 @@ class LTText(object):
return return
def __repr__(self): def __repr__(self):
return '<text %r>' % self.text return ('<%s %r>' %
(self.__class__.__name__, self.get_text()))
def is_upright(self): def get_text(self):
return True return self.text
## LTAnon ## LTAnon
@ -239,20 +229,18 @@ class LTChar(LTItem, LTText):
def __repr__(self): def __repr__(self):
if self.debug: if self.debug:
return ('<char matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' % return ('<%s %s matrix=%s font=%r fontsize=%.1f adv=%s text=%r>' %
(matrix2str(self.matrix), self.font, self.fontsize, (self.__class__.__name__, bbox2str(self.bbox),
bbox2str(self.bbox), self.adv, self.text)) matrix2str(self.matrix), self.font, self.fontsize,
self.adv, self.get_text()))
else: else:
return '<char %r>' % self.text return '<char %r>' % self.text
def get_size(self): def get_size(self):
return max(self.width, self.height) return max(self.width, self.height)
def is_vertical(self): def is_compatible(self, obj):
return self.font.is_vertical() return True
def is_upright(self):
return self.upright
## LTContainer ## LTContainer
@ -267,9 +255,6 @@ class LTContainer(LTItem):
self._objs = [] self._objs = []
return return
def __repr__(self):
return ('<container %s>' % bbox2str(self.bbox))
def __iter__(self): def __iter__(self):
return iter(self.get_objs()) return iter(self.get_objs())
@ -302,15 +287,17 @@ class LTContainer(LTItem):
## LTTextLine ## LTTextLine
## ##
class LTTextLine(LTContainer): class LTTextLine(LTContainer, LTText):
def __init__(self, word_margin=0): def __init__(self, laparams=None):
self.word_margin = word_margin self.laparams = laparams
LTContainer.__init__(self) LTContainer.__init__(self)
return return
def __repr__(self): def __repr__(self):
return ('<textline %s>' % bbox2str(self.bbox)) return ('<%s %s %r>' %
(self.__class__.__name__, bbox2str(self.bbox),
self.get_text()))
def get_text(self): def get_text(self):
return ''.join( obj.text for obj in self.get_objs() if isinstance(obj, LTText) ) return ''.join( obj.text for obj in self.get_objs() if isinstance(obj, LTText) )
@ -320,14 +307,16 @@ class LTTextLine(LTContainer):
class LTTextLineHorizontal(LTTextLine): class LTTextLineHorizontal(LTTextLine):
def __repr__(self):
return ('<textline-h %s>' % bbox2str(self.bbox))
def get_objs(self): def get_objs(self):
if self.laparams is None:
for obj in self._objs:
yield obj
return
word_margin = self.laparams.word_margin
x1 = INF x1 = INF
for obj in csort(self._objs, key=lambda obj: obj.x0): for obj in csort(self._objs, key=lambda obj: obj.x0):
if isinstance(obj, LTChar) and self.word_margin: if isinstance(obj, LTChar) and word_margin:
margin = self.word_margin * obj.width margin = word_margin * obj.width
if x1 < obj.x0-margin: if x1 < obj.x0-margin:
yield LTAnon(' ') yield LTAnon(' ')
yield obj yield obj
@ -342,14 +331,16 @@ class LTTextLineHorizontal(LTTextLine):
class LTTextLineVertical(LTTextLine): class LTTextLineVertical(LTTextLine):
def __repr__(self):
return ('<textline-v %s>' % bbox2str(self.bbox))
def get_objs(self): def get_objs(self):
if self.laparams is None:
for obj in self._objs:
yield obj
return
word_margin = self.laparams.word_margin
y0 = -INF y0 = -INF
for obj in csort(self._objs, key=lambda obj: -obj.y1): for obj in csort(self._objs, key=lambda obj: -obj.y1):
if isinstance(obj, LTChar) and self.word_margin: if isinstance(obj, LTChar) and word_margin:
margin = self.word_margin * obj.height margin = word_margin * obj.height
if obj.y1+margin < y0: if obj.y1+margin < y0:
yield LTAnon(' ') yield LTAnon(' ')
yield obj yield obj
@ -376,7 +367,9 @@ class LTTextBox(LTContainer):
return return
def __repr__(self): def __repr__(self):
return ('<textbox(%s) %s %r...>' % (self.index, bbox2str(self.bbox), self.get_text()[:20])) return ('<%s(%s) %s %r...>' %
(self.__class__.__name__, self.index,
bbox2str(self.bbox), self.get_text()[:20]))
def get_text(self): def get_text(self):
return ''.join( obj.get_text() for obj in self.get_objs() if isinstance(obj, LTTextLine) ) return ''.join( obj.get_text() for obj in self.get_objs() if isinstance(obj, LTTextLine) )
@ -489,7 +482,7 @@ class LTAnalyzer(LTContainer):
textobjs = [] textobjs = []
otherobjs = [] otherobjs = []
for obj in objs: for obj in objs:
if isinstance(obj, LTText) and obj.is_upright(): if isinstance(obj, LTChar):
textobjs.append(obj) textobjs.append(obj)
else: else:
otherobjs.append(obj) otherobjs.append(obj)
@ -499,11 +492,9 @@ class LTAnalyzer(LTContainer):
obj0 = None obj0 = None
line = None line = None
for obj1 in objs: for obj1 in objs:
if obj0 is None: if obj0 is not None:
obj0 = obj1
else:
k = 0 k = 0
if (obj0.is_voverlap(obj1) and if (obj0.is_compatible(obj1) and obj0.is_voverlap(obj1) and
min(obj0.height, obj1.height) * laparams.line_overlap < obj0.voverlap(obj1) and min(obj0.height, obj1.height) * laparams.line_overlap < obj0.voverlap(obj1) and
obj0.hdistance(obj1) < min(obj0.width, obj1.width) * laparams.char_margin): obj0.hdistance(obj1) < min(obj0.width, obj1.width) * laparams.char_margin):
# obj0 and obj1 is horizontally aligned: # obj0 and obj1 is horizontally aligned:
@ -517,7 +508,7 @@ class LTAnalyzer(LTContainer):
# |<--->| # |<--->|
# (char_margin) # (char_margin)
k |= 1 k |= 1
if (obj0.is_hoverlap(obj1) and if (obj0.is_compatible(obj1) and obj0.is_hoverlap(obj1) and
min(obj0.width, obj1.width) * laparams.line_overlap < obj0.hoverlap(obj1) and min(obj0.width, obj1.width) * laparams.line_overlap < obj0.hoverlap(obj1) and
obj0.vdistance(obj1) < min(obj0.height, obj1.height) * laparams.char_margin): obj0.vdistance(obj1) < min(obj0.height, obj1.height) * laparams.char_margin):
# obj0 and obj1 is vertically aligned: # obj0 and obj1 is vertically aligned:
@ -538,21 +529,28 @@ class LTAnalyzer(LTContainer):
if ( (k & 1 and isinstance(line, LTTextLineHorizontal)) or if ( (k & 1 and isinstance(line, LTTextLineHorizontal)) or
(k & 2 and isinstance(line, LTTextLineVertical)) ): (k & 2 and isinstance(line, LTTextLineVertical)) ):
line.add(obj1) line.add(obj1)
elif line is None: elif line is not None:
if k == 2: line.fixate()
line = LTTextLineVertical(laparams.word_margin) yield line
line = None
else: else:
line = LTTextLineHorizontal(laparams.word_margin) if k == 2:
line = LTTextLineVertical(laparams)
line.add(obj0)
line.add(obj1)
elif k == 1:
line = LTTextLineHorizontal(laparams)
line.add(obj0) line.add(obj0)
line.add(obj1) line.add(obj1)
else: else:
line = LTTextLineHorizontal(laparams)
line.add(obj0)
line.fixate() line.fixate()
yield line yield line
line = None line = None
obj0 = obj1 obj0 = obj1
if line is None: if line is None:
line = LTTextLineHorizontal(laparams.word_margin) line = LTTextLineHorizontal(laparams)
if obj0 is not None:
line.add(obj0) line.add(obj0)
line.fixate() line.fixate()
yield line yield line
@ -633,14 +631,15 @@ class LTFigure(LTAnalyzer):
self.name = name self.name = name
self.matrix = matrix self.matrix = matrix
(x,y,w,h) = bbox (x,y,w,h) = bbox
bbox = get_bounds( apply_matrix_pt(matrix, (p,q)) bbox = get_bound( apply_matrix_pt(matrix, (p,q))
for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) ) for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) )
LTAnalyzer.__init__(self, bbox=bbox) LTAnalyzer.__init__(self, bbox=bbox)
return return
def __repr__(self): def __repr__(self):
return ('<figure %r bbox=%s matrix=%s>' % return ('<%s(%s) %s matrix=%s>' %
(self.name, bbox2str(self.bbox), matrix2str(self.matrix))) (self.__class__.__name__, self.name,
bbox2str(self.bbox), matrix2str(self.matrix)))
def analyze(self, laparams=None): def analyze(self, laparams=None):
if laparams is not None and laparams.all_texts: if laparams is not None and laparams.all_texts:
@ -660,4 +659,6 @@ class LTPage(LTAnalyzer):
return return
def __repr__(self): def __repr__(self):
return ('<page(%r) bbox=%s rotate=%r>' % (self.pageid, bbox2str(self.bbox), self.rotate)) return ('<%s(%r) %s rotate=%r>' %
(self.__class__.__name__, self.pageid,
bbox2str(self.bbox), self.rotate))

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
from sys import maxint as INF
from struct import pack, unpack from struct import pack, unpack
@ -28,6 +29,17 @@ def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
## Utility functions ## Utility functions
## ##
# get_bound
def get_bound(pts):
'''Compute a minimal rectangle that covers all the points.'''
(x0, y0, x1, y1) = (INF, INF, -INF, -INF)
for (x,y) in pts:
x0 = min(x0, x)
y0 = min(y0, y)
x1 = max(x1, x)
y1 = max(y1, y)
return (x0,y0,x1,y1)
# pick # pick
def pick(seq, func, maxobj=None): def pick(seq, func, maxobj=None):
'''Picks the object obj where func(obj) has the highest value.''' '''Picks the object obj where func(obj) has the highest value.'''

View File

@ -48,6 +48,9 @@ XMLS_NONFREE= \
nonfree/naacl06-shinyama.xml \ nonfree/naacl06-shinyama.xml \
nonfree/nlp2004slides.xml nonfree/nlp2004slides.xml
all:
$(MAKE) test CMP=cmp
test: htmls texts xmls test: htmls texts xmls
clean: clean: