git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@122 1aa58f4a-7d42-0410-adbc-911cccaed67c

pull/1/head
yusuke.shinyama.dummy 2009-07-21 16:06:50 +00:00
parent 9093c340af
commit 57025ee632
3 changed files with 33 additions and 15 deletions

View File

@ -224,7 +224,7 @@ class SGMLConverter(PDFConverter):
self.write(item.text) self.write(item.text)
self.outfp.write('</text>\n') self.outfp.write('</text>\n')
elif isinstance(item, LTText): elif isinstance(item, LTText):
self.outfp.write('<text>%s</text>\n', item.text) self.outfp.write('<text>%s</text>\n' % item.text)
else: else:
assert 0, item assert 0, item
return return

View File

@ -10,10 +10,12 @@ class LAParams(object):
def __init__(self, def __init__(self,
direction=None, direction=None,
line_overlap=0.5,
char_margin=1.0, char_margin=1.0,
line_margin=0.5, line_margin=0.5,
word_margin=0.1): word_margin=0.1):
self.direction = direction self.direction = direction
self.line_overlap = line_overlap
self.char_margin = char_margin self.char_margin = char_margin
self.line_margin = line_margin self.line_margin = line_margin
self.word_margin = word_margin self.word_margin = word_margin
@ -92,7 +94,7 @@ class ClusterSet(object):
return list(r) return list(r)
@classmethod @classmethod
def build(klass, objs, hratio, vratio, objtype): def build(klass, objs, hratio, vratio, objtype, func=None):
plane = Plane(objs) plane = Plane(objs)
cset = ClusterSet(objtype) cset = ClusterSet(objtype)
for obj in objs: for obj in objs:
@ -101,6 +103,8 @@ class ClusterSet(object):
vmargin = vratio * margin vmargin = vratio * margin
neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin)) neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin))
assert obj in neighbors, obj assert obj in neighbors, obj
if func:
neighbors = [ x for x in neighbors if x is obj or func(obj, x) ]
cset.add(neighbors) cset.add(neighbors)
return cset.finish() return cset.finish()
@ -450,10 +454,8 @@ class LTPage(LayoutContainer):
else: else:
otherobjs.append(obj) otherobjs.append(obj)
if laparams.direction == 'V': if laparams.direction == 'V':
lines = ClusterSet.build(textobjs, 0, laparams.char_margin, def vline(obj1, obj2):
(lambda id,objs: LTTextLine(id, objs, 'V', laparams.word_margin))) return obj1.width * laparams.line_overlap < obj1.hoverlap(obj2)
boxes = ClusterSet.build(lines, laparams.line_margin, 0,
(lambda id,objs: LTTextBox(id, objs, 'V')))
def vorder(obj1, obj2): def vorder(obj1, obj2):
if obj1.voverlap(obj2): if obj1.voverlap(obj2):
return obj2.x1 < obj1.x1 return obj2.x1 < obj1.x1
@ -461,12 +463,15 @@ class LTPage(LayoutContainer):
return obj2.y1 < obj1.y1 return obj2.y1 < obj1.y1
else: else:
return obj2.x1 < obj1.x1 and obj2.y1 < obj1.y1 return obj2.x1 < obj1.x1 and obj2.y1 < obj1.y1
lines = ClusterSet.build(textobjs, 0, laparams.char_margin,
(lambda id,objs: LTTextLine(id, objs, 'V', laparams.word_margin)),
vline)
boxes = ClusterSet.build(lines, laparams.line_margin, 0,
(lambda id,objs: LTTextBox(id, objs, 'V')))
boxes = tsort(boxes, vorder) boxes = tsort(boxes, vorder)
else: else:
lines = ClusterSet.build(textobjs, laparams.char_margin, 0, def hline(obj1, obj2):
(lambda id,objs: LTTextLine(id, objs, 'H', laparams.word_margin))) return obj1.height * laparams.line_overlap < obj1.voverlap(obj2)
boxes = ClusterSet.build(lines, 0, laparams.line_margin,
(lambda id,objs: LTTextBox(id, objs, 'H')))
def horder(obj1, obj2): def horder(obj1, obj2):
if obj1.hoverlap(obj2): if obj1.hoverlap(obj2):
return obj2.y1 < obj1.y1 return obj2.y1 < obj1.y1
@ -474,6 +479,11 @@ class LTPage(LayoutContainer):
return obj1.x1 < obj2.x0 return obj1.x1 < obj2.x0
else: else:
return obj2.y1 < obj1.y1 and obj1.x1 < obj2.x0 return obj2.y1 < obj1.y1 and obj1.x1 < obj2.x0
lines = ClusterSet.build(textobjs, laparams.char_margin, 0,
(lambda id,objs: LTTextLine(id, objs, 'H', laparams.word_margin)),
hline)
boxes = ClusterSet.build(lines, 0, laparams.line_margin,
(lambda id,objs: LTTextBox(id, objs, 'H')))
boxes = tsort(boxes, horder) boxes = tsort(boxes, horder)
self.objs = otherobjs + boxes self.objs = otherobjs + boxes
return return

View File

@ -25,17 +25,25 @@ TEXTS= \
naacl06-shinyama.txt \ naacl06-shinyama.txt \
nlp2004slides.txt nlp2004slides.txt
SGMLS= \
simple1.sgml \
simple2.sgml \
dmca.sgml \
f1040nr.sgml \
i1040nr.sgml \
jo.sgml \
kampo.sgml \
naacl06-shinyama.sgml \
nlp2004slides.sgml
all: all:
clean: clean:
-rm $(HTMLS) -rm $(HTMLS)
-rm $(TEXTS) -rm $(TEXTS)
-rm $(SGMLS)
test: htmls texts test: $(HTMLS) $(TEXTS) $(SGMLS)
htmls: $(HTMLS)
texts: $(TEXTS)
.SUFFIXES: .pdf .html .sgml .txt .SUFFIXES: .pdf .html .sgml .txt
.pdf.html: .pdf.html: