git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@122 1aa58f4a-7d42-0410-adbc-911cccaed67c

pull/1/head
yusuke.shinyama.dummy 2009-07-21 16:06:50 +00:00
parent 9093c340af
commit 57025ee632
3 changed files with 33 additions and 15 deletions

View File

@ -224,7 +224,7 @@ class SGMLConverter(PDFConverter):
self.write(item.text)
self.outfp.write('</text>\n')
elif isinstance(item, LTText):
self.outfp.write('<text>%s</text>\n', item.text)
self.outfp.write('<text>%s</text>\n' % item.text)
else:
assert 0, item
return

View File

@ -10,10 +10,12 @@ class LAParams(object):
def __init__(self,
direction=None,
line_overlap=0.5,
char_margin=1.0,
line_margin=0.5,
word_margin=0.1):
self.direction = direction
self.line_overlap = line_overlap
self.char_margin = char_margin
self.line_margin = line_margin
self.word_margin = word_margin
@ -92,7 +94,7 @@ class ClusterSet(object):
return list(r)
@classmethod
def build(klass, objs, hratio, vratio, objtype):
def build(klass, objs, hratio, vratio, objtype, func=None):
plane = Plane(objs)
cset = ClusterSet(objtype)
for obj in objs:
@ -101,6 +103,8 @@ class ClusterSet(object):
vmargin = vratio * margin
neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin))
assert obj in neighbors, obj
if func:
neighbors = [ x for x in neighbors if x is obj or func(obj, x) ]
cset.add(neighbors)
return cset.finish()
@ -450,10 +454,8 @@ class LTPage(LayoutContainer):
else:
otherobjs.append(obj)
if laparams.direction == 'V':
lines = ClusterSet.build(textobjs, 0, laparams.char_margin,
(lambda id,objs: LTTextLine(id, objs, 'V', laparams.word_margin)))
boxes = ClusterSet.build(lines, laparams.line_margin, 0,
(lambda id,objs: LTTextBox(id, objs, 'V')))
def vline(obj1, obj2):
return obj1.width * laparams.line_overlap < obj1.hoverlap(obj2)
def vorder(obj1, obj2):
if obj1.voverlap(obj2):
return obj2.x1 < obj1.x1
@ -461,12 +463,15 @@ class LTPage(LayoutContainer):
return obj2.y1 < obj1.y1
else:
return obj2.x1 < obj1.x1 and obj2.y1 < obj1.y1
lines = ClusterSet.build(textobjs, 0, laparams.char_margin,
(lambda id,objs: LTTextLine(id, objs, 'V', laparams.word_margin)),
vline)
boxes = ClusterSet.build(lines, laparams.line_margin, 0,
(lambda id,objs: LTTextBox(id, objs, 'V')))
boxes = tsort(boxes, vorder)
else:
lines = ClusterSet.build(textobjs, laparams.char_margin, 0,
(lambda id,objs: LTTextLine(id, objs, 'H', laparams.word_margin)))
boxes = ClusterSet.build(lines, 0, laparams.line_margin,
(lambda id,objs: LTTextBox(id, objs, 'H')))
def hline(obj1, obj2):
return obj1.height * laparams.line_overlap < obj1.voverlap(obj2)
def horder(obj1, obj2):
if obj1.hoverlap(obj2):
return obj2.y1 < obj1.y1
@ -474,6 +479,11 @@ class LTPage(LayoutContainer):
return obj1.x1 < obj2.x0
else:
return obj2.y1 < obj1.y1 and obj1.x1 < obj2.x0
lines = ClusterSet.build(textobjs, laparams.char_margin, 0,
(lambda id,objs: LTTextLine(id, objs, 'H', laparams.word_margin)),
hline)
boxes = ClusterSet.build(lines, 0, laparams.line_margin,
(lambda id,objs: LTTextBox(id, objs, 'H')))
boxes = tsort(boxes, horder)
self.objs = otherobjs + boxes
return

View File

@ -25,17 +25,25 @@ TEXTS= \
naacl06-shinyama.txt \
nlp2004slides.txt
SGMLS= \
simple1.sgml \
simple2.sgml \
dmca.sgml \
f1040nr.sgml \
i1040nr.sgml \
jo.sgml \
kampo.sgml \
naacl06-shinyama.sgml \
nlp2004slides.sgml
all:
clean:
-rm $(HTMLS)
-rm $(TEXTS)
-rm $(SGMLS)
test: htmls texts
htmls: $(HTMLS)
texts: $(TEXTS)
test: $(HTMLS) $(TEXTS) $(SGMLS)
.SUFFIXES: .pdf .html .sgml .txt
.pdf.html: