git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@122 1aa58f4a-7d42-0410-adbc-911cccaed67c
parent
9093c340af
commit
57025ee632
|
@ -224,7 +224,7 @@ class SGMLConverter(PDFConverter):
|
||||||
self.write(item.text)
|
self.write(item.text)
|
||||||
self.outfp.write('</text>\n')
|
self.outfp.write('</text>\n')
|
||||||
elif isinstance(item, LTText):
|
elif isinstance(item, LTText):
|
||||||
self.outfp.write('<text>%s</text>\n', item.text)
|
self.outfp.write('<text>%s</text>\n' % item.text)
|
||||||
else:
|
else:
|
||||||
assert 0, item
|
assert 0, item
|
||||||
return
|
return
|
||||||
|
|
|
@ -10,10 +10,12 @@ class LAParams(object):
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
direction=None,
|
direction=None,
|
||||||
|
line_overlap=0.5,
|
||||||
char_margin=1.0,
|
char_margin=1.0,
|
||||||
line_margin=0.5,
|
line_margin=0.5,
|
||||||
word_margin=0.1):
|
word_margin=0.1):
|
||||||
self.direction = direction
|
self.direction = direction
|
||||||
|
self.line_overlap = line_overlap
|
||||||
self.char_margin = char_margin
|
self.char_margin = char_margin
|
||||||
self.line_margin = line_margin
|
self.line_margin = line_margin
|
||||||
self.word_margin = word_margin
|
self.word_margin = word_margin
|
||||||
|
@ -92,7 +94,7 @@ class ClusterSet(object):
|
||||||
return list(r)
|
return list(r)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def build(klass, objs, hratio, vratio, objtype):
|
def build(klass, objs, hratio, vratio, objtype, func=None):
|
||||||
plane = Plane(objs)
|
plane = Plane(objs)
|
||||||
cset = ClusterSet(objtype)
|
cset = ClusterSet(objtype)
|
||||||
for obj in objs:
|
for obj in objs:
|
||||||
|
@ -101,6 +103,8 @@ class ClusterSet(object):
|
||||||
vmargin = vratio * margin
|
vmargin = vratio * margin
|
||||||
neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin))
|
neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin))
|
||||||
assert obj in neighbors, obj
|
assert obj in neighbors, obj
|
||||||
|
if func:
|
||||||
|
neighbors = [ x for x in neighbors if x is obj or func(obj, x) ]
|
||||||
cset.add(neighbors)
|
cset.add(neighbors)
|
||||||
return cset.finish()
|
return cset.finish()
|
||||||
|
|
||||||
|
@ -450,10 +454,8 @@ class LTPage(LayoutContainer):
|
||||||
else:
|
else:
|
||||||
otherobjs.append(obj)
|
otherobjs.append(obj)
|
||||||
if laparams.direction == 'V':
|
if laparams.direction == 'V':
|
||||||
lines = ClusterSet.build(textobjs, 0, laparams.char_margin,
|
def vline(obj1, obj2):
|
||||||
(lambda id,objs: LTTextLine(id, objs, 'V', laparams.word_margin)))
|
return obj1.width * laparams.line_overlap < obj1.hoverlap(obj2)
|
||||||
boxes = ClusterSet.build(lines, laparams.line_margin, 0,
|
|
||||||
(lambda id,objs: LTTextBox(id, objs, 'V')))
|
|
||||||
def vorder(obj1, obj2):
|
def vorder(obj1, obj2):
|
||||||
if obj1.voverlap(obj2):
|
if obj1.voverlap(obj2):
|
||||||
return obj2.x1 < obj1.x1
|
return obj2.x1 < obj1.x1
|
||||||
|
@ -461,12 +463,15 @@ class LTPage(LayoutContainer):
|
||||||
return obj2.y1 < obj1.y1
|
return obj2.y1 < obj1.y1
|
||||||
else:
|
else:
|
||||||
return obj2.x1 < obj1.x1 and obj2.y1 < obj1.y1
|
return obj2.x1 < obj1.x1 and obj2.y1 < obj1.y1
|
||||||
|
lines = ClusterSet.build(textobjs, 0, laparams.char_margin,
|
||||||
|
(lambda id,objs: LTTextLine(id, objs, 'V', laparams.word_margin)),
|
||||||
|
vline)
|
||||||
|
boxes = ClusterSet.build(lines, laparams.line_margin, 0,
|
||||||
|
(lambda id,objs: LTTextBox(id, objs, 'V')))
|
||||||
boxes = tsort(boxes, vorder)
|
boxes = tsort(boxes, vorder)
|
||||||
else:
|
else:
|
||||||
lines = ClusterSet.build(textobjs, laparams.char_margin, 0,
|
def hline(obj1, obj2):
|
||||||
(lambda id,objs: LTTextLine(id, objs, 'H', laparams.word_margin)))
|
return obj1.height * laparams.line_overlap < obj1.voverlap(obj2)
|
||||||
boxes = ClusterSet.build(lines, 0, laparams.line_margin,
|
|
||||||
(lambda id,objs: LTTextBox(id, objs, 'H')))
|
|
||||||
def horder(obj1, obj2):
|
def horder(obj1, obj2):
|
||||||
if obj1.hoverlap(obj2):
|
if obj1.hoverlap(obj2):
|
||||||
return obj2.y1 < obj1.y1
|
return obj2.y1 < obj1.y1
|
||||||
|
@ -474,6 +479,11 @@ class LTPage(LayoutContainer):
|
||||||
return obj1.x1 < obj2.x0
|
return obj1.x1 < obj2.x0
|
||||||
else:
|
else:
|
||||||
return obj2.y1 < obj1.y1 and obj1.x1 < obj2.x0
|
return obj2.y1 < obj1.y1 and obj1.x1 < obj2.x0
|
||||||
|
lines = ClusterSet.build(textobjs, laparams.char_margin, 0,
|
||||||
|
(lambda id,objs: LTTextLine(id, objs, 'H', laparams.word_margin)),
|
||||||
|
hline)
|
||||||
|
boxes = ClusterSet.build(lines, 0, laparams.line_margin,
|
||||||
|
(lambda id,objs: LTTextBox(id, objs, 'H')))
|
||||||
boxes = tsort(boxes, horder)
|
boxes = tsort(boxes, horder)
|
||||||
self.objs = otherobjs + boxes
|
self.objs = otherobjs + boxes
|
||||||
return
|
return
|
||||||
|
|
|
@ -25,17 +25,25 @@ TEXTS= \
|
||||||
naacl06-shinyama.txt \
|
naacl06-shinyama.txt \
|
||||||
nlp2004slides.txt
|
nlp2004slides.txt
|
||||||
|
|
||||||
|
SGMLS= \
|
||||||
|
simple1.sgml \
|
||||||
|
simple2.sgml \
|
||||||
|
dmca.sgml \
|
||||||
|
f1040nr.sgml \
|
||||||
|
i1040nr.sgml \
|
||||||
|
jo.sgml \
|
||||||
|
kampo.sgml \
|
||||||
|
naacl06-shinyama.sgml \
|
||||||
|
nlp2004slides.sgml
|
||||||
|
|
||||||
all:
|
all:
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
-rm $(HTMLS)
|
-rm $(HTMLS)
|
||||||
-rm $(TEXTS)
|
-rm $(TEXTS)
|
||||||
|
-rm $(SGMLS)
|
||||||
|
|
||||||
test: htmls texts
|
test: $(HTMLS) $(TEXTS) $(SGMLS)
|
||||||
|
|
||||||
htmls: $(HTMLS)
|
|
||||||
|
|
||||||
texts: $(TEXTS)
|
|
||||||
|
|
||||||
.SUFFIXES: .pdf .html .sgml .txt
|
.SUFFIXES: .pdf .html .sgml .txt
|
||||||
.pdf.html:
|
.pdf.html:
|
||||||
|
|
Loading…
Reference in New Issue