From 57025ee632f5878bce044e782d28cd095a23843b Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Tue, 21 Jul 2009 16:06:50 +0000 Subject: [PATCH] git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@122 1aa58f4a-7d42-0410-adbc-911cccaed67c --- pdfminer/converter.py | 2 +- pdfminer/layout.py | 28 +++++++++++++++++++--------- samples/Makefile | 18 +++++++++++++----- 3 files changed, 33 insertions(+), 15 deletions(-) diff --git a/pdfminer/converter.py b/pdfminer/converter.py index b811001..9df3ace 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -224,7 +224,7 @@ class SGMLConverter(PDFConverter): self.write(item.text) self.outfp.write('\n') elif isinstance(item, LTText): - self.outfp.write('%s\n', item.text) + self.outfp.write('%s\n' % item.text) else: assert 0, item return diff --git a/pdfminer/layout.py b/pdfminer/layout.py index e0de3af..375142c 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -10,10 +10,12 @@ class LAParams(object): def __init__(self, direction=None, + line_overlap=0.5, char_margin=1.0, line_margin=0.5, word_margin=0.1): self.direction = direction + self.line_overlap = line_overlap self.char_margin = char_margin self.line_margin = line_margin self.word_margin = word_margin @@ -92,7 +94,7 @@ class ClusterSet(object): return list(r) @classmethod - def build(klass, objs, hratio, vratio, objtype): + def build(klass, objs, hratio, vratio, objtype, func=None): plane = Plane(objs) cset = ClusterSet(objtype) for obj in objs: @@ -101,6 +103,8 @@ class ClusterSet(object): vmargin = vratio * margin neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin)) assert obj in neighbors, obj + if func: + neighbors = [ x for x in neighbors if x is obj or func(obj, x) ] cset.add(neighbors) return cset.finish() @@ -450,10 +454,8 @@ class LTPage(LayoutContainer): else: otherobjs.append(obj) if laparams.direction == 'V': - lines = ClusterSet.build(textobjs, 0, laparams.char_margin, - (lambda id,objs: LTTextLine(id, objs, 'V', laparams.word_margin))) - boxes = ClusterSet.build(lines, laparams.line_margin, 0, - (lambda id,objs: LTTextBox(id, objs, 'V'))) + def vline(obj1, obj2): + return obj1.width * laparams.line_overlap < obj1.hoverlap(obj2) def vorder(obj1, obj2): if obj1.voverlap(obj2): return obj2.x1 < obj1.x1 @@ -461,12 +463,15 @@ class LTPage(LayoutContainer): return obj2.y1 < obj1.y1 else: return obj2.x1 < obj1.x1 and obj2.y1 < obj1.y1 + lines = ClusterSet.build(textobjs, 0, laparams.char_margin, + (lambda id,objs: LTTextLine(id, objs, 'V', laparams.word_margin)), + vline) + boxes = ClusterSet.build(lines, laparams.line_margin, 0, + (lambda id,objs: LTTextBox(id, objs, 'V'))) boxes = tsort(boxes, vorder) else: - lines = ClusterSet.build(textobjs, laparams.char_margin, 0, - (lambda id,objs: LTTextLine(id, objs, 'H', laparams.word_margin))) - boxes = ClusterSet.build(lines, 0, laparams.line_margin, - (lambda id,objs: LTTextBox(id, objs, 'H'))) + def hline(obj1, obj2): + return obj1.height * laparams.line_overlap < obj1.voverlap(obj2) def horder(obj1, obj2): if obj1.hoverlap(obj2): return obj2.y1 < obj1.y1 @@ -474,6 +479,11 @@ class LTPage(LayoutContainer): return obj1.x1 < obj2.x0 else: return obj2.y1 < obj1.y1 and obj1.x1 < obj2.x0 + lines = ClusterSet.build(textobjs, laparams.char_margin, 0, + (lambda id,objs: LTTextLine(id, objs, 'H', laparams.word_margin)), + hline) + boxes = ClusterSet.build(lines, 0, laparams.line_margin, + (lambda id,objs: LTTextBox(id, objs, 'H'))) boxes = tsort(boxes, horder) self.objs = otherobjs + boxes return diff --git a/samples/Makefile b/samples/Makefile index a358227..08fc43b 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -25,17 +25,25 @@ TEXTS= \ naacl06-shinyama.txt \ nlp2004slides.txt +SGMLS= \ + simple1.sgml \ + simple2.sgml \ + dmca.sgml \ + f1040nr.sgml \ + i1040nr.sgml \ + jo.sgml \ + kampo.sgml \ + naacl06-shinyama.sgml \ + nlp2004slides.sgml + all: clean: -rm $(HTMLS) -rm $(TEXTS) + -rm $(SGMLS) -test: htmls texts - -htmls: $(HTMLS) - -texts: $(TEXTS) +test: $(HTMLS) $(TEXTS) $(SGMLS) .SUFFIXES: .pdf .html .sgml .txt .pdf.html: