From 97dd4dda5e5e8de1c91b81c4723dd3605e23fba9 Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Sat, 20 Jun 2009 10:44:00 +0000 Subject: [PATCH] improved clustering git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@116 1aa58f4a-7d42-0410-adbc-911cccaed67c --- pdfminer/converter.py | 31 ++++++++++++++++++------------- pdfminer/layout.py | 40 ++++++++++++++++++++++++++-------------- tools/pdf2txt.py | 14 ++++++++------ 3 files changed, 52 insertions(+), 33 deletions(-) diff --git a/pdfminer/converter.py b/pdfminer/converter.py index ad2a47f..666c10d 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -2,7 +2,7 @@ import sys from pdfminer.pdfdevice import PDFDevice from pdfminer.pdffont import PDFUnicodeNotDefined -from pdfminer.layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox +from pdfminer.layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox, LTAnon from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc @@ -115,10 +115,11 @@ class PDFPageAggregator(PDFDevice): ## class PDFConverter(PDFPageAggregator): - def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8'): + def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, word_margin=None, codec='utf-8'): PDFPageAggregator.__init__(self, rsrc, pageno=pageno, cluster_margin=cluster_margin) self.outfp = outfp self.codec = codec + self.word_margin = word_margin return def write(self, text): @@ -204,6 +205,11 @@ class SGMLConverter(PDFConverter): item.get_bbox(), item.fontsize)) self.write(item.text) self.outfp.write('\n') + elif isinstance(item, LTAnon): + if item.text == ' ': + self.outfp.write('\n') + elif item.text == '\n': + self.outfp.write('\n') elif isinstance(item, LTLine): self.outfp.write('' % (item.linewidth, item.direction, item.get_bbox())) elif isinstance(item, LTRect): @@ -215,7 +221,7 @@ class SGMLConverter(PDFConverter): self.outfp.write('\n') elif isinstance(item, LTTextBox): self.outfp.write('\n' % (item.id, item.get_bbox())) - for child in item: + for child in item.get_lines(self.word_margin): render(child) self.outfp.write('\n') return @@ -228,9 +234,9 @@ class SGMLConverter(PDFConverter): ## class HTMLConverter(PDFConverter): - def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8', + def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, word_margin=None, codec='utf-8', scale=1, showpageno=True, pagepad=50): - PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, codec=codec) + PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, word_margin=word_margin, codec=codec) self.showpageno = showpageno self.pagepad = pagepad self.scale = scale @@ -270,11 +276,13 @@ class HTMLConverter(PDFConverter): self.outfp.write('\n') if self.debug: self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height) + elif isinstance(item, LTAnon): + self.write(item.text) elif isinstance(item, LTLine) or isinstance(item, LTRect): self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height) elif isinstance(item, LTTextBox): self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height) - for child in item: + for child in item.get_lines(self.word_margin): render(child) return page = PDFConverter.end_page(self, page) @@ -295,12 +303,9 @@ class HTMLConverter(PDFConverter): class TextConverter(PDFConverter): def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8', - showpageno=False, word_margin=0.2): - if cluster_margin == None: - cluster_margin = 0.5 - PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, codec=codec) + showpageno=False, word_margin=None): + PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, word_margin=word_margin, codec=codec) self.showpageno = showpageno - self.word_margin = word_margin return def write(self, text): @@ -312,8 +317,8 @@ class TextConverter(PDFConverter): if isinstance(item, LTText): self.write(item.text+'\n') elif isinstance(item, LTTextBox): - for line in item.get_lines(self.word_margin): - self.write(line+'\n') + for obj in item.get_lines(self.word_margin): + self.write(obj.text) self.write('\n') elif isinstance(item, LayoutContainer): for child in item: diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 680cbc4..cb2598c 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -195,7 +195,7 @@ class LayoutItem(object): return def __repr__(self): - return ('' % (self.get_bbox())) + return ('' % (self.get_bbox())) def hoverlap(self, obj): assert isinstance(obj, LayoutItem) @@ -223,7 +223,7 @@ class LayoutItem(object): def get_direction(self): return None - + ## LayoutContainer ## class LayoutContainer(LayoutItem): @@ -300,6 +300,18 @@ class LTRect(LayoutItem): return +## LTAnon +## +class LTAnon(object): + + def __init__(self, text): + self.text = text + return + + def get_weight(self): + return 0 + + ## LTText ## class LTText(LayoutItem): @@ -409,28 +421,28 @@ class LTTextBox(LayoutContainer): def get_lines(self, ratio): if self.get_direction() == 'H': for line in self.lines: - s = '' x1 = INF for obj in line: if not isinstance(obj, LTText): continue - margin = obj.get_margin(ratio) - if x1 < obj.x0-margin: - s += ' ' - s += obj.text + if ratio: + margin = obj.get_margin(ratio) + if x1 < obj.x0-margin: + yield LTAnon(' ') + yield obj x1 = obj.x1 - yield s + yield LTAnon('\n') else: for line in self.lines: - s = '' y0 = -INF for obj in line: if not isinstance(obj, LTText): continue - margin = obj.get_margin(ratio) - if obj.y1+margin < y0: - s += ' ' - s += obj.text + if ratio: + margin = obj.get_margin(ratio) + if obj.y1+margin < y0: + yield LTAnon(' ') + yield obj y0 = obj.y0 - yield s + yield LTAnon('\n') return diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index 8a5086e..c44eb01 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -10,10 +10,10 @@ from pdfminer.cmap import CMapDB, find_cmap_path def main(argv): import getopt def usage(): - print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-w] [-t text|html|sgml|tag] [-o output] file ...' % argv[0] + print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-T cluster_threshold] [-W word_margin] [-t text|html|sgml|tag] [-o output] file ...' % argv[0] return 100 try: - (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:T:t:o:C:D:m:w') + (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:T:W:t:o:C:D:m:') except getopt.GetoptError: return usage() if not args: return usage() @@ -29,7 +29,8 @@ def main(argv): outfile = None outtype = None codec = 'utf-8' - cluster_margin = None + cluster_margin = 0.5 + word_margin = 0.2 pageno = 1 scale = 1 showpageno = True @@ -44,6 +45,7 @@ def main(argv): elif k == '-o': outfile = v elif k == '-s': scale = float(v) elif k == '-T': cluster_margin = float(v) + elif k == '-W': word_margin = float(v) # CMapDB.debug = debug PDFResourceManager.debug = debug @@ -68,11 +70,11 @@ def main(argv): else: outfp = sys.stdout if outtype == 'sgml': - device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin) + device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin) elif outtype == 'html': - device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, scale=scale) + device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin, scale=scale) elif outtype == 'text': - device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin) + device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin) elif outtype == 'tag': device = TagExtractor(rsrc, outfp, codec=codec) else: