improved clustering

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@116 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-06-20 10:44:00 +00:00
parent c7a0894182
commit 97dd4dda5e
3 changed files with 52 additions and 33 deletions

View File

@ -2,7 +2,7 @@
import sys import sys
from pdfminer.pdfdevice import PDFDevice from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdffont import PDFUnicodeNotDefined from pdfminer.pdffont import PDFUnicodeNotDefined
from pdfminer.layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox from pdfminer.layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox, LTAnon
from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
@ -115,10 +115,11 @@ class PDFPageAggregator(PDFDevice):
## ##
class PDFConverter(PDFPageAggregator): class PDFConverter(PDFPageAggregator):
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8'): def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, word_margin=None, codec='utf-8'):
PDFPageAggregator.__init__(self, rsrc, pageno=pageno, cluster_margin=cluster_margin) PDFPageAggregator.__init__(self, rsrc, pageno=pageno, cluster_margin=cluster_margin)
self.outfp = outfp self.outfp = outfp
self.codec = codec self.codec = codec
self.word_margin = word_margin
return return
def write(self, text): def write(self, text):
@ -204,6 +205,11 @@ class SGMLConverter(PDFConverter):
item.get_bbox(), item.fontsize)) item.get_bbox(), item.fontsize))
self.write(item.text) self.write(item.text)
self.outfp.write('</text>\n') self.outfp.write('</text>\n')
elif isinstance(item, LTAnon):
if item.text == ' ':
self.outfp.write('<space>\n')
elif item.text == '\n':
self.outfp.write('<newline>\n')
elif isinstance(item, LTLine): elif isinstance(item, LTLine):
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox())) self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox()))
elif isinstance(item, LTRect): elif isinstance(item, LTRect):
@ -215,7 +221,7 @@ class SGMLConverter(PDFConverter):
self.outfp.write('</figure>\n') self.outfp.write('</figure>\n')
elif isinstance(item, LTTextBox): elif isinstance(item, LTTextBox):
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox())) self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
for child in item: for child in item.get_lines(self.word_margin):
render(child) render(child)
self.outfp.write('</textbox>\n') self.outfp.write('</textbox>\n')
return return
@ -228,9 +234,9 @@ class SGMLConverter(PDFConverter):
## ##
class HTMLConverter(PDFConverter): class HTMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8', def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, word_margin=None, codec='utf-8',
scale=1, showpageno=True, pagepad=50): scale=1, showpageno=True, pagepad=50):
PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, codec=codec) PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, word_margin=word_margin, codec=codec)
self.showpageno = showpageno self.showpageno = showpageno
self.pagepad = pagepad self.pagepad = pagepad
self.scale = scale self.scale = scale
@ -270,11 +276,13 @@ class HTMLConverter(PDFConverter):
self.outfp.write('</span>\n') self.outfp.write('</span>\n')
if self.debug: if self.debug:
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTAnon):
self.write(item.text)
elif isinstance(item, LTLine) or isinstance(item, LTRect): elif isinstance(item, LTLine) or isinstance(item, LTRect):
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTTextBox): elif isinstance(item, LTTextBox):
self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
for child in item: for child in item.get_lines(self.word_margin):
render(child) render(child)
return return
page = PDFConverter.end_page(self, page) page = PDFConverter.end_page(self, page)
@ -295,12 +303,9 @@ class HTMLConverter(PDFConverter):
class TextConverter(PDFConverter): class TextConverter(PDFConverter):
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8', def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8',
showpageno=False, word_margin=0.2): showpageno=False, word_margin=None):
if cluster_margin == None: PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, word_margin=word_margin, codec=codec)
cluster_margin = 0.5
PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, codec=codec)
self.showpageno = showpageno self.showpageno = showpageno
self.word_margin = word_margin
return return
def write(self, text): def write(self, text):
@ -312,8 +317,8 @@ class TextConverter(PDFConverter):
if isinstance(item, LTText): if isinstance(item, LTText):
self.write(item.text+'\n') self.write(item.text+'\n')
elif isinstance(item, LTTextBox): elif isinstance(item, LTTextBox):
for line in item.get_lines(self.word_margin): for obj in item.get_lines(self.word_margin):
self.write(line+'\n') self.write(obj.text)
self.write('\n') self.write('\n')
elif isinstance(item, LayoutContainer): elif isinstance(item, LayoutContainer):
for child in item: for child in item:

View File

@ -195,7 +195,7 @@ class LayoutItem(object):
return return
def __repr__(self): def __repr__(self):
return ('<pageitem bbox=%s>' % (self.get_bbox())) return ('<item bbox=%s>' % (self.get_bbox()))
def hoverlap(self, obj): def hoverlap(self, obj):
assert isinstance(obj, LayoutItem) assert isinstance(obj, LayoutItem)
@ -300,6 +300,18 @@ class LTRect(LayoutItem):
return return
## LTAnon
##
class LTAnon(object):
def __init__(self, text):
self.text = text
return
def get_weight(self):
return 0
## LTText ## LTText
## ##
class LTText(LayoutItem): class LTText(LayoutItem):
@ -409,28 +421,28 @@ class LTTextBox(LayoutContainer):
def get_lines(self, ratio): def get_lines(self, ratio):
if self.get_direction() == 'H': if self.get_direction() == 'H':
for line in self.lines: for line in self.lines:
s = ''
x1 = INF x1 = INF
for obj in line: for obj in line:
if not isinstance(obj, LTText): continue if not isinstance(obj, LTText): continue
if ratio:
margin = obj.get_margin(ratio) margin = obj.get_margin(ratio)
if x1 < obj.x0-margin: if x1 < obj.x0-margin:
s += ' ' yield LTAnon(' ')
s += obj.text yield obj
x1 = obj.x1 x1 = obj.x1
yield s yield LTAnon('\n')
else: else:
for line in self.lines: for line in self.lines:
s = ''
y0 = -INF y0 = -INF
for obj in line: for obj in line:
if not isinstance(obj, LTText): continue if not isinstance(obj, LTText): continue
if ratio:
margin = obj.get_margin(ratio) margin = obj.get_margin(ratio)
if obj.y1+margin < y0: if obj.y1+margin < y0:
s += ' ' yield LTAnon(' ')
s += obj.text yield obj
y0 = obj.y0 y0 = obj.y0
yield s yield LTAnon('\n')
return return

View File

@ -10,10 +10,10 @@ from pdfminer.cmap import CMapDB, find_cmap_path
def main(argv): def main(argv):
import getopt import getopt
def usage(): def usage():
print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-w] [-t text|html|sgml|tag] [-o output] file ...' % argv[0] print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-T cluster_threshold] [-W word_margin] [-t text|html|sgml|tag] [-o output] file ...' % argv[0]
return 100 return 100
try: try:
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:T:t:o:C:D:m:w') (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:T:W:t:o:C:D:m:')
except getopt.GetoptError: except getopt.GetoptError:
return usage() return usage()
if not args: return usage() if not args: return usage()
@ -29,7 +29,8 @@ def main(argv):
outfile = None outfile = None
outtype = None outtype = None
codec = 'utf-8' codec = 'utf-8'
cluster_margin = None cluster_margin = 0.5
word_margin = 0.2
pageno = 1 pageno = 1
scale = 1 scale = 1
showpageno = True showpageno = True
@ -44,6 +45,7 @@ def main(argv):
elif k == '-o': outfile = v elif k == '-o': outfile = v
elif k == '-s': scale = float(v) elif k == '-s': scale = float(v)
elif k == '-T': cluster_margin = float(v) elif k == '-T': cluster_margin = float(v)
elif k == '-W': word_margin = float(v)
# #
CMapDB.debug = debug CMapDB.debug = debug
PDFResourceManager.debug = debug PDFResourceManager.debug = debug
@ -68,11 +70,11 @@ def main(argv):
else: else:
outfp = sys.stdout outfp = sys.stdout
if outtype == 'sgml': if outtype == 'sgml':
device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin) device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin)
elif outtype == 'html': elif outtype == 'html':
device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, scale=scale) device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin, scale=scale)
elif outtype == 'text': elif outtype == 'text':
device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin) device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin)
elif outtype == 'tag': elif outtype == 'tag':
device = TagExtractor(rsrc, outfp, codec=codec) device = TagExtractor(rsrc, outfp, codec=codec)
else: else: