improved clustering

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@116 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-06-20 10:44:00 +00:00
parent c7a0894182
commit 97dd4dda5e
3 changed files with 52 additions and 33 deletions

View File

@ -2,7 +2,7 @@
import sys
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdffont import PDFUnicodeNotDefined
from pdfminer.layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox
from pdfminer.layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox, LTAnon
from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
@ -115,10 +115,11 @@ class PDFPageAggregator(PDFDevice):
##
class PDFConverter(PDFPageAggregator):
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8'):
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, word_margin=None, codec='utf-8'):
PDFPageAggregator.__init__(self, rsrc, pageno=pageno, cluster_margin=cluster_margin)
self.outfp = outfp
self.codec = codec
self.word_margin = word_margin
return
def write(self, text):
@ -204,6 +205,11 @@ class SGMLConverter(PDFConverter):
item.get_bbox(), item.fontsize))
self.write(item.text)
self.outfp.write('</text>\n')
elif isinstance(item, LTAnon):
if item.text == ' ':
self.outfp.write('<space>\n')
elif item.text == '\n':
self.outfp.write('<newline>\n')
elif isinstance(item, LTLine):
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox()))
elif isinstance(item, LTRect):
@ -215,7 +221,7 @@ class SGMLConverter(PDFConverter):
self.outfp.write('</figure>\n')
elif isinstance(item, LTTextBox):
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
for child in item:
for child in item.get_lines(self.word_margin):
render(child)
self.outfp.write('</textbox>\n')
return
@ -228,9 +234,9 @@ class SGMLConverter(PDFConverter):
##
class HTMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8',
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, word_margin=None, codec='utf-8',
scale=1, showpageno=True, pagepad=50):
PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, codec=codec)
PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, word_margin=word_margin, codec=codec)
self.showpageno = showpageno
self.pagepad = pagepad
self.scale = scale
@ -270,11 +276,13 @@ class HTMLConverter(PDFConverter):
self.outfp.write('</span>\n')
if self.debug:
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTAnon):
self.write(item.text)
elif isinstance(item, LTLine) or isinstance(item, LTRect):
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTTextBox):
self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
for child in item:
for child in item.get_lines(self.word_margin):
render(child)
return
page = PDFConverter.end_page(self, page)
@ -295,12 +303,9 @@ class HTMLConverter(PDFConverter):
class TextConverter(PDFConverter):
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8',
showpageno=False, word_margin=0.2):
if cluster_margin == None:
cluster_margin = 0.5
PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, codec=codec)
showpageno=False, word_margin=None):
PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, word_margin=word_margin, codec=codec)
self.showpageno = showpageno
self.word_margin = word_margin
return
def write(self, text):
@ -312,8 +317,8 @@ class TextConverter(PDFConverter):
if isinstance(item, LTText):
self.write(item.text+'\n')
elif isinstance(item, LTTextBox):
for line in item.get_lines(self.word_margin):
self.write(line+'\n')
for obj in item.get_lines(self.word_margin):
self.write(obj.text)
self.write('\n')
elif isinstance(item, LayoutContainer):
for child in item:

View File

@ -195,7 +195,7 @@ class LayoutItem(object):
return
def __repr__(self):
return ('<pageitem bbox=%s>' % (self.get_bbox()))
return ('<item bbox=%s>' % (self.get_bbox()))
def hoverlap(self, obj):
assert isinstance(obj, LayoutItem)
@ -300,6 +300,18 @@ class LTRect(LayoutItem):
return
## LTAnon
##
class LTAnon(object):
def __init__(self, text):
self.text = text
return
def get_weight(self):
return 0
## LTText
##
class LTText(LayoutItem):
@ -409,28 +421,28 @@ class LTTextBox(LayoutContainer):
def get_lines(self, ratio):
if self.get_direction() == 'H':
for line in self.lines:
s = ''
x1 = INF
for obj in line:
if not isinstance(obj, LTText): continue
if ratio:
margin = obj.get_margin(ratio)
if x1 < obj.x0-margin:
s += ' '
s += obj.text
yield LTAnon(' ')
yield obj
x1 = obj.x1
yield s
yield LTAnon('\n')
else:
for line in self.lines:
s = ''
y0 = -INF
for obj in line:
if not isinstance(obj, LTText): continue
if ratio:
margin = obj.get_margin(ratio)
if obj.y1+margin < y0:
s += ' '
s += obj.text
yield LTAnon(' ')
yield obj
y0 = obj.y0
yield s
yield LTAnon('\n')
return

View File

@ -10,10 +10,10 @@ from pdfminer.cmap import CMapDB, find_cmap_path
def main(argv):
import getopt
def usage():
print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-w] [-t text|html|sgml|tag] [-o output] file ...' % argv[0]
print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-T cluster_threshold] [-W word_margin] [-t text|html|sgml|tag] [-o output] file ...' % argv[0]
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:T:t:o:C:D:m:w')
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:T:W:t:o:C:D:m:')
except getopt.GetoptError:
return usage()
if not args: return usage()
@ -29,7 +29,8 @@ def main(argv):
outfile = None
outtype = None
codec = 'utf-8'
cluster_margin = None
cluster_margin = 0.5
word_margin = 0.2
pageno = 1
scale = 1
showpageno = True
@ -44,6 +45,7 @@ def main(argv):
elif k == '-o': outfile = v
elif k == '-s': scale = float(v)
elif k == '-T': cluster_margin = float(v)
elif k == '-W': word_margin = float(v)
#
CMapDB.debug = debug
PDFResourceManager.debug = debug
@ -68,11 +70,11 @@ def main(argv):
else:
outfp = sys.stdout
if outtype == 'sgml':
device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin)
elif outtype == 'html':
device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, scale=scale)
device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin, scale=scale)
elif outtype == 'text':
device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin)
elif outtype == 'tag':
device = TagExtractor(rsrc, outfp, codec=codec)
else: