improved clustering
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@116 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
c7a0894182
commit
97dd4dda5e
|
@ -2,7 +2,7 @@
|
||||||
import sys
|
import sys
|
||||||
from pdfminer.pdfdevice import PDFDevice
|
from pdfminer.pdfdevice import PDFDevice
|
||||||
from pdfminer.pdffont import PDFUnicodeNotDefined
|
from pdfminer.pdffont import PDFUnicodeNotDefined
|
||||||
from pdfminer.layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox
|
from pdfminer.layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox, LTAnon
|
||||||
from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
|
from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
|
||||||
|
|
||||||
|
|
||||||
|
@ -115,10 +115,11 @@ class PDFPageAggregator(PDFDevice):
|
||||||
##
|
##
|
||||||
class PDFConverter(PDFPageAggregator):
|
class PDFConverter(PDFPageAggregator):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8'):
|
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, word_margin=None, codec='utf-8'):
|
||||||
PDFPageAggregator.__init__(self, rsrc, pageno=pageno, cluster_margin=cluster_margin)
|
PDFPageAggregator.__init__(self, rsrc, pageno=pageno, cluster_margin=cluster_margin)
|
||||||
self.outfp = outfp
|
self.outfp = outfp
|
||||||
self.codec = codec
|
self.codec = codec
|
||||||
|
self.word_margin = word_margin
|
||||||
return
|
return
|
||||||
|
|
||||||
def write(self, text):
|
def write(self, text):
|
||||||
|
@ -204,6 +205,11 @@ class SGMLConverter(PDFConverter):
|
||||||
item.get_bbox(), item.fontsize))
|
item.get_bbox(), item.fontsize))
|
||||||
self.write(item.text)
|
self.write(item.text)
|
||||||
self.outfp.write('</text>\n')
|
self.outfp.write('</text>\n')
|
||||||
|
elif isinstance(item, LTAnon):
|
||||||
|
if item.text == ' ':
|
||||||
|
self.outfp.write('<space>\n')
|
||||||
|
elif item.text == '\n':
|
||||||
|
self.outfp.write('<newline>\n')
|
||||||
elif isinstance(item, LTLine):
|
elif isinstance(item, LTLine):
|
||||||
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox()))
|
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox()))
|
||||||
elif isinstance(item, LTRect):
|
elif isinstance(item, LTRect):
|
||||||
|
@ -215,7 +221,7 @@ class SGMLConverter(PDFConverter):
|
||||||
self.outfp.write('</figure>\n')
|
self.outfp.write('</figure>\n')
|
||||||
elif isinstance(item, LTTextBox):
|
elif isinstance(item, LTTextBox):
|
||||||
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
|
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
|
||||||
for child in item:
|
for child in item.get_lines(self.word_margin):
|
||||||
render(child)
|
render(child)
|
||||||
self.outfp.write('</textbox>\n')
|
self.outfp.write('</textbox>\n')
|
||||||
return
|
return
|
||||||
|
@ -228,9 +234,9 @@ class SGMLConverter(PDFConverter):
|
||||||
##
|
##
|
||||||
class HTMLConverter(PDFConverter):
|
class HTMLConverter(PDFConverter):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8',
|
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, word_margin=None, codec='utf-8',
|
||||||
scale=1, showpageno=True, pagepad=50):
|
scale=1, showpageno=True, pagepad=50):
|
||||||
PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, codec=codec)
|
PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, word_margin=word_margin, codec=codec)
|
||||||
self.showpageno = showpageno
|
self.showpageno = showpageno
|
||||||
self.pagepad = pagepad
|
self.pagepad = pagepad
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
|
@ -270,11 +276,13 @@ class HTMLConverter(PDFConverter):
|
||||||
self.outfp.write('</span>\n')
|
self.outfp.write('</span>\n')
|
||||||
if self.debug:
|
if self.debug:
|
||||||
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||||
|
elif isinstance(item, LTAnon):
|
||||||
|
self.write(item.text)
|
||||||
elif isinstance(item, LTLine) or isinstance(item, LTRect):
|
elif isinstance(item, LTLine) or isinstance(item, LTRect):
|
||||||
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||||
elif isinstance(item, LTTextBox):
|
elif isinstance(item, LTTextBox):
|
||||||
self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||||
for child in item:
|
for child in item.get_lines(self.word_margin):
|
||||||
render(child)
|
render(child)
|
||||||
return
|
return
|
||||||
page = PDFConverter.end_page(self, page)
|
page = PDFConverter.end_page(self, page)
|
||||||
|
@ -295,12 +303,9 @@ class HTMLConverter(PDFConverter):
|
||||||
class TextConverter(PDFConverter):
|
class TextConverter(PDFConverter):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8',
|
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8',
|
||||||
showpageno=False, word_margin=0.2):
|
showpageno=False, word_margin=None):
|
||||||
if cluster_margin == None:
|
PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, word_margin=word_margin, codec=codec)
|
||||||
cluster_margin = 0.5
|
|
||||||
PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, codec=codec)
|
|
||||||
self.showpageno = showpageno
|
self.showpageno = showpageno
|
||||||
self.word_margin = word_margin
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def write(self, text):
|
def write(self, text):
|
||||||
|
@ -312,8 +317,8 @@ class TextConverter(PDFConverter):
|
||||||
if isinstance(item, LTText):
|
if isinstance(item, LTText):
|
||||||
self.write(item.text+'\n')
|
self.write(item.text+'\n')
|
||||||
elif isinstance(item, LTTextBox):
|
elif isinstance(item, LTTextBox):
|
||||||
for line in item.get_lines(self.word_margin):
|
for obj in item.get_lines(self.word_margin):
|
||||||
self.write(line+'\n')
|
self.write(obj.text)
|
||||||
self.write('\n')
|
self.write('\n')
|
||||||
elif isinstance(item, LayoutContainer):
|
elif isinstance(item, LayoutContainer):
|
||||||
for child in item:
|
for child in item:
|
||||||
|
|
|
@ -195,7 +195,7 @@ class LayoutItem(object):
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<pageitem bbox=%s>' % (self.get_bbox()))
|
return ('<item bbox=%s>' % (self.get_bbox()))
|
||||||
|
|
||||||
def hoverlap(self, obj):
|
def hoverlap(self, obj):
|
||||||
assert isinstance(obj, LayoutItem)
|
assert isinstance(obj, LayoutItem)
|
||||||
|
@ -300,6 +300,18 @@ class LTRect(LayoutItem):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
## LTAnon
|
||||||
|
##
|
||||||
|
class LTAnon(object):
|
||||||
|
|
||||||
|
def __init__(self, text):
|
||||||
|
self.text = text
|
||||||
|
return
|
||||||
|
|
||||||
|
def get_weight(self):
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
## LTText
|
## LTText
|
||||||
##
|
##
|
||||||
class LTText(LayoutItem):
|
class LTText(LayoutItem):
|
||||||
|
@ -409,28 +421,28 @@ class LTTextBox(LayoutContainer):
|
||||||
def get_lines(self, ratio):
|
def get_lines(self, ratio):
|
||||||
if self.get_direction() == 'H':
|
if self.get_direction() == 'H':
|
||||||
for line in self.lines:
|
for line in self.lines:
|
||||||
s = ''
|
|
||||||
x1 = INF
|
x1 = INF
|
||||||
for obj in line:
|
for obj in line:
|
||||||
if not isinstance(obj, LTText): continue
|
if not isinstance(obj, LTText): continue
|
||||||
|
if ratio:
|
||||||
margin = obj.get_margin(ratio)
|
margin = obj.get_margin(ratio)
|
||||||
if x1 < obj.x0-margin:
|
if x1 < obj.x0-margin:
|
||||||
s += ' '
|
yield LTAnon(' ')
|
||||||
s += obj.text
|
yield obj
|
||||||
x1 = obj.x1
|
x1 = obj.x1
|
||||||
yield s
|
yield LTAnon('\n')
|
||||||
else:
|
else:
|
||||||
for line in self.lines:
|
for line in self.lines:
|
||||||
s = ''
|
|
||||||
y0 = -INF
|
y0 = -INF
|
||||||
for obj in line:
|
for obj in line:
|
||||||
if not isinstance(obj, LTText): continue
|
if not isinstance(obj, LTText): continue
|
||||||
|
if ratio:
|
||||||
margin = obj.get_margin(ratio)
|
margin = obj.get_margin(ratio)
|
||||||
if obj.y1+margin < y0:
|
if obj.y1+margin < y0:
|
||||||
s += ' '
|
yield LTAnon(' ')
|
||||||
s += obj.text
|
yield obj
|
||||||
y0 = obj.y0
|
y0 = obj.y0
|
||||||
yield s
|
yield LTAnon('\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -10,10 +10,10 @@ from pdfminer.cmap import CMapDB, find_cmap_path
|
||||||
def main(argv):
|
def main(argv):
|
||||||
import getopt
|
import getopt
|
||||||
def usage():
|
def usage():
|
||||||
print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-w] [-t text|html|sgml|tag] [-o output] file ...' % argv[0]
|
print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-T cluster_threshold] [-W word_margin] [-t text|html|sgml|tag] [-o output] file ...' % argv[0]
|
||||||
return 100
|
return 100
|
||||||
try:
|
try:
|
||||||
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:T:t:o:C:D:m:w')
|
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:T:W:t:o:C:D:m:')
|
||||||
except getopt.GetoptError:
|
except getopt.GetoptError:
|
||||||
return usage()
|
return usage()
|
||||||
if not args: return usage()
|
if not args: return usage()
|
||||||
|
@ -29,7 +29,8 @@ def main(argv):
|
||||||
outfile = None
|
outfile = None
|
||||||
outtype = None
|
outtype = None
|
||||||
codec = 'utf-8'
|
codec = 'utf-8'
|
||||||
cluster_margin = None
|
cluster_margin = 0.5
|
||||||
|
word_margin = 0.2
|
||||||
pageno = 1
|
pageno = 1
|
||||||
scale = 1
|
scale = 1
|
||||||
showpageno = True
|
showpageno = True
|
||||||
|
@ -44,6 +45,7 @@ def main(argv):
|
||||||
elif k == '-o': outfile = v
|
elif k == '-o': outfile = v
|
||||||
elif k == '-s': scale = float(v)
|
elif k == '-s': scale = float(v)
|
||||||
elif k == '-T': cluster_margin = float(v)
|
elif k == '-T': cluster_margin = float(v)
|
||||||
|
elif k == '-W': word_margin = float(v)
|
||||||
#
|
#
|
||||||
CMapDB.debug = debug
|
CMapDB.debug = debug
|
||||||
PDFResourceManager.debug = debug
|
PDFResourceManager.debug = debug
|
||||||
|
@ -68,11 +70,11 @@ def main(argv):
|
||||||
else:
|
else:
|
||||||
outfp = sys.stdout
|
outfp = sys.stdout
|
||||||
if outtype == 'sgml':
|
if outtype == 'sgml':
|
||||||
device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
|
device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin)
|
||||||
elif outtype == 'html':
|
elif outtype == 'html':
|
||||||
device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, scale=scale)
|
device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin, scale=scale)
|
||||||
elif outtype == 'text':
|
elif outtype == 'text':
|
||||||
device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
|
device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin)
|
||||||
elif outtype == 'tag':
|
elif outtype == 'tag':
|
||||||
device = TagExtractor(rsrc, outfp, codec=codec)
|
device = TagExtractor(rsrc, outfp, codec=codec)
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in New Issue