improved clustering
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@116 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
c7a0894182
commit
97dd4dda5e
|
@ -2,7 +2,7 @@
|
|||
import sys
|
||||
from pdfminer.pdfdevice import PDFDevice
|
||||
from pdfminer.pdffont import PDFUnicodeNotDefined
|
||||
from pdfminer.layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox
|
||||
from pdfminer.layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox, LTAnon
|
||||
from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
|
||||
|
||||
|
||||
|
@ -115,10 +115,11 @@ class PDFPageAggregator(PDFDevice):
|
|||
##
|
||||
class PDFConverter(PDFPageAggregator):
|
||||
|
||||
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8'):
|
||||
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, word_margin=None, codec='utf-8'):
|
||||
PDFPageAggregator.__init__(self, rsrc, pageno=pageno, cluster_margin=cluster_margin)
|
||||
self.outfp = outfp
|
||||
self.codec = codec
|
||||
self.word_margin = word_margin
|
||||
return
|
||||
|
||||
def write(self, text):
|
||||
|
@ -204,6 +205,11 @@ class SGMLConverter(PDFConverter):
|
|||
item.get_bbox(), item.fontsize))
|
||||
self.write(item.text)
|
||||
self.outfp.write('</text>\n')
|
||||
elif isinstance(item, LTAnon):
|
||||
if item.text == ' ':
|
||||
self.outfp.write('<space>\n')
|
||||
elif item.text == '\n':
|
||||
self.outfp.write('<newline>\n')
|
||||
elif isinstance(item, LTLine):
|
||||
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox()))
|
||||
elif isinstance(item, LTRect):
|
||||
|
@ -215,7 +221,7 @@ class SGMLConverter(PDFConverter):
|
|||
self.outfp.write('</figure>\n')
|
||||
elif isinstance(item, LTTextBox):
|
||||
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
|
||||
for child in item:
|
||||
for child in item.get_lines(self.word_margin):
|
||||
render(child)
|
||||
self.outfp.write('</textbox>\n')
|
||||
return
|
||||
|
@ -228,9 +234,9 @@ class SGMLConverter(PDFConverter):
|
|||
##
|
||||
class HTMLConverter(PDFConverter):
|
||||
|
||||
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8',
|
||||
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, word_margin=None, codec='utf-8',
|
||||
scale=1, showpageno=True, pagepad=50):
|
||||
PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, codec=codec)
|
||||
PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, word_margin=word_margin, codec=codec)
|
||||
self.showpageno = showpageno
|
||||
self.pagepad = pagepad
|
||||
self.scale = scale
|
||||
|
@ -270,11 +276,13 @@ class HTMLConverter(PDFConverter):
|
|||
self.outfp.write('</span>\n')
|
||||
if self.debug:
|
||||
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||
elif isinstance(item, LTAnon):
|
||||
self.write(item.text)
|
||||
elif isinstance(item, LTLine) or isinstance(item, LTRect):
|
||||
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||
elif isinstance(item, LTTextBox):
|
||||
self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||
for child in item:
|
||||
for child in item.get_lines(self.word_margin):
|
||||
render(child)
|
||||
return
|
||||
page = PDFConverter.end_page(self, page)
|
||||
|
@ -295,12 +303,9 @@ class HTMLConverter(PDFConverter):
|
|||
class TextConverter(PDFConverter):
|
||||
|
||||
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8',
|
||||
showpageno=False, word_margin=0.2):
|
||||
if cluster_margin == None:
|
||||
cluster_margin = 0.5
|
||||
PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, codec=codec)
|
||||
showpageno=False, word_margin=None):
|
||||
PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, word_margin=word_margin, codec=codec)
|
||||
self.showpageno = showpageno
|
||||
self.word_margin = word_margin
|
||||
return
|
||||
|
||||
def write(self, text):
|
||||
|
@ -312,8 +317,8 @@ class TextConverter(PDFConverter):
|
|||
if isinstance(item, LTText):
|
||||
self.write(item.text+'\n')
|
||||
elif isinstance(item, LTTextBox):
|
||||
for line in item.get_lines(self.word_margin):
|
||||
self.write(line+'\n')
|
||||
for obj in item.get_lines(self.word_margin):
|
||||
self.write(obj.text)
|
||||
self.write('\n')
|
||||
elif isinstance(item, LayoutContainer):
|
||||
for child in item:
|
||||
|
|
|
@ -195,7 +195,7 @@ class LayoutItem(object):
|
|||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<pageitem bbox=%s>' % (self.get_bbox()))
|
||||
return ('<item bbox=%s>' % (self.get_bbox()))
|
||||
|
||||
def hoverlap(self, obj):
|
||||
assert isinstance(obj, LayoutItem)
|
||||
|
@ -300,6 +300,18 @@ class LTRect(LayoutItem):
|
|||
return
|
||||
|
||||
|
||||
## LTAnon
|
||||
##
|
||||
class LTAnon(object):
|
||||
|
||||
def __init__(self, text):
|
||||
self.text = text
|
||||
return
|
||||
|
||||
def get_weight(self):
|
||||
return 0
|
||||
|
||||
|
||||
## LTText
|
||||
##
|
||||
class LTText(LayoutItem):
|
||||
|
@ -409,28 +421,28 @@ class LTTextBox(LayoutContainer):
|
|||
def get_lines(self, ratio):
|
||||
if self.get_direction() == 'H':
|
||||
for line in self.lines:
|
||||
s = ''
|
||||
x1 = INF
|
||||
for obj in line:
|
||||
if not isinstance(obj, LTText): continue
|
||||
if ratio:
|
||||
margin = obj.get_margin(ratio)
|
||||
if x1 < obj.x0-margin:
|
||||
s += ' '
|
||||
s += obj.text
|
||||
yield LTAnon(' ')
|
||||
yield obj
|
||||
x1 = obj.x1
|
||||
yield s
|
||||
yield LTAnon('\n')
|
||||
else:
|
||||
for line in self.lines:
|
||||
s = ''
|
||||
y0 = -INF
|
||||
for obj in line:
|
||||
if not isinstance(obj, LTText): continue
|
||||
if ratio:
|
||||
margin = obj.get_margin(ratio)
|
||||
if obj.y1+margin < y0:
|
||||
s += ' '
|
||||
s += obj.text
|
||||
yield LTAnon(' ')
|
||||
yield obj
|
||||
y0 = obj.y0
|
||||
yield s
|
||||
yield LTAnon('\n')
|
||||
return
|
||||
|
||||
|
||||
|
|
|
@ -10,10 +10,10 @@ from pdfminer.cmap import CMapDB, find_cmap_path
|
|||
def main(argv):
|
||||
import getopt
|
||||
def usage():
|
||||
print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-w] [-t text|html|sgml|tag] [-o output] file ...' % argv[0]
|
||||
print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-T cluster_threshold] [-W word_margin] [-t text|html|sgml|tag] [-o output] file ...' % argv[0]
|
||||
return 100
|
||||
try:
|
||||
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:T:t:o:C:D:m:w')
|
||||
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:T:W:t:o:C:D:m:')
|
||||
except getopt.GetoptError:
|
||||
return usage()
|
||||
if not args: return usage()
|
||||
|
@ -29,7 +29,8 @@ def main(argv):
|
|||
outfile = None
|
||||
outtype = None
|
||||
codec = 'utf-8'
|
||||
cluster_margin = None
|
||||
cluster_margin = 0.5
|
||||
word_margin = 0.2
|
||||
pageno = 1
|
||||
scale = 1
|
||||
showpageno = True
|
||||
|
@ -44,6 +45,7 @@ def main(argv):
|
|||
elif k == '-o': outfile = v
|
||||
elif k == '-s': scale = float(v)
|
||||
elif k == '-T': cluster_margin = float(v)
|
||||
elif k == '-W': word_margin = float(v)
|
||||
#
|
||||
CMapDB.debug = debug
|
||||
PDFResourceManager.debug = debug
|
||||
|
@ -68,11 +70,11 @@ def main(argv):
|
|||
else:
|
||||
outfp = sys.stdout
|
||||
if outtype == 'sgml':
|
||||
device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
|
||||
device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin)
|
||||
elif outtype == 'html':
|
||||
device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, scale=scale)
|
||||
device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin, scale=scale)
|
||||
elif outtype == 'text':
|
||||
device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
|
||||
device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin)
|
||||
elif outtype == 'tag':
|
||||
device = TagExtractor(rsrc, outfp, codec=codec)
|
||||
else:
|
||||
|
|
Loading…
Reference in New Issue