clustering fix

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@98 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-05-15 10:10:30 +00:00
parent 9095738056
commit dfac85360b
3 changed files with 77 additions and 48 deletions

View File

@ -49,6 +49,7 @@ def bsearch(objs, v0, v1):
## reorder_hv, reorder_vh
## chop_hv, chop_vh
##
## Reorders objects according to its writing direction.
##
@ -210,7 +211,7 @@ class LayoutItem(object):
return 0
def get_direction(self):
return False
return None
## LayoutContainer
@ -227,7 +228,7 @@ class LayoutContainer(LayoutItem):
return
def __repr__(self):
return ('<group %s(%d)>' % (self.get_bbox(), len(self.objs)))
return ('<group %s>' % (self.get_bbox()))
def __iter__(self):
return iter(self.objs)
@ -267,8 +268,14 @@ class LayoutContainer(LayoutItem):
return self.weight
def get_direction(self):
return ((sum( obj.get_weight() for obj in self.objs )/2) <
sum( obj.get_weight() for obj in self.objs if obj.get_direction() ))
if not self.objs: return None
d = {}
for obj in self.objs:
k = obj.get_direction()
if k not in d: d[k] = 0
d[k] += 1
(direction,_) = sorted(d.iteritems(), key=lambda (k,v):v)[0]
return direction
## FigureItem
@ -327,7 +334,7 @@ class TextItem(LayoutItem):
def get_weight(self):
return len(self.text)
def get_direction(self):
def is_vertical(self):
return self.vertical
@ -340,40 +347,41 @@ class TextBox(LayoutContainer):
def __init__(self, objs):
LayoutContainer.__init__(self, None, (0,0,0,0), objs)
self.vertical = False
self.direction = None
return
def __repr__(self):
return ('<textbox %s(%s)>' % (self.get_bbox(), self.direction))
def fixate(self):
LayoutContainer.fixate(self)
self.direction = 'H'
for obj in self.objs:
self.vertical = bool(obj.get_direction())
if obj.is_vertical():
self.direction = 'V'
break
if 2 <= len(self.objs):
objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
if objs[0].get_weight() == 1 and objs[1].get_weight() == 1:
h = objs[0].voverlap(objs[1])
v = objs[0].hoverlap(objs[1])
self.vertical = (h < v)
if h < v:
self.direction = 'V'
if self.direction == 'H':
self.lines = reorder_vh(self.objs, +1)
else:
self.lines = reorder_hv(self.objs, -1)
self.objs = []
for line in self.lines:
self.objs.extend(line)
return
def get_direction(self):
return self.vertical
return self.direction
def get_lines(self, ratio):
if self.get_direction():
for line in reorder_hv(self.objs, -1):
s = ''
y0 = -INF
for obj in line:
if not isinstance(obj, TextItem): continue
margin = obj.get_margin(ratio)
if obj.y1+margin < y0:
s += ' '
s += obj.text
y0 = obj.y0
yield s
else:
for line in reorder_vh(self.objs, +1):
if self.get_direction() == 'H':
for line in self.lines:
s = ''
x1 = INF
for obj in line:
@ -384,6 +392,18 @@ class TextBox(LayoutContainer):
s += obj.text
x1 = obj.x1
yield s
else:
for line in self.lines:
s = ''
y0 = -INF
for obj in line:
if not isinstance(obj, TextItem): continue
margin = obj.get_margin(ratio)
if obj.y1+margin < y0:
s += ' '
s += obj.text
y0 = obj.y0
yield s
return
@ -404,10 +424,10 @@ class Page(LayoutContainer):
def group_text(self, ratio):
self.group_objs(ratio, TextBox)
if self.get_direction():
lines = reorder_hv(self.objs, -1)
else:
if self.get_direction() == 'H':
lines = reorder_vh(self.objs, +1)
else:
lines = reorder_hv(self.objs, -1)
self.objs = []
for line in lines:
self.objs.extend(line)

View File

@ -3,8 +3,8 @@ import sys
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
from pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfdevice import PDFDevice, PDFPageAggregator
from layout import Page, LayoutContainer, TextItem, TextBox
from pdffont import PDFUnicodeNotDefined
from page import Page, LayoutContainer, TextItem, FigureItem, TextBox
from cmap import CMapDB
@ -104,24 +104,33 @@ class TagExtractor(PDFDevice):
class SGMLConverter(PDFConverter):
def end_page(self, page):
def draw(item):
if isinstance(item, TextItem):
self.outfp.write('<text font="%s" direction="%s" bbox="%s" fontsize="%.3f">' %
(e(item.font.fontname), item.get_direction(),
def render(item):
if isinstance(item, Page):
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
(item.id, item.get_bbox(), item.rotate))
for child in item:
render(child)
self.outfp.write('</page>\n')
elif isinstance(item, TextItem):
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
(e(item.font.fontname), item.is_vertical(),
item.get_bbox(), item.fontsize))
self.write(item.text)
self.outfp.write('</text>\n')
elif isinstance(item, LayoutContainer):
self.outfp.write('<group id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
elif isinstance(item, FigureItem):
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
for child in item:
draw(child)
self.outfp.write('</group>\n')
render(child)
self.outfp.write('</figure>\n')
elif isinstance(item, TextBox):
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
print item
for child in item:
render(child)
self.outfp.write('</textbox>\n')
return
page = PDFConverter.end_page(self, page)
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
(page.id, page.get_bbox(), page.rotate))
draw(page)
self.outfp.write('</page>\n')
render(page)
return
@ -150,7 +159,7 @@ class HTMLConverter(PDFConverter):
return
def end_page(self, page):
def draw(item):
def render(item):
if isinstance(item, Page):
self.write_rect('gray', item.x0, self.yoffset-item.y1, item.width, item.height)
if self.pagenum:
@ -158,7 +167,7 @@ class HTMLConverter(PDFConverter):
((self.yoffset-page.y1)*self.scale))
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id))
for child in item:
draw(child)
render(child)
elif isinstance(item, TextItem):
if item.vertical:
wmode = 'tb-rl'
@ -175,11 +184,11 @@ class HTMLConverter(PDFConverter):
elif isinstance(item, LayoutContainer):
self.write_rect('blue', item.x0, self.yoffset-item.y1, item.width, item.height)
for child in item:
draw(child)
render(child)
return
page = PDFConverter.end_page(self, page)
self.yoffset += page.y1
draw(page)
render(page)
self.yoffset += self.pagepad
return
@ -204,7 +213,7 @@ class TextConverter(PDFConverter):
return
def end_page(self, page):
def draw(item):
def render(item):
if isinstance(item, TextItem):
self.outfp.write(obj.text.encode(self.codec, 'replace'))
self.outfp.write('\n')
@ -214,11 +223,11 @@ class TextConverter(PDFConverter):
self.outfp.write('\n')
elif isinstance(item, LayoutContainer):
for child in item:
draw(child)
render(child)
page = PDFConverter.end_page(self, page)
if self.pagenum:
self.outfp.write('Page %d\n' % page.id)
draw(page)
render(page)
self.outfp.write('\f')
return
@ -294,7 +303,7 @@ def main(argv):
CMapDB.initialize(cmapdir, cdbcmapdir)
rsrc = PDFResourceManager()
if outtype == 'sgml':
device = SGMLConverter(rsrc, outfp, codec=codec)
device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
elif outtype == 'html':
device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
elif outtype == 'text':

View File

@ -3,7 +3,7 @@ import sys
stdout = sys.stdout
stderr = sys.stderr
from pdffont import PDFUnicodeNotDefined
from layout import Page, FigureItem, TextItem
from page import Page, FigureItem, TextItem
from utils import mult_matrix, translate_matrix