clustering fix

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@98 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-05-15 10:10:30 +00:00
parent 9095738056
commit dfac85360b
3 changed files with 77 additions and 48 deletions

View File

@ -49,6 +49,7 @@ def bsearch(objs, v0, v1):
## reorder_hv, reorder_vh ## reorder_hv, reorder_vh
## chop_hv, chop_vh
## ##
## Reorders objects according to its writing direction. ## Reorders objects according to its writing direction.
## ##
@ -210,7 +211,7 @@ class LayoutItem(object):
return 0 return 0
def get_direction(self): def get_direction(self):
return False return None
## LayoutContainer ## LayoutContainer
@ -227,7 +228,7 @@ class LayoutContainer(LayoutItem):
return return
def __repr__(self): def __repr__(self):
return ('<group %s(%d)>' % (self.get_bbox(), len(self.objs))) return ('<group %s>' % (self.get_bbox()))
def __iter__(self): def __iter__(self):
return iter(self.objs) return iter(self.objs)
@ -267,8 +268,14 @@ class LayoutContainer(LayoutItem):
return self.weight return self.weight
def get_direction(self): def get_direction(self):
return ((sum( obj.get_weight() for obj in self.objs )/2) < if not self.objs: return None
sum( obj.get_weight() for obj in self.objs if obj.get_direction() )) d = {}
for obj in self.objs:
k = obj.get_direction()
if k not in d: d[k] = 0
d[k] += 1
(direction,_) = sorted(d.iteritems(), key=lambda (k,v):v)[0]
return direction
## FigureItem ## FigureItem
@ -327,7 +334,7 @@ class TextItem(LayoutItem):
def get_weight(self): def get_weight(self):
return len(self.text) return len(self.text)
def get_direction(self): def is_vertical(self):
return self.vertical return self.vertical
@ -340,40 +347,41 @@ class TextBox(LayoutContainer):
def __init__(self, objs): def __init__(self, objs):
LayoutContainer.__init__(self, None, (0,0,0,0), objs) LayoutContainer.__init__(self, None, (0,0,0,0), objs)
self.vertical = False self.direction = None
return return
def __repr__(self):
return ('<textbox %s(%s)>' % (self.get_bbox(), self.direction))
def fixate(self): def fixate(self):
LayoutContainer.fixate(self) LayoutContainer.fixate(self)
self.direction = 'H'
for obj in self.objs: for obj in self.objs:
self.vertical = bool(obj.get_direction()) if obj.is_vertical():
self.direction = 'V'
break break
if 2 <= len(self.objs): if 2 <= len(self.objs):
objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1) objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
if objs[0].get_weight() == 1 and objs[1].get_weight() == 1: if objs[0].get_weight() == 1 and objs[1].get_weight() == 1:
h = objs[0].voverlap(objs[1]) h = objs[0].voverlap(objs[1])
v = objs[0].hoverlap(objs[1]) v = objs[0].hoverlap(objs[1])
self.vertical = (h < v) if h < v:
self.direction = 'V'
if self.direction == 'H':
self.lines = reorder_vh(self.objs, +1)
else:
self.lines = reorder_hv(self.objs, -1)
self.objs = []
for line in self.lines:
self.objs.extend(line)
return return
def get_direction(self): def get_direction(self):
return self.vertical return self.direction
def get_lines(self, ratio): def get_lines(self, ratio):
if self.get_direction(): if self.get_direction() == 'H':
for line in reorder_hv(self.objs, -1): for line in self.lines:
s = ''
y0 = -INF
for obj in line:
if not isinstance(obj, TextItem): continue
margin = obj.get_margin(ratio)
if obj.y1+margin < y0:
s += ' '
s += obj.text
y0 = obj.y0
yield s
else:
for line in reorder_vh(self.objs, +1):
s = '' s = ''
x1 = INF x1 = INF
for obj in line: for obj in line:
@ -384,6 +392,18 @@ class TextBox(LayoutContainer):
s += obj.text s += obj.text
x1 = obj.x1 x1 = obj.x1
yield s yield s
else:
for line in self.lines:
s = ''
y0 = -INF
for obj in line:
if not isinstance(obj, TextItem): continue
margin = obj.get_margin(ratio)
if obj.y1+margin < y0:
s += ' '
s += obj.text
y0 = obj.y0
yield s
return return
@ -404,10 +424,10 @@ class Page(LayoutContainer):
def group_text(self, ratio): def group_text(self, ratio):
self.group_objs(ratio, TextBox) self.group_objs(ratio, TextBox)
if self.get_direction(): if self.get_direction() == 'H':
lines = reorder_hv(self.objs, -1)
else:
lines = reorder_vh(self.objs, +1) lines = reorder_vh(self.objs, +1)
else:
lines = reorder_hv(self.objs, -1)
self.objs = [] self.objs = []
for line in lines: for line in lines:
self.objs.extend(line) self.objs.extend(line)

View File

@ -3,8 +3,8 @@ import sys
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
from pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfdevice import PDFDevice, PDFPageAggregator from pdfdevice import PDFDevice, PDFPageAggregator
from layout import Page, LayoutContainer, TextItem, TextBox
from pdffont import PDFUnicodeNotDefined from pdffont import PDFUnicodeNotDefined
from page import Page, LayoutContainer, TextItem, FigureItem, TextBox
from cmap import CMapDB from cmap import CMapDB
@ -104,24 +104,33 @@ class TagExtractor(PDFDevice):
class SGMLConverter(PDFConverter): class SGMLConverter(PDFConverter):
def end_page(self, page): def end_page(self, page):
def draw(item): def render(item):
if isinstance(item, TextItem): if isinstance(item, Page):
self.outfp.write('<text font="%s" direction="%s" bbox="%s" fontsize="%.3f">' % self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
(e(item.font.fontname), item.get_direction(), (item.id, item.get_bbox(), item.rotate))
for child in item:
render(child)
self.outfp.write('</page>\n')
elif isinstance(item, TextItem):
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
(e(item.font.fontname), item.is_vertical(),
item.get_bbox(), item.fontsize)) item.get_bbox(), item.fontsize))
self.write(item.text) self.write(item.text)
self.outfp.write('</text>\n') self.outfp.write('</text>\n')
elif isinstance(item, LayoutContainer): elif isinstance(item, FigureItem):
self.outfp.write('<group id="%s" bbox="%s">\n' % (item.id, item.get_bbox())) self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
for child in item: for child in item:
draw(child) render(child)
self.outfp.write('</group>\n') self.outfp.write('</figure>\n')
elif isinstance(item, TextBox):
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
print item
for child in item:
render(child)
self.outfp.write('</textbox>\n')
return return
page = PDFConverter.end_page(self, page) page = PDFConverter.end_page(self, page)
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' % render(page)
(page.id, page.get_bbox(), page.rotate))
draw(page)
self.outfp.write('</page>\n')
return return
@ -150,7 +159,7 @@ class HTMLConverter(PDFConverter):
return return
def end_page(self, page): def end_page(self, page):
def draw(item): def render(item):
if isinstance(item, Page): if isinstance(item, Page):
self.write_rect('gray', item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('gray', item.x0, self.yoffset-item.y1, item.width, item.height)
if self.pagenum: if self.pagenum:
@ -158,7 +167,7 @@ class HTMLConverter(PDFConverter):
((self.yoffset-page.y1)*self.scale)) ((self.yoffset-page.y1)*self.scale))
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id)) self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id))
for child in item: for child in item:
draw(child) render(child)
elif isinstance(item, TextItem): elif isinstance(item, TextItem):
if item.vertical: if item.vertical:
wmode = 'tb-rl' wmode = 'tb-rl'
@ -175,11 +184,11 @@ class HTMLConverter(PDFConverter):
elif isinstance(item, LayoutContainer): elif isinstance(item, LayoutContainer):
self.write_rect('blue', item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('blue', item.x0, self.yoffset-item.y1, item.width, item.height)
for child in item: for child in item:
draw(child) render(child)
return return
page = PDFConverter.end_page(self, page) page = PDFConverter.end_page(self, page)
self.yoffset += page.y1 self.yoffset += page.y1
draw(page) render(page)
self.yoffset += self.pagepad self.yoffset += self.pagepad
return return
@ -204,7 +213,7 @@ class TextConverter(PDFConverter):
return return
def end_page(self, page): def end_page(self, page):
def draw(item): def render(item):
if isinstance(item, TextItem): if isinstance(item, TextItem):
self.outfp.write(obj.text.encode(self.codec, 'replace')) self.outfp.write(obj.text.encode(self.codec, 'replace'))
self.outfp.write('\n') self.outfp.write('\n')
@ -214,11 +223,11 @@ class TextConverter(PDFConverter):
self.outfp.write('\n') self.outfp.write('\n')
elif isinstance(item, LayoutContainer): elif isinstance(item, LayoutContainer):
for child in item: for child in item:
draw(child) render(child)
page = PDFConverter.end_page(self, page) page = PDFConverter.end_page(self, page)
if self.pagenum: if self.pagenum:
self.outfp.write('Page %d\n' % page.id) self.outfp.write('Page %d\n' % page.id)
draw(page) render(page)
self.outfp.write('\f') self.outfp.write('\f')
return return
@ -294,7 +303,7 @@ def main(argv):
CMapDB.initialize(cmapdir, cdbcmapdir) CMapDB.initialize(cmapdir, cdbcmapdir)
rsrc = PDFResourceManager() rsrc = PDFResourceManager()
if outtype == 'sgml': if outtype == 'sgml':
device = SGMLConverter(rsrc, outfp, codec=codec) device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
elif outtype == 'html': elif outtype == 'html':
device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin) device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
elif outtype == 'text': elif outtype == 'text':

View File

@ -3,7 +3,7 @@ import sys
stdout = sys.stdout stdout = sys.stdout
stderr = sys.stderr stderr = sys.stderr
from pdffont import PDFUnicodeNotDefined from pdffont import PDFUnicodeNotDefined
from layout import Page, FigureItem, TextItem from page import Page, FigureItem, TextItem
from utils import mult_matrix, translate_matrix from utils import mult_matrix, translate_matrix