clustering fix
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@98 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
9095738056
commit
dfac85360b
|
@ -49,6 +49,7 @@ def bsearch(objs, v0, v1):
|
|||
|
||||
|
||||
## reorder_hv, reorder_vh
|
||||
## chop_hv, chop_vh
|
||||
##
|
||||
## Reorders objects according to its writing direction.
|
||||
##
|
||||
|
@ -210,7 +211,7 @@ class LayoutItem(object):
|
|||
return 0
|
||||
|
||||
def get_direction(self):
|
||||
return False
|
||||
return None
|
||||
|
||||
|
||||
## LayoutContainer
|
||||
|
@ -227,7 +228,7 @@ class LayoutContainer(LayoutItem):
|
|||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<group %s(%d)>' % (self.get_bbox(), len(self.objs)))
|
||||
return ('<group %s>' % (self.get_bbox()))
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.objs)
|
||||
|
@ -267,8 +268,14 @@ class LayoutContainer(LayoutItem):
|
|||
return self.weight
|
||||
|
||||
def get_direction(self):
|
||||
return ((sum( obj.get_weight() for obj in self.objs )/2) <
|
||||
sum( obj.get_weight() for obj in self.objs if obj.get_direction() ))
|
||||
if not self.objs: return None
|
||||
d = {}
|
||||
for obj in self.objs:
|
||||
k = obj.get_direction()
|
||||
if k not in d: d[k] = 0
|
||||
d[k] += 1
|
||||
(direction,_) = sorted(d.iteritems(), key=lambda (k,v):v)[0]
|
||||
return direction
|
||||
|
||||
|
||||
## FigureItem
|
||||
|
@ -327,7 +334,7 @@ class TextItem(LayoutItem):
|
|||
def get_weight(self):
|
||||
return len(self.text)
|
||||
|
||||
def get_direction(self):
|
||||
def is_vertical(self):
|
||||
return self.vertical
|
||||
|
||||
|
||||
|
@ -340,40 +347,41 @@ class TextBox(LayoutContainer):
|
|||
|
||||
def __init__(self, objs):
|
||||
LayoutContainer.__init__(self, None, (0,0,0,0), objs)
|
||||
self.vertical = False
|
||||
self.direction = None
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<textbox %s(%s)>' % (self.get_bbox(), self.direction))
|
||||
|
||||
def fixate(self):
|
||||
LayoutContainer.fixate(self)
|
||||
self.direction = 'H'
|
||||
for obj in self.objs:
|
||||
self.vertical = bool(obj.get_direction())
|
||||
if obj.is_vertical():
|
||||
self.direction = 'V'
|
||||
break
|
||||
if 2 <= len(self.objs):
|
||||
objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
|
||||
if objs[0].get_weight() == 1 and objs[1].get_weight() == 1:
|
||||
h = objs[0].voverlap(objs[1])
|
||||
v = objs[0].hoverlap(objs[1])
|
||||
self.vertical = (h < v)
|
||||
if h < v:
|
||||
self.direction = 'V'
|
||||
if self.direction == 'H':
|
||||
self.lines = reorder_vh(self.objs, +1)
|
||||
else:
|
||||
self.lines = reorder_hv(self.objs, -1)
|
||||
self.objs = []
|
||||
for line in self.lines:
|
||||
self.objs.extend(line)
|
||||
return
|
||||
|
||||
def get_direction(self):
|
||||
return self.vertical
|
||||
return self.direction
|
||||
|
||||
def get_lines(self, ratio):
|
||||
if self.get_direction():
|
||||
for line in reorder_hv(self.objs, -1):
|
||||
s = ''
|
||||
y0 = -INF
|
||||
for obj in line:
|
||||
if not isinstance(obj, TextItem): continue
|
||||
margin = obj.get_margin(ratio)
|
||||
if obj.y1+margin < y0:
|
||||
s += ' '
|
||||
s += obj.text
|
||||
y0 = obj.y0
|
||||
yield s
|
||||
else:
|
||||
for line in reorder_vh(self.objs, +1):
|
||||
if self.get_direction() == 'H':
|
||||
for line in self.lines:
|
||||
s = ''
|
||||
x1 = INF
|
||||
for obj in line:
|
||||
|
@ -384,6 +392,18 @@ class TextBox(LayoutContainer):
|
|||
s += obj.text
|
||||
x1 = obj.x1
|
||||
yield s
|
||||
else:
|
||||
for line in self.lines:
|
||||
s = ''
|
||||
y0 = -INF
|
||||
for obj in line:
|
||||
if not isinstance(obj, TextItem): continue
|
||||
margin = obj.get_margin(ratio)
|
||||
if obj.y1+margin < y0:
|
||||
s += ' '
|
||||
s += obj.text
|
||||
y0 = obj.y0
|
||||
yield s
|
||||
return
|
||||
|
||||
|
||||
|
@ -404,10 +424,10 @@ class Page(LayoutContainer):
|
|||
|
||||
def group_text(self, ratio):
|
||||
self.group_objs(ratio, TextBox)
|
||||
if self.get_direction():
|
||||
lines = reorder_hv(self.objs, -1)
|
||||
else:
|
||||
if self.get_direction() == 'H':
|
||||
lines = reorder_vh(self.objs, +1)
|
||||
else:
|
||||
lines = reorder_hv(self.objs, -1)
|
||||
self.objs = []
|
||||
for line in lines:
|
||||
self.objs.extend(line)
|
|
@ -3,8 +3,8 @@ import sys
|
|||
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
|
||||
from pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
from pdfdevice import PDFDevice, PDFPageAggregator
|
||||
from layout import Page, LayoutContainer, TextItem, TextBox
|
||||
from pdffont import PDFUnicodeNotDefined
|
||||
from page import Page, LayoutContainer, TextItem, FigureItem, TextBox
|
||||
from cmap import CMapDB
|
||||
|
||||
|
||||
|
@ -104,24 +104,33 @@ class TagExtractor(PDFDevice):
|
|||
class SGMLConverter(PDFConverter):
|
||||
|
||||
def end_page(self, page):
|
||||
def draw(item):
|
||||
if isinstance(item, TextItem):
|
||||
self.outfp.write('<text font="%s" direction="%s" bbox="%s" fontsize="%.3f">' %
|
||||
(e(item.font.fontname), item.get_direction(),
|
||||
def render(item):
|
||||
if isinstance(item, Page):
|
||||
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
|
||||
(item.id, item.get_bbox(), item.rotate))
|
||||
for child in item:
|
||||
render(child)
|
||||
self.outfp.write('</page>\n')
|
||||
elif isinstance(item, TextItem):
|
||||
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
|
||||
(e(item.font.fontname), item.is_vertical(),
|
||||
item.get_bbox(), item.fontsize))
|
||||
self.write(item.text)
|
||||
self.outfp.write('</text>\n')
|
||||
elif isinstance(item, LayoutContainer):
|
||||
self.outfp.write('<group id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
|
||||
elif isinstance(item, FigureItem):
|
||||
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
|
||||
for child in item:
|
||||
draw(child)
|
||||
self.outfp.write('</group>\n')
|
||||
render(child)
|
||||
self.outfp.write('</figure>\n')
|
||||
elif isinstance(item, TextBox):
|
||||
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
|
||||
print item
|
||||
for child in item:
|
||||
render(child)
|
||||
self.outfp.write('</textbox>\n')
|
||||
return
|
||||
page = PDFConverter.end_page(self, page)
|
||||
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
|
||||
(page.id, page.get_bbox(), page.rotate))
|
||||
draw(page)
|
||||
self.outfp.write('</page>\n')
|
||||
render(page)
|
||||
return
|
||||
|
||||
|
||||
|
@ -150,7 +159,7 @@ class HTMLConverter(PDFConverter):
|
|||
return
|
||||
|
||||
def end_page(self, page):
|
||||
def draw(item):
|
||||
def render(item):
|
||||
if isinstance(item, Page):
|
||||
self.write_rect('gray', item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||
if self.pagenum:
|
||||
|
@ -158,7 +167,7 @@ class HTMLConverter(PDFConverter):
|
|||
((self.yoffset-page.y1)*self.scale))
|
||||
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id))
|
||||
for child in item:
|
||||
draw(child)
|
||||
render(child)
|
||||
elif isinstance(item, TextItem):
|
||||
if item.vertical:
|
||||
wmode = 'tb-rl'
|
||||
|
@ -175,11 +184,11 @@ class HTMLConverter(PDFConverter):
|
|||
elif isinstance(item, LayoutContainer):
|
||||
self.write_rect('blue', item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||
for child in item:
|
||||
draw(child)
|
||||
render(child)
|
||||
return
|
||||
page = PDFConverter.end_page(self, page)
|
||||
self.yoffset += page.y1
|
||||
draw(page)
|
||||
render(page)
|
||||
self.yoffset += self.pagepad
|
||||
return
|
||||
|
||||
|
@ -204,7 +213,7 @@ class TextConverter(PDFConverter):
|
|||
return
|
||||
|
||||
def end_page(self, page):
|
||||
def draw(item):
|
||||
def render(item):
|
||||
if isinstance(item, TextItem):
|
||||
self.outfp.write(obj.text.encode(self.codec, 'replace'))
|
||||
self.outfp.write('\n')
|
||||
|
@ -214,11 +223,11 @@ class TextConverter(PDFConverter):
|
|||
self.outfp.write('\n')
|
||||
elif isinstance(item, LayoutContainer):
|
||||
for child in item:
|
||||
draw(child)
|
||||
render(child)
|
||||
page = PDFConverter.end_page(self, page)
|
||||
if self.pagenum:
|
||||
self.outfp.write('Page %d\n' % page.id)
|
||||
draw(page)
|
||||
render(page)
|
||||
self.outfp.write('\f')
|
||||
return
|
||||
|
||||
|
@ -294,7 +303,7 @@ def main(argv):
|
|||
CMapDB.initialize(cmapdir, cdbcmapdir)
|
||||
rsrc = PDFResourceManager()
|
||||
if outtype == 'sgml':
|
||||
device = SGMLConverter(rsrc, outfp, codec=codec)
|
||||
device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
|
||||
elif outtype == 'html':
|
||||
device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
|
||||
elif outtype == 'text':
|
||||
|
|
|
@ -3,7 +3,7 @@ import sys
|
|||
stdout = sys.stdout
|
||||
stderr = sys.stderr
|
||||
from pdffont import PDFUnicodeNotDefined
|
||||
from layout import Page, FigureItem, TextItem
|
||||
from page import Page, FigureItem, TextItem
|
||||
from utils import mult_matrix, translate_matrix
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue