clustering fix
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@98 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
9095738056
commit
dfac85360b
|
@ -49,6 +49,7 @@ def bsearch(objs, v0, v1):
|
||||||
|
|
||||||
|
|
||||||
## reorder_hv, reorder_vh
|
## reorder_hv, reorder_vh
|
||||||
|
## chop_hv, chop_vh
|
||||||
##
|
##
|
||||||
## Reorders objects according to its writing direction.
|
## Reorders objects according to its writing direction.
|
||||||
##
|
##
|
||||||
|
@ -210,7 +211,7 @@ class LayoutItem(object):
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
def get_direction(self):
|
def get_direction(self):
|
||||||
return False
|
return None
|
||||||
|
|
||||||
|
|
||||||
## LayoutContainer
|
## LayoutContainer
|
||||||
|
@ -227,7 +228,7 @@ class LayoutContainer(LayoutItem):
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<group %s(%d)>' % (self.get_bbox(), len(self.objs)))
|
return ('<group %s>' % (self.get_bbox()))
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
return iter(self.objs)
|
return iter(self.objs)
|
||||||
|
@ -267,8 +268,14 @@ class LayoutContainer(LayoutItem):
|
||||||
return self.weight
|
return self.weight
|
||||||
|
|
||||||
def get_direction(self):
|
def get_direction(self):
|
||||||
return ((sum( obj.get_weight() for obj in self.objs )/2) <
|
if not self.objs: return None
|
||||||
sum( obj.get_weight() for obj in self.objs if obj.get_direction() ))
|
d = {}
|
||||||
|
for obj in self.objs:
|
||||||
|
k = obj.get_direction()
|
||||||
|
if k not in d: d[k] = 0
|
||||||
|
d[k] += 1
|
||||||
|
(direction,_) = sorted(d.iteritems(), key=lambda (k,v):v)[0]
|
||||||
|
return direction
|
||||||
|
|
||||||
|
|
||||||
## FigureItem
|
## FigureItem
|
||||||
|
@ -327,7 +334,7 @@ class TextItem(LayoutItem):
|
||||||
def get_weight(self):
|
def get_weight(self):
|
||||||
return len(self.text)
|
return len(self.text)
|
||||||
|
|
||||||
def get_direction(self):
|
def is_vertical(self):
|
||||||
return self.vertical
|
return self.vertical
|
||||||
|
|
||||||
|
|
||||||
|
@ -340,40 +347,41 @@ class TextBox(LayoutContainer):
|
||||||
|
|
||||||
def __init__(self, objs):
|
def __init__(self, objs):
|
||||||
LayoutContainer.__init__(self, None, (0,0,0,0), objs)
|
LayoutContainer.__init__(self, None, (0,0,0,0), objs)
|
||||||
self.vertical = False
|
self.direction = None
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return ('<textbox %s(%s)>' % (self.get_bbox(), self.direction))
|
||||||
|
|
||||||
def fixate(self):
|
def fixate(self):
|
||||||
LayoutContainer.fixate(self)
|
LayoutContainer.fixate(self)
|
||||||
|
self.direction = 'H'
|
||||||
for obj in self.objs:
|
for obj in self.objs:
|
||||||
self.vertical = bool(obj.get_direction())
|
if obj.is_vertical():
|
||||||
|
self.direction = 'V'
|
||||||
break
|
break
|
||||||
if 2 <= len(self.objs):
|
if 2 <= len(self.objs):
|
||||||
objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
|
objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
|
||||||
if objs[0].get_weight() == 1 and objs[1].get_weight() == 1:
|
if objs[0].get_weight() == 1 and objs[1].get_weight() == 1:
|
||||||
h = objs[0].voverlap(objs[1])
|
h = objs[0].voverlap(objs[1])
|
||||||
v = objs[0].hoverlap(objs[1])
|
v = objs[0].hoverlap(objs[1])
|
||||||
self.vertical = (h < v)
|
if h < v:
|
||||||
|
self.direction = 'V'
|
||||||
|
if self.direction == 'H':
|
||||||
|
self.lines = reorder_vh(self.objs, +1)
|
||||||
|
else:
|
||||||
|
self.lines = reorder_hv(self.objs, -1)
|
||||||
|
self.objs = []
|
||||||
|
for line in self.lines:
|
||||||
|
self.objs.extend(line)
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_direction(self):
|
def get_direction(self):
|
||||||
return self.vertical
|
return self.direction
|
||||||
|
|
||||||
def get_lines(self, ratio):
|
def get_lines(self, ratio):
|
||||||
if self.get_direction():
|
if self.get_direction() == 'H':
|
||||||
for line in reorder_hv(self.objs, -1):
|
for line in self.lines:
|
||||||
s = ''
|
|
||||||
y0 = -INF
|
|
||||||
for obj in line:
|
|
||||||
if not isinstance(obj, TextItem): continue
|
|
||||||
margin = obj.get_margin(ratio)
|
|
||||||
if obj.y1+margin < y0:
|
|
||||||
s += ' '
|
|
||||||
s += obj.text
|
|
||||||
y0 = obj.y0
|
|
||||||
yield s
|
|
||||||
else:
|
|
||||||
for line in reorder_vh(self.objs, +1):
|
|
||||||
s = ''
|
s = ''
|
||||||
x1 = INF
|
x1 = INF
|
||||||
for obj in line:
|
for obj in line:
|
||||||
|
@ -384,6 +392,18 @@ class TextBox(LayoutContainer):
|
||||||
s += obj.text
|
s += obj.text
|
||||||
x1 = obj.x1
|
x1 = obj.x1
|
||||||
yield s
|
yield s
|
||||||
|
else:
|
||||||
|
for line in self.lines:
|
||||||
|
s = ''
|
||||||
|
y0 = -INF
|
||||||
|
for obj in line:
|
||||||
|
if not isinstance(obj, TextItem): continue
|
||||||
|
margin = obj.get_margin(ratio)
|
||||||
|
if obj.y1+margin < y0:
|
||||||
|
s += ' '
|
||||||
|
s += obj.text
|
||||||
|
y0 = obj.y0
|
||||||
|
yield s
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@ -404,10 +424,10 @@ class Page(LayoutContainer):
|
||||||
|
|
||||||
def group_text(self, ratio):
|
def group_text(self, ratio):
|
||||||
self.group_objs(ratio, TextBox)
|
self.group_objs(ratio, TextBox)
|
||||||
if self.get_direction():
|
if self.get_direction() == 'H':
|
||||||
lines = reorder_hv(self.objs, -1)
|
|
||||||
else:
|
|
||||||
lines = reorder_vh(self.objs, +1)
|
lines = reorder_vh(self.objs, +1)
|
||||||
|
else:
|
||||||
|
lines = reorder_hv(self.objs, -1)
|
||||||
self.objs = []
|
self.objs = []
|
||||||
for line in lines:
|
for line in lines:
|
||||||
self.objs.extend(line)
|
self.objs.extend(line)
|
|
@ -3,8 +3,8 @@ import sys
|
||||||
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
|
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
|
||||||
from pdfinterp import PDFResourceManager, PDFPageInterpreter
|
from pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||||
from pdfdevice import PDFDevice, PDFPageAggregator
|
from pdfdevice import PDFDevice, PDFPageAggregator
|
||||||
from layout import Page, LayoutContainer, TextItem, TextBox
|
|
||||||
from pdffont import PDFUnicodeNotDefined
|
from pdffont import PDFUnicodeNotDefined
|
||||||
|
from page import Page, LayoutContainer, TextItem, FigureItem, TextBox
|
||||||
from cmap import CMapDB
|
from cmap import CMapDB
|
||||||
|
|
||||||
|
|
||||||
|
@ -104,24 +104,33 @@ class TagExtractor(PDFDevice):
|
||||||
class SGMLConverter(PDFConverter):
|
class SGMLConverter(PDFConverter):
|
||||||
|
|
||||||
def end_page(self, page):
|
def end_page(self, page):
|
||||||
def draw(item):
|
def render(item):
|
||||||
if isinstance(item, TextItem):
|
if isinstance(item, Page):
|
||||||
self.outfp.write('<text font="%s" direction="%s" bbox="%s" fontsize="%.3f">' %
|
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
|
||||||
(e(item.font.fontname), item.get_direction(),
|
(item.id, item.get_bbox(), item.rotate))
|
||||||
|
for child in item:
|
||||||
|
render(child)
|
||||||
|
self.outfp.write('</page>\n')
|
||||||
|
elif isinstance(item, TextItem):
|
||||||
|
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
|
||||||
|
(e(item.font.fontname), item.is_vertical(),
|
||||||
item.get_bbox(), item.fontsize))
|
item.get_bbox(), item.fontsize))
|
||||||
self.write(item.text)
|
self.write(item.text)
|
||||||
self.outfp.write('</text>\n')
|
self.outfp.write('</text>\n')
|
||||||
elif isinstance(item, LayoutContainer):
|
elif isinstance(item, FigureItem):
|
||||||
self.outfp.write('<group id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
|
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
|
||||||
for child in item:
|
for child in item:
|
||||||
draw(child)
|
render(child)
|
||||||
self.outfp.write('</group>\n')
|
self.outfp.write('</figure>\n')
|
||||||
|
elif isinstance(item, TextBox):
|
||||||
|
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
|
||||||
|
print item
|
||||||
|
for child in item:
|
||||||
|
render(child)
|
||||||
|
self.outfp.write('</textbox>\n')
|
||||||
return
|
return
|
||||||
page = PDFConverter.end_page(self, page)
|
page = PDFConverter.end_page(self, page)
|
||||||
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
|
render(page)
|
||||||
(page.id, page.get_bbox(), page.rotate))
|
|
||||||
draw(page)
|
|
||||||
self.outfp.write('</page>\n')
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@ -150,7 +159,7 @@ class HTMLConverter(PDFConverter):
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_page(self, page):
|
def end_page(self, page):
|
||||||
def draw(item):
|
def render(item):
|
||||||
if isinstance(item, Page):
|
if isinstance(item, Page):
|
||||||
self.write_rect('gray', item.x0, self.yoffset-item.y1, item.width, item.height)
|
self.write_rect('gray', item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||||
if self.pagenum:
|
if self.pagenum:
|
||||||
|
@ -158,7 +167,7 @@ class HTMLConverter(PDFConverter):
|
||||||
((self.yoffset-page.y1)*self.scale))
|
((self.yoffset-page.y1)*self.scale))
|
||||||
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id))
|
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id))
|
||||||
for child in item:
|
for child in item:
|
||||||
draw(child)
|
render(child)
|
||||||
elif isinstance(item, TextItem):
|
elif isinstance(item, TextItem):
|
||||||
if item.vertical:
|
if item.vertical:
|
||||||
wmode = 'tb-rl'
|
wmode = 'tb-rl'
|
||||||
|
@ -175,11 +184,11 @@ class HTMLConverter(PDFConverter):
|
||||||
elif isinstance(item, LayoutContainer):
|
elif isinstance(item, LayoutContainer):
|
||||||
self.write_rect('blue', item.x0, self.yoffset-item.y1, item.width, item.height)
|
self.write_rect('blue', item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||||
for child in item:
|
for child in item:
|
||||||
draw(child)
|
render(child)
|
||||||
return
|
return
|
||||||
page = PDFConverter.end_page(self, page)
|
page = PDFConverter.end_page(self, page)
|
||||||
self.yoffset += page.y1
|
self.yoffset += page.y1
|
||||||
draw(page)
|
render(page)
|
||||||
self.yoffset += self.pagepad
|
self.yoffset += self.pagepad
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -204,7 +213,7 @@ class TextConverter(PDFConverter):
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_page(self, page):
|
def end_page(self, page):
|
||||||
def draw(item):
|
def render(item):
|
||||||
if isinstance(item, TextItem):
|
if isinstance(item, TextItem):
|
||||||
self.outfp.write(obj.text.encode(self.codec, 'replace'))
|
self.outfp.write(obj.text.encode(self.codec, 'replace'))
|
||||||
self.outfp.write('\n')
|
self.outfp.write('\n')
|
||||||
|
@ -214,11 +223,11 @@ class TextConverter(PDFConverter):
|
||||||
self.outfp.write('\n')
|
self.outfp.write('\n')
|
||||||
elif isinstance(item, LayoutContainer):
|
elif isinstance(item, LayoutContainer):
|
||||||
for child in item:
|
for child in item:
|
||||||
draw(child)
|
render(child)
|
||||||
page = PDFConverter.end_page(self, page)
|
page = PDFConverter.end_page(self, page)
|
||||||
if self.pagenum:
|
if self.pagenum:
|
||||||
self.outfp.write('Page %d\n' % page.id)
|
self.outfp.write('Page %d\n' % page.id)
|
||||||
draw(page)
|
render(page)
|
||||||
self.outfp.write('\f')
|
self.outfp.write('\f')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -294,7 +303,7 @@ def main(argv):
|
||||||
CMapDB.initialize(cmapdir, cdbcmapdir)
|
CMapDB.initialize(cmapdir, cdbcmapdir)
|
||||||
rsrc = PDFResourceManager()
|
rsrc = PDFResourceManager()
|
||||||
if outtype == 'sgml':
|
if outtype == 'sgml':
|
||||||
device = SGMLConverter(rsrc, outfp, codec=codec)
|
device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
|
||||||
elif outtype == 'html':
|
elif outtype == 'html':
|
||||||
device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
|
device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
|
||||||
elif outtype == 'text':
|
elif outtype == 'text':
|
||||||
|
|
|
@ -3,7 +3,7 @@ import sys
|
||||||
stdout = sys.stdout
|
stdout = sys.stdout
|
||||||
stderr = sys.stderr
|
stderr = sys.stderr
|
||||||
from pdffont import PDFUnicodeNotDefined
|
from pdffont import PDFUnicodeNotDefined
|
||||||
from layout import Page, FigureItem, TextItem
|
from page import Page, FigureItem, TextItem
|
||||||
from utils import mult_matrix, translate_matrix
|
from utils import mult_matrix, translate_matrix
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue