layout analysis improved.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@93 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-05-04 08:29:36 +00:00
parent 43e5c05307
commit fd27d16acc
5 changed files with 285 additions and 120 deletions

View File

@ -1,11 +1,18 @@
#!/usr/bin/env python
import sys
from pdfdevice import PageItem
from utils import pick
INF = sys.maxint
## binary search
## bsearch
##
## Finds objects whose coordinates overlap with [v0,v1].
## It performs binary search so that the processing time
## should be around O(log n).
##
def bsearch(objs, v0, v1):
assert v0 <= v1
if v1 <= v0: return []
i0 = 0
i1 = len(objs)-1
while i0 <= i1:
@ -31,27 +38,79 @@ def bsearch(objs, v0, v1):
return []
## reorder_hv, reorder_vh
##
## Reorders objects according to its writing direction.
##
def reorder_hv(objs, hdir):
if 0 < hdir:
hkey = (lambda obj: obj.x0)
else:
hkey = (lambda obj: -obj.x1)
vkey = (lambda obj: -obj.y1)
r = []
line = []
for obj1 in sorted(objs, key=vkey):
if line and not line[-1].voverlap(obj1):
line.sort(key=hkey)
r.append(line)
line = []
line.append(obj1)
line.sort(key=hkey)
r.append(line)
return r
def reorder_vh(objs, hdir):
if 0 < hdir:
hkey = (lambda obj: obj.x0)
else:
hkey = (lambda obj: -obj.x1)
vkey = (lambda obj: -obj.y1)
r = []
line = []
for obj1 in sorted(objs, key=hkey):
if line and not line[-1].hoverlap(obj1):
line.sort(key=vkey)
r.append(line)
line = []
line.append(obj1)
line.sort(key=vkey)
r.append(line)
return r
## Plane
##
## A data structure for objects placed on a plane.
## Can efficiently find objects in a certain rectangular area.
## It maintains two parallel lists of objects, each of
## which is sorted by its x or y coordinate.
##
class Plane(object):
def __init__(self):
def __init__(self, objs):
self.xobjs = []
self.yobjs = []
for obj in objs:
self.place(obj)
self.fixate()
return
def add(self, (x0,y0,x1,y1), obj):
self.xobjs.append((x0, obj))
self.xobjs.append((x1, obj))
self.yobjs.append((y0, obj))
self.yobjs.append((y1, obj))
# place(obj): place an object in a certain area.
def place(self, obj):
self.xobjs.append((obj.x0, obj))
self.xobjs.append((obj.x1, obj))
self.yobjs.append((obj.y0, obj))
self.yobjs.append((obj.y1, obj))
return
def finish(self):
# fixate(): you must call this after adding all objects.
def fixate(self):
self.xobjs.sort()
self.yobjs.sort()
return
# find(): finds objects that are in a certain area.
def find(self, (x0,y0,x1,y1)):
xobjs = set(bsearch(self.xobjs, x0, x1))
yobjs = set(bsearch(self.yobjs, y0, y1))
@ -59,68 +118,127 @@ class Plane(object):
return objs
## TextBox
##
## A set of text objects that are clustered in
## a certain rectangular area.
##
class TextBox(PageItem):
def __init__(self, objs):
self.objs = set(objs)
self.vertical = False
self.length = None
return
def __repr__(self):
return ('<textbox %s %s items=%d>' % (self.bbox(), self.vertical, len(self.objs)))
def __len__(self):
return self.length
# merge(boxes): merges with other textboxes.
def merge(self, box):
self.objs.update(box.objs)
return
# finish(): determines its boundery and writing direction.
def finish(self):
assert self.objs
(bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
for obj in self.objs:
bx0 = min(bx0, obj.x0)
by0 = min(by0, obj.y0)
bx1 = max(bx1, obj.x1)
by1 = max(by1, obj.y1)
PageItem.__init__(self, (bx0, by0, bx1, by1))
self.length = sum( len(obj) for obj in self.objs )
for obj in self.objs:
self.vertical = obj.vertical
break
if 2 <= len(self.objs):
objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
if len(objs[0]) == 1 and len(objs[1]) == 1:
h = objs[0].voverlap(objs[1])
v = objs[0].hoverlap(objs[1])
self.vertical = (h < v)
return
def lines(self, ratio):
if self.vertical:
objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
for line in reorder_vh(objs, -1):
s = ''
y0 = -INF
for obj in line:
margin = abs(obj.fontsize * ratio)
if obj.y1 < y0-margin:
s += ' '
s += obj.text
y0 = obj.y0
yield s
else:
objs = sorted(self.objs, key=lambda obj: obj.x0-obj.y1)
for line in reorder_hv(objs, +1):
s = ''
x1 = INF
for obj in line:
margin = abs(obj.fontsize * ratio)
if x1+margin < obj.x0:
s += ' '
s += obj.text
x1 = obj.x1
yield s
return
## ClusterSet
##
## Maintains a set of TextBox objects.
## It incrementally constructs TextBox objects
## and group them when necessary. It gives
## a sequence of TextBox objects that represent
## the text stream of that page.
##
class ClusterSet(object):
def __init__(self):
self.clusters = {}
return
def add(self, obj):
self.clusters[obj] = (obj,)
return
def merge(self, objs):
allobjs = set(objs)
# add(objs): groups text objects if necessary.
def add(self, objs):
c = TextBox(objs)
for obj in objs:
if obj in self.clusters:
allobjs.update(self.clusters[obj])
c = tuple(allobjs)
for obj in allobjs:
c.merge(self.clusters[obj])
for obj in c.objs:
self.clusters[obj] = c
return
# finish(): returns all the TextBoxes in a page.
def finish(self):
return set(self.clusters.itervalues())
r = set(self.clusters.itervalues())
for textbox in r:
textbox.finish()
return r
def cluster_pageobjs(objs, ratio):
idx = dict( (obj,i) for (i,obj) in enumerate(objs) )
plane = Plane()
for obj in objs:
plane.add(obj.bbox, obj)
plane.finish()
# cluster_textobjs
def cluster_textobjs(objs, ratio):
plane = Plane(objs)
cset = ClusterSet()
for obj in objs:
(bx0,by0,bx1,by1) = obj.bbox
margin = abs(obj.fontsize * ratio)
x0 = min(bx0,bx1)
y0 = min(by0,by1)
x1 = max(bx0,bx1)
y1 = max(by0,by1)
found = plane.find((x0-margin, y0-margin, x1+margin, y1+margin))
if len(found) == 1:
cset.add(found.pop())
else:
cset.merge(found)
clusters = sorted(cset.finish(), key=lambda objs: idx[objs[0]])
neighbors = plane.find((obj.x0-margin, obj.y0-margin, obj.x1+margin, obj.y1+margin))
cset.add(neighbors)
clusters = cset.finish()
vertical = ((sum( len(textbox) for textbox in clusters )/2) <
sum( len(textbox) for textbox in clusters if textbox.vertical ))
if vertical:
lines = reorder_hv(clusters, -1)
else:
lines = reorder_vh(clusters, +1)
r = []
for objs in clusters:
objs = sorted(objs, key=lambda obj: idx[obj])
h = v = 0
(bx0,by0,bx1,by1) = objs[0].bbox
(lx0,ly0,_,_) = objs[0].bbox
for obj in objs[1:]:
(x0,y0,x1,y1) = obj.bbox
if len(obj.text) == 1 and abs(lx0-x0) < abs(ly0-y0):
v += 1
else:
h += 1
(lx0,ly0) = (x0,y0)
bx0 = min(bx0, x0)
bx1 = max(bx1, x1)
by0 = min(by0, y0)
by1 = max(by1, y1)
r.append(((bx0,by0,bx1,by1), h < v, objs))
for line in lines:
r.extend(line)
return r

View File

@ -2,7 +2,7 @@
import sys
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
from pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfdevice import PDFDevice, PageItem, FigureItem, TextItem, PDFPageAggregator
from pdfdevice import PDFDevice, PageItem, Page, FigureItem, TextItem, PDFPageAggregator
from pdffont import PDFUnicodeNotDefined
from cmap import CMapDB
@ -19,7 +19,7 @@ def get_textobjs(item, r=None):
if r == None: r = []
if isinstance(item, TextItem):
r.append(item)
elif isinstance(item, PageItem):
elif isinstance(item, Page):
for child in item.objs:
get_textobjs(child, r)
return r
@ -49,8 +49,8 @@ class SGMLConverter(PDFConverter):
f(child)
self.outfp.write('</figure>\n')
elif isinstance(item, TextItem):
self.outfp.write('<text font="%s" direction="%s" bbox="%s" fontsize="%.3f">' %
(enc(item.font.fontname, self.codec), item.direction, bbox, item.fontsize))
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
(enc(item.font.fontname, self.codec), item.vertical, bbox, item.fontsize))
self.outfp.write(enc(item.text, self.codec))
self.outfp.write('</text>\n')
bbox = '%.3f,%.3f,%.3f,%.3f' % page.bbox
@ -79,42 +79,45 @@ class HTMLConverter(PDFConverter):
return
def end_page(self, page):
from cluster import cluster_pageobjs
from cluster import cluster_textobjs
page = PDFConverter.end_page(self, page)
(x0,y0,x1,y1) = page.bbox
self.yoffset += y1
self.yoffset += page.y1
if self.pagenum:
self.outfp.write('<div style="position:absolute; top:%dpx;"><a name="%s">Page %s</a></div>' %
((self.yoffset-y1)*self.scale, page.id, page.id))
((self.yoffset-page.y1)*self.scale, page.id, page.id))
self.outfp.write('<span style="position:absolute; border: 1px solid gray; '
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
(x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale))
(page.x0*self.scale, (self.yoffset-page.y1)*self.scale,
page.width*self.scale, page.height*self.scale))
def draw(item):
if isinstance(item, FigureItem):
for child in item.objs:
draw(child)
elif isinstance(item, TextItem):
if item.direction == 2:
if item.vertical:
wmode = 'tb-rl'
else:
wmode = 'lr-tb'
(x0,y0,x1,y1) = item.bbox
self.outfp.write('<span style="position:absolute; writing-mode:%s; left:%dpx; top:%dpx; font-size:%dpx;">' %
(wmode, x0*self.scale, (self.yoffset-y1)*self.scale, item.fontsize*self.scale))
self.outfp.write('<span style="position:absolute; writing-mode:%s;'
' left:%dpx; top:%dpx; font-size:%dpx;">' %
(wmode, item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
item.fontsize*self.scale))
self.outfp.write(enc(item.text, self.codec))
self.outfp.write('</span>\n')
if self.show_text_border:
self.outfp.write('<span style="position:absolute; border: 1px solid red; '
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
(x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale))
(item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
item.width*self.scale, self.height*self.scale))
for child in page.objs:
draw(child)
if self.cluster_margin:
clusters = cluster_pageobjs(get_textobjs(page), self.cluster_margin)
for ((x0,y0,x1,y1),_,objs) in clusters:
clusters = cluster_textobjs(get_textobjs(page), self.cluster_margin)
for textbox in clusters:
self.outfp.write('<span style="position:absolute; border: 1px solid red; '
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
(x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale))
(textbox.x0*self.scale, (self.yoffset-textbox.y1)*self.scale,
textbox.width*self.scale, textbox.height*self.scale))
self.yoffset += self.pagepad
return
@ -135,30 +138,25 @@ class TextConverter(PDFConverter):
if cluster_margin == None:
cluster_margin = 0.5
self.cluster_margin = cluster_margin
self.word_margin = 0.2
return
def end_page(self, page):
from cluster import cluster_pageobjs
from cluster import cluster_textobjs
page = PDFConverter.end_page(self, page)
if self.pagenum:
self.outfp.write('Page %d\n' % page.id)
if self.cluster_margin:
textobjs = get_textobjs(page)
clusters = cluster_pageobjs(textobjs, self.cluster_margin)
for (_,vertical,objs) in clusters:
for (i,item) in enumerate(objs):
(x0,y0,x1,y1) = item.bbox
if (i and
((not vertical and (y1 < ly0 or ly1 < y0)) or
(vertical and (x1 < lx0 or lx1 < x0)))):
self.outfp.write('\n')
(lx0,ly0,lx1,ly1) = (x0,y0,x1,y1)
self.outfp.write(item.text.encode(self.codec, 'replace'))
self.outfp.write('\n\n')
clusters = cluster_textobjs(textobjs, self.cluster_margin)
for textbox in clusters:
for line in textbox.lines(self.word_margin):
self.outfp.write(line.encode(self.codec, 'replace')+'\n')
self.outfp.write('\n')
else:
for item in page.objs:
if isinstance(item, TextItem):
self.outfp.write(item.text.encode(self.codec, 'replace'))
for obj in page.objs:
if isinstance(obj, TextItem):
self.outfp.write(obj.text.encode(self.codec, 'replace'))
self.outfp.write('\n')
self.outfp.write('\f')
return

View File

@ -52,74 +52,119 @@ class PDFDevice(object):
return
## PageItem
## Page
##
class PageItem(object):
def __init__(self, (x0,y0,x1,y1)):
#assert x0 <= x1 and y0 <= y1
self.x0 = x0
self.y0 = y0
self.x1 = x1
self.y1 = y1
self.width = x1-x0
self.height = y1-y0
return
def __repr__(self):
return ('<pageitem bbox=%s>' % (self.bbox()))
def __init__(self, id, (x0,y0,x1,y1), rotate=0):
self.id = id
self.bbox = (x0, y0, x1, y1)
self.rotate = rotate
def bbox(self):
return rect2str((self.x0, self.y0, self.x1, self.y1))
def hoverlap(self, obj):
assert isinstance(obj, PageItem)
if self.x1 <= obj.x0 or obj.x1 <= self.x0:
return 0
else:
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
def voverlap(self, obj):
assert isinstance(obj, PageItem)
if self.y1 <= obj.y0 or obj.y1 <= self.y0:
return 0
else:
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
class PageContainer(PageItem):
def __init__(self, bbox):
PageItem.__init__(self, bbox)
self.objs = []
return
def __repr__(self):
return ('<page id=%r bbox=%r rotate=%r>' % (self.id, self.bbox, self.rotate))
def add(self, obj):
self.objs.append(obj)
return
class Page(PageContainer):
def __init__(self, id, bbox, rotate=0):
PageContainer.__init__(self, bbox)
self.id = id
self.rotate = rotate
return
def __repr__(self):
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.bbox(), self.rotate))
## FigureItem
##
class FigureItem(PageItem):
class FigureItem(PageContainer):
def __init__(self, id, bbox):
PageContainer.__init__(self, bbox)
self.id = id
return
def __repr__(self):
return ('<figure id=%r bbox=%r>' % (self.id, self.bbox))
return ('<figure id=%r bbox=%s>' % (self.id, self.bbox()))
## TextItem
##
class TextItem(object):
class TextItem(PageItem):
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
assert chars
self.matrix = matrix
self.font = font
(_,_,_,_,tx,ty) = self.matrix
self.direction = 0
self.text = ''
adv = 0
for (char,cid) in chars:
self.text += char
adv += font.char_width(cid)
self.vertical = self.font.is_vertical()
self.text = ''.join( char for (char,_) in chars )
adv = sum( font.char_width(cid) for (_,cid) in chars )
adv = (adv * fontsize + len(chars)*charspace) * scaling * .01
size = (font.get_ascent() - font.get_descent()) * fontsize
if not self.font.is_vertical():
if not self.vertical:
# horizontal text
self.direction = 1
self.vertical = False
(dx,dy) = apply_matrix_norm(self.matrix, (adv,size))
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
ty += descent
self.adv = (dx, 0)
self.bbox = (tx, ty, tx+dx, ty+dy)
bbox = (tx, ty, tx+dx, ty+dy)
else:
# vertical text
self.direction = 2
(_,cid) = chars[0]
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001))
(dx,dy) = apply_matrix_norm(self.matrix, (size,adv))
tx -= dx/2
ty += disp
self.adv = (0, dy)
self.bbox = (tx, ty+dy, tx+dx, ty)
bbox = (tx, ty+dy, tx+dx, ty)
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
PageItem.__init__(self, bbox)
return
def __len__(self):
return len(self.text)
def __repr__(self):
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s text=%r adv=%s>' %
(matrix2str(self.matrix), self.font, self.fontsize,
rect2str(self.bbox), self.text, point2str(self.adv)))
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
(matrix2str(self.matrix), self.font, self.fontsize, self.bbox(),
point2str(self.adv), self.text))
## PDFPageAggregator
@ -133,7 +178,7 @@ class PDFPageAggregator(PDFDevice):
return
def begin_page(self, page):
self.cur_item = PageItem(self.pageno, page.mediabox, page.rotate)
self.cur_item = Page(self.pageno, page.mediabox, page.rotate)
return
def end_page(self, _):
@ -177,7 +222,8 @@ class PDFPageAggregator(PDFDevice):
def render_chars(self, textmatrix, textstate, chars):
if not chars: return (0, 0)
item = TextItem(textmatrix, textstate.font, textstate.fontsize, textstate.charspace, textstate.scaling, chars)
item = TextItem(textmatrix, textstate.font, textstate.fontsize,
textstate.charspace, textstate.scaling, chars)
self.cur_item.add(item)
return item.adv
@ -199,7 +245,7 @@ class PDFPageAggregator(PDFDevice):
(cidcoding, cid) = e.args
char = self.handle_undefined_char(cidcoding, cid)
chars.append((char, cid))
if cid == 32 and not font.is_multibyte():
if textstate.wordspace and not font.is_multibyte() and cid == 32:
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
dx += textstate.wordspace * textstate.scaling * .01
textmatrix = translate_matrix(textmatrix, (dx, dy))

View File

@ -359,9 +359,6 @@ class PDFFont(object):
def string_width(self, s):
return sum( self.char_width(cid) for cid in self.decode(s) )
def space_width(self):
return max(self.char_width(32), self.char_width(44), self.char_width(46)) * 0.5
# PDFSimpleFont
class PDFSimpleFont(PDFFont):
@ -572,9 +569,6 @@ class PDFCIDFont(PDFFont):
chars = unpack('>%dH' % (len(code)/2), code)
return ''.join( unichr(c) for c in chars )
def space_width(self):
return 0
# main
def main(argv):

View File

@ -98,3 +98,12 @@ def decode_text(s):
return unicode(s[2:], 'utf-16be', 'ignore')
else:
return ''.join( PDFDocEncoding[ord(c)] for c in s )
##
def pick(seq, func, maxobj=None):
maxscore = None
for obj in seq:
score = func(obj)
if maxscore == None or maxscore < score:
(maxscore,maxobj) = (score,obj)
return maxobj