add novel layout analysis
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@187 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
ffaaea0bac
commit
e01cb43e31
|
@ -5,7 +5,7 @@ from pdffont import PDFUnicodeNotDefined
|
|||
from pdftypes import LITERALS_DCT_DECODE
|
||||
from layout import LayoutContainer
|
||||
from layout import LTPage, LTText, LTLine, LTRect, LTPolygon
|
||||
from layout import LTFigure, LTImage, LTChar, LTTextBox, LTTextLine
|
||||
from layout import LTFigure, LTImage, LTChar, LTTextBox, LTTextFlow, LTTextLine
|
||||
from utils import apply_matrix_pt, mult_matrix
|
||||
from utils import enc, bbox2str
|
||||
|
||||
|
@ -218,6 +218,11 @@ class HTMLConverter(PDFConverter):
|
|||
self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||
for child in item:
|
||||
render(child)
|
||||
elif isinstance(item, LTTextFlow):
|
||||
for child in item:
|
||||
render(child)
|
||||
if self.debug:
|
||||
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||
elif isinstance(item, LTFigure):
|
||||
self.write_rect('green', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||
for child in item:
|
||||
|
@ -294,6 +299,11 @@ class XMLConverter(PDFConverter):
|
|||
for child in item:
|
||||
render(child)
|
||||
self.outfp.write('</textbox>\n')
|
||||
elif isinstance(item, LTTextFlow):
|
||||
self.outfp.write('<textflow bbox="%s">\n' % bbox2str(item.bbox))
|
||||
for child in item:
|
||||
render(child)
|
||||
self.outfp.write('</textflow>\n')
|
||||
elif isinstance(item, LTChar):
|
||||
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" size="%.3f">' %
|
||||
(enc(item.font.fontname), item.is_vertical(),
|
||||
|
|
|
@ -50,6 +50,375 @@ class LAParams(object):
|
|||
(self.direction, self.char_margin, self.line_margin, self.word_margin))
|
||||
|
||||
|
||||
## LayoutItem
|
||||
##
|
||||
class LayoutItem(object):
|
||||
|
||||
def __init__(self, bbox):
|
||||
self.set_bbox(bbox)
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<item bbox=%s>' % bbox2str(self.bbox))
|
||||
|
||||
def set_bbox(self, (x0,y0,x1,y1)):
|
||||
if x1 < x0: (x0,x1) = (x1,x0)
|
||||
if y1 < y0: (y0,y1) = (y1,y0)
|
||||
self.x0 = x0
|
||||
self.y0 = y0
|
||||
self.x1 = x1
|
||||
self.y1 = y1
|
||||
self.width = x1-x0
|
||||
self.height = y1-y0
|
||||
self.bbox = (x0, y0, x1, y1)
|
||||
return
|
||||
|
||||
def is_hoverlap(self, obj):
|
||||
assert isinstance(obj, LayoutItem)
|
||||
return obj.x0 <= self.x1 and self.x0 <= obj.x1
|
||||
|
||||
def hdistance(self, obj):
|
||||
assert isinstance(obj, LayoutItem)
|
||||
if self.is_hoverlap(obj):
|
||||
return 0
|
||||
else:
|
||||
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
|
||||
|
||||
def hoverlap(self, obj):
|
||||
assert isinstance(obj, LayoutItem)
|
||||
if self.is_hoverlap(obj):
|
||||
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
|
||||
else:
|
||||
return 0
|
||||
|
||||
def is_voverlap(self, obj):
|
||||
assert isinstance(obj, LayoutItem)
|
||||
return obj.y0 <= self.y1 and self.y0 <= obj.y1
|
||||
|
||||
def vdistance(self, obj):
|
||||
assert isinstance(obj, LayoutItem)
|
||||
if self.is_voverlap(obj):
|
||||
return 0
|
||||
else:
|
||||
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
|
||||
|
||||
def voverlap(self, obj):
|
||||
assert isinstance(obj, LayoutItem)
|
||||
if self.is_voverlap(obj):
|
||||
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
## LayoutContainer
|
||||
##
|
||||
class LayoutContainer(LayoutItem):
|
||||
|
||||
def __init__(self, bbox, objs=None):
|
||||
LayoutItem.__init__(self, bbox)
|
||||
if objs:
|
||||
self.objs = objs[:]
|
||||
else:
|
||||
self.objs = []
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<container %s>' % bbox2str(self.bbox))
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.objs)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.objs)
|
||||
|
||||
def add(self, obj):
|
||||
self.objs.append(obj)
|
||||
return
|
||||
|
||||
def merge(self, container):
|
||||
self.objs.extend(container.objs)
|
||||
return
|
||||
|
||||
# fixate(): determines its boundery and writing direction.
|
||||
def fixate(self):
|
||||
if not self.width and self.objs:
|
||||
(bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
|
||||
for obj in self.objs:
|
||||
bx0 = min(bx0, obj.x0)
|
||||
by0 = min(by0, obj.y0)
|
||||
bx1 = max(bx1, obj.x1)
|
||||
by1 = max(by1, obj.y1)
|
||||
self.set_bbox((bx0, by0, bx1, by1))
|
||||
return
|
||||
|
||||
|
||||
## LTPolygon
|
||||
##
|
||||
class LTPolygon(LayoutItem):
|
||||
|
||||
def __init__(self, linewidth, pts):
|
||||
LayoutItem.__init__(self, get_bounds(pts))
|
||||
self.pts = pts
|
||||
self.linewidth = linewidth
|
||||
return
|
||||
|
||||
def get_pts(self):
|
||||
return ','.join( '%.3f,%.3f' % p for p in self.pts )
|
||||
|
||||
|
||||
## LTLine
|
||||
##
|
||||
class LTLine(LTPolygon):
|
||||
|
||||
def __init__(self, linewidth, p0, p1):
|
||||
LTPolygon.__init__(self, linewidth, [p0, p1])
|
||||
return
|
||||
|
||||
|
||||
## LTRect
|
||||
##
|
||||
class LTRect(LTPolygon):
|
||||
|
||||
def __init__(self, linewidth, (x0,y0,x1,y1)):
|
||||
LTPolygon.__init__(self, linewidth, [(x0,y0), (x1,y0), (x1,y1), (x0,y1)])
|
||||
return
|
||||
|
||||
|
||||
## LTImage
|
||||
##
|
||||
class LTImage(LayoutItem):
|
||||
|
||||
def __init__(self, name, type, srcsize, bbox, data):
|
||||
LayoutItem.__init__(self, bbox)
|
||||
self.name = name
|
||||
self.type = type
|
||||
self.srcsize = srcsize
|
||||
self.data = data
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
(w,h) = self.srcsize
|
||||
return '<image %s %s %dx%d>' % (self.name, self.type, w, h)
|
||||
|
||||
|
||||
## LTText
|
||||
##
|
||||
class LTText(object):
|
||||
|
||||
def __init__(self, text):
|
||||
self.text = text
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<text %r>' % self.text
|
||||
|
||||
def is_upright(self):
|
||||
return True
|
||||
|
||||
|
||||
## LTAnon
|
||||
##
|
||||
class LTAnon(LTText):
|
||||
|
||||
pass
|
||||
|
||||
|
||||
## LTChar
|
||||
##
|
||||
class LTChar(LayoutItem, LTText):
|
||||
|
||||
debug = 1
|
||||
|
||||
def __init__(self, matrix, font, fontsize, scaling, cid):
|
||||
self.matrix = matrix
|
||||
self.font = font
|
||||
self.fontsize = fontsize
|
||||
self.vertical = font.is_vertical()
|
||||
self.adv = font.char_width(cid) * fontsize * scaling
|
||||
try:
|
||||
text = font.to_unichr(cid)
|
||||
except PDFUnicodeNotDefined:
|
||||
text = '?'
|
||||
LTText.__init__(self, text)
|
||||
# compute the boundary rectangle.
|
||||
if self.vertical:
|
||||
# vertical
|
||||
size = font.get_size() * fontsize
|
||||
displacement = (1000 - font.char_disp(cid)) * fontsize * .001
|
||||
(_,displacement) = apply_matrix_norm(self.matrix, (0, displacement))
|
||||
(dx,dy) = apply_matrix_norm(self.matrix, (size, self.adv))
|
||||
(_,_,_,_,tx,ty) = self.matrix
|
||||
tx -= dx/2
|
||||
ty += displacement
|
||||
bbox = (tx, ty+dy, tx+dx, ty)
|
||||
else:
|
||||
# horizontal
|
||||
size = font.get_size() * fontsize
|
||||
descent = font.get_descent() * fontsize
|
||||
(_,descent) = apply_matrix_norm(self.matrix, (0, descent))
|
||||
(dx,dy) = apply_matrix_norm(self.matrix, (self.adv, size))
|
||||
(_,_,_,_,tx,ty) = self.matrix
|
||||
ty += descent
|
||||
bbox = (tx, ty, tx+dx, ty+dy)
|
||||
LayoutItem.__init__(self, bbox)
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
if self.debug:
|
||||
return ('<char matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
|
||||
(matrix2str(self.matrix), self.font, self.fontsize,
|
||||
bbox2str(self.bbox), self.adv, self.text))
|
||||
else:
|
||||
return '<char %r>' % self.text
|
||||
|
||||
def get_size(self):
|
||||
return max(self.width, self.height)
|
||||
|
||||
def is_vertical(self):
|
||||
return self.vertical
|
||||
|
||||
def is_upright(self):
|
||||
(a,b,c,d,e,f) = self.matrix
|
||||
return 0 < a*d and b*c <= 0
|
||||
|
||||
|
||||
## LTFigure
|
||||
##
|
||||
class LTFigure(LayoutContainer):
|
||||
|
||||
def __init__(self, id, bbox, matrix):
|
||||
(x,y,w,h) = bbox
|
||||
bbox = get_bounds( apply_matrix_pt(matrix, (p,q))
|
||||
for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) )
|
||||
self.id = id
|
||||
self.matrix = matrix
|
||||
LayoutContainer.__init__(self, bbox)
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<figure id=%r bbox=%s matrix=%s>' %
|
||||
(self.id, bbox2str(self.bbox), matrix2str(self.matrix)))
|
||||
|
||||
|
||||
## LTTextLine
|
||||
##
|
||||
class LTTextLine(LayoutContainer):
|
||||
|
||||
def __init__(self, objs):
|
||||
LayoutContainer.__init__(self, (0,0,0,0), objs)
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<textline %s>' % bbox2str(self.bbox))
|
||||
|
||||
def get_text(self):
|
||||
return ''.join( obj.text for obj in self.objs if isinstance(obj, LTText) )
|
||||
|
||||
def find_neighbors(self, plane, ratio):
|
||||
raise NotImplementedError
|
||||
|
||||
class LTTextLineHorizontal(LTTextLine):
|
||||
|
||||
def __init__(self, objs, word_margin):
|
||||
LTTextLine.__init__(self, objs)
|
||||
LayoutContainer.fixate(self)
|
||||
objs = []
|
||||
x1 = INF
|
||||
for obj in sorted(self.objs, key=lambda obj: obj.x0):
|
||||
if isinstance(obj, LTChar) and word_margin:
|
||||
margin = word_margin * obj.width
|
||||
if x1 < obj.x0-margin:
|
||||
objs.append(LTAnon(' '))
|
||||
objs.append(obj)
|
||||
x1 = obj.x1
|
||||
self.objs = objs + [LTAnon('\n')]
|
||||
return
|
||||
|
||||
def find_neighbors(self, plane, ratio):
|
||||
h = ratio*self.height
|
||||
return plane.find((self.x0, self.y0-h, self.x1, self.y1+h))
|
||||
|
||||
class LTTextLineVertical(LTTextLine):
|
||||
|
||||
def __init__(self, objs, word_margin):
|
||||
LTTextLine.__init__(self, objs)
|
||||
LayoutContainer.fixate(self)
|
||||
objs = []
|
||||
y0 = -INF
|
||||
for obj in sorted(self.objs, key=lambda obj: -obj.y1):
|
||||
if isinstance(obj, LTChar) and word_margin:
|
||||
margin = word_margin * obj.height
|
||||
if obj.y1+margin < y0:
|
||||
objs.append(LTAnon(' '))
|
||||
objs.append(obj)
|
||||
y0 = obj.y0
|
||||
self.objs = objs + [LTAnon('\n')]
|
||||
return
|
||||
|
||||
def find_neighbors(self, plane, ratio):
|
||||
w = ratio*self.width
|
||||
return plane.find((self.x0-w, self.y0, self.x1+w, self.y1))
|
||||
|
||||
|
||||
## LTTextBox
|
||||
##
|
||||
## A set of text objects that are grouped within
|
||||
## a certain rectangular area.
|
||||
##
|
||||
class LTTextBox(LayoutContainer):
|
||||
|
||||
def __init__(self, objs):
|
||||
LayoutContainer.__init__(self, (0,0,0,0), objs)
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<textbox(%d) %s %r...>' % (len(self.objs), bbox2str(self.bbox), self.get_text()[:20]))
|
||||
|
||||
def get_text(self):
|
||||
return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) )
|
||||
|
||||
class LTTextBoxHorizontal(LTTextBox):
|
||||
|
||||
def fixate(self):
|
||||
LTTextBox.fixate(self)
|
||||
self.objs = sorted(self.objs, key=lambda obj: -obj.y1)
|
||||
return
|
||||
|
||||
class LTTextBoxVertical(LTTextBox):
|
||||
|
||||
def fixate(self):
|
||||
LTTextBox.fixate(self)
|
||||
self.objs = sorted(self.objs, key=lambda obj: -obj.x1)
|
||||
return
|
||||
|
||||
|
||||
## LTTextFlow
|
||||
##
|
||||
class LTTextFlow(LayoutContainer):
|
||||
|
||||
def __init__(self, objs):
|
||||
assert objs
|
||||
LayoutContainer.__init__(self, (0,0,0,0), objs)
|
||||
LayoutContainer.fixate(self)
|
||||
return
|
||||
|
||||
class LTTextFlowHorizontal(LTTextFlow):
|
||||
|
||||
def __init__(self, objs):
|
||||
LTTextFlow.__init__(self, objs)
|
||||
# reorder the objects from top-left to bottom-right.
|
||||
self.objs = sorted(self.objs, key=lambda obj: obj.x0-obj.y1)
|
||||
return
|
||||
|
||||
class LTTextFlowVertical(LTTextFlow):
|
||||
|
||||
def __init__(self, objs):
|
||||
LTTextFlow.__init__(self, objs)
|
||||
# reorder the objects from top-right to bottom-left.
|
||||
self.objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
|
||||
return
|
||||
|
||||
|
||||
## Plane
|
||||
##
|
||||
## A data structure for objects placed on a plane.
|
||||
|
@ -116,401 +485,34 @@ class ClusterBuilder(object):
|
|||
cluster.fixate()
|
||||
return list(clusters)
|
||||
|
||||
def build_clusters(groupfunc, objs, (hratio, vratio)):
|
||||
def build_boxes(groupfunc, objs, *args):
|
||||
plane = Plane(objs)
|
||||
builder = ClusterBuilder(groupfunc)
|
||||
for obj in objs:
|
||||
margin = obj.get_margin()
|
||||
hmargin = hratio * margin
|
||||
vmargin = vratio * margin
|
||||
neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin))
|
||||
neighbors = obj.find_neighbors(plane, *args)
|
||||
assert obj in neighbors, obj
|
||||
builder.group(neighbors)
|
||||
return builder.finish()
|
||||
|
||||
|
||||
## LayoutItem
|
||||
##
|
||||
class LayoutItem(object):
|
||||
|
||||
def __init__(self, bbox):
|
||||
self.set_bbox(bbox)
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<item bbox=%s>' % bbox2str(self.bbox))
|
||||
|
||||
def set_bbox(self, (x0,y0,x1,y1)):
|
||||
if x1 < x0: (x0,x1) = (x1,x0)
|
||||
if y1 < y0: (y0,y1) = (y1,y0)
|
||||
self.x0 = x0
|
||||
self.y0 = y0
|
||||
self.x1 = x1
|
||||
self.y1 = y1
|
||||
self.width = x1-x0
|
||||
self.height = y1-y0
|
||||
self.bbox = (x0, y0, x1, y1)
|
||||
return
|
||||
|
||||
def is_hoverlap(self, obj):
|
||||
assert isinstance(obj, LayoutItem)
|
||||
return obj.x0 <= self.x1 and self.x0 <= obj.x1
|
||||
|
||||
def hdistance(self, obj):
|
||||
assert isinstance(obj, LayoutItem)
|
||||
if self.is_hoverlap(obj):
|
||||
return 0
|
||||
else:
|
||||
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
|
||||
|
||||
def hoverlap(self, obj):
|
||||
assert isinstance(obj, LayoutItem)
|
||||
if self.is_hoverlap(obj):
|
||||
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
|
||||
else:
|
||||
return 0
|
||||
|
||||
def is_voverlap(self, obj):
|
||||
assert isinstance(obj, LayoutItem)
|
||||
return obj.y0 <= self.y1 and self.y0 <= obj.y1
|
||||
|
||||
def vdistance(self, obj):
|
||||
assert isinstance(obj, LayoutItem)
|
||||
if self.is_voverlap(obj):
|
||||
return 0
|
||||
else:
|
||||
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
|
||||
|
||||
def voverlap(self, obj):
|
||||
assert isinstance(obj, LayoutItem)
|
||||
if self.is_voverlap(obj):
|
||||
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
|
||||
else:
|
||||
return 0
|
||||
|
||||
def get_margin(self):
|
||||
return 0
|
||||
|
||||
def get_weight(self):
|
||||
return 0
|
||||
|
||||
|
||||
## LayoutContainer
|
||||
##
|
||||
class LayoutContainer(LayoutItem):
|
||||
|
||||
def __init__(self, bbox, objs=None):
|
||||
LayoutItem.__init__(self, bbox)
|
||||
if objs:
|
||||
self.objs = objs[:]
|
||||
else:
|
||||
self.objs = []
|
||||
self.weight = None
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<container %s>' % bbox2str(self.bbox))
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.objs)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.objs)
|
||||
|
||||
def add(self, obj):
|
||||
self.objs.append(obj)
|
||||
return
|
||||
|
||||
def merge(self, container):
|
||||
self.objs.extend(container.objs)
|
||||
return
|
||||
|
||||
# fixate(): determines its boundery and writing direction.
|
||||
def fixate(self):
|
||||
if not self.width and self.objs:
|
||||
(bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
|
||||
for obj in self.objs:
|
||||
bx0 = min(bx0, obj.x0)
|
||||
by0 = min(by0, obj.y0)
|
||||
bx1 = max(bx1, obj.x1)
|
||||
by1 = max(by1, obj.y1)
|
||||
self.set_bbox((bx0, by0, bx1, by1))
|
||||
self.weight = sum( obj.get_weight() for obj in self.objs )
|
||||
return
|
||||
|
||||
def get_weight(self):
|
||||
return self.weight
|
||||
|
||||
|
||||
## LTPolygon
|
||||
##
|
||||
class LTPolygon(LayoutItem):
|
||||
|
||||
def __init__(self, linewidth, pts):
|
||||
LayoutItem.__init__(self, get_bounds(pts))
|
||||
self.pts = pts
|
||||
self.linewidth = linewidth
|
||||
return
|
||||
|
||||
def get_pts(self):
|
||||
return ','.join( '%.3f,%.3f' % p for p in self.pts )
|
||||
|
||||
|
||||
## LTLine
|
||||
##
|
||||
class LTLine(LTPolygon):
|
||||
|
||||
def __init__(self, linewidth, p0, p1):
|
||||
LTPolygon.__init__(self, linewidth, [p0, p1])
|
||||
return
|
||||
|
||||
|
||||
## LTRect
|
||||
##
|
||||
class LTRect(LTPolygon):
|
||||
|
||||
def __init__(self, linewidth, (x0,y0,x1,y1)):
|
||||
LTPolygon.__init__(self, linewidth, [(x0,y0), (x1,y0), (x1,y1), (x0,y1)])
|
||||
return
|
||||
|
||||
|
||||
## LTImage
|
||||
##
|
||||
class LTImage(LayoutItem):
|
||||
|
||||
def __init__(self, name, type, srcsize, bbox, data):
|
||||
LayoutItem.__init__(self, bbox)
|
||||
self.name = name
|
||||
self.type = type
|
||||
self.srcsize = srcsize
|
||||
self.data = data
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
(w,h) = self.srcsize
|
||||
return '<image %s %s %dx%d>' % (self.name, self.type, w, h)
|
||||
|
||||
def get_weight(self):
|
||||
return 0
|
||||
|
||||
|
||||
## LTText
|
||||
##
|
||||
class LTText(object):
|
||||
|
||||
def __init__(self, text):
|
||||
self.text = text
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<text %r>' % self.text
|
||||
|
||||
def get_weight(self):
|
||||
return len(self.text)
|
||||
|
||||
def is_upright(self):
|
||||
return True
|
||||
|
||||
|
||||
## LTAnon
|
||||
##
|
||||
class LTAnon(LTText):
|
||||
|
||||
def get_weight(self):
|
||||
return 0
|
||||
|
||||
|
||||
## LTChar
|
||||
##
|
||||
class LTChar(LayoutItem, LTText):
|
||||
|
||||
debug = 1
|
||||
|
||||
def __init__(self, matrix, font, fontsize, scaling, cid):
|
||||
self.matrix = matrix
|
||||
self.font = font
|
||||
self.fontsize = fontsize
|
||||
self.vertical = font.is_vertical()
|
||||
self.adv = font.char_width(cid) * fontsize * scaling
|
||||
try:
|
||||
text = font.to_unichr(cid)
|
||||
except PDFUnicodeNotDefined:
|
||||
text = '?'
|
||||
LTText.__init__(self, text)
|
||||
# compute the boundary rectangle.
|
||||
if self.vertical:
|
||||
# vertical
|
||||
size = font.get_size() * fontsize
|
||||
displacement = (1000 - font.char_disp(cid)) * fontsize * .001
|
||||
(_,displacement) = apply_matrix_norm(self.matrix, (0, displacement))
|
||||
(dx,dy) = apply_matrix_norm(self.matrix, (size, self.adv))
|
||||
(_,_,_,_,tx,ty) = self.matrix
|
||||
tx -= dx/2
|
||||
ty += displacement
|
||||
bbox = (tx, ty+dy, tx+dx, ty)
|
||||
else:
|
||||
# horizontal
|
||||
size = font.get_size() * fontsize
|
||||
descent = font.get_descent() * fontsize
|
||||
(_,descent) = apply_matrix_norm(self.matrix, (0, descent))
|
||||
(dx,dy) = apply_matrix_norm(self.matrix, (self.adv, size))
|
||||
(_,_,_,_,tx,ty) = self.matrix
|
||||
ty += descent
|
||||
bbox = (tx, ty, tx+dx, ty+dy)
|
||||
LayoutItem.__init__(self, bbox)
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
if self.debug:
|
||||
return ('<char matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
|
||||
(matrix2str(self.matrix), self.font, self.fontsize,
|
||||
bbox2str(self.bbox), self.adv, self.text))
|
||||
else:
|
||||
return '<char %r>' % self.text
|
||||
|
||||
def get_margin(self):
|
||||
return min(self.width, self.height)
|
||||
|
||||
def get_size(self):
|
||||
return max(self.width, self.height)
|
||||
|
||||
def is_vertical(self):
|
||||
return self.vertical
|
||||
|
||||
def is_upright(self):
|
||||
(a,b,c,d,e,f) = self.matrix
|
||||
return 0 < a*d and b*c <= 0
|
||||
|
||||
|
||||
## LTFigure
|
||||
##
|
||||
class LTFigure(LayoutContainer):
|
||||
|
||||
def __init__(self, id, bbox, matrix):
|
||||
(x,y,w,h) = bbox
|
||||
bbox = get_bounds( apply_matrix_pt(matrix, (p,q))
|
||||
for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) )
|
||||
self.id = id
|
||||
self.matrix = matrix
|
||||
LayoutContainer.__init__(self, bbox)
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<figure id=%r bbox=%s matrix=%s>' %
|
||||
(self.id, bbox2str(self.bbox), matrix2str(self.matrix)))
|
||||
|
||||
|
||||
## LTTextLine
|
||||
##
|
||||
class LTTextLine(LayoutContainer):
|
||||
|
||||
def __init__(self, objs):
|
||||
LayoutContainer.__init__(self, (0,0,0,0), objs)
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<textline %s>' % bbox2str(self.bbox))
|
||||
|
||||
def get_margin(self):
|
||||
return min(self.width, self.height)
|
||||
|
||||
def get_text(self):
|
||||
return ''.join( obj.text for obj in self.objs if isinstance(obj, LTText) )
|
||||
|
||||
class LTTextLineHorizontal(LTTextLine):
|
||||
|
||||
def __init__(self, objs, word_margin):
|
||||
LTTextLine.__init__(self, objs)
|
||||
LayoutContainer.fixate(self)
|
||||
objs = []
|
||||
x1 = INF
|
||||
for obj in sorted(self.objs, key=lambda obj: obj.x0):
|
||||
if isinstance(obj, LTChar) and word_margin:
|
||||
margin = word_margin * obj.get_margin()
|
||||
if x1 < obj.x0-margin:
|
||||
objs.append(LTAnon(' '))
|
||||
objs.append(obj)
|
||||
x1 = obj.x1
|
||||
self.objs = objs + [LTAnon('\n')]
|
||||
return
|
||||
|
||||
class LTTextLineVertical(LTTextLine):
|
||||
|
||||
def __init__(self, objs, word_margin):
|
||||
LTTextLine.__init__(self, objs)
|
||||
LayoutContainer.fixate(self)
|
||||
objs = []
|
||||
y0 = -INF
|
||||
for obj in sorted(self.objs, key=lambda obj: -obj.y1):
|
||||
if isinstance(obj, LTChar) and word_margin:
|
||||
margin = word_margin * obj.get_margin()
|
||||
if obj.y1+margin < y0:
|
||||
objs.append(LTAnon(' '))
|
||||
objs.append(obj)
|
||||
y0 = obj.y0
|
||||
self.objs = objs + [LTAnon('\n')]
|
||||
return
|
||||
|
||||
|
||||
## LTTextBox
|
||||
##
|
||||
## A set of text objects that are grouped within
|
||||
## a certain rectangular area.
|
||||
##
|
||||
class LTTextBox(LayoutContainer):
|
||||
|
||||
def __init__(self, objs):
|
||||
LayoutContainer.__init__(self, (0,0,0,0), objs)
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<textbox(%d) %s %r...>' % (len(self.objs), bbox2str(self.bbox), self.get_text()[:20]))
|
||||
|
||||
def get_text(self):
|
||||
return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) )
|
||||
|
||||
class LTTextBoxHorizontal(LTTextBox):
|
||||
|
||||
def fixate(self):
|
||||
LTTextBox.fixate(self)
|
||||
self.objs = sorted(self.objs, key=lambda obj: -obj.y1)
|
||||
return
|
||||
|
||||
class LTTextBoxVertical(LTTextBox):
|
||||
|
||||
def fixate(self):
|
||||
LTTextBox.fixate(self)
|
||||
self.objs = sorted(self.objs, key=lambda obj: -obj.x1)
|
||||
return
|
||||
|
||||
|
||||
def tsort(objs, f):
|
||||
gi = dict( (obj,[]) for obj in objs )
|
||||
go = dict( (obj,[]) for obj in objs )
|
||||
for obj1 in objs:
|
||||
for obj2 in objs:
|
||||
if obj1 is obj2: continue
|
||||
if f(obj1, obj2): # obj1 -> obj2
|
||||
go[obj1].append(obj2)
|
||||
gi[obj2].append(obj1)
|
||||
r = objs[:]
|
||||
s = []
|
||||
while r:
|
||||
for obj in r:
|
||||
if not go[obj] or gi[obj]: continue
|
||||
for c in go[obj]:
|
||||
gi[c].remove(obj)
|
||||
del gi[obj]
|
||||
del go[obj]
|
||||
r.remove(obj)
|
||||
s.append(obj)
|
||||
break
|
||||
else:
|
||||
obj = r.pop()
|
||||
del gi[obj]
|
||||
del go[obj]
|
||||
s.append(obj)
|
||||
return s
|
||||
def group_hier(groupfunc, objs, distfunc):
|
||||
assert objs
|
||||
while 2 <= len(objs):
|
||||
mindist = INF
|
||||
minpair = None
|
||||
objs.sort(key=lambda obj: obj.width*obj.height)
|
||||
for (i,obj0) in enumerate(objs):
|
||||
for obj1 in objs[i+1:]:
|
||||
d = distfunc(obj0, obj1)
|
||||
if d < mindist:
|
||||
mindist = d
|
||||
minpair = (obj0, obj1)
|
||||
assert minpair
|
||||
(obj0, obj1) = minpair
|
||||
objs.remove(obj0)
|
||||
objs.remove(obj1)
|
||||
objs.append(groupfunc([obj0, obj1]))
|
||||
assert len(objs) == 1
|
||||
return objs.pop()
|
||||
|
||||
|
||||
## LTPage
|
||||
|
@ -538,7 +540,7 @@ class LTPage(LayoutContainer):
|
|||
textobjs = self.analyze_layout_vertical(textobjs, laparams)
|
||||
else:
|
||||
textobjs = self.analyze_layout_horizontal(textobjs, laparams)
|
||||
self.objs = textobjs + otherobjs
|
||||
self.objs = [textobjs] + otherobjs
|
||||
return
|
||||
|
||||
def analyze_layout_horizontal(self, objs, laparams):
|
||||
|
@ -567,16 +569,14 @@ class LTPage(LayoutContainer):
|
|||
prev = cur
|
||||
if line:
|
||||
lines.append(LTTextLineHorizontal(line, laparams.word_margin))
|
||||
boxes = build_clusters(LTTextBoxHorizontal, lines, (0, laparams.line_margin))
|
||||
boxes = build_boxes(LTTextBoxHorizontal, lines, laparams.line_margin)
|
||||
|
||||
def dist(obj1, obj2):
|
||||
return ((max(obj1.x1,obj2.x1) - min(obj1.x0,obj2.x0)) *
|
||||
(max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) -
|
||||
obj1.width*obj1.height - obj2.width*obj2.height)
|
||||
|
||||
def horder(obj1, obj2):
|
||||
if obj1.is_hoverlap(obj2):
|
||||
return obj2.y1 < obj1.y0
|
||||
elif obj1.is_voverlap(obj2):
|
||||
return obj1.x1 < obj2.x0
|
||||
else:
|
||||
return obj1.x1 < obj2.x0 and obj2.y1 < obj1.y0
|
||||
return tsort(boxes, horder)
|
||||
return group_hier(LTTextFlowHorizontal, boxes, dist)
|
||||
|
||||
def analyze_layout_vertical(self, objs, laparams):
|
||||
|
||||
|
@ -593,7 +593,6 @@ class LTPage(LayoutContainer):
|
|||
#
|
||||
# |<--->|
|
||||
# (line_overlap)
|
||||
#
|
||||
return ((min(obj1.width, obj2.width) * laparams.line_overlap < obj1.hoverlap(obj2)) and
|
||||
(obj1.vdistance(obj2) < min(obj1.height, obj2.height) * laparams.char_margin))
|
||||
|
||||
|
@ -609,13 +608,11 @@ class LTPage(LayoutContainer):
|
|||
prev = cur
|
||||
if line:
|
||||
lines.append(LTTextLineVertical(line, laparams.word_margin))
|
||||
boxes = build_clusters(LTTextBoxVertical, lines, (laparams.line_margin, 0))
|
||||
boxes = build_boxes(LTTextBoxVertical, lines, laparams.line_margin)
|
||||
|
||||
def vorder(obj1, obj2):
|
||||
if obj1.is_voverlap(obj2):
|
||||
return obj2.y1 < obj1.y0
|
||||
elif obj1.is_hoverlap(obj2):
|
||||
return obj1.x1 < obj2.x0
|
||||
else:
|
||||
return obj1.x1 < obj2.x0 and obj2.y1 < obj1.y0
|
||||
return tsort(boxes, vorder)
|
||||
def dist(obj1, obj2):
|
||||
return ((max(obj1.x1,obj2.x1) - min(obj1.x0,obj2.x0)) *
|
||||
(max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) -
|
||||
obj1.width*obj1.height - obj2.width*obj2.height)
|
||||
|
||||
return group_hier(LTTextFlowVertical, boxes, dist)
|
||||
|
|
Loading…
Reference in New Issue