code cleanup
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@188 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
e01cb43e31
commit
cd39642abe
2
Makefile
2
Makefile
|
@ -50,6 +50,6 @@ $(CMAPDST)/TO_UNICODE_Adobe_Korea1.py:
|
||||||
$(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
|
$(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
|
||||||
|
|
||||||
test: cmap
|
test: cmap
|
||||||
cd samples && $(MAKE) all
|
cd samples && $(MAKE) test
|
||||||
test_clean:
|
test_clean:
|
||||||
-cd samples && $(MAKE) clean
|
-cd samples && $(MAKE) clean
|
||||||
|
|
|
@ -5,7 +5,7 @@ from pdffont import PDFUnicodeNotDefined
|
||||||
from pdftypes import LITERALS_DCT_DECODE
|
from pdftypes import LITERALS_DCT_DECODE
|
||||||
from layout import LayoutContainer
|
from layout import LayoutContainer
|
||||||
from layout import LTPage, LTText, LTLine, LTRect, LTPolygon
|
from layout import LTPage, LTText, LTLine, LTRect, LTPolygon
|
||||||
from layout import LTFigure, LTImage, LTChar, LTTextBox, LTTextFlow, LTTextLine
|
from layout import LTFigure, LTImage, LTChar, LTTextLine, LTTextBox, LTTextGroup
|
||||||
from utils import apply_matrix_pt, mult_matrix
|
from utils import apply_matrix_pt, mult_matrix
|
||||||
from utils import enc, bbox2str
|
from utils import enc, bbox2str
|
||||||
|
|
||||||
|
@ -32,9 +32,7 @@ class PDFPageAggregator(PDFTextDevice):
|
||||||
def end_page(self, _):
|
def end_page(self, _):
|
||||||
assert not self.stack
|
assert not self.stack
|
||||||
assert isinstance(self.cur_item, LTPage)
|
assert isinstance(self.cur_item, LTPage)
|
||||||
self.cur_item.fixate()
|
self.cur_item.fixate(self.laparams)
|
||||||
if self.laparams:
|
|
||||||
self.cur_item.analyze_layout(self.laparams)
|
|
||||||
self.pageno += 1
|
self.pageno += 1
|
||||||
return self.cur_item
|
return self.cur_item
|
||||||
|
|
||||||
|
@ -143,7 +141,7 @@ class TextConverter(PDFConverter):
|
||||||
self.write('\n')
|
self.write('\n')
|
||||||
page = PDFConverter.end_page(self, page)
|
page = PDFConverter.end_page(self, page)
|
||||||
if self.showpageno:
|
if self.showpageno:
|
||||||
self.write('Page %d\n' % page.id)
|
self.write('Page %s\n' % page.pageid)
|
||||||
render(page)
|
render(page)
|
||||||
self.write('\f')
|
self.write('\f')
|
||||||
return
|
return
|
||||||
|
@ -170,7 +168,16 @@ class HTMLConverter(PDFConverter):
|
||||||
def write_rect(self, color, width, x, y, w, h):
|
def write_rect(self, color, width, x, y, w, h):
|
||||||
self.outfp.write('<span style="position:absolute; border: %s %dpx solid; '
|
self.outfp.write('<span style="position:absolute; border: %s %dpx solid; '
|
||||||
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
|
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
|
||||||
(color, width, x*self.scale, y*self.scale, w*self.scale, h*self.scale))
|
(color, width,
|
||||||
|
x*self.scale, (self.yoffset-y)*self.scale,
|
||||||
|
w*self.scale, h*self.scale))
|
||||||
|
return
|
||||||
|
|
||||||
|
def write_text(self, text, x, y, size):
|
||||||
|
self.outfp.write('<span style="position:absolute; left:%dpx; top:%dpx; font-size:%dpx;">' %
|
||||||
|
(x*self.scale, (self.yoffset-y)*self.scale, size*self.scale))
|
||||||
|
self.write(text)
|
||||||
|
self.outfp.write('</span>\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
def write_image(self, image):
|
def write_image(self, image):
|
||||||
|
@ -194,37 +201,30 @@ class HTMLConverter(PDFConverter):
|
||||||
def render(item):
|
def render(item):
|
||||||
if isinstance(item, LTPage):
|
if isinstance(item, LTPage):
|
||||||
self.yoffset += item.y1
|
self.yoffset += item.y1
|
||||||
self.write_rect('gray', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
self.write_rect('gray', 1, item.x0, item.y1, item.width, item.height)
|
||||||
if self.showpageno:
|
if self.showpageno:
|
||||||
self.outfp.write('<div style="position:absolute; top:%dpx;">' %
|
self.outfp.write('<div style="position:absolute; top:%dpx;">' %
|
||||||
((self.yoffset-item.y1)*self.scale))
|
((self.yoffset-item.y1)*self.scale))
|
||||||
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id))
|
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.pageid, page.pageid))
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
elif isinstance(item, LTChar):
|
elif isinstance(item, LTChar):
|
||||||
self.outfp.write('<span style="position:absolute; left:%dpx; top:%dpx; font-size:%dpx;">' %
|
self.write_text(item.text, item.x0, item.y1, item.get_size())
|
||||||
(item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
|
|
||||||
item.get_size()*self.scale))
|
|
||||||
self.write(item.text)
|
|
||||||
self.outfp.write('</span>\n')
|
|
||||||
if self.debug:
|
if self.debug:
|
||||||
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
self.write_rect('red', 1, item.x0, item.y1, item.width, item.height)
|
||||||
elif isinstance(item, LTPolygon):
|
elif isinstance(item, LTPolygon):
|
||||||
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
self.write_rect('black', 1, item.x0, item.y1, item.width, item.height)
|
||||||
elif isinstance(item, LTTextLine):
|
elif isinstance(item, LTTextLine):
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
elif isinstance(item, LTTextBox):
|
elif isinstance(item, LTTextBox):
|
||||||
self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
self.write_rect('blue', 1, item.x0, item.y1, item.width, item.height)
|
||||||
for child in item:
|
|
||||||
render(child)
|
|
||||||
elif isinstance(item, LTTextFlow):
|
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
if self.debug:
|
if self.debug:
|
||||||
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
self.write_text(str(item.index+1), item.x0, item.y1, 20)
|
||||||
elif isinstance(item, LTFigure):
|
elif isinstance(item, LTFigure):
|
||||||
self.write_rect('green', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
self.write_rect('green', 1, item.x0, item.y1, item.width, item.height)
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
elif isinstance(item, LTImage):
|
elif isinstance(item, LTImage):
|
||||||
|
@ -233,6 +233,14 @@ class HTMLConverter(PDFConverter):
|
||||||
return
|
return
|
||||||
page = PDFConverter.end_page(self, page)
|
page = PDFConverter.end_page(self, page)
|
||||||
render(page)
|
render(page)
|
||||||
|
if page.layout:
|
||||||
|
def show_layout(item):
|
||||||
|
if isinstance(item, LTTextGroup):
|
||||||
|
self.write_rect('red', 1, item.x0, item.y1, item.width, item.height)
|
||||||
|
for child in item:
|
||||||
|
show_layout(child)
|
||||||
|
return
|
||||||
|
show_layout(page.layout)
|
||||||
self.yoffset += self.pagepad
|
self.yoffset += self.pagepad
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -270,13 +278,13 @@ class XMLConverter(PDFConverter):
|
||||||
def render(item):
|
def render(item):
|
||||||
if isinstance(item, LTPage):
|
if isinstance(item, LTPage):
|
||||||
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
|
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
|
||||||
(item.id, bbox2str(item.bbox), item.rotate))
|
(item.pageid, bbox2str(item.bbox), item.rotate))
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
self.outfp.write('</page>\n')
|
self.outfp.write('</page>\n')
|
||||||
elif isinstance(item, LTLine) and item.direction:
|
elif isinstance(item, LTLine):
|
||||||
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />\n' %
|
self.outfp.write('<line linewidth="%d" bbox="%s" />\n' %
|
||||||
(item.linewidth, item.direction, bbox2str(item.bbox)))
|
(item.linewidth, bbox2str(item.bbox)))
|
||||||
elif isinstance(item, LTRect):
|
elif isinstance(item, LTRect):
|
||||||
self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' %
|
self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' %
|
||||||
(item.linewidth, bbox2str(item.bbox)))
|
(item.linewidth, bbox2str(item.bbox)))
|
||||||
|
@ -284,8 +292,8 @@ class XMLConverter(PDFConverter):
|
||||||
self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>\n' %
|
self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>\n' %
|
||||||
(item.linewidth, bbox2str(item.bbox), item.get_pts()))
|
(item.linewidth, bbox2str(item.bbox), item.get_pts()))
|
||||||
elif isinstance(item, LTFigure):
|
elif isinstance(item, LTFigure):
|
||||||
self.outfp.write('<figure id="%s" bbox="%s">\n' %
|
self.outfp.write('<figure name="%s" bbox="%s">\n' %
|
||||||
(item.id, bbox2str(item.bbox)))
|
(item.name, bbox2str(item.bbox)))
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
self.outfp.write('</figure>\n')
|
self.outfp.write('</figure>\n')
|
||||||
|
@ -295,15 +303,10 @@ class XMLConverter(PDFConverter):
|
||||||
render(child)
|
render(child)
|
||||||
self.outfp.write('</textline>\n')
|
self.outfp.write('</textline>\n')
|
||||||
elif isinstance(item, LTTextBox):
|
elif isinstance(item, LTTextBox):
|
||||||
self.outfp.write('<textbox bbox="%s">\n' % bbox2str(item.bbox))
|
self.outfp.write('<textbox id="%d" bbox="%s">\n' % (item.index, bbox2str(item.bbox)))
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
self.outfp.write('</textbox>\n')
|
self.outfp.write('</textbox>\n')
|
||||||
elif isinstance(item, LTTextFlow):
|
|
||||||
self.outfp.write('<textflow bbox="%s">\n' % bbox2str(item.bbox))
|
|
||||||
for child in item:
|
|
||||||
render(child)
|
|
||||||
self.outfp.write('</textflow>\n')
|
|
||||||
elif isinstance(item, LTChar):
|
elif isinstance(item, LTChar):
|
||||||
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" size="%.3f">' %
|
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" size="%.3f">' %
|
||||||
(enc(item.font.fontname), item.is_vertical(),
|
(enc(item.font.fontname), item.is_vertical(),
|
||||||
|
@ -325,6 +328,19 @@ class XMLConverter(PDFConverter):
|
||||||
return
|
return
|
||||||
page = PDFConverter.end_page(self, page)
|
page = PDFConverter.end_page(self, page)
|
||||||
render(page)
|
render(page)
|
||||||
|
if page.layout:
|
||||||
|
def show_layout(item):
|
||||||
|
if isinstance(item, LTTextBox):
|
||||||
|
self.outfp.write('<textbox id="%d" bbox="%s" />\n' % (item.index, bbox2str(item.bbox)))
|
||||||
|
elif isinstance(item, LTTextGroup):
|
||||||
|
self.outfp.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox))
|
||||||
|
for child in item:
|
||||||
|
show_layout(child)
|
||||||
|
self.outfp.write('</textgroup>\n')
|
||||||
|
return
|
||||||
|
self.outfp.write('<layout>\n')
|
||||||
|
show_layout(page.layout)
|
||||||
|
self.outfp.write('</layout>\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
|
|
|
@ -286,18 +286,18 @@ class LTChar(LayoutItem, LTText):
|
||||||
##
|
##
|
||||||
class LTFigure(LayoutContainer):
|
class LTFigure(LayoutContainer):
|
||||||
|
|
||||||
def __init__(self, id, bbox, matrix):
|
def __init__(self, name, bbox, matrix):
|
||||||
(x,y,w,h) = bbox
|
(x,y,w,h) = bbox
|
||||||
bbox = get_bounds( apply_matrix_pt(matrix, (p,q))
|
bbox = get_bounds( apply_matrix_pt(matrix, (p,q))
|
||||||
for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) )
|
for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) )
|
||||||
self.id = id
|
self.name = name
|
||||||
self.matrix = matrix
|
self.matrix = matrix
|
||||||
LayoutContainer.__init__(self, bbox)
|
LayoutContainer.__init__(self, bbox)
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<figure id=%r bbox=%s matrix=%s>' %
|
return ('<figure %r bbox=%s matrix=%s>' %
|
||||||
(self.id, bbox2str(self.bbox), matrix2str(self.matrix)))
|
(self.name, bbox2str(self.bbox), matrix2str(self.matrix)))
|
||||||
|
|
||||||
|
|
||||||
## LTTextLine
|
## LTTextLine
|
||||||
|
@ -369,10 +369,11 @@ class LTTextBox(LayoutContainer):
|
||||||
|
|
||||||
def __init__(self, objs):
|
def __init__(self, objs):
|
||||||
LayoutContainer.__init__(self, (0,0,0,0), objs)
|
LayoutContainer.__init__(self, (0,0,0,0), objs)
|
||||||
|
self.index = None
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<textbox(%d) %s %r...>' % (len(self.objs), bbox2str(self.bbox), self.get_text()[:20]))
|
return ('<textbox(%s) %s %r...>' % (self.index, bbox2str(self.bbox), self.get_text()[:20]))
|
||||||
|
|
||||||
def get_text(self):
|
def get_text(self):
|
||||||
return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) )
|
return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) )
|
||||||
|
@ -392,9 +393,9 @@ class LTTextBoxVertical(LTTextBox):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
## LTTextFlow
|
## LTTextGroup
|
||||||
##
|
##
|
||||||
class LTTextFlow(LayoutContainer):
|
class LTTextGroup(LayoutContainer):
|
||||||
|
|
||||||
def __init__(self, objs):
|
def __init__(self, objs):
|
||||||
assert objs
|
assert objs
|
||||||
|
@ -402,18 +403,18 @@ class LTTextFlow(LayoutContainer):
|
||||||
LayoutContainer.fixate(self)
|
LayoutContainer.fixate(self)
|
||||||
return
|
return
|
||||||
|
|
||||||
class LTTextFlowHorizontal(LTTextFlow):
|
class LTTextGroupHorizontal(LTTextGroup):
|
||||||
|
|
||||||
def __init__(self, objs):
|
def __init__(self, objs):
|
||||||
LTTextFlow.__init__(self, objs)
|
LTTextGroup.__init__(self, objs)
|
||||||
# reorder the objects from top-left to bottom-right.
|
# reorder the objects from top-left to bottom-right.
|
||||||
self.objs = sorted(self.objs, key=lambda obj: obj.x0-obj.y1)
|
self.objs = sorted(self.objs, key=lambda obj: obj.x0-obj.y1)
|
||||||
return
|
return
|
||||||
|
|
||||||
class LTTextFlowVertical(LTTextFlow):
|
class LTTextGroupVertical(LTTextGroup):
|
||||||
|
|
||||||
def __init__(self, objs):
|
def __init__(self, objs):
|
||||||
LTTextFlow.__init__(self, objs)
|
LTTextGroup.__init__(self, objs)
|
||||||
# reorder the objects from top-right to bottom-left.
|
# reorder the objects from top-right to bottom-left.
|
||||||
self.objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
|
self.objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
|
||||||
return
|
return
|
||||||
|
@ -458,44 +459,32 @@ class Plane(object):
|
||||||
return list(xobjs)
|
return list(xobjs)
|
||||||
|
|
||||||
|
|
||||||
## ClusterBuilder
|
## group_lines
|
||||||
##
|
##
|
||||||
class ClusterBuilder(object):
|
def group_lines(groupfunc, objs, *args):
|
||||||
|
|
||||||
def __init__(self, groupfunc):
|
|
||||||
self.clusters = {}
|
|
||||||
self.groupfunc = groupfunc
|
|
||||||
return
|
|
||||||
|
|
||||||
# group(objs): groups given objects into one cluster.
|
|
||||||
def group(self, objs):
|
|
||||||
r = objs[:]
|
|
||||||
for obj1 in objs:
|
|
||||||
if obj1 in self.clusters:
|
|
||||||
r.extend(self.clusters.pop(obj1))
|
|
||||||
cluster = self.groupfunc(list(uniq(r)))
|
|
||||||
for obj in r:
|
|
||||||
self.clusters[obj] = cluster
|
|
||||||
return
|
|
||||||
|
|
||||||
# finish(): returns all the clusters.
|
|
||||||
def finish(self):
|
|
||||||
clusters = set(self.clusters.itervalues())
|
|
||||||
for cluster in clusters:
|
|
||||||
cluster.fixate()
|
|
||||||
return list(clusters)
|
|
||||||
|
|
||||||
def build_boxes(groupfunc, objs, *args):
|
|
||||||
plane = Plane(objs)
|
plane = Plane(objs)
|
||||||
builder = ClusterBuilder(groupfunc)
|
groups = {}
|
||||||
for obj in objs:
|
for obj in objs:
|
||||||
neighbors = obj.find_neighbors(plane, *args)
|
neighbors = obj.find_neighbors(plane, *args)
|
||||||
assert obj in neighbors, obj
|
assert obj in neighbors, obj
|
||||||
builder.group(neighbors)
|
members = neighbors[:]
|
||||||
return builder.finish()
|
for obj1 in neighbors:
|
||||||
|
if obj1 in groups:
|
||||||
|
members.extend(groups.pop(obj1))
|
||||||
|
group = groupfunc(list(uniq(members)))
|
||||||
|
for obj in members:
|
||||||
|
groups[obj] = group
|
||||||
|
groups = set(groups.values())
|
||||||
|
for group in groups:
|
||||||
|
group.fixate()
|
||||||
|
return list(groups)
|
||||||
|
|
||||||
def group_hier(groupfunc, objs, distfunc):
|
|
||||||
|
## group_boxes
|
||||||
|
##
|
||||||
|
def group_boxes(groupfunc, objs, distfunc):
|
||||||
assert objs
|
assert objs
|
||||||
|
objs = objs[:]
|
||||||
while 2 <= len(objs):
|
while 2 <= len(objs):
|
||||||
mindist = INF
|
mindist = INF
|
||||||
minpair = None
|
minpair = None
|
||||||
|
@ -519,16 +508,43 @@ def group_hier(groupfunc, objs, distfunc):
|
||||||
##
|
##
|
||||||
class LTPage(LayoutContainer):
|
class LTPage(LayoutContainer):
|
||||||
|
|
||||||
def __init__(self, id, bbox, rotate=0):
|
def __init__(self, pageid, bbox, rotate=0):
|
||||||
LayoutContainer.__init__(self, bbox)
|
LayoutContainer.__init__(self, bbox)
|
||||||
self.id = id
|
self.pageid = pageid
|
||||||
self.rotate = rotate
|
self.rotate = rotate
|
||||||
|
self.layout = None
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, bbox2str(self.bbox), self.rotate))
|
return ('<page(%r) bbox=%s rotate=%r>' % (self.pageid, bbox2str(self.bbox), self.rotate))
|
||||||
|
|
||||||
def analyze_layout(self, laparams):
|
def fixate(self, laparams):
|
||||||
|
"""Perform the layout analysis."""
|
||||||
|
LayoutContainer.fixate(self)
|
||||||
|
(textobjs, otherobjs) = self.get_textobjs()
|
||||||
|
if not laparams or not textobjs: return
|
||||||
|
if laparams.direction == 'V':
|
||||||
|
textboxes = self.build_textbox_vertical(textobjs, laparams)
|
||||||
|
top = self.group_textbox_vertical(textboxes, laparams)
|
||||||
|
else:
|
||||||
|
textboxes = self.build_textbox_horizontal(textobjs, laparams)
|
||||||
|
top = self.group_textbox_horizontal(textboxes, laparams)
|
||||||
|
def assign_index(obj, i):
|
||||||
|
if isinstance(obj, LTTextBox):
|
||||||
|
obj.index = i
|
||||||
|
i += 1
|
||||||
|
elif isinstance(obj, LTTextGroup):
|
||||||
|
for x in obj:
|
||||||
|
i = assign_index(x, i)
|
||||||
|
return i
|
||||||
|
assign_index(top, 0)
|
||||||
|
textboxes.sort(key=lambda box:box.index)
|
||||||
|
self.objs = textboxes + otherobjs
|
||||||
|
self.layout = top
|
||||||
|
return
|
||||||
|
|
||||||
|
def get_textobjs(self):
|
||||||
|
"""Split all the objects in the page into text-related objects and others."""
|
||||||
textobjs = []
|
textobjs = []
|
||||||
otherobjs = []
|
otherobjs = []
|
||||||
for obj in self.objs:
|
for obj in self.objs:
|
||||||
|
@ -536,16 +552,11 @@ class LTPage(LayoutContainer):
|
||||||
textobjs.append(obj)
|
textobjs.append(obj)
|
||||||
else:
|
else:
|
||||||
otherobjs.append(obj)
|
otherobjs.append(obj)
|
||||||
if laparams.direction == 'V':
|
return (textobjs, otherobjs)
|
||||||
textobjs = self.analyze_layout_vertical(textobjs, laparams)
|
|
||||||
else:
|
|
||||||
textobjs = self.analyze_layout_horizontal(textobjs, laparams)
|
|
||||||
self.objs = [textobjs] + otherobjs
|
|
||||||
return
|
|
||||||
|
|
||||||
def analyze_layout_horizontal(self, objs, laparams):
|
def build_textbox_horizontal(self, objs, laparams):
|
||||||
|
"""Identify horizontal text regions in the page."""
|
||||||
def halign(obj1, obj2):
|
def aligned(obj1, obj2):
|
||||||
# +------+ - - -
|
# +------+ - - -
|
||||||
# | obj1 | - - +------+ -
|
# | obj1 | - - +------+ -
|
||||||
# | | | obj2 | | (line_overlap)
|
# | | | obj2 | | (line_overlap)
|
||||||
|
@ -556,12 +567,11 @@ class LTPage(LayoutContainer):
|
||||||
# (char_margin)
|
# (char_margin)
|
||||||
return ((min(obj1.height, obj2.height) * laparams.line_overlap < obj1.voverlap(obj2)) and
|
return ((min(obj1.height, obj2.height) * laparams.line_overlap < obj1.voverlap(obj2)) and
|
||||||
(obj1.hdistance(obj2) < min(obj1.width, obj2.width) * laparams.char_margin))
|
(obj1.hdistance(obj2) < min(obj1.width, obj2.width) * laparams.char_margin))
|
||||||
|
|
||||||
lines = []
|
lines = []
|
||||||
line = []
|
line = []
|
||||||
prev = None
|
prev = None
|
||||||
for cur in objs:
|
for cur in objs:
|
||||||
if prev is not None and not halign(prev, cur):
|
if prev is not None and not aligned(prev, cur):
|
||||||
if line:
|
if line:
|
||||||
lines.append(LTTextLineHorizontal(line, laparams.word_margin))
|
lines.append(LTTextLineHorizontal(line, laparams.word_margin))
|
||||||
line = []
|
line = []
|
||||||
|
@ -569,18 +579,11 @@ class LTPage(LayoutContainer):
|
||||||
prev = cur
|
prev = cur
|
||||||
if line:
|
if line:
|
||||||
lines.append(LTTextLineHorizontal(line, laparams.word_margin))
|
lines.append(LTTextLineHorizontal(line, laparams.word_margin))
|
||||||
boxes = build_boxes(LTTextBoxHorizontal, lines, laparams.line_margin)
|
return group_lines(LTTextBoxHorizontal, lines, laparams.line_margin)
|
||||||
|
|
||||||
def dist(obj1, obj2):
|
def build_textbox_vertical(self, objs, laparams):
|
||||||
return ((max(obj1.x1,obj2.x1) - min(obj1.x0,obj2.x0)) *
|
"""Identify vertical text regions in the page."""
|
||||||
(max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) -
|
def aligned(obj1, obj2):
|
||||||
obj1.width*obj1.height - obj2.width*obj2.height)
|
|
||||||
|
|
||||||
return group_hier(LTTextFlowHorizontal, boxes, dist)
|
|
||||||
|
|
||||||
def analyze_layout_vertical(self, objs, laparams):
|
|
||||||
|
|
||||||
def valign(obj1, obj2):
|
|
||||||
# +------+
|
# +------+
|
||||||
# | obj1 |
|
# | obj1 |
|
||||||
# | |
|
# | |
|
||||||
|
@ -595,12 +598,11 @@ class LTPage(LayoutContainer):
|
||||||
# (line_overlap)
|
# (line_overlap)
|
||||||
return ((min(obj1.width, obj2.width) * laparams.line_overlap < obj1.hoverlap(obj2)) and
|
return ((min(obj1.width, obj2.width) * laparams.line_overlap < obj1.hoverlap(obj2)) and
|
||||||
(obj1.vdistance(obj2) < min(obj1.height, obj2.height) * laparams.char_margin))
|
(obj1.vdistance(obj2) < min(obj1.height, obj2.height) * laparams.char_margin))
|
||||||
|
|
||||||
lines = []
|
lines = []
|
||||||
line = []
|
line = []
|
||||||
prev = None
|
prev = None
|
||||||
for cur in objs:
|
for cur in objs:
|
||||||
if prev is not None and not valign(prev, cur):
|
if prev is not None and not aligned(prev, cur):
|
||||||
if line:
|
if line:
|
||||||
lines.append(LTTextLineVertical(line, laparams.word_margin))
|
lines.append(LTTextLineVertical(line, laparams.word_margin))
|
||||||
line = []
|
line = []
|
||||||
|
@ -608,11 +610,18 @@ class LTPage(LayoutContainer):
|
||||||
prev = cur
|
prev = cur
|
||||||
if line:
|
if line:
|
||||||
lines.append(LTTextLineVertical(line, laparams.word_margin))
|
lines.append(LTTextLineVertical(line, laparams.word_margin))
|
||||||
boxes = build_boxes(LTTextBoxVertical, lines, laparams.line_margin)
|
return group_lines(LTTextBoxVertical, lines, laparams.line_margin)
|
||||||
|
|
||||||
|
def group_textbox_horizontal(self, boxes, laparams):
|
||||||
def dist(obj1, obj2):
|
def dist(obj1, obj2):
|
||||||
return ((max(obj1.x1,obj2.x1) - min(obj1.x0,obj2.x0)) *
|
return ((max(obj1.x1,obj2.x1) - min(obj1.x0,obj2.x0)) *
|
||||||
(max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) -
|
(max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) -
|
||||||
obj1.width*obj1.height - obj2.width*obj2.height)
|
obj1.width*obj1.height - obj2.width*obj2.height)
|
||||||
|
return group_boxes(LTTextGroupHorizontal, boxes, dist)
|
||||||
|
|
||||||
return group_hier(LTTextFlowVertical, boxes, dist)
|
def group_textbox_vertical(self, boxes, laparams):
|
||||||
|
def dist(obj1, obj2):
|
||||||
|
return ((max(obj1.x1,obj2.x1) - min(obj1.x0,obj2.x0)) *
|
||||||
|
(max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) -
|
||||||
|
obj1.width*obj1.height - obj2.width*obj2.height)
|
||||||
|
return group_boxes(LTTextGroupVertical, boxes, dist)
|
||||||
|
|
|
@ -38,7 +38,7 @@ XMLS= \
|
||||||
naacl06-shinyama.xml \
|
naacl06-shinyama.xml \
|
||||||
nlp2004slides.xml
|
nlp2004slides.xml
|
||||||
|
|
||||||
all: htmls texts xmls
|
test: htmls texts xmls
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
-$(RM) $(HTMLS)
|
-$(RM) $(HTMLS)
|
||||||
|
@ -53,9 +53,12 @@ xmls: $(XMLS)
|
||||||
|
|
||||||
.pdf.html:
|
.pdf.html:
|
||||||
$(PDF2TXT) -t html $< > $@
|
$(PDF2TXT) -t html $< > $@
|
||||||
|
# $(CMP) $@ $@.ref
|
||||||
|
|
||||||
.pdf.xml:
|
.pdf.xml:
|
||||||
$(PDF2TXT) -t xml $< > $@
|
$(PDF2TXT) -t xml $< > $@
|
||||||
|
# $(CMP) $@ $@.ref
|
||||||
|
|
||||||
.pdf.txt:
|
.pdf.txt:
|
||||||
$(PDF2TXT) -t text $< > $@
|
$(PDF2TXT) -t text $< > $@
|
||||||
|
# $(CMP) $@ $@.ref
|
||||||
|
|
Loading…
Reference in New Issue