code cleanup

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@188 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2010-03-22 04:00:18 +00:00
parent e01cb43e31
commit cd39642abe
4 changed files with 135 additions and 107 deletions

View File

@ -50,6 +50,6 @@ $(CMAPDST)/TO_UNICODE_Adobe_Korea1.py:
$(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr $(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
test: cmap test: cmap
cd samples && $(MAKE) all cd samples && $(MAKE) test
test_clean: test_clean:
-cd samples && $(MAKE) clean -cd samples && $(MAKE) clean

View File

@ -5,7 +5,7 @@ from pdffont import PDFUnicodeNotDefined
from pdftypes import LITERALS_DCT_DECODE from pdftypes import LITERALS_DCT_DECODE
from layout import LayoutContainer from layout import LayoutContainer
from layout import LTPage, LTText, LTLine, LTRect, LTPolygon from layout import LTPage, LTText, LTLine, LTRect, LTPolygon
from layout import LTFigure, LTImage, LTChar, LTTextBox, LTTextFlow, LTTextLine from layout import LTFigure, LTImage, LTChar, LTTextLine, LTTextBox, LTTextGroup
from utils import apply_matrix_pt, mult_matrix from utils import apply_matrix_pt, mult_matrix
from utils import enc, bbox2str from utils import enc, bbox2str
@ -32,9 +32,7 @@ class PDFPageAggregator(PDFTextDevice):
def end_page(self, _): def end_page(self, _):
assert not self.stack assert not self.stack
assert isinstance(self.cur_item, LTPage) assert isinstance(self.cur_item, LTPage)
self.cur_item.fixate() self.cur_item.fixate(self.laparams)
if self.laparams:
self.cur_item.analyze_layout(self.laparams)
self.pageno += 1 self.pageno += 1
return self.cur_item return self.cur_item
@ -143,7 +141,7 @@ class TextConverter(PDFConverter):
self.write('\n') self.write('\n')
page = PDFConverter.end_page(self, page) page = PDFConverter.end_page(self, page)
if self.showpageno: if self.showpageno:
self.write('Page %d\n' % page.id) self.write('Page %s\n' % page.pageid)
render(page) render(page)
self.write('\f') self.write('\f')
return return
@ -170,7 +168,16 @@ class HTMLConverter(PDFConverter):
def write_rect(self, color, width, x, y, w, h): def write_rect(self, color, width, x, y, w, h):
self.outfp.write('<span style="position:absolute; border: %s %dpx solid; ' self.outfp.write('<span style="position:absolute; border: %s %dpx solid; '
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' % 'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
(color, width, x*self.scale, y*self.scale, w*self.scale, h*self.scale)) (color, width,
x*self.scale, (self.yoffset-y)*self.scale,
w*self.scale, h*self.scale))
return
def write_text(self, text, x, y, size):
self.outfp.write('<span style="position:absolute; left:%dpx; top:%dpx; font-size:%dpx;">' %
(x*self.scale, (self.yoffset-y)*self.scale, size*self.scale))
self.write(text)
self.outfp.write('</span>\n')
return return
def write_image(self, image): def write_image(self, image):
@ -194,37 +201,30 @@ class HTMLConverter(PDFConverter):
def render(item): def render(item):
if isinstance(item, LTPage): if isinstance(item, LTPage):
self.yoffset += item.y1 self.yoffset += item.y1
self.write_rect('gray', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('gray', 1, item.x0, item.y1, item.width, item.height)
if self.showpageno: if self.showpageno:
self.outfp.write('<div style="position:absolute; top:%dpx;">' % self.outfp.write('<div style="position:absolute; top:%dpx;">' %
((self.yoffset-item.y1)*self.scale)) ((self.yoffset-item.y1)*self.scale))
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id)) self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.pageid, page.pageid))
for child in item: for child in item:
render(child) render(child)
elif isinstance(item, LTChar): elif isinstance(item, LTChar):
self.outfp.write('<span style="position:absolute; left:%dpx; top:%dpx; font-size:%dpx;">' % self.write_text(item.text, item.x0, item.y1, item.get_size())
(item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
item.get_size()*self.scale))
self.write(item.text)
self.outfp.write('</span>\n')
if self.debug: if self.debug:
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('red', 1, item.x0, item.y1, item.width, item.height)
elif isinstance(item, LTPolygon): elif isinstance(item, LTPolygon):
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('black', 1, item.x0, item.y1, item.width, item.height)
elif isinstance(item, LTTextLine): elif isinstance(item, LTTextLine):
for child in item: for child in item:
render(child) render(child)
elif isinstance(item, LTTextBox): elif isinstance(item, LTTextBox):
self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('blue', 1, item.x0, item.y1, item.width, item.height)
for child in item:
render(child)
elif isinstance(item, LTTextFlow):
for child in item: for child in item:
render(child) render(child)
if self.debug: if self.debug:
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_text(str(item.index+1), item.x0, item.y1, 20)
elif isinstance(item, LTFigure): elif isinstance(item, LTFigure):
self.write_rect('green', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('green', 1, item.x0, item.y1, item.width, item.height)
for child in item: for child in item:
render(child) render(child)
elif isinstance(item, LTImage): elif isinstance(item, LTImage):
@ -233,6 +233,14 @@ class HTMLConverter(PDFConverter):
return return
page = PDFConverter.end_page(self, page) page = PDFConverter.end_page(self, page)
render(page) render(page)
if page.layout:
def show_layout(item):
if isinstance(item, LTTextGroup):
self.write_rect('red', 1, item.x0, item.y1, item.width, item.height)
for child in item:
show_layout(child)
return
show_layout(page.layout)
self.yoffset += self.pagepad self.yoffset += self.pagepad
return return
@ -270,13 +278,13 @@ class XMLConverter(PDFConverter):
def render(item): def render(item):
if isinstance(item, LTPage): if isinstance(item, LTPage):
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' % self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
(item.id, bbox2str(item.bbox), item.rotate)) (item.pageid, bbox2str(item.bbox), item.rotate))
for child in item: for child in item:
render(child) render(child)
self.outfp.write('</page>\n') self.outfp.write('</page>\n')
elif isinstance(item, LTLine) and item.direction: elif isinstance(item, LTLine):
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />\n' % self.outfp.write('<line linewidth="%d" bbox="%s" />\n' %
(item.linewidth, item.direction, bbox2str(item.bbox))) (item.linewidth, bbox2str(item.bbox)))
elif isinstance(item, LTRect): elif isinstance(item, LTRect):
self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' % self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' %
(item.linewidth, bbox2str(item.bbox))) (item.linewidth, bbox2str(item.bbox)))
@ -284,8 +292,8 @@ class XMLConverter(PDFConverter):
self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>\n' % self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>\n' %
(item.linewidth, bbox2str(item.bbox), item.get_pts())) (item.linewidth, bbox2str(item.bbox), item.get_pts()))
elif isinstance(item, LTFigure): elif isinstance(item, LTFigure):
self.outfp.write('<figure id="%s" bbox="%s">\n' % self.outfp.write('<figure name="%s" bbox="%s">\n' %
(item.id, bbox2str(item.bbox))) (item.name, bbox2str(item.bbox)))
for child in item: for child in item:
render(child) render(child)
self.outfp.write('</figure>\n') self.outfp.write('</figure>\n')
@ -295,15 +303,10 @@ class XMLConverter(PDFConverter):
render(child) render(child)
self.outfp.write('</textline>\n') self.outfp.write('</textline>\n')
elif isinstance(item, LTTextBox): elif isinstance(item, LTTextBox):
self.outfp.write('<textbox bbox="%s">\n' % bbox2str(item.bbox)) self.outfp.write('<textbox id="%d" bbox="%s">\n' % (item.index, bbox2str(item.bbox)))
for child in item: for child in item:
render(child) render(child)
self.outfp.write('</textbox>\n') self.outfp.write('</textbox>\n')
elif isinstance(item, LTTextFlow):
self.outfp.write('<textflow bbox="%s">\n' % bbox2str(item.bbox))
for child in item:
render(child)
self.outfp.write('</textflow>\n')
elif isinstance(item, LTChar): elif isinstance(item, LTChar):
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" size="%.3f">' % self.outfp.write('<text font="%s" vertical="%s" bbox="%s" size="%.3f">' %
(enc(item.font.fontname), item.is_vertical(), (enc(item.font.fontname), item.is_vertical(),
@ -325,6 +328,19 @@ class XMLConverter(PDFConverter):
return return
page = PDFConverter.end_page(self, page) page = PDFConverter.end_page(self, page)
render(page) render(page)
if page.layout:
def show_layout(item):
if isinstance(item, LTTextBox):
self.outfp.write('<textbox id="%d" bbox="%s" />\n' % (item.index, bbox2str(item.bbox)))
elif isinstance(item, LTTextGroup):
self.outfp.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox))
for child in item:
show_layout(child)
self.outfp.write('</textgroup>\n')
return
self.outfp.write('<layout>\n')
show_layout(page.layout)
self.outfp.write('</layout>\n')
return return
def close(self): def close(self):

View File

@ -286,18 +286,18 @@ class LTChar(LayoutItem, LTText):
## ##
class LTFigure(LayoutContainer): class LTFigure(LayoutContainer):
def __init__(self, id, bbox, matrix): def __init__(self, name, bbox, matrix):
(x,y,w,h) = bbox (x,y,w,h) = bbox
bbox = get_bounds( apply_matrix_pt(matrix, (p,q)) bbox = get_bounds( apply_matrix_pt(matrix, (p,q))
for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) ) for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) )
self.id = id self.name = name
self.matrix = matrix self.matrix = matrix
LayoutContainer.__init__(self, bbox) LayoutContainer.__init__(self, bbox)
return return
def __repr__(self): def __repr__(self):
return ('<figure id=%r bbox=%s matrix=%s>' % return ('<figure %r bbox=%s matrix=%s>' %
(self.id, bbox2str(self.bbox), matrix2str(self.matrix))) (self.name, bbox2str(self.bbox), matrix2str(self.matrix)))
## LTTextLine ## LTTextLine
@ -369,10 +369,11 @@ class LTTextBox(LayoutContainer):
def __init__(self, objs): def __init__(self, objs):
LayoutContainer.__init__(self, (0,0,0,0), objs) LayoutContainer.__init__(self, (0,0,0,0), objs)
self.index = None
return return
def __repr__(self): def __repr__(self):
return ('<textbox(%d) %s %r...>' % (len(self.objs), bbox2str(self.bbox), self.get_text()[:20])) return ('<textbox(%s) %s %r...>' % (self.index, bbox2str(self.bbox), self.get_text()[:20]))
def get_text(self): def get_text(self):
return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) ) return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) )
@ -392,9 +393,9 @@ class LTTextBoxVertical(LTTextBox):
return return
## LTTextFlow ## LTTextGroup
## ##
class LTTextFlow(LayoutContainer): class LTTextGroup(LayoutContainer):
def __init__(self, objs): def __init__(self, objs):
assert objs assert objs
@ -402,18 +403,18 @@ class LTTextFlow(LayoutContainer):
LayoutContainer.fixate(self) LayoutContainer.fixate(self)
return return
class LTTextFlowHorizontal(LTTextFlow): class LTTextGroupHorizontal(LTTextGroup):
def __init__(self, objs): def __init__(self, objs):
LTTextFlow.__init__(self, objs) LTTextGroup.__init__(self, objs)
# reorder the objects from top-left to bottom-right. # reorder the objects from top-left to bottom-right.
self.objs = sorted(self.objs, key=lambda obj: obj.x0-obj.y1) self.objs = sorted(self.objs, key=lambda obj: obj.x0-obj.y1)
return return
class LTTextFlowVertical(LTTextFlow): class LTTextGroupVertical(LTTextGroup):
def __init__(self, objs): def __init__(self, objs):
LTTextFlow.__init__(self, objs) LTTextGroup.__init__(self, objs)
# reorder the objects from top-right to bottom-left. # reorder the objects from top-right to bottom-left.
self.objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1) self.objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
return return
@ -458,44 +459,32 @@ class Plane(object):
return list(xobjs) return list(xobjs)
## ClusterBuilder ## group_lines
## ##
class ClusterBuilder(object): def group_lines(groupfunc, objs, *args):
def __init__(self, groupfunc):
self.clusters = {}
self.groupfunc = groupfunc
return
# group(objs): groups given objects into one cluster.
def group(self, objs):
r = objs[:]
for obj1 in objs:
if obj1 in self.clusters:
r.extend(self.clusters.pop(obj1))
cluster = self.groupfunc(list(uniq(r)))
for obj in r:
self.clusters[obj] = cluster
return
# finish(): returns all the clusters.
def finish(self):
clusters = set(self.clusters.itervalues())
for cluster in clusters:
cluster.fixate()
return list(clusters)
def build_boxes(groupfunc, objs, *args):
plane = Plane(objs) plane = Plane(objs)
builder = ClusterBuilder(groupfunc) groups = {}
for obj in objs: for obj in objs:
neighbors = obj.find_neighbors(plane, *args) neighbors = obj.find_neighbors(plane, *args)
assert obj in neighbors, obj assert obj in neighbors, obj
builder.group(neighbors) members = neighbors[:]
return builder.finish() for obj1 in neighbors:
if obj1 in groups:
members.extend(groups.pop(obj1))
group = groupfunc(list(uniq(members)))
for obj in members:
groups[obj] = group
groups = set(groups.values())
for group in groups:
group.fixate()
return list(groups)
def group_hier(groupfunc, objs, distfunc):
## group_boxes
##
def group_boxes(groupfunc, objs, distfunc):
assert objs assert objs
objs = objs[:]
while 2 <= len(objs): while 2 <= len(objs):
mindist = INF mindist = INF
minpair = None minpair = None
@ -519,16 +508,43 @@ def group_hier(groupfunc, objs, distfunc):
## ##
class LTPage(LayoutContainer): class LTPage(LayoutContainer):
def __init__(self, id, bbox, rotate=0): def __init__(self, pageid, bbox, rotate=0):
LayoutContainer.__init__(self, bbox) LayoutContainer.__init__(self, bbox)
self.id = id self.pageid = pageid
self.rotate = rotate self.rotate = rotate
self.layout = None
return return
def __repr__(self): def __repr__(self):
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, bbox2str(self.bbox), self.rotate)) return ('<page(%r) bbox=%s rotate=%r>' % (self.pageid, bbox2str(self.bbox), self.rotate))
def analyze_layout(self, laparams): def fixate(self, laparams):
"""Perform the layout analysis."""
LayoutContainer.fixate(self)
(textobjs, otherobjs) = self.get_textobjs()
if not laparams or not textobjs: return
if laparams.direction == 'V':
textboxes = self.build_textbox_vertical(textobjs, laparams)
top = self.group_textbox_vertical(textboxes, laparams)
else:
textboxes = self.build_textbox_horizontal(textobjs, laparams)
top = self.group_textbox_horizontal(textboxes, laparams)
def assign_index(obj, i):
if isinstance(obj, LTTextBox):
obj.index = i
i += 1
elif isinstance(obj, LTTextGroup):
for x in obj:
i = assign_index(x, i)
return i
assign_index(top, 0)
textboxes.sort(key=lambda box:box.index)
self.objs = textboxes + otherobjs
self.layout = top
return
def get_textobjs(self):
"""Split all the objects in the page into text-related objects and others."""
textobjs = [] textobjs = []
otherobjs = [] otherobjs = []
for obj in self.objs: for obj in self.objs:
@ -536,16 +552,11 @@ class LTPage(LayoutContainer):
textobjs.append(obj) textobjs.append(obj)
else: else:
otherobjs.append(obj) otherobjs.append(obj)
if laparams.direction == 'V': return (textobjs, otherobjs)
textobjs = self.analyze_layout_vertical(textobjs, laparams)
else:
textobjs = self.analyze_layout_horizontal(textobjs, laparams)
self.objs = [textobjs] + otherobjs
return
def analyze_layout_horizontal(self, objs, laparams): def build_textbox_horizontal(self, objs, laparams):
"""Identify horizontal text regions in the page."""
def halign(obj1, obj2): def aligned(obj1, obj2):
# +------+ - - - # +------+ - - -
# | obj1 | - - +------+ - # | obj1 | - - +------+ -
# | | | obj2 | | (line_overlap) # | | | obj2 | | (line_overlap)
@ -556,12 +567,11 @@ class LTPage(LayoutContainer):
# (char_margin) # (char_margin)
return ((min(obj1.height, obj2.height) * laparams.line_overlap < obj1.voverlap(obj2)) and return ((min(obj1.height, obj2.height) * laparams.line_overlap < obj1.voverlap(obj2)) and
(obj1.hdistance(obj2) < min(obj1.width, obj2.width) * laparams.char_margin)) (obj1.hdistance(obj2) < min(obj1.width, obj2.width) * laparams.char_margin))
lines = [] lines = []
line = [] line = []
prev = None prev = None
for cur in objs: for cur in objs:
if prev is not None and not halign(prev, cur): if prev is not None and not aligned(prev, cur):
if line: if line:
lines.append(LTTextLineHorizontal(line, laparams.word_margin)) lines.append(LTTextLineHorizontal(line, laparams.word_margin))
line = [] line = []
@ -569,18 +579,11 @@ class LTPage(LayoutContainer):
prev = cur prev = cur
if line: if line:
lines.append(LTTextLineHorizontal(line, laparams.word_margin)) lines.append(LTTextLineHorizontal(line, laparams.word_margin))
boxes = build_boxes(LTTextBoxHorizontal, lines, laparams.line_margin) return group_lines(LTTextBoxHorizontal, lines, laparams.line_margin)
def dist(obj1, obj2): def build_textbox_vertical(self, objs, laparams):
return ((max(obj1.x1,obj2.x1) - min(obj1.x0,obj2.x0)) * """Identify vertical text regions in the page."""
(max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) - def aligned(obj1, obj2):
obj1.width*obj1.height - obj2.width*obj2.height)
return group_hier(LTTextFlowHorizontal, boxes, dist)
def analyze_layout_vertical(self, objs, laparams):
def valign(obj1, obj2):
# +------+ # +------+
# | obj1 | # | obj1 |
# | | # | |
@ -595,12 +598,11 @@ class LTPage(LayoutContainer):
# (line_overlap) # (line_overlap)
return ((min(obj1.width, obj2.width) * laparams.line_overlap < obj1.hoverlap(obj2)) and return ((min(obj1.width, obj2.width) * laparams.line_overlap < obj1.hoverlap(obj2)) and
(obj1.vdistance(obj2) < min(obj1.height, obj2.height) * laparams.char_margin)) (obj1.vdistance(obj2) < min(obj1.height, obj2.height) * laparams.char_margin))
lines = [] lines = []
line = [] line = []
prev = None prev = None
for cur in objs: for cur in objs:
if prev is not None and not valign(prev, cur): if prev is not None and not aligned(prev, cur):
if line: if line:
lines.append(LTTextLineVertical(line, laparams.word_margin)) lines.append(LTTextLineVertical(line, laparams.word_margin))
line = [] line = []
@ -608,11 +610,18 @@ class LTPage(LayoutContainer):
prev = cur prev = cur
if line: if line:
lines.append(LTTextLineVertical(line, laparams.word_margin)) lines.append(LTTextLineVertical(line, laparams.word_margin))
boxes = build_boxes(LTTextBoxVertical, lines, laparams.line_margin) return group_lines(LTTextBoxVertical, lines, laparams.line_margin)
def group_textbox_horizontal(self, boxes, laparams):
def dist(obj1, obj2): def dist(obj1, obj2):
return ((max(obj1.x1,obj2.x1) - min(obj1.x0,obj2.x0)) * return ((max(obj1.x1,obj2.x1) - min(obj1.x0,obj2.x0)) *
(max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) - (max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) -
obj1.width*obj1.height - obj2.width*obj2.height) obj1.width*obj1.height - obj2.width*obj2.height)
return group_boxes(LTTextGroupHorizontal, boxes, dist)
return group_hier(LTTextFlowVertical, boxes, dist) def group_textbox_vertical(self, boxes, laparams):
def dist(obj1, obj2):
return ((max(obj1.x1,obj2.x1) - min(obj1.x0,obj2.x0)) *
(max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) -
obj1.width*obj1.height - obj2.width*obj2.height)
return group_boxes(LTTextGroupVertical, boxes, dist)

View File

@ -38,7 +38,7 @@ XMLS= \
naacl06-shinyama.xml \ naacl06-shinyama.xml \
nlp2004slides.xml nlp2004slides.xml
all: htmls texts xmls test: htmls texts xmls
clean: clean:
-$(RM) $(HTMLS) -$(RM) $(HTMLS)
@ -53,9 +53,12 @@ xmls: $(XMLS)
.pdf.html: .pdf.html:
$(PDF2TXT) -t html $< > $@ $(PDF2TXT) -t html $< > $@
# $(CMP) $@ $@.ref
.pdf.xml: .pdf.xml:
$(PDF2TXT) -t xml $< > $@ $(PDF2TXT) -t xml $< > $@
# $(CMP) $@ $@.ref
.pdf.txt: .pdf.txt:
$(PDF2TXT) -t text $< > $@ $(PDF2TXT) -t text $< > $@
# $(CMP) $@ $@.ref