From cd39642abe6a9f62264f2018a944fdabab849621 Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Mon, 22 Mar 2010 04:00:18 +0000 Subject: [PATCH] code cleanup git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@188 1aa58f4a-7d42-0410-adbc-911cccaed67c --- Makefile | 2 +- pdfminer/converter.py | 82 +++++++++++++--------- pdfminer/layout.py | 153 ++++++++++++++++++++++-------------------- samples/Makefile | 5 +- 4 files changed, 135 insertions(+), 107 deletions(-) diff --git a/Makefile b/Makefile index c972828..d0f2f35 100644 --- a/Makefile +++ b/Makefile @@ -50,6 +50,6 @@ $(CMAPDST)/TO_UNICODE_Adobe_Korea1.py: $(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr test: cmap - cd samples && $(MAKE) all + cd samples && $(MAKE) test test_clean: -cd samples && $(MAKE) clean diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 8334ab0..6b609a5 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -5,7 +5,7 @@ from pdffont import PDFUnicodeNotDefined from pdftypes import LITERALS_DCT_DECODE from layout import LayoutContainer from layout import LTPage, LTText, LTLine, LTRect, LTPolygon -from layout import LTFigure, LTImage, LTChar, LTTextBox, LTTextFlow, LTTextLine +from layout import LTFigure, LTImage, LTChar, LTTextLine, LTTextBox, LTTextGroup from utils import apply_matrix_pt, mult_matrix from utils import enc, bbox2str @@ -32,9 +32,7 @@ class PDFPageAggregator(PDFTextDevice): def end_page(self, _): assert not self.stack assert isinstance(self.cur_item, LTPage) - self.cur_item.fixate() - if self.laparams: - self.cur_item.analyze_layout(self.laparams) + self.cur_item.fixate(self.laparams) self.pageno += 1 return self.cur_item @@ -143,7 +141,7 @@ class TextConverter(PDFConverter): self.write('\n') page = PDFConverter.end_page(self, page) if self.showpageno: - self.write('Page %d\n' % page.id) + self.write('Page %s\n' % page.pageid) render(page) self.write('\f') return @@ -170,7 +168,16 @@ class HTMLConverter(PDFConverter): def write_rect(self, color, width, x, y, w, h): self.outfp.write('\n' % - (color, width, x*self.scale, y*self.scale, w*self.scale, h*self.scale)) + (color, width, + x*self.scale, (self.yoffset-y)*self.scale, + w*self.scale, h*self.scale)) + return + + def write_text(self, text, x, y, size): + self.outfp.write('' % + (x*self.scale, (self.yoffset-y)*self.scale, size*self.scale)) + self.write(text) + self.outfp.write('\n') return def write_image(self, image): @@ -194,37 +201,30 @@ class HTMLConverter(PDFConverter): def render(item): if isinstance(item, LTPage): self.yoffset += item.y1 - self.write_rect('gray', 1, item.x0, self.yoffset-item.y1, item.width, item.height) + self.write_rect('gray', 1, item.x0, item.y1, item.width, item.height) if self.showpageno: self.outfp.write('
' % ((self.yoffset-item.y1)*self.scale)) - self.outfp.write('Page %s
\n' % (page.id, page.id)) + self.outfp.write('Page %s\n' % (page.pageid, page.pageid)) for child in item: render(child) elif isinstance(item, LTChar): - self.outfp.write('' % - (item.x0*self.scale, (self.yoffset-item.y1)*self.scale, - item.get_size()*self.scale)) - self.write(item.text) - self.outfp.write('\n') + self.write_text(item.text, item.x0, item.y1, item.get_size()) if self.debug: - self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height) + self.write_rect('red', 1, item.x0, item.y1, item.width, item.height) elif isinstance(item, LTPolygon): - self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height) + self.write_rect('black', 1, item.x0, item.y1, item.width, item.height) elif isinstance(item, LTTextLine): for child in item: render(child) elif isinstance(item, LTTextBox): - self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height) - for child in item: - render(child) - elif isinstance(item, LTTextFlow): + self.write_rect('blue', 1, item.x0, item.y1, item.width, item.height) for child in item: render(child) if self.debug: - self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height) + self.write_text(str(item.index+1), item.x0, item.y1, 20) elif isinstance(item, LTFigure): - self.write_rect('green', 1, item.x0, self.yoffset-item.y1, item.width, item.height) + self.write_rect('green', 1, item.x0, item.y1, item.width, item.height) for child in item: render(child) elif isinstance(item, LTImage): @@ -233,6 +233,14 @@ class HTMLConverter(PDFConverter): return page = PDFConverter.end_page(self, page) render(page) + if page.layout: + def show_layout(item): + if isinstance(item, LTTextGroup): + self.write_rect('red', 1, item.x0, item.y1, item.width, item.height) + for child in item: + show_layout(child) + return + show_layout(page.layout) self.yoffset += self.pagepad return @@ -270,13 +278,13 @@ class XMLConverter(PDFConverter): def render(item): if isinstance(item, LTPage): self.outfp.write('\n' % - (item.id, bbox2str(item.bbox), item.rotate)) + (item.pageid, bbox2str(item.bbox), item.rotate)) for child in item: render(child) self.outfp.write('\n') - elif isinstance(item, LTLine) and item.direction: - self.outfp.write('\n' % - (item.linewidth, item.direction, bbox2str(item.bbox))) + elif isinstance(item, LTLine): + self.outfp.write('\n' % + (item.linewidth, bbox2str(item.bbox))) elif isinstance(item, LTRect): self.outfp.write('\n' % (item.linewidth, bbox2str(item.bbox))) @@ -284,8 +292,8 @@ class XMLConverter(PDFConverter): self.outfp.write('\n' % (item.linewidth, bbox2str(item.bbox), item.get_pts())) elif isinstance(item, LTFigure): - self.outfp.write('
\n' % - (item.id, bbox2str(item.bbox))) + self.outfp.write('
\n' % + (item.name, bbox2str(item.bbox))) for child in item: render(child) self.outfp.write('
\n') @@ -295,15 +303,10 @@ class XMLConverter(PDFConverter): render(child) self.outfp.write('\n') elif isinstance(item, LTTextBox): - self.outfp.write('\n' % bbox2str(item.bbox)) + self.outfp.write('\n' % (item.index, bbox2str(item.bbox))) for child in item: render(child) self.outfp.write('\n') - elif isinstance(item, LTTextFlow): - self.outfp.write('\n' % bbox2str(item.bbox)) - for child in item: - render(child) - self.outfp.write('\n') elif isinstance(item, LTChar): self.outfp.write('' % (enc(item.font.fontname), item.is_vertical(), @@ -325,6 +328,19 @@ class XMLConverter(PDFConverter): return page = PDFConverter.end_page(self, page) render(page) + if page.layout: + def show_layout(item): + if isinstance(item, LTTextBox): + self.outfp.write('\n' % (item.index, bbox2str(item.bbox))) + elif isinstance(item, LTTextGroup): + self.outfp.write('\n' % bbox2str(item.bbox)) + for child in item: + show_layout(child) + self.outfp.write('\n') + return + self.outfp.write('\n') + show_layout(page.layout) + self.outfp.write('\n') return def close(self): diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 8e9c8b2..25f2d31 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -286,18 +286,18 @@ class LTChar(LayoutItem, LTText): ## class LTFigure(LayoutContainer): - def __init__(self, id, bbox, matrix): + def __init__(self, name, bbox, matrix): (x,y,w,h) = bbox bbox = get_bounds( apply_matrix_pt(matrix, (p,q)) for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) ) - self.id = id + self.name = name self.matrix = matrix LayoutContainer.__init__(self, bbox) return def __repr__(self): - return ('
' % - (self.id, bbox2str(self.bbox), matrix2str(self.matrix))) + return ('
' % + (self.name, bbox2str(self.bbox), matrix2str(self.matrix))) ## LTTextLine @@ -369,10 +369,11 @@ class LTTextBox(LayoutContainer): def __init__(self, objs): LayoutContainer.__init__(self, (0,0,0,0), objs) + self.index = None return def __repr__(self): - return ('' % (len(self.objs), bbox2str(self.bbox), self.get_text()[:20])) + return ('' % (self.index, bbox2str(self.bbox), self.get_text()[:20])) def get_text(self): return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) ) @@ -392,9 +393,9 @@ class LTTextBoxVertical(LTTextBox): return -## LTTextFlow +## LTTextGroup ## -class LTTextFlow(LayoutContainer): +class LTTextGroup(LayoutContainer): def __init__(self, objs): assert objs @@ -402,18 +403,18 @@ class LTTextFlow(LayoutContainer): LayoutContainer.fixate(self) return -class LTTextFlowHorizontal(LTTextFlow): +class LTTextGroupHorizontal(LTTextGroup): def __init__(self, objs): - LTTextFlow.__init__(self, objs) + LTTextGroup.__init__(self, objs) # reorder the objects from top-left to bottom-right. self.objs = sorted(self.objs, key=lambda obj: obj.x0-obj.y1) return -class LTTextFlowVertical(LTTextFlow): +class LTTextGroupVertical(LTTextGroup): def __init__(self, objs): - LTTextFlow.__init__(self, objs) + LTTextGroup.__init__(self, objs) # reorder the objects from top-right to bottom-left. self.objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1) return @@ -458,44 +459,32 @@ class Plane(object): return list(xobjs) -## ClusterBuilder +## group_lines ## -class ClusterBuilder(object): - - def __init__(self, groupfunc): - self.clusters = {} - self.groupfunc = groupfunc - return - - # group(objs): groups given objects into one cluster. - def group(self, objs): - r = objs[:] - for obj1 in objs: - if obj1 in self.clusters: - r.extend(self.clusters.pop(obj1)) - cluster = self.groupfunc(list(uniq(r))) - for obj in r: - self.clusters[obj] = cluster - return - - # finish(): returns all the clusters. - def finish(self): - clusters = set(self.clusters.itervalues()) - for cluster in clusters: - cluster.fixate() - return list(clusters) - -def build_boxes(groupfunc, objs, *args): +def group_lines(groupfunc, objs, *args): plane = Plane(objs) - builder = ClusterBuilder(groupfunc) + groups = {} for obj in objs: neighbors = obj.find_neighbors(plane, *args) assert obj in neighbors, obj - builder.group(neighbors) - return builder.finish() + members = neighbors[:] + for obj1 in neighbors: + if obj1 in groups: + members.extend(groups.pop(obj1)) + group = groupfunc(list(uniq(members))) + for obj in members: + groups[obj] = group + groups = set(groups.values()) + for group in groups: + group.fixate() + return list(groups) -def group_hier(groupfunc, objs, distfunc): + +## group_boxes +## +def group_boxes(groupfunc, objs, distfunc): assert objs + objs = objs[:] while 2 <= len(objs): mindist = INF minpair = None @@ -519,16 +508,43 @@ def group_hier(groupfunc, objs, distfunc): ## class LTPage(LayoutContainer): - def __init__(self, id, bbox, rotate=0): + def __init__(self, pageid, bbox, rotate=0): LayoutContainer.__init__(self, bbox) - self.id = id + self.pageid = pageid self.rotate = rotate + self.layout = None return def __repr__(self): - return ('' % (self.id, bbox2str(self.bbox), self.rotate)) + return ('' % (self.pageid, bbox2str(self.bbox), self.rotate)) - def analyze_layout(self, laparams): + def fixate(self, laparams): + """Perform the layout analysis.""" + LayoutContainer.fixate(self) + (textobjs, otherobjs) = self.get_textobjs() + if not laparams or not textobjs: return + if laparams.direction == 'V': + textboxes = self.build_textbox_vertical(textobjs, laparams) + top = self.group_textbox_vertical(textboxes, laparams) + else: + textboxes = self.build_textbox_horizontal(textobjs, laparams) + top = self.group_textbox_horizontal(textboxes, laparams) + def assign_index(obj, i): + if isinstance(obj, LTTextBox): + obj.index = i + i += 1 + elif isinstance(obj, LTTextGroup): + for x in obj: + i = assign_index(x, i) + return i + assign_index(top, 0) + textboxes.sort(key=lambda box:box.index) + self.objs = textboxes + otherobjs + self.layout = top + return + + def get_textobjs(self): + """Split all the objects in the page into text-related objects and others.""" textobjs = [] otherobjs = [] for obj in self.objs: @@ -536,16 +552,11 @@ class LTPage(LayoutContainer): textobjs.append(obj) else: otherobjs.append(obj) - if laparams.direction == 'V': - textobjs = self.analyze_layout_vertical(textobjs, laparams) - else: - textobjs = self.analyze_layout_horizontal(textobjs, laparams) - self.objs = [textobjs] + otherobjs - return + return (textobjs, otherobjs) - def analyze_layout_horizontal(self, objs, laparams): - - def halign(obj1, obj2): + def build_textbox_horizontal(self, objs, laparams): + """Identify horizontal text regions in the page.""" + def aligned(obj1, obj2): # +------+ - - - # | obj1 | - - +------+ - # | | | obj2 | | (line_overlap) @@ -556,12 +567,11 @@ class LTPage(LayoutContainer): # (char_margin) return ((min(obj1.height, obj2.height) * laparams.line_overlap < obj1.voverlap(obj2)) and (obj1.hdistance(obj2) < min(obj1.width, obj2.width) * laparams.char_margin)) - lines = [] line = [] prev = None for cur in objs: - if prev is not None and not halign(prev, cur): + if prev is not None and not aligned(prev, cur): if line: lines.append(LTTextLineHorizontal(line, laparams.word_margin)) line = [] @@ -569,18 +579,11 @@ class LTPage(LayoutContainer): prev = cur if line: lines.append(LTTextLineHorizontal(line, laparams.word_margin)) - boxes = build_boxes(LTTextBoxHorizontal, lines, laparams.line_margin) + return group_lines(LTTextBoxHorizontal, lines, laparams.line_margin) - def dist(obj1, obj2): - return ((max(obj1.x1,obj2.x1) - min(obj1.x0,obj2.x0)) * - (max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) - - obj1.width*obj1.height - obj2.width*obj2.height) - - return group_hier(LTTextFlowHorizontal, boxes, dist) - - def analyze_layout_vertical(self, objs, laparams): - - def valign(obj1, obj2): + def build_textbox_vertical(self, objs, laparams): + """Identify vertical text regions in the page.""" + def aligned(obj1, obj2): # +------+ # | obj1 | # | | @@ -595,12 +598,11 @@ class LTPage(LayoutContainer): # (line_overlap) return ((min(obj1.width, obj2.width) * laparams.line_overlap < obj1.hoverlap(obj2)) and (obj1.vdistance(obj2) < min(obj1.height, obj2.height) * laparams.char_margin)) - lines = [] line = [] prev = None for cur in objs: - if prev is not None and not valign(prev, cur): + if prev is not None and not aligned(prev, cur): if line: lines.append(LTTextLineVertical(line, laparams.word_margin)) line = [] @@ -608,11 +610,18 @@ class LTPage(LayoutContainer): prev = cur if line: lines.append(LTTextLineVertical(line, laparams.word_margin)) - boxes = build_boxes(LTTextBoxVertical, lines, laparams.line_margin) + return group_lines(LTTextBoxVertical, lines, laparams.line_margin) + def group_textbox_horizontal(self, boxes, laparams): def dist(obj1, obj2): return ((max(obj1.x1,obj2.x1) - min(obj1.x0,obj2.x0)) * (max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) - obj1.width*obj1.height - obj2.width*obj2.height) + return group_boxes(LTTextGroupHorizontal, boxes, dist) - return group_hier(LTTextFlowVertical, boxes, dist) + def group_textbox_vertical(self, boxes, laparams): + def dist(obj1, obj2): + return ((max(obj1.x1,obj2.x1) - min(obj1.x0,obj2.x0)) * + (max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) - + obj1.width*obj1.height - obj2.width*obj2.height) + return group_boxes(LTTextGroupVertical, boxes, dist) diff --git a/samples/Makefile b/samples/Makefile index b7490d6..6051c55 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -38,7 +38,7 @@ XMLS= \ naacl06-shinyama.xml \ nlp2004slides.xml -all: htmls texts xmls +test: htmls texts xmls clean: -$(RM) $(HTMLS) @@ -53,9 +53,12 @@ xmls: $(XMLS) .pdf.html: $(PDF2TXT) -t html $< > $@ +# $(CMP) $@ $@.ref .pdf.xml: $(PDF2TXT) -t xml $< > $@ +# $(CMP) $@ $@.ref .pdf.txt: $(PDF2TXT) -t text $< > $@ +# $(CMP) $@ $@.ref