code cleanup

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@188 1aa58f4a-7d42-0410-adbc-911cccaed67c
2010-03-22 04:00:18 +00:00 · 2010-03-22 04:00:18 +00:00 · cd39642abe
parent e01cb43e31
commit cd39642abe
4 changed files with 135 additions and 107 deletions
--- a/2
+++ b/2
@ -50,6 +50,6 @@ $(CMAPDST)/TO_UNICODE_Adobe_Korea1.py:
 	$(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr

 test: cmap
-	cd samples && $(MAKE) all
+	cd samples && $(MAKE) test
 test_clean:
 	-cd samples && $(MAKE) clean
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@ -5,7 +5,7 @@ from pdffont import PDFUnicodeNotDefined
 from pdftypes import LITERALS_DCT_DECODE
 from layout import LayoutContainer
 from layout import LTPage, LTText, LTLine, LTRect, LTPolygon
-from layout import LTFigure, LTImage, LTChar, LTTextBox, LTTextFlow, LTTextLine
+from layout import LTFigure, LTImage, LTChar, LTTextLine, LTTextBox, LTTextGroup
 from utils import apply_matrix_pt, mult_matrix
 from utils import enc, bbox2str

@ -32,9 +32,7 @@ class PDFPageAggregator(PDFTextDevice):
    def end_page(self, _):
        assert not self.stack
        assert isinstance(self.cur_item, LTPage)
-        self.cur_item.fixate()
-        if self.laparams:
-            self.cur_item.analyze_layout(self.laparams)
+        self.cur_item.fixate(self.laparams)
        self.pageno += 1
        return self.cur_item

@ -143,7 +141,7 @@ class TextConverter(PDFConverter):
                self.write('\n')
        page = PDFConverter.end_page(self, page)
        if self.showpageno:
-            self.write('Page %d\n' % page.id)
+            self.write('Page %s\n' % page.pageid)
        render(page)
        self.write('\f')
        return
@ -170,7 +168,16 @@ class HTMLConverter(PDFConverter):
    def write_rect(self, color, width, x, y, w, h):
        self.outfp.write('<span style="position:absolute; border: %s %dpx solid; '
                         'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
-                         (color, width, x*self.scale, y*self.scale, w*self.scale, h*self.scale))
+                         (color, width,
+                          x*self.scale, (self.yoffset-y)*self.scale,
+                          w*self.scale, h*self.scale))
+        return
+
+    def write_text(self, text, x, y, size):
+        self.outfp.write('<span style="position:absolute; left:%dpx; top:%dpx; font-size:%dpx;">' %
+                         (x*self.scale, (self.yoffset-y)*self.scale, size*self.scale))
+        self.write(text)
+        self.outfp.write('</span>\n')
        return

    def write_image(self, image):
@ -194,37 +201,30 @@ class HTMLConverter(PDFConverter):
        def render(item):
            if isinstance(item, LTPage):
                self.yoffset += item.y1
-                self.write_rect('gray', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
+                self.write_rect('gray', 1, item.x0, item.y1, item.width, item.height)
                if self.showpageno:
                    self.outfp.write('<div style="position:absolute; top:%dpx;">' %
                                     ((self.yoffset-item.y1)*self.scale))
-                    self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id))
+                    self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.pageid, page.pageid))
                for child in item:
                    render(child)
            elif isinstance(item, LTChar):
-                self.outfp.write('<span style="position:absolute; left:%dpx; top:%dpx; font-size:%dpx;">' %
-                                 (item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
-                                  item.get_size()*self.scale))
-                self.write(item.text)
-                self.outfp.write('</span>\n')
+                self.write_text(item.text, item.x0, item.y1, item.get_size())
                if self.debug:
-                    self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
+                    self.write_rect('red', 1, item.x0, item.y1, item.width, item.height)
            elif isinstance(item, LTPolygon):
-                self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
+                self.write_rect('black', 1, item.x0, item.y1, item.width, item.height)
            elif isinstance(item, LTTextLine):
                for child in item:
                    render(child)
            elif isinstance(item, LTTextBox):
-                self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
-                for child in item:
-                    render(child)
-            elif isinstance(item, LTTextFlow):
+                self.write_rect('blue', 1, item.x0, item.y1, item.width, item.height)
                for child in item:
                    render(child)
                if self.debug:
-                    self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
+                    self.write_text(str(item.index+1), item.x0, item.y1, 20)
            elif isinstance(item, LTFigure):
-                self.write_rect('green', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
+                self.write_rect('green', 1, item.x0, item.y1, item.width, item.height)
                for child in item:
                    render(child)
            elif isinstance(item, LTImage):
@ -233,6 +233,14 @@ class HTMLConverter(PDFConverter):
            return
        page = PDFConverter.end_page(self, page)
        render(page)
+        if page.layout:
+            def show_layout(item):
+                if isinstance(item, LTTextGroup):
+                    self.write_rect('red', 1, item.x0, item.y1, item.width, item.height)
+                    for child in item:
+                        show_layout(child)
+                return
+            show_layout(page.layout)
        self.yoffset += self.pagepad
        return

@ -270,13 +278,13 @@ class XMLConverter(PDFConverter):
        def render(item):
            if isinstance(item, LTPage):
                self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
-                                 (item.id, bbox2str(item.bbox), item.rotate))
+                                 (item.pageid, bbox2str(item.bbox), item.rotate))
                for child in item:
                    render(child)
                self.outfp.write('</page>\n')
-            elif isinstance(item, LTLine) and item.direction:
-                self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />\n' %
-                                 (item.linewidth, item.direction, bbox2str(item.bbox)))
+            elif isinstance(item, LTLine):
+                self.outfp.write('<line linewidth="%d" bbox="%s" />\n' %
+                                 (item.linewidth, bbox2str(item.bbox)))
            elif isinstance(item, LTRect):
                self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' %
                                 (item.linewidth, bbox2str(item.bbox)))
@ -284,8 +292,8 @@ class XMLConverter(PDFConverter):
                self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>\n' %
                                 (item.linewidth, bbox2str(item.bbox), item.get_pts()))
            elif isinstance(item, LTFigure):
-                self.outfp.write('<figure id="%s" bbox="%s">\n' %
-                                 (item.id, bbox2str(item.bbox)))
+                self.outfp.write('<figure name="%s" bbox="%s">\n' %
+                                 (item.name, bbox2str(item.bbox)))
                for child in item:
                    render(child)
                self.outfp.write('</figure>\n')
@ -295,15 +303,10 @@ class XMLConverter(PDFConverter):
                    render(child)
                self.outfp.write('</textline>\n')
            elif isinstance(item, LTTextBox):
-                self.outfp.write('<textbox bbox="%s">\n' % bbox2str(item.bbox))
+                self.outfp.write('<textbox id="%d" bbox="%s">\n' % (item.index, bbox2str(item.bbox)))
                for child in item:
                    render(child)
                self.outfp.write('</textbox>\n')
-            elif isinstance(item, LTTextFlow):
-                self.outfp.write('<textflow bbox="%s">\n' % bbox2str(item.bbox))
-                for child in item:
-                    render(child)
-                self.outfp.write('</textflow>\n')
            elif isinstance(item, LTChar):
                self.outfp.write('<text font="%s" vertical="%s" bbox="%s" size="%.3f">' %
                                 (enc(item.font.fontname), item.is_vertical(),
@ -325,6 +328,19 @@ class XMLConverter(PDFConverter):
            return
        page = PDFConverter.end_page(self, page)
        render(page)
+        if page.layout:
+            def show_layout(item):
+                if isinstance(item, LTTextBox):
+                    self.outfp.write('<textbox id="%d" bbox="%s" />\n' % (item.index, bbox2str(item.bbox)))
+                elif isinstance(item, LTTextGroup):
+                    self.outfp.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox))
+                    for child in item:
+                        show_layout(child)
+                    self.outfp.write('</textgroup>\n')
+                return
+            self.outfp.write('<layout>\n')
+            show_layout(page.layout)
+            self.outfp.write('</layout>\n')
        return

    def close(self):
--- a/pdfminer/layout.py
+++ b/pdfminer/layout.py
@ -286,18 +286,18 @@ class LTChar(LayoutItem, LTText):
 ##
 class LTFigure(LayoutContainer):

-    def __init__(self, id, bbox, matrix):
+    def __init__(self, name, bbox, matrix):
        (x,y,w,h) = bbox
        bbox = get_bounds( apply_matrix_pt(matrix, (p,q))
                           for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) )
-        self.id = id
+        self.name = name
        self.matrix = matrix
        LayoutContainer.__init__(self, bbox)
        return

    def __repr__(self):
-        return ('<figure id=%r bbox=%s matrix=%s>' %
-                (self.id, bbox2str(self.bbox), matrix2str(self.matrix)))
+        return ('<figure %r bbox=%s matrix=%s>' %
+                (self.name, bbox2str(self.bbox), matrix2str(self.matrix)))


 ##  LTTextLine
@ -369,10 +369,11 @@ class LTTextBox(LayoutContainer):

    def __init__(self, objs):
        LayoutContainer.__init__(self, (0,0,0,0), objs)
+        self.index = None
        return

    def __repr__(self):
-        return ('<textbox(%d) %s %r...>' % (len(self.objs), bbox2str(self.bbox), self.get_text()[:20]))
+        return ('<textbox(%s) %s %r...>' % (self.index, bbox2str(self.bbox), self.get_text()[:20]))

    def get_text(self):
        return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) )
@ -392,9 +393,9 @@ class LTTextBoxVertical(LTTextBox):
        return


-##  LTTextFlow
+##  LTTextGroup
 ##
-class LTTextFlow(LayoutContainer):
+class LTTextGroup(LayoutContainer):

    def __init__(self, objs):
        assert objs
@ -402,18 +403,18 @@ class LTTextFlow(LayoutContainer):
        LayoutContainer.fixate(self)
        return

-class LTTextFlowHorizontal(LTTextFlow):
+class LTTextGroupHorizontal(LTTextGroup):
    
    def __init__(self, objs):
-        LTTextFlow.__init__(self, objs)
+        LTTextGroup.__init__(self, objs)
        # reorder the objects from top-left to bottom-right.
        self.objs = sorted(self.objs, key=lambda obj: obj.x0-obj.y1)
        return

-class LTTextFlowVertical(LTTextFlow):
+class LTTextGroupVertical(LTTextGroup):
    
    def __init__(self, objs):
-        LTTextFlow.__init__(self, objs)
+        LTTextGroup.__init__(self, objs)
        # reorder the objects from top-right to bottom-left.
        self.objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
        return
@ -458,44 +459,32 @@ class Plane(object):
        return list(xobjs)


-##  ClusterBuilder
+##  group_lines
 ##
-class ClusterBuilder(object):
-
-    def __init__(self, groupfunc):
-        self.clusters = {}
-        self.groupfunc = groupfunc
-        return
-
-    # group(objs): groups given objects into one cluster.
-    def group(self, objs):
-        r = objs[:]
-        for obj1 in objs:
-            if obj1 in self.clusters:
-                r.extend(self.clusters.pop(obj1))
-        cluster = self.groupfunc(list(uniq(r)))
-        for obj in r:
-            self.clusters[obj] = cluster
-        return
-
-    # finish(): returns all the clusters.
-    def finish(self):
-        clusters = set(self.clusters.itervalues())
-        for cluster in clusters:
-            cluster.fixate()
-        return list(clusters)
-
-def build_boxes(groupfunc, objs, *args):
+def group_lines(groupfunc, objs, *args):
    plane = Plane(objs)
-    builder = ClusterBuilder(groupfunc)
+    groups = {}
    for obj in objs:
        neighbors = obj.find_neighbors(plane, *args)
        assert obj in neighbors, obj
-        builder.group(neighbors)
-    return builder.finish()
+        members = neighbors[:]
+        for obj1 in neighbors:
+            if obj1 in groups:
+                members.extend(groups.pop(obj1))
+        group = groupfunc(list(uniq(members)))
+        for obj in members:
+            groups[obj] = group
+    groups = set(groups.values())
+    for group in groups:
+        group.fixate()
+    return list(groups)

-def group_hier(groupfunc, objs, distfunc):
+
+##  group_boxes
+##
+def group_boxes(groupfunc, objs, distfunc):
    assert objs
+    objs = objs[:]
    while 2 <= len(objs):
        mindist = INF
        minpair = None
@ -519,16 +508,43 @@ def group_hier(groupfunc, objs, distfunc):
 ##
 class LTPage(LayoutContainer):

-    def __init__(self, id, bbox, rotate=0):
+    def __init__(self, pageid, bbox, rotate=0):
        LayoutContainer.__init__(self, bbox)
-        self.id = id
+        self.pageid = pageid
        self.rotate = rotate
+        self.layout = None
        return

    def __repr__(self):
-        return ('<page id=%r bbox=%s rotate=%r>' % (self.id, bbox2str(self.bbox), self.rotate))
+        return ('<page(%r) bbox=%s rotate=%r>' % (self.pageid, bbox2str(self.bbox), self.rotate))

-    def analyze_layout(self, laparams):
+    def fixate(self, laparams):
+        """Perform the layout analysis."""
+        LayoutContainer.fixate(self)
+        (textobjs, otherobjs) = self.get_textobjs()
+        if not laparams or not textobjs: return
+        if laparams.direction == 'V':
+            textboxes = self.build_textbox_vertical(textobjs, laparams)
+            top = self.group_textbox_vertical(textboxes, laparams)
+        else:
+            textboxes = self.build_textbox_horizontal(textobjs, laparams)
+            top = self.group_textbox_horizontal(textboxes, laparams)
+        def assign_index(obj, i):
+            if isinstance(obj, LTTextBox):
+                obj.index = i
+                i += 1
+            elif isinstance(obj, LTTextGroup):
+                for x in obj:
+                    i = assign_index(x, i)
+            return i
+        assign_index(top, 0)
+        textboxes.sort(key=lambda box:box.index)
+        self.objs = textboxes + otherobjs
+        self.layout = top
+        return
+
+    def get_textobjs(self):
+        """Split all the objects in the page into text-related objects and others."""
        textobjs = []
        otherobjs = []
        for obj in self.objs:
@ -536,16 +552,11 @@ class LTPage(LayoutContainer):
                textobjs.append(obj)
            else:
                otherobjs.append(obj)
-        if laparams.direction == 'V':
-            textobjs = self.analyze_layout_vertical(textobjs, laparams)
-        else:
-            textobjs = self.analyze_layout_horizontal(textobjs, laparams)
-        self.objs = [textobjs] + otherobjs
-        return
+        return (textobjs, otherobjs)

-    def analyze_layout_horizontal(self, objs, laparams):
-        
-        def halign(obj1, obj2):
+    def build_textbox_horizontal(self, objs, laparams):
+        """Identify horizontal text regions in the page."""
+        def aligned(obj1, obj2):
            # +------+ - - -
            # | obj1 | - - +------+   -
            # |      |     | obj2 |   | (line_overlap)
@ -556,12 +567,11 @@ class LTPage(LayoutContainer):
            #      (char_margin)
            return ((min(obj1.height, obj2.height) * laparams.line_overlap < obj1.voverlap(obj2)) and
                    (obj1.hdistance(obj2) < min(obj1.width, obj2.width) * laparams.char_margin))
-        
        lines = []
        line = []
        prev = None
        for cur in objs:
-            if prev is not None and not halign(prev, cur):
+            if prev is not None and not aligned(prev, cur):
                if line:
                    lines.append(LTTextLineHorizontal(line, laparams.word_margin))
                    line = []
@ -569,18 +579,11 @@ class LTPage(LayoutContainer):
            prev = cur
        if line:
            lines.append(LTTextLineHorizontal(line, laparams.word_margin))
-        boxes = build_boxes(LTTextBoxHorizontal, lines, laparams.line_margin)
+        return group_lines(LTTextBoxHorizontal, lines, laparams.line_margin)

-        def dist(obj1, obj2):
-            return ((max(obj1.x1,obj2.x1) - min(obj1.x0,obj2.x0)) *
-                    (max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) -
-                    obj1.width*obj1.height - obj2.width*obj2.height)
-        
-        return group_hier(LTTextFlowHorizontal, boxes, dist)
-
-    def analyze_layout_vertical(self, objs, laparams):
-
-        def valign(obj1, obj2):
+    def build_textbox_vertical(self, objs, laparams):
+        """Identify vertical text regions in the page."""
+        def aligned(obj1, obj2):
            # +------+
            # | obj1 |
            # |      |
@ -595,12 +598,11 @@ class LTPage(LayoutContainer):
            # (line_overlap)
            return ((min(obj1.width, obj2.width) * laparams.line_overlap < obj1.hoverlap(obj2)) and
                    (obj1.vdistance(obj2) < min(obj1.height, obj2.height) * laparams.char_margin))
-        
        lines = []
        line = []
        prev = None
        for cur in objs:
-            if prev is not None and not valign(prev, cur):
+            if prev is not None and not aligned(prev, cur):
                if line:
                    lines.append(LTTextLineVertical(line, laparams.word_margin))
                    line = []
@ -608,11 +610,18 @@ class LTPage(LayoutContainer):
            prev = cur
        if line:
            lines.append(LTTextLineVertical(line, laparams.word_margin))
-        boxes = build_boxes(LTTextBoxVertical, lines, laparams.line_margin)
+        return group_lines(LTTextBoxVertical, lines, laparams.line_margin)

+    def group_textbox_horizontal(self, boxes, laparams):
        def dist(obj1, obj2):
            return ((max(obj1.x1,obj2.x1) - min(obj1.x0,obj2.x0)) *
                    (max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) -
                    obj1.width*obj1.height - obj2.width*obj2.height)
+        return group_boxes(LTTextGroupHorizontal, boxes, dist)

-        return group_hier(LTTextFlowVertical, boxes, dist)
+    def group_textbox_vertical(self, boxes, laparams):
+        def dist(obj1, obj2):
+            return ((max(obj1.x1,obj2.x1) - min(obj1.x0,obj2.x0)) *
+                    (max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) -
+                    obj1.width*obj1.height - obj2.width*obj2.height)
+        return group_boxes(LTTextGroupVertical, boxes, dist)
--- a/samples/Makefile
+++ b/samples/Makefile
@ -38,7 +38,7 @@ XMLS= \
 	naacl06-shinyama.xml \
 	nlp2004slides.xml

-all: htmls texts xmls
+test: htmls texts xmls

 clean:
 	-$(RM) $(HTMLS)
@ -53,9 +53,12 @@ xmls: $(XMLS)

 .pdf.html:
 	$(PDF2TXT) -t html $< > $@
+#	$(CMP) $@ $@.ref

 .pdf.xml:
 	$(PDF2TXT) -t xml $< > $@
+#	$(CMP) $@ $@.ref

 .pdf.txt:
 	$(PDF2TXT) -t text $< > $@
+#	$(CMP) $@ $@.ref