From 6590ad42f5e17641b9ff204664be097015845026 Mon Sep 17 00:00:00 2001
From: "yusuke.shinyama.dummy"
 <yusuke.shinyama.dummy@1aa58f4a-7d42-0410-adbc-911cccaed67c>
Date: Sun, 20 Dec 2009 02:38:01 +0000
Subject: [PATCH] experimental polygon extraction.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@166 1aa58f4a-7d42-0410-adbc-911cccaed67c
---
 docs/index.html       |  8 +++---
 pdfminer/converter.py | 22 ++++++++-------
 pdfminer/layout.py    | 63 ++++++++++++++++++++++++++++++-------------
 pdfminer/pdffont.py   |  2 +-
 4 files changed, 63 insertions(+), 32 deletions(-)
diff --git a/docs/index.html b/docs/index.html
index 63f92f8..b5e58f1 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -19,7 +19,7 @@ Python PDF parser and analyzer
 
 <div align=right class=lastmod>
 <!-- hhmts start -->
-Last Modified: Sun Dec 20 00:09:12 JST 2009
+Last Modified: Sun Dec 20 01:25:02 JST 2009
 <!-- hhmts end -->
 </div>
 
@@ -41,7 +41,7 @@ Last Modified: Sun Dec 20 00:09:12 JST 2009
 <h2>What's It?</h2>
 <p>
 PDFMiner is a suite of programs that help
-extracting some meaningful informatin out of PDF documents.
+extracting some meaningful information out of PDF documents.
 Unlike other PDF-related tools, it focuses entirely on getting 
 and analyzing text data from PDFs. PDFMiner allows to obtain
 the exact location of texts in a page, as well as 
@@ -95,7 +95,7 @@ http://pdf2html.tabesugi.net:8080/
 <ol>
 <li> Install <a href="http://www.python.org/download/">Python</a> 2.4 or newer.
 <li> Download the <a href="#source">PDFMiner source</a>.
-<li> Extract it.
+<li> Unpack it.
 <li> Run <code>setup.py</code> to install:<br>
 <blockquote><pre>
 # <strong>python setup.py install</strong>
@@ -344,7 +344,7 @@ no stream header is displayed for the ease of saving it to a file.
 <hr noshade>
 <h2>Changes</h2>
 <ul>
-<li> 2009/12/19: CMap resources are now the part of the package. Thanks to Adobe for opensourcing them.
+<li> 2009/12/19: CMap resources are now the part of the package. Thanks to Adobe for open-sourcing them.
 <li> 2009/11/29: Password encryption bug fixed. Thanks to Yannick Gingras.
 <li> 2009/10/31: SGML output format is changed and renamed as XML.
 <li> 2009/10/24: Charspace bug fixed. Adjusted for 4-space indentation.
diff --git a/pdfminer/converter.py b/pdfminer/converter.py
index 4d39103..69c25b8 100644
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@@ -3,7 +3,7 @@ import sys
 from pdfdevice import PDFDevice, PDFTextDevice
 from pdffont import PDFUnicodeNotDefined
 from layout import LayoutContainer
-from layout import LTPage, LTText, LTLine, LTRect
+from layout import LTPage, LTText, LTLine, LTRect, LTPolygon
 from layout import LTFigure, LTTextItem, LTTextBox, LTTextLine
 from utils import enc
 from utils import apply_matrix_pt, mult_matrix
@@ -116,12 +116,7 @@ class PDFPageAggregator(PDFTextDevice):
             (_,x1,y1) = path[1]
             (x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
             (x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
-            if y0 == y1:
-                # horizontal ruler
-                self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1)))
-            elif x0 == x1:
-                # vertical ruler
-                self.cur_item.add(LTLine(gstate.linewidth, 'V', (x0,y0,x1,y1)))
+            self.cur_item.add(LTLine(gstate.linewidth, (x0,y0), (x1,y1)))
         elif shape == 'mlllh':
             # rectangle
             (_,x0,y0) = path[0]
@@ -135,6 +130,13 @@ class PDFPageAggregator(PDFTextDevice):
             if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
                 (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
                 self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
+        else:
+            # other polygon
+            pts = []
+            for p in path:
+                for i in xrange(1, len(p), 2):
+                    pts.append(apply_matrix_pt(self.ctm, (p[i], p[i+1])))
+            self.cur_item.add(LTPolygon(gstate.linewidth, pts))
         return
 
     def render_chars(self, matrix, font, fontsize, charspace, scaling, chars):
@@ -177,10 +179,12 @@ class XMLConverter(PDFConverter):
                 for child in item:
                     render(child)
                 self.outfp.write('</page>\n')
-            elif isinstance(item, LTLine):
+            elif isinstance(item, LTLine) and item.direction:
                 self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox()))
             elif isinstance(item, LTRect):
                 self.outfp.write('<rect linewidth="%d" bbox="%s" />' % (item.linewidth, item.get_bbox()))
+            elif isinstance(item, LTPolygon):
+                self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>' % (item.linewidth, item.get_bbox(), item.get_pts()))
             elif isinstance(item, LTFigure):
                 self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
                 for child in item:
@@ -263,7 +267,7 @@ class HTMLConverter(PDFConverter):
                 self.outfp.write('</span>\n')
                 if self.debug:
                     self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
-            elif isinstance(item, LTLine) or isinstance(item, LTRect):
+            elif isinstance(item, LTPolygon):
                 self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
             elif isinstance(item, LTTextLine):
                 for child in item:
diff --git a/pdfminer/layout.py b/pdfminer/layout.py
index 7a15ce7..1b940e4 100644
--- a/pdfminer/layout.py
+++ b/pdfminer/layout.py
@@ -6,6 +6,20 @@ from utils import apply_matrix_pt
 from utils import bsearch
 
 
+
+##  get_bounds
+##
+def get_bounds(pts):
+    """Compute a maximal rectangle that covers all the points."""
+    (x0, y0, x1, y1) = (INF, INF, -INF, -INF)
+    for (x,y) in pts:
+        x0 = min(x0, x)
+        y0 = min(y0, y)
+        x1 = max(x1, x)
+        y1 = max(y1, y)
+    return (x0,y0,x1,y1)
+
+
 ##  LAParams
 ##
 class LAParams(object):
@@ -228,24 +242,44 @@ class LayoutContainer(LayoutItem):
         return None
 
 
+##  LTPolygon
+##
+class LTPolygon(LayoutItem):
+
+    def __init__(self, linewidth, pts):
+        LayoutItem.__init__(self, get_bounds(pts))
+        self.pts = pts
+        self.linewidth = linewidth
+        return
+
+    def get_pts(self):
+        return ','.join( '%.3f,%.3f' % p for p in self.pts )
+
+
 ##  LTLine
 ##
-class LTLine(LayoutItem):
+class LTLine(LTPolygon):
 
-    def __init__(self, linewidth, direction, bbox):
-        LayoutItem.__init__(self, bbox)
-        self.linewidth = linewidth
-        self.direction = direction
+    def __init__(self, linewidth, p0, p1):
+        (x0,y0) = p0
+        (x1,y1) = p0
+        self.direction = None
+        if y0 == y1:
+            # horizontal ruler
+            self.direction = 'H'
+        elif x0 == x1:
+            # vertical ruler
+            self.direction = 'V'
+        LTPolygon.__init__(self, linewidth, [p0, p1])
         return
 
 
 ##  LTRect
 ##
-class LTRect(LayoutItem):
+class LTRect(LTPolygon):
 
-    def __init__(self, linewidth, bbox):
-        LayoutItem.__init__(self, bbox)
-        self.linewidth = linewidth
+    def __init__(self, linewidth, (x0,y0,x1,y1)):
+        LTPolygon.__init__(self, linewidth, [(x0,y0), (x1,y0), (x1,y1), (x0,y1)])
         return
 
 
@@ -339,15 +373,8 @@ class LTFigure(LayoutContainer):
 
     def __init__(self, id, bbox, matrix):
         (x,y,w,h) = bbox
-        x0 = y0 = INF
-        x1 = y1 = -INF
-        for (p,q) in ((x,y),(x+w,y),(x,y+h),(x+w,y+h)):
-            (p,q) = apply_matrix_pt(matrix, (p,q))
-            x0 = min(x0, p)
-            x1 = max(x1, p)
-            y0 = min(y0, q)
-            y1 = max(y1, q)
-        bbox = (x0,y0,x1,y1)
+        bbox = get_bounds( apply_matrix_pt(matrix, (p,q))
+                           for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) )
         self.matrix = matrix
         LayoutContainer.__init__(self, id, bbox)
         return
diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py
index 1ed201f..fe544e6 100644
--- a/pdfminer/pdffont.py
+++ b/pdfminer/pdffont.py
@@ -518,7 +518,7 @@ class PDFCIDFont(PDFFont):
             try:
                 self.unicode_map = CMapDB.get_unicode_map(self.cidcoding, self.cmap.is_vertical())
             except CMapDB.CMapNotFound, e:
-                raise PDFFontError(e)
+                pass
 
         def get_width(seq):
             dic = {}