text rise support added

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@217 1aa58f4a-7d42-0410-adbc-911cccaed67c
2010-05-18 14:57:04 +00:00 · 2010-05-18 14:57:04 +00:00 · fe3bdbfce0
parent 479c920ec7
commit fe3bdbfce0
6 changed files with 28 additions and 21 deletions
--- a/3
+++ b/3
@ -4,7 +4,6 @@
 PACKAGE=pdfminer
 PREFIX=/usr/local
 SVN=svn
 PYTHON=python
 RM=rm -f
 CP=cp -f
@ -22,8 +21,6 @@ clean:
 distclean: clean test_clean cmap_clean
 commit: distclean
 	$(SVN) commit
 pack: distclean
 	$(PYTHON) setup.py sdist
 register: distclean
--- a/docs/index.html
+++ b/docs/index.html
@ -19,7 +19,7 @@ Python PDF parser and analyzer
 <div align=right class=lastmod>
 <!-- hhmts start -->
-Last Modified: Sat Apr 24 04:30:10 UTC 2010
+Last Modified: Mon May 10 23:02:20 UTC 2010
 <!-- hhmts end -->
 </div>
@ -64,7 +64,8 @@ PDF parser that can be used for other purposes instead of text analysis.
 <li> Reconstruct the original layout by grouping text chunks.
 </ul>
 <p>
-On the performance, PDFMiner is about 20 times slower than 
+On the performance side, 
 PDFMiner is about 20 times slower than 
 other C/C++-based software such as XPdf.
 <a name="source"></a>
--- a/docs/usage.html
+++ b/docs/usage.html
@ -56,7 +56,7 @@ for page in doc.get_pages():
 </pre></blockquote>
 <p>
-In PDFMiner, there are several objects involved in parsing a PDF file,
+In PDFMiner, there are several Python classes involved in parsing a PDF file,
 as shown in Figure 1.
 <div>
@ -68,7 +68,12 @@ as shown in Figure 1.
 <hr noshade>
 <h2>Accessing Layout Objects</h2>
 <p>
-PDFMiner performs a basic layout analysis.
+PDF documents are more like graphics, rather than text documents.
 It presents no logical structure such as sentences or paragraphs (for most cases).
 PDFMiner tries to reconstruct the original structure by performing
 basic layout analysis.
 <p>
 <blockquote><pre>
 from pdfminer.layout import LAParams
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@ -91,8 +91,8 @@ class PDFLayoutAnalyzer(PDFTextDevice):
            self.cur_item.add(LTPolygon(gstate.linewidth, pts))
        return
-    def render_char(self, matrix, font, fontsize, scaling, cid):
+    def render_char(self, matrix, font, fontsize, scaling, rise, cid):
-        item = LTChar(matrix, font, fontsize, scaling, cid)
+        item = LTChar(matrix, font, fontsize, scaling, rise, cid)
        self.cur_item.add(item)
        return item.adv
--- a/pdfminer/layout.py
+++ b/pdfminer/layout.py
@ -204,7 +204,7 @@ class LTChar(LTItem, LTText):
    debug = 0
-    def __init__(self, matrix, font, fontsize, scaling, cid):
+    def __init__(self, matrix, font, fontsize, scaling, rise, cid):
        self.matrix = matrix
        self.font = font
        self.fontsize = fontsize
@ -214,6 +214,8 @@ class LTChar(LTItem, LTText):
            text = font.to_unichr(cid)
        except PDFUnicodeNotDefined:
            text = '?'
        (a,b,c,d,e,f) = self.matrix
        self.upright = (0 < a*d*scaling and b*c <= 0)
        LTText.__init__(self, text)
        # compute the boundary rectangle.
        if self.vertical:
@ -224,7 +226,7 @@ class LTChar(LTItem, LTText):
            (dx,dy) = apply_matrix_norm(self.matrix, (size, self.adv))
            (_,_,_,_,tx,ty) = self.matrix
            tx -= dx/2
-            ty += displacement
+            ty += displacement + rise
            bbox = (tx, ty+dy, tx+dx, ty)
        else:
            # horizontal
@ -233,7 +235,7 @@ class LTChar(LTItem, LTText):
            (_,descent) = apply_matrix_norm(self.matrix, (0, descent))
            (dx,dy) = apply_matrix_norm(self.matrix, (self.adv, size))
            (_,_,_,_,tx,ty) = self.matrix
-            ty += descent
+            ty += descent + rise
            bbox = (tx, ty, tx+dx, ty+dy)
        LTItem.__init__(self, bbox)
        return
@ -253,8 +255,7 @@ class LTChar(LTItem, LTText):
        return self.vertical
    def is_upright(self):
-        (a,b,c,d,e,f) = self.matrix
+        return self.upright
        return 0 < a*d and b*c <= 0
 ##  LTContainer
--- a/pdfminer/pdfdevice.py
+++ b/pdfminer/pdfdevice.py
@ -66,19 +66,22 @@ class PDFTextDevice(PDFDevice):
        scaling = textstate.scaling * .01
        charspace = textstate.charspace * scaling
        wordspace = textstate.wordspace * scaling
        rise = textstate.rise
        if font.is_multibyte():
            wordspace = 0
        dxscale = .001 * fontsize * scaling
        if font.is_vertical():
            textstate.linematrix = self.render_string_vertical(
-                seq, matrix, textstate.linematrix, font, fontsize, scaling, charspace, wordspace, dxscale)
+                seq, matrix, textstate.linematrix, font, fontsize,
                scaling, charspace, wordspace, rise, dxscale)
        else:
            textstate.linematrix = self.render_string_horizontal(
-                seq, matrix, textstate.linematrix, font, fontsize, scaling, charspace, wordspace, dxscale)
+                seq, matrix, textstate.linematrix, font, fontsize,
                scaling, charspace, wordspace, rise, dxscale)
        return
    def render_string_horizontal(self, seq, matrix, (x,y), 
-                                 font, fontsize, scaling, charspace, wordspace, dxscale):
+                                 font, fontsize, scaling, charspace, wordspace, rise, dxscale):
        needcharspace = False
        for obj in seq:
            if isinstance(obj, int) or isinstance(obj, float):
@ -89,14 +92,14 @@ class PDFTextDevice(PDFDevice):
                    if needcharspace:
                        x += charspace
                    x += self.render_char(translate_matrix(matrix, (x,y)),
-                                          font, fontsize, scaling, cid)
+                                          font, fontsize, scaling, rise, cid)
                    if cid == 32 and wordspace:
                        x += wordspace
                    needcharspace = True
        return (x, y)
    def render_string_vertical(self, seq, matrix, (x,y), 
-                               font, fontsize, scaling, charspace, wordspace, dxscale):
+                               font, fontsize, scaling, charspace, wordspace, rise, dxscale):
        needcharspace = False
        for obj in seq:
            if isinstance(obj, int) or isinstance(obj, float):
@ -107,13 +110,13 @@ class PDFTextDevice(PDFDevice):
                    if needcharspace:
                        y += charspace
                    y += self.render_char(translate_matrix(matrix, (x,y)), 
-                                          font, fontsize, scaling, cid)
+                                          font, fontsize, scaling, rise, cid)
                    if cid == 32 and wordspace:
                        y += wordspace
                    needcharspace = True
        return (x, y)
-    def render_char(self, matrix, font, fontsize, scaling, cid):
+    def render_char(self, matrix, font, fontsize, scaling, rise, cid):
        return 0