text rise support added

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@217 1aa58f4a-7d42-0410-adbc-911cccaed67c
2010-05-18 14:57:04 +00:00 · 2010-05-18 14:57:04 +00:00 · fe3bdbfce0
parent 479c920ec7
commit fe3bdbfce0
6 changed files with 28 additions and 21 deletions
--- a/3
+++ b/3
@ -4,7 +4,6 @@
 PACKAGE=pdfminer
 PREFIX=/usr/local

-SVN=svn
 PYTHON=python
 RM=rm -f
 CP=cp -f
@ -22,8 +21,6 @@ clean:

 distclean: clean test_clean cmap_clean

-commit: distclean
-	$(SVN) commit
 pack: distclean
 	$(PYTHON) setup.py sdist
 register: distclean
--- a/docs/index.html
+++ b/docs/index.html
@ -19,7 +19,7 @@ Python PDF parser and analyzer

 <div align=right class=lastmod>
 <!-- hhmts start -->
-Last Modified: Sat Apr 24 04:30:10 UTC 2010
+Last Modified: Mon May 10 23:02:20 UTC 2010
 <!-- hhmts end -->
 </div>

@ -64,7 +64,8 @@ PDF parser that can be used for other purposes instead of text analysis.
 <li> Reconstruct the original layout by grouping text chunks.
 </ul>
 <p>
-On the performance, PDFMiner is about 20 times slower than 
+On the performance side, 
+PDFMiner is about 20 times slower than 
 other C/C++-based software such as XPdf.

 <a name="source"></a>
--- a/docs/usage.html
+++ b/docs/usage.html
@ -56,7 +56,7 @@ for page in doc.get_pages():
 </pre></blockquote>

 <p>
-In PDFMiner, there are several objects involved in parsing a PDF file,
+In PDFMiner, there are several Python classes involved in parsing a PDF file,
 as shown in Figure 1.

 <div>
@ -68,7 +68,12 @@ as shown in Figure 1.
 <hr noshade>
 <h2>Accessing Layout Objects</h2>
 <p>
-PDFMiner performs a basic layout analysis.
+PDF documents are more like graphics, rather than text documents.
+It presents no logical structure such as sentences or paragraphs (for most cases).
+PDFMiner tries to reconstruct the original structure by performing
+basic layout analysis.
+<p>
+

 <blockquote><pre>
 from pdfminer.layout import LAParams
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@ -91,8 +91,8 @@ class PDFLayoutAnalyzer(PDFTextDevice):
            self.cur_item.add(LTPolygon(gstate.linewidth, pts))
        return

-    def render_char(self, matrix, font, fontsize, scaling, cid):
-        item = LTChar(matrix, font, fontsize, scaling, cid)
+    def render_char(self, matrix, font, fontsize, scaling, rise, cid):
+        item = LTChar(matrix, font, fontsize, scaling, rise, cid)
        self.cur_item.add(item)
        return item.adv

--- a/pdfminer/layout.py
+++ b/pdfminer/layout.py
@ -204,7 +204,7 @@ class LTChar(LTItem, LTText):

    debug = 0

-    def __init__(self, matrix, font, fontsize, scaling, cid):
+    def __init__(self, matrix, font, fontsize, scaling, rise, cid):
        self.matrix = matrix
        self.font = font
        self.fontsize = fontsize
@ -214,6 +214,8 @@ class LTChar(LTItem, LTText):
            text = font.to_unichr(cid)
        except PDFUnicodeNotDefined:
            text = '?'
+        (a,b,c,d,e,f) = self.matrix
+        self.upright = (0 < a*d*scaling and b*c <= 0)
        LTText.__init__(self, text)
        # compute the boundary rectangle.
        if self.vertical:
@ -224,7 +226,7 @@ class LTChar(LTItem, LTText):
            (dx,dy) = apply_matrix_norm(self.matrix, (size, self.adv))
            (_,_,_,_,tx,ty) = self.matrix
            tx -= dx/2
-            ty += displacement
+            ty += displacement + rise
            bbox = (tx, ty+dy, tx+dx, ty)
        else:
            # horizontal
@ -233,7 +235,7 @@ class LTChar(LTItem, LTText):
            (_,descent) = apply_matrix_norm(self.matrix, (0, descent))
            (dx,dy) = apply_matrix_norm(self.matrix, (self.adv, size))
            (_,_,_,_,tx,ty) = self.matrix
-            ty += descent
+            ty += descent + rise
            bbox = (tx, ty, tx+dx, ty+dy)
        LTItem.__init__(self, bbox)
        return
@ -253,8 +255,7 @@ class LTChar(LTItem, LTText):
        return self.vertical

    def is_upright(self):
-        (a,b,c,d,e,f) = self.matrix
-        return 0 < a*d and b*c <= 0
+        return self.upright

    
 ##  LTContainer
--- a/pdfminer/pdfdevice.py
+++ b/pdfminer/pdfdevice.py
@ -66,19 +66,22 @@ class PDFTextDevice(PDFDevice):
        scaling = textstate.scaling * .01
        charspace = textstate.charspace * scaling
        wordspace = textstate.wordspace * scaling
+        rise = textstate.rise
        if font.is_multibyte():
            wordspace = 0
        dxscale = .001 * fontsize * scaling
        if font.is_vertical():
            textstate.linematrix = self.render_string_vertical(
-                seq, matrix, textstate.linematrix, font, fontsize, scaling, charspace, wordspace, dxscale)
+                seq, matrix, textstate.linematrix, font, fontsize,
+                scaling, charspace, wordspace, rise, dxscale)
        else:
            textstate.linematrix = self.render_string_horizontal(
-                seq, matrix, textstate.linematrix, font, fontsize, scaling, charspace, wordspace, dxscale)
+                seq, matrix, textstate.linematrix, font, fontsize,
+                scaling, charspace, wordspace, rise, dxscale)
        return
    
    def render_string_horizontal(self, seq, matrix, (x,y), 
-                                 font, fontsize, scaling, charspace, wordspace, dxscale):
+                                 font, fontsize, scaling, charspace, wordspace, rise, dxscale):
        needcharspace = False
        for obj in seq:
            if isinstance(obj, int) or isinstance(obj, float):
@ -89,14 +92,14 @@ class PDFTextDevice(PDFDevice):
                    if needcharspace:
                        x += charspace
                    x += self.render_char(translate_matrix(matrix, (x,y)),
-                                          font, fontsize, scaling, cid)
+                                          font, fontsize, scaling, rise, cid)
                    if cid == 32 and wordspace:
                        x += wordspace
                    needcharspace = True
        return (x, y)

    def render_string_vertical(self, seq, matrix, (x,y), 
-                               font, fontsize, scaling, charspace, wordspace, dxscale):
+                               font, fontsize, scaling, charspace, wordspace, rise, dxscale):
        needcharspace = False
        for obj in seq:
            if isinstance(obj, int) or isinstance(obj, float):
@ -107,13 +110,13 @@ class PDFTextDevice(PDFDevice):
                    if needcharspace:
                        y += charspace
                    y += self.render_char(translate_matrix(matrix, (x,y)), 
-                                          font, fontsize, scaling, cid)
+                                          font, fontsize, scaling, rise, cid)
                    if cid == 32 and wordspace:
                        y += wordspace
                    needcharspace = True
        return (x, y)

-    def render_char(self, matrix, font, fontsize, scaling, cid):
+    def render_char(self, matrix, font, fontsize, scaling, rise, cid):
        return 0