charspace bug fixed.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@139 1aa58f4a-7d42-0410-adbc-911cccaed67c
2009-10-23 14:51:40 +00:00 · 2009-10-23 14:51:40 +00:00 · a1591f6a4d
parent ee97f18d4e
commit a1591f6a4d
3 changed files with 19 additions and 5 deletions
--- a/docs/index.html
+++ b/docs/index.html
@ -18,7 +18,7 @@ Python PDF parser and analyzer

 <div align=right class=lastmod>
 <!-- hhmts start -->
-Last Modified: Sun Oct  4 12:47:18 JST 2009
+Last Modified: Thu Oct 15 19:01:07 JST 2009
 <!-- hhmts end -->
 </div>

@ -141,6 +141,7 @@ PDFMiner comes with two handy tools:
 <p>
 <code>pdf2txt.py</code> extracts text contents from a PDF file.
 It extracts all the texts that are to be rendered programmatically,
+ie. text represented as ASCII or Unicode strings.
 It cannot recognize texts drawn as images that would require optical character recognition.
 It also extracts the corresponding locations, font names, font sizes, writing
 direction (horizontal or vertical) for each text portion.
@ -183,9 +184,9 @@ By default, it extracts texts from all the pages.
 <dt> <code>-t <em>type</em></code> 
 <dd> Specifies the output format. The following formats are currently supported.
 <ul>
-<li> <code>html</code> : HTML format. (Default)
-<li> <code>text</code> : TEXT format.
-<li> <code>sgml</code> : SGML format.
+<li> <code>text</code> : TEXT format. (Default)
+<li> <code>html</code> : HTML format. Not recommended for extraction purpose because the markup is very messy.
+<li> <code>sgml</code> : SGML format. Provides the most information available.
 <li> <code>tag</code> : "Tagged PDF" format. A tagged PDF has its own contents annotated with
 HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations.
 Tags used here are defined in the PDF specification (See &sect;10.7 "<em>Tagged PDF</em>").
--- a/pdfminer/layout.py
+++ b/pdfminer/layout.py
@ -269,7 +269,7 @@ class LTTextItem(LayoutItem, LTText):
    self.vertical = font.is_vertical()
    self.text = ''.join( char for (char,_) in chars )
    adv = sum( font.char_width(cid) for (_,cid) in chars )
-    adv = (adv * fontsize + len(chars)*charspace) * scaling
+    adv = (adv * fontsize + (len(chars)-1)*charspace) * scaling
    #size = (font.get_ascent() - font.get_descent()) * fontsize
    size = font.get_size() * fontsize
    (_,_,_,_,tx,ty) = self.matrix
--- a/pdfminer/pdfdevice.py
+++ b/pdfminer/pdfdevice.py
@ -71,6 +71,7 @@ class PDFTextDevice(PDFDevice):
    wordspace = textstate.wordspace * scaling
    dxscale = .001 * fontsize * scaling
    chars = []
+    needspace = False
    (x,y) = textstate.linematrix
    for obj in seq:
      if isinstance(obj, int) or isinstance(obj, float):
@ -84,6 +85,7 @@ class PDFTextDevice(PDFDevice):
        else:
          x += d
        chars = []
+        needspace = False
      else:
        for cid in font.decode(obj):
          try:
@ -93,8 +95,14 @@ class PDFTextDevice(PDFDevice):
            char = self.handle_undefined_char(cidcoding, cid)
          chars.append((char, cid))
          if cid == 32 and textstate.wordspace and not font.is_multibyte():
+            if needspace:
+              if font.is_vertical():
+                y += charspace
+              else:
+                x += charspace
            (dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
                                        fontsize, charspace, scaling, chars)
+            needspace = True
            x += dx
            y += dy
            if font.is_vertical():
@ -103,6 +111,11 @@ class PDFTextDevice(PDFDevice):
              x += wordspace
            chars = []
    if chars:
+      if needspace:
+        if font.is_vertical():
+          y += charspace
+        else:
+          x += charspace
      (dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
                                  fontsize, charspace, scaling, chars)
      x += dx