From 173d0955229332b277c5d4de68047b8c6f6b8688 Mon Sep 17 00:00:00 2001
From: "yusuke.shinyama.dummy"
 <yusuke.shinyama.dummy@1aa58f4a-7d42-0410-adbc-911cccaed67c>
Date: Sat, 16 May 2009 10:42:35 +0000
Subject: [PATCH] text spacing bug fixed

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@106 1aa58f4a-7d42-0410-adbc-911cccaed67c
---
 pdfminer/converter.py | 14 +++++++-------
 pdfminer/pdffont.py   | 15 ++++-----------
 pdfminer/pdfinterp.py |  9 +++------
 tools/pdf2txt.py      |  2 ++
 4 files changed, 16 insertions(+), 24 deletions(-)
diff --git a/pdfminer/converter.py b/pdfminer/converter.py
index 3776699..4b1b5f9 100644
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@@ -86,12 +86,14 @@ class PDFPageAggregator(PDFDevice):
   def render_string(self, textstate, textmatrix, seq):
     font = textstate.font
     textmatrix = mult_matrix(textmatrix, self.ctm)
+    scaling = textstate.scaling * .01
+    dxscale = scaling / (font.hscale*1000) * .01
+    wordspace = textstate.wordspace * scaling
     chars = []
     for x in seq:
       if isinstance(x, int) or isinstance(x, float):
         (dx,dy) = self.render_chars(textmatrix, textstate, chars)
-        dx -= x * textstate.scaling * .0001
-        textmatrix = translate_matrix(textmatrix, (dx, dy))
+        textmatrix = translate_matrix(textmatrix, (dx-x*dxscale, dy))
         chars = []
       else:
         for cid in font.decode(x):
@@ -101,10 +103,9 @@ class PDFPageAggregator(PDFDevice):
             (cidcoding, cid) = e.args
             char = self.handle_undefined_char(cidcoding, cid)
           chars.append((char, cid))
-          if textstate.wordspace and not font.is_multibyte() and cid == 32:
+          if cid == 32 and textstate.wordspace and not font.is_multibyte():
             (dx,dy) = self.render_chars(textmatrix, textstate, chars)
-            dx += textstate.wordspace * textstate.scaling * .01
-            textmatrix = translate_matrix(textmatrix, (dx, dy))
+            textmatrix = translate_matrix(textmatrix, (dx+wordspace, dy))
             chars = []
     self.render_chars(textmatrix, textstate, chars)
     return
@@ -238,7 +239,6 @@ class HTMLConverter(PDFConverter):
                      self.codec)
     self.outfp.write('</head><body>\n')
     self.yoffset = self.pagepad
-    self.show_text_border = False
     return
 
   def write_rect(self, color, width, x, y, w, h):
@@ -268,7 +268,7 @@ class HTMLConverter(PDFConverter):
                           item.fontsize*self.scale))
         self.write(item.text)
         self.outfp.write('</span>\n')
-        if self.show_text_border:
+        if self.debug:
           self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
       elif isinstance(item, LTLine) or isinstance(item, LTRect):
         self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py
index a3de399..cc3af4e 100644
--- a/pdfminer/pdffont.py
+++ b/pdfminer/pdffont.py
@@ -330,6 +330,7 @@ class PDFFont(object):
     self.default_width = default_width or descriptor.get('MissingWidth', 0)
     self.leading = num_value(descriptor.get('Leading', 0))
     self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0)))
+    self.hscale = self.vscale = .001
     return
 
   def __repr__(self):
@@ -345,12 +346,12 @@ class PDFFont(object):
     return map(ord, bytes)
 
   def get_ascent(self):
-    return self.ascent * .001
+    return self.ascent * self.vscale
   def get_descent(self):
-    return self.descent * .001
+    return self.descent * self.vscale
 
   def char_width(self, cid):
-    return self.widths.get(cid, self.default_width) * .001
+    return self.widths.get(cid, self.default_width) * self.hscale
 
   def char_disp(self, cid):
     return 0
@@ -448,14 +449,6 @@ class PDFType3Font(PDFSimpleFont):
   def __repr__(self):
     return '<PDFType3Font>'
 
-  def get_ascent(self):
-    return self.ascent * self.vscale
-  def get_descent(self):
-    return self.descent * self.vscale
-
-  def char_width(self, cid):
-    return self.widths.get(cid, self.default_width) * self.hscale
-
 
 # PDFCIDFont
 class PDFCIDFont(PDFFont):
diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py
index a39e354..6ad9883 100644
--- a/pdfminer/pdfinterp.py
+++ b/pdfminer/pdfinterp.py
@@ -754,18 +754,15 @@ class PDFPageInterpreter(object):
 
 ##  process_pdf
 ##
-class TextExtractionNotAllowed(RuntimeError): pass
+class PDFTextExtractionNotAllowed(PDFInterpreterError): pass
 
 def process_pdf(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
   doc = PDFDocument()
   fp = file(fname, 'rb')
   parser = PDFParser(doc, fp)
-  try:
-    doc.initialize(password)
-  except PDFPasswordIncorrect:
-    raise TextExtractionNotAllowed('Incorrect password')
+  doc.initialize(password)
   if not doc.is_extractable:
-    raise TextExtractionNotAllowed('Text extraction is not allowed: %r' % fname)
+    raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fname)
   interpreter = PDFPageInterpreter(rsrc, device)
   for (pageno,page) in enumerate(doc.get_pages()):
     if pagenos and (pageno not in pagenos): continue
diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py
index 96a7a23..2e8ed18 100755
--- a/tools/pdf2txt.py
+++ b/tools/pdf2txt.py
@@ -2,6 +2,7 @@
 import sys
 from pdfminer.pdfparser import PDFDocument, PDFParser
 from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
+from pdfminer.pdfdevice import PDFDevice
 from pdfminer.converter import SGMLConverter, HTMLConverter, TextConverter, TagExtractor
 from pdfminer.cmap import CMapDB
 
@@ -51,6 +52,7 @@ def main(argv):
   PDFDocument.debug = debug
   PDFParser.debug = debug
   PDFPageInterpreter.debug = debug
+  PDFDevice.debug = debug
   #
   CMapDB.initialize(cmapdir, cdbcmapdir)
   rsrc = PDFResourceManager()