From ba277fb5a00ad5e719d478c01d9df7ae54e93ee0 Mon Sep 17 00:00:00 2001
From: "yusuke.shinyama.dummy"
 <yusuke.shinyama.dummy@1aa58f4a-7d42-0410-adbc-911cccaed67c>
Date: Sat, 10 Jan 2009 10:45:49 +0000
Subject: [PATCH] handling type3 font size correctly.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@60 1aa58f4a-7d42-0410-adbc-911cccaed67c
---
 pdflib/pdfdevice.py | 24 ++++++++++++++----------
 pdflib/pdffont.py   | 44 +++++++++++++++++++++++++++++---------------
 pdflib/pdfinterp.py | 29 +++++++++++++++++------------
 samples/Makefile    |  4 ++--
 4 files changed, 62 insertions(+), 39 deletions(-)

diff --git a/pdflib/pdfdevice.py b/pdflib/pdfdevice.py
index df5c304..18954cc 100644
--- a/pdflib/pdfdevice.py
+++ b/pdflib/pdfdevice.py
@@ -90,11 +90,11 @@ class TextItem(object):
     self.direction = 0
     self.text = ''
     scaling *= .01
+    size = (font.get_ascent() - font.get_descent()) * fontsize
     if not self.font.is_vertical():
-      spwidth = int(font.char_width(32) * self.SPACE_WIDTH) # space width
+      # horizontal text
+      spwidth = font.char_width(32) * self.SPACE_WIDTH # space width
       self.direction = 1
-      (_,descent) = apply_matrix_norm(self.matrix, (0,font.descent * fontsize * .001))
-      ty += descent
       w = 0
       dx = 0
       prev = ' '
@@ -106,14 +106,18 @@ class TextItem(object):
           self.text += char
           prev = char
           dx = 0
-          w += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling
+          w += (font.char_width(ord(char)) * fontsize + charspace) * scaling
         else:
+          t *= .001
           dx -= t
-          w += t * fontsize * .001 * scaling
-      (w,h) = apply_matrix_norm(self.matrix, (w,fontsize))
+          w += t * fontsize * scaling
+      (_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
+      ty += descent
+      (w,h) = apply_matrix_norm(self.matrix, (w,size))
       self.adv = (w, 0)
       self.bbox = (tx, ty, tx+w, ty+h)
     else:
+      # vertical text
       self.direction = 2
       disp = 0
       h = 0
@@ -122,19 +126,19 @@ class TextItem(object):
           (disp,char) = t
           (_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
           self.text += char
-          h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling
+          h += (font.char_width(ord(char)) * fontsize + charspace) * scaling
           break
       for t in text:
         if isinstance(t, tuple):
           (_,char) = t
           self.text += char
-          h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling
-      (w,h) = apply_matrix_norm(self.matrix, (fontsize,h))
+          h += (font.char_width(ord(char)) * fontsize + charspace) * scaling
+      (w,h) = apply_matrix_norm(self.matrix, (size,h))
       tx -= w/2
       ty += disp
       self.adv = (0, h)
       self.bbox = (tx, ty+h, tx+w, ty)
-    self.fontsize = max(apply_matrix_norm(self.matrix, (fontsize,fontsize)))
+    self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
     return
   
   def __repr__(self):
diff --git a/pdflib/pdffont.py b/pdflib/pdffont.py
index 30f6c1d..d929bf7 100644
--- a/pdflib/pdffont.py
+++ b/pdflib/pdffont.py
@@ -12,6 +12,7 @@ from pdflib.pdftypes import PDFException, \
      resolve1, int_value, float_value, num_value, \
      str_value, list_value, dict_value, stream_value
 from pdflib.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
+from utils import apply_matrix_norm
 
 
 ##  Fonts
@@ -26,7 +27,7 @@ LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
 # PDFFont
 class PDFFont(object):
   
-  def __init__(self, descriptor, widths, default_width=None, font_matrix=None):
+  def __init__(self, descriptor, widths, default_width=None):
     self.descriptor = descriptor
     self.widths = widths
     self.fontname = descriptor.get('FontName', 'unknown')
@@ -37,7 +38,6 @@ class PDFFont(object):
     self.default_width = default_width or descriptor.get('MissingWidth', 0)
     self.leading = num_value(descriptor.get('Leading', 0))
     self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0)))
-    self.font_matrix = font_matrix or (.001,0,0,.001,0,0)
     return
 
   def __repr__(self):
@@ -52,8 +52,13 @@ class PDFFont(object):
   def decode(self, bytes):
     return map(ord, bytes)
 
+  def get_ascent(self):
+    return self.ascent * .001
+  def get_descent(self):
+    return self.descent * .001
+
   def char_width(self, cid):
-    return self.widths.get(cid, self.default_width)
+    return self.widths.get(cid, self.default_width) * .001
 
   def char_disp(self, cid):
     return 0
@@ -61,10 +66,11 @@ class PDFFont(object):
   def string_width(self, s):
     return sum( self.char_width(cid) for cid in self.decode(s) )
 
+
 # PDFSimpleFont
 class PDFSimpleFont(PDFFont):
   
-  def __init__(self, descriptor, widths, spec, font_matrix=None):
+  def __init__(self, descriptor, widths, spec):
     # Font encoding is specified either by a name of
     # built-in encoding or a dictionary that describes
     # the differences.
@@ -83,7 +89,7 @@ class PDFSimpleFont(PDFFont):
       strm = stream_value(spec['ToUnicode'])
       self.ucs2_cmap = CMap()
       CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
-    PDFFont.__init__(self, descriptor, widths, font_matrix=font_matrix)
+    PDFFont.__init__(self, descriptor, widths)
     return
 
   def to_unicode(self, cid):
@@ -102,7 +108,7 @@ class PDFSimpleFont(PDFFont):
 # PDFType1Font
 class PDFType1Font(PDFSimpleFont):
   
-  def __init__(self, spec):
+  def __init__(self, rsrc, spec):
     try:
       self.basefont = literal_name(spec['BaseFont'])
     except KeyError:
@@ -132,7 +138,7 @@ class PDFTrueTypeFont(PDFType1Font):
 # PDFType3Font
 class PDFType3Font(PDFSimpleFont):
   
-  def __init__(self, spec):
+  def __init__(self, rsrc, spec):
     firstchar = int_value(spec.get('FirstChar', 0))
     lastchar = int_value(spec.get('LastChar', 0))
     widths = list_value(spec.get('Widths', [0]*256))
@@ -143,13 +149,23 @@ class PDFType3Font(PDFSimpleFont):
       descriptor = {'FontName':spec.get('Name'),
                     'Ascent':0, 'Descent':0,
                     'FontBBox':spec['FontBBox']}
-    PDFSimpleFont.__init__(self, descriptor, widths, spec,
-                           font_matrix=tuple(list_value(spec.get('FontMatrix'))))
+    PDFSimpleFont.__init__(self, descriptor, widths, spec)
+    self.matrix = tuple(list_value(spec.get('FontMatrix')))
+    (_,self.descent,_,self.ascent) = self.bbox
+    (self.hscale,self.vscale) = apply_matrix_norm(self.matrix, (1,1))
     return
 
   def __repr__(self):
     return '<PDFType3Font>'
 
+  def get_ascent(self):
+    return self.ascent * self.vscale
+  def get_descent(self):
+    return self.descent * self.vscale
+
+  def char_width(self, cid):
+    return self.widths.get(cid, self.default_width) * self.hscale
+
 
 # PDFCIDFont
 
@@ -229,7 +245,7 @@ class TrueTypeFont(object):
 
 class PDFCIDFont(PDFFont):
   
-  def __init__(self, spec):
+  def __init__(self, rsrc, spec):
     try:
       self.basefont = literal_name(spec['BaseFont'])
     except KeyError:
@@ -246,7 +262,7 @@ class PDFCIDFont(PDFFont):
         raise PDFFontError('Encoding is unspecified')
       name = 'unknown'
     try:
-      self.cmap = CMapDB.get_cmap(name, strict=STRICT)
+      self.cmap = rsrc.get_cmap(name, strict=STRICT)
     except CMapDB.CMapNotFound, e:
       raise PDFFontError(e)
     try:
@@ -273,8 +289,8 @@ class PDFCIDFont(PDFFont):
           pass
     else:
       try:
-        self.ucs2_cmap = CMapDB.get_cmap('%s-UCS2' % self.cidcoding,
-                                         strict=STRICT)
+        self.ucs2_cmap = rsrc.get_cmap('%s-UCS2' % self.cidcoding,
+                                       strict=STRICT)
       except CMapDB.CMapNotFound, e:
         raise PDFFontError(e)
     
@@ -336,5 +352,3 @@ class PDFCIDFont(PDFFont):
       raise PDFUnicodeNotDefined(self.cidcoding, cid)
     chars = unpack('>%dH' % (len(code)/2), code)
     return ''.join( unichr(c) for c in chars )
-
-
diff --git a/pdflib/pdfinterp.py b/pdflib/pdfinterp.py
index 280c317..90ca1f8 100644
--- a/pdflib/pdfinterp.py
+++ b/pdflib/pdfinterp.py
@@ -12,10 +12,11 @@ from pdflib.psparser import PSException, PSTypeError, PSEOF, \
 from pdflib.pdftypes import PDFException, PDFStream, PDFObjRef, \
      resolve1, int_value, float_value, num_value, \
      str_value, list_value, dict_value, stream_value
-from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix, MATRIX_IDENTITY
+from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix, apply_matrix_norm, MATRIX_IDENTITY
 from pdflib.pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
 from pdflib.pdfcolor import ColorSpace, PREDEFINED_COLORSPACE, \
      LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK
+from pdflib.cmap import CMapDB
 
 
 ##  Exceptions
@@ -58,6 +59,9 @@ class PDFResourceManager(object):
         #raise PDFResourceError('ProcSet %r is not supported.' % proc)
         pass
     return
+
+  def get_cmap(self, cmapname, strict=False):
+    return CMapDB.get_cmap(cmapname, strict=strict)
   
   def get_font(self, objid, spec):
     if objid and objid in self.fonts:
@@ -75,16 +79,16 @@ class PDFResourceManager(object):
         subtype = 'Type1'
       if subtype in ('Type1', 'MMType1'):
         # Type1 Font
-        font = PDFType1Font(spec)
+        font = PDFType1Font(self, spec)
       elif subtype == 'TrueType':
         # TrueType Font
-        font = PDFTrueTypeFont(spec)
+        font = PDFTrueTypeFont(self, spec)
       elif subtype == 'Type3':
         # Type3 Font
-        font = PDFType3Font(spec)
+        font = PDFType3Font(self, spec)
       elif subtype in ('CIDFontType0', 'CIDFontType2'):
         # CID Font
-        font = PDFCIDFont(spec)
+        font = PDFCIDFont(self, spec)
       elif subtype == 'Type0':
         # Type0 Font
         dfonts = list_value(spec['DescendantFonts'])
@@ -535,16 +539,17 @@ class PDFPageInterpreter(object):
     self.device.render_string(textstate, textmatrix, seq)
     font = textstate.font
     s = ''.join( x for x in seq if isinstance(x, str) )
-    n = sum( x for x in seq if not isinstance(x, str) )
-    w = (font.string_width(s)-n)*.001 * textstate.fontsize + len(s) * textstate.charspace
-    if not font.is_multibyte():
-      w += s.count(' ')*textstate.wordspace
-    w *= (textstate.scaling * .01)
+    w = ((font.string_width(s) - sum( x for x in seq if not isinstance(x, str) )*.001) * textstate.fontsize +
+         len(s) * textstate.charspace)
     (lx,ly) = textstate.linematrix
     if font.is_vertical():
-      ly += w
+      # advance vertically
+      ly += w * (textstate.scaling * .01)
     else:
-      lx += w
+      # advance horizontally
+      if not font.is_multibyte():
+        w += s.count(' ')*textstate.wordspace
+      lx += w * (textstate.scaling * .01)
     textstate.linematrix = (lx,ly)
     return
   # show
diff --git a/samples/Makefile b/samples/Makefile
index 757137c..67e3534 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -2,7 +2,7 @@
 
 PYTHON=python
 CDBCMAPDIR=../CDBCMap
-PDF2TXT=PYTHONPATH=.. $(PYTHON) -m tools.pdf2txt
+PDF2TXT=PYTHONPATH=.. $(PYTHON) -m pdflib.pdf2txt
 
 HTMLS= \
 	simple1.html \
@@ -22,4 +22,4 @@ clean:
 
 .SUFFIXES: .pdf .html
 .pdf.html:
-	$(PDF2TXT) -D$(CDBCMAPDIR) -H -o $@ $<
+	$(PDF2TXT) -D$(CDBCMAPDIR) -o $@ $<