handling type3 font size correctly.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@60 1aa58f4a-7d42-0410-adbc-911cccaed67c
2009-01-10 10:45:49 +00:00 · 2009-01-10 10:45:49 +00:00 · ba277fb5a0
parent 91770edd46
commit ba277fb5a0
4 changed files with 62 additions and 39 deletions
--- a/pdflib/pdfdevice.py
+++ b/pdflib/pdfdevice.py
@ -90,11 +90,11 @@ class TextItem(object):
    self.direction = 0
    self.text = ''
    scaling *= .01
    size = (font.get_ascent() - font.get_descent()) * fontsize
    if not self.font.is_vertical():
-      spwidth = int(font.char_width(32) * self.SPACE_WIDTH) # space width
+      # horizontal text
      spwidth = font.char_width(32) * self.SPACE_WIDTH # space width
      self.direction = 1
      (_,descent) = apply_matrix_norm(self.matrix, (0,font.descent * fontsize * .001))
      ty += descent
      w = 0
      dx = 0
      prev = ' '
@ -106,14 +106,18 @@ class TextItem(object):
          self.text += char
          prev = char
          dx = 0
-          w += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling
+          w += (font.char_width(ord(char)) * fontsize + charspace) * scaling
        else:
          t *= .001
          dx -= t
-          w += t * fontsize * .001 * scaling
+          w += t * fontsize * scaling
-      (w,h) = apply_matrix_norm(self.matrix, (w,fontsize))
+      (_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
      ty += descent
      (w,h) = apply_matrix_norm(self.matrix, (w,size))
      self.adv = (w, 0)
      self.bbox = (tx, ty, tx+w, ty+h)
    else:
      # vertical text
      self.direction = 2
      disp = 0
      h = 0
@ -122,19 +126,19 @@ class TextItem(object):
          (disp,char) = t
          (_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
          self.text += char
-          h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling
+          h += (font.char_width(ord(char)) * fontsize + charspace) * scaling
          break
      for t in text:
        if isinstance(t, tuple):
          (_,char) = t
          self.text += char
-          h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling
+          h += (font.char_width(ord(char)) * fontsize + charspace) * scaling
-      (w,h) = apply_matrix_norm(self.matrix, (fontsize,h))
+      (w,h) = apply_matrix_norm(self.matrix, (size,h))
      tx -= w/2
      ty += disp
      self.adv = (0, h)
      self.bbox = (tx, ty+h, tx+w, ty)
-    self.fontsize = max(apply_matrix_norm(self.matrix, (fontsize,fontsize)))
+    self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
    return
  def __repr__(self):
--- a/pdflib/pdffont.py
+++ b/pdflib/pdffont.py
@ -12,6 +12,7 @@ from pdflib.pdftypes import PDFException, \
     resolve1, int_value, float_value, num_value, \
     str_value, list_value, dict_value, stream_value
 from pdflib.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
 from utils import apply_matrix_norm
 ##  Fonts
@ -26,7 +27,7 @@ LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
 # PDFFont
 class PDFFont(object):
-  def __init__(self, descriptor, widths, default_width=None, font_matrix=None):
+  def __init__(self, descriptor, widths, default_width=None):
    self.descriptor = descriptor
    self.widths = widths
    self.fontname = descriptor.get('FontName', 'unknown')
@ -37,7 +38,6 @@ class PDFFont(object):
    self.default_width = default_width or descriptor.get('MissingWidth', 0)
    self.leading = num_value(descriptor.get('Leading', 0))
    self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0)))
    self.font_matrix = font_matrix or (.001,0,0,.001,0,0)
    return
  def __repr__(self):
@ -52,8 +52,13 @@ class PDFFont(object):
  def decode(self, bytes):
    return map(ord, bytes)
  def get_ascent(self):
    return self.ascent * .001
  def get_descent(self):
    return self.descent * .001
  def char_width(self, cid):
-    return self.widths.get(cid, self.default_width)
+    return self.widths.get(cid, self.default_width) * .001
  def char_disp(self, cid):
    return 0
@ -61,10 +66,11 @@ class PDFFont(object):
  def string_width(self, s):
    return sum( self.char_width(cid) for cid in self.decode(s) )
 # PDFSimpleFont
 class PDFSimpleFont(PDFFont):
-  def __init__(self, descriptor, widths, spec, font_matrix=None):
+  def __init__(self, descriptor, widths, spec):
    # Font encoding is specified either by a name of
    # built-in encoding or a dictionary that describes
    # the differences.
@ -83,7 +89,7 @@ class PDFSimpleFont(PDFFont):
      strm = stream_value(spec['ToUnicode'])
      self.ucs2_cmap = CMap()
      CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
-    PDFFont.__init__(self, descriptor, widths, font_matrix=font_matrix)
+    PDFFont.__init__(self, descriptor, widths)
    return
  def to_unicode(self, cid):
@ -102,7 +108,7 @@ class PDFSimpleFont(PDFFont):
 # PDFType1Font
 class PDFType1Font(PDFSimpleFont):
-  def __init__(self, spec):
+  def __init__(self, rsrc, spec):
    try:
      self.basefont = literal_name(spec['BaseFont'])
    except KeyError:
@ -132,7 +138,7 @@ class PDFTrueTypeFont(PDFType1Font):
 # PDFType3Font
 class PDFType3Font(PDFSimpleFont):
-  def __init__(self, spec):
+  def __init__(self, rsrc, spec):
    firstchar = int_value(spec.get('FirstChar', 0))
    lastchar = int_value(spec.get('LastChar', 0))
    widths = list_value(spec.get('Widths', [0]*256))
@ -143,13 +149,23 @@ class PDFType3Font(PDFSimpleFont):
      descriptor = {'FontName':spec.get('Name'),
                    'Ascent':0, 'Descent':0,
                    'FontBBox':spec['FontBBox']}
-    PDFSimpleFont.__init__(self, descriptor, widths, spec,
+    PDFSimpleFont.__init__(self, descriptor, widths, spec)
-                           font_matrix=tuple(list_value(spec.get('FontMatrix'))))
+    self.matrix = tuple(list_value(spec.get('FontMatrix')))
    (_,self.descent,_,self.ascent) = self.bbox
    (self.hscale,self.vscale) = apply_matrix_norm(self.matrix, (1,1))
    return
  def __repr__(self):
    return '<PDFType3Font>'
  def get_ascent(self):
    return self.ascent * self.vscale
  def get_descent(self):
    return self.descent * self.vscale
  def char_width(self, cid):
    return self.widths.get(cid, self.default_width) * self.hscale
 # PDFCIDFont
@ -229,7 +245,7 @@ class TrueTypeFont(object):
 class PDFCIDFont(PDFFont):
-  def __init__(self, spec):
+  def __init__(self, rsrc, spec):
    try:
      self.basefont = literal_name(spec['BaseFont'])
    except KeyError:
@ -246,7 +262,7 @@ class PDFCIDFont(PDFFont):
        raise PDFFontError('Encoding is unspecified')
      name = 'unknown'
    try:
-      self.cmap = CMapDB.get_cmap(name, strict=STRICT)
+      self.cmap = rsrc.get_cmap(name, strict=STRICT)
    except CMapDB.CMapNotFound, e:
      raise PDFFontError(e)
    try:
@ -273,7 +289,7 @@ class PDFCIDFont(PDFFont):
          pass
    else:
      try:
-        self.ucs2_cmap = CMapDB.get_cmap('%s-UCS2' % self.cidcoding,
+        self.ucs2_cmap = rsrc.get_cmap('%s-UCS2' % self.cidcoding,
                                       strict=STRICT)
      except CMapDB.CMapNotFound, e:
        raise PDFFontError(e)
@ -336,5 +352,3 @@ class PDFCIDFont(PDFFont):
      raise PDFUnicodeNotDefined(self.cidcoding, cid)
    chars = unpack('>%dH' % (len(code)/2), code)
    return ''.join( unichr(c) for c in chars )
--- a/pdflib/pdfinterp.py
+++ b/pdflib/pdfinterp.py
@ -12,10 +12,11 @@ from pdflib.psparser import PSException, PSTypeError, PSEOF, \
 from pdflib.pdftypes import PDFException, PDFStream, PDFObjRef, \
     resolve1, int_value, float_value, num_value, \
     str_value, list_value, dict_value, stream_value
-from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix, MATRIX_IDENTITY
+from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix, apply_matrix_norm, MATRIX_IDENTITY
 from pdflib.pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
 from pdflib.pdfcolor import ColorSpace, PREDEFINED_COLORSPACE, \
     LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK
 from pdflib.cmap import CMapDB
 ##  Exceptions
@ -59,6 +60,9 @@ class PDFResourceManager(object):
        pass
    return
  def get_cmap(self, cmapname, strict=False):
    return CMapDB.get_cmap(cmapname, strict=strict)
  def get_font(self, objid, spec):
    if objid and objid in self.fonts:
      font = self.fonts[objid]
@ -75,16 +79,16 @@ class PDFResourceManager(object):
        subtype = 'Type1'
      if subtype in ('Type1', 'MMType1'):
        # Type1 Font
-        font = PDFType1Font(spec)
+        font = PDFType1Font(self, spec)
      elif subtype == 'TrueType':
        # TrueType Font
-        font = PDFTrueTypeFont(spec)
+        font = PDFTrueTypeFont(self, spec)
      elif subtype == 'Type3':
        # Type3 Font
-        font = PDFType3Font(spec)
+        font = PDFType3Font(self, spec)
      elif subtype in ('CIDFontType0', 'CIDFontType2'):
        # CID Font
-        font = PDFCIDFont(spec)
+        font = PDFCIDFont(self, spec)
      elif subtype == 'Type0':
        # Type0 Font
        dfonts = list_value(spec['DescendantFonts'])
@ -535,16 +539,17 @@ class PDFPageInterpreter(object):
    self.device.render_string(textstate, textmatrix, seq)
    font = textstate.font
    s = ''.join( x for x in seq if isinstance(x, str) )
-    n = sum( x for x in seq if not isinstance(x, str) )
+    w = ((font.string_width(s) - sum( x for x in seq if not isinstance(x, str) )*.001) * textstate.fontsize +
-    w = (font.string_width(s)-n)*.001 * textstate.fontsize + len(s) * textstate.charspace
+         len(s) * textstate.charspace)
    if not font.is_multibyte():
      w += s.count(' ')*textstate.wordspace
    w *= (textstate.scaling * .01)
    (lx,ly) = textstate.linematrix
    if font.is_vertical():
-      ly += w
+      # advance vertically
      ly += w * (textstate.scaling * .01)
    else:
-      lx += w
+      # advance horizontally
      if not font.is_multibyte():
        w += s.count(' ')*textstate.wordspace
      lx += w * (textstate.scaling * .01)
    textstate.linematrix = (lx,ly)
    return
  # show
--- a/samples/Makefile
+++ b/samples/Makefile
@ -2,7 +2,7 @@
 PYTHON=python
 CDBCMAPDIR=../CDBCMap
-PDF2TXT=PYTHONPATH=.. $(PYTHON) -m tools.pdf2txt
+PDF2TXT=PYTHONPATH=.. $(PYTHON) -m pdflib.pdf2txt
 HTMLS= \
 	simple1.html \
@ -22,4 +22,4 @@ clean:
 .SUFFIXES: .pdf .html
 .pdf.html:
-	$(PDF2TXT) -D$(CDBCMAPDIR) -H -o $@ $<
+	$(PDF2TXT) -D$(CDBCMAPDIR) -o $@ $<