text positioning got right.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@87 1aa58f4a-7d42-0410-adbc-911cccaed67c
2009-04-18 17:15:49 +00:00 · 2009-04-18 17:15:49 +00:00 · 6d91453187
parent f8510edffc
commit 6d91453187
4 changed files with 78 additions and 86 deletions
--- a/pdflib/pdf2txt.py
+++ b/pdflib/pdf2txt.py
@ -28,8 +28,8 @@ def get_textobjs(item, r=None):
 ##  PDFConverter
 class PDFConverter(PDFPageAggregator):
-  def __init__(self, rsrc, outfp, codec='ascii', splitwords=False):
+  def __init__(self, rsrc, outfp, codec='ascii'):
-    PDFPageAggregator.__init__(self, rsrc, splitwords=splitwords)
+    PDFPageAggregator.__init__(self, rsrc)
    self.outfp = outfp
    self.codec = codec
    return
@ -66,8 +66,8 @@ class SGMLConverter(PDFConverter):
 ##
 class HTMLConverter(PDFConverter):
-  def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, cluster_margin=None, splitwords=False):
+  def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, cluster_margin=None):
-    PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=splitwords)
+    PDFConverter.__init__(self, rsrc, outfp, codec=codec)
    self.pagenum = pagenum
    self.pagepad = pagepad
    self.scale = scale
@ -75,25 +75,12 @@ class HTMLConverter(PDFConverter):
    self.outfp.write('</head><body>\n')
    self.yoffset = self.pagepad
    self.cluster_margin = cluster_margin
    self.show_text_border = False
    return
  def end_page(self, page):
    from cluster import cluster_pageobjs
    page = PDFConverter.end_page(self, page)
    def f(item):
      if isinstance(item, FigureItem):
        for child in item.objs:
          f(child)
      elif isinstance(item, TextItem):
        if item.direction == 2:
          wmode = 'tb-rl'
        else:
          wmode = 'lr-tb'
        (x,_,_,y) = item.bbox
        self.outfp.write('<span style="position:absolute; writing-mode:%s; left:%dpx; top:%dpx; font-size:%dpx;">' %
                         (wmode, x*self.scale, (self.yoffset-y)*self.scale, item.fontsize*self.scale))
        self.outfp.write(enc(item.text, self.codec))
        self.outfp.write('</span>\n')
    (x0,y0,x1,y1) = page.bbox
    self.yoffset += y1
    if self.pagenum:
@ -102,8 +89,26 @@ class HTMLConverter(PDFConverter):
    self.outfp.write('<span style="position:absolute; border: 1px solid gray; '
                     'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' % 
                     (x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale))
    def draw(item):
      if isinstance(item, FigureItem):
        for child in item.objs:
          draw(child)
      elif isinstance(item, TextItem):
        if item.direction == 2:
          wmode = 'tb-rl'
        else:
          wmode = 'lr-tb'
        (x0,y0,x1,y1) = item.bbox
        self.outfp.write('<span style="position:absolute; writing-mode:%s; left:%dpx; top:%dpx; font-size:%dpx;">' %
                         (wmode, x0*self.scale, (self.yoffset-y1)*self.scale, item.fontsize*self.scale))
        self.outfp.write(enc(item.text, self.codec))
        self.outfp.write('</span>\n')
        if self.show_text_border:
          self.outfp.write('<span style="position:absolute; border: 1px solid red; '
                           'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' % 
                           (x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale))
    for child in page.objs:
-      f(child)
+      draw(child)
    if self.cluster_margin:
      clusters = cluster_pageobjs(get_textobjs(page), self.cluster_margin)
      for ((x0,y0,x1,y1),_,objs) in clusters:
@ -124,8 +129,8 @@ class HTMLConverter(PDFConverter):
 ##
 class TextConverter(PDFConverter):
-  def __init__(self, rsrc, outfp, codec='utf-8', pagenum=False, cluster_margin=None, splitwords=False):
+  def __init__(self, rsrc, outfp, codec='utf-8', pagenum=False, cluster_margin=None):
-    PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=True)
+    PDFConverter.__init__(self, rsrc, outfp, codec=codec)
    self.pagenum = pagenum
    if cluster_margin == None:
      cluster_margin = 0.5
@ -288,9 +293,9 @@ def main(argv):
  CMapDB.initialize(cmapdir, cdbcmapdir)
  rsrc = PDFResourceManager()
  if outtype == 'sgml':
-    device = SGMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords)
+    device = SGMLConverter(rsrc, outfp, codec=codec)
  elif outtype == 'html':
-    device = HTMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords, cluster_margin=cluster_margin)
+    device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
  elif outtype == 'text':
    device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
  elif outtype == 'tag':
--- a/pdflib/pdfdevice.py
+++ b/pdflib/pdfdevice.py
@ -3,7 +3,8 @@ import sys
 stdout = sys.stdout
 stderr = sys.stderr
 from pdffont import PDFUnicodeNotDefined
-from utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix
+from utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix, \
     matrix2str, rect2str, point2str
 ##  PDFDevice
@ -84,71 +85,48 @@ class TextItem(object):
    self.matrix = matrix
    self.font = font
    (_,_,_,_,tx,ty) = self.matrix
    self.origin = (tx,ty)
    self.direction = 0
    self.text = ''
-    scaling *= .01
+    adv = 0
    for (char,cid) in chars:
      self.text += char
      adv += font.char_width(cid)
    adv = (adv * fontsize + len(chars)*charspace) * scaling * .01
    size = (font.get_ascent() - font.get_descent()) * fontsize
    if not self.font.is_vertical():
      # horizontal text
      spwidth = font.space_width()
      self.direction = 1
-      w = 0
+      (dx,dy) = apply_matrix_norm(self.matrix, (adv,size))
      dx = 0
      prev = ' '
      for (char,cid,t) in chars:
        if char:
          if prev != ' ' and spwidth < dx:
            self.text += ' '
          prev = char
          self.text += char
          dx = 0
          w += (font.char_width(cid) * fontsize + charspace) * scaling
        else:
          t *= .001
          dx -= t
          w -= t * fontsize * scaling
      (_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
      ty += descent
-      (w,h) = apply_matrix_norm(self.matrix, (w,size))
+      self.adv = (dx, 0)
-      self.adv = (w, 0)
+      self.bbox = (tx, ty, tx+dx, ty+dy)
      self.bbox = (tx, ty, tx+w, ty+h)
    else:
      # vertical text
      self.direction = 2
-      disp = 0
+      (_,cid) = chars[0]
-      h = 0
+      (_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001))
-      for (char,cid,disp) in chars:
+      (dx,dy) = apply_matrix_norm(self.matrix, (size,adv))
-        if not char: continue
+      tx -= dx/2
        (_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
        self.text += font.to_unicode(cid)
        h += (font.char_width(cid) * fontsize + charspace) * scaling
        break
      for (char,cid,_) in chars[1:]:
        if not char: continue
        self.text += font.to_unicode(cid)
        h += (font.char_width(cid) * fontsize + charspace) * scaling
      (w,h) = apply_matrix_norm(self.matrix, (size,h))
      tx -= w/2
      ty += disp
-      self.adv = (0, h)
+      self.adv = (0, dy)
-      self.bbox = (tx, ty+h, tx+w, ty)
+      self.bbox = (tx, ty+dy, tx+dx, ty)
    self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
    return
  def __repr__(self):
-    return ('<text matrix=%r font=%r fontsize=%r bbox=%r text=%r adv=%r>' %
+    return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s text=%r adv=%s>' %
-            (self.matrix, self.font, self.fontsize, self.bbox, self.text, self.adv))
+            (matrix2str(self.matrix), self.font, self.fontsize,
             rect2str(self.bbox), self.text, point2str(self.adv)))
 ##  PDFPageAggregator
 ##
 class PDFPageAggregator(PDFDevice):
-  def __init__(self, rsrc, pageno=1, splitwords=False):
+  def __init__(self, rsrc, pageno=1):
    PDFDevice.__init__(self, rsrc)
    self.pageno = pageno
    self.splitwords = splitwords
    self.stack = []
    return
@ -181,12 +159,22 @@ class PDFPageAggregator(PDFDevice):
      print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
    return None
  def render_chars(self, textmatrix, textstate, chars):
    if not chars: return (0, 0)
    item = TextItem(textmatrix, textstate.font, textstate.fontsize, textstate.charspace, textstate.scaling, chars)
    self.cur_item.add(item)
    return item.adv
  def render_string(self, textstate, textmatrix, seq):
    font = textstate.font
    textmatrix = mult_matrix(textmatrix, self.ctm)
    chars = []
    for x in seq:
      if isinstance(x, int) or isinstance(x, float):
-        chars.append((None, None, x))
+        (dx,dy) = self.render_chars(textmatrix, textstate, chars)
        dx -= x * textstate.scaling * .0001
        textmatrix = translate_matrix(textmatrix, (dx, dy))
        chars = []
      else:
        for cid in font.decode(x):
          try:
@ -194,20 +182,11 @@ class PDFPageAggregator(PDFDevice):
          except PDFUnicodeNotDefined, e:
            (cidcoding, cid) = e.args
            char = self.handle_undefined_char(cidcoding, cid)
-          chars.append((char, cid, font.char_disp(cid)))
+          chars.append((char, cid))
-    textmatrix = mult_matrix(textmatrix, self.ctm)
+          if cid == 32 and not font.is_multibyte():
-    word = []
+            (dx,dy) = self.render_chars(textmatrix, textstate, chars)
-    for (char, cid, disp) in chars:
+            dx += textstate.wordspace * textstate.scaling * .01
-      word.append((char,cid,disp))
+            textmatrix = translate_matrix(textmatrix, (dx, dy))
-      if self.splitwords and cid == 32 and not font.is_multibyte():
+            chars = []
-        if word:
+    self.render_chars(textmatrix, textstate, chars)
          item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, word)
          self.cur_item.add(item)
          (dx,dy) = item.adv
          dx += textstate.wordspace * textstate.scaling * .01
          textmatrix = translate_matrix(textmatrix, (dx, dy))
          word = []
    if word:
      item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, word)
      self.cur_item.add(item)
    return
--- a/pdflib/utils.py
+++ b/pdflib/utils.py
@ -23,6 +23,13 @@ def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
  '''equiv to apply_matrix(M, (p,q)) - apply_matrix(M, (0,0))'''
  return (a*p+c*q, b*p+d*q)
 # display functions
 def matrix2str((a,b,c,d,e,f)):
  return '[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % (a,b,c,d,e,f)
 def rect2str((x0,y0,x1,y1)):
  return '(%.1f, %.1f)-(%.1f, %.1f)' % (x0,y0,x1,y1)
 def point2str((x,y)):
  return '(%.1f, %.1f)' % (x,y)
 ##  Utilities
 ##
--- a/tools/prof.py
+++ b/tools/prof.py
@ -5,15 +5,16 @@ def prof_main(argv):
  import getopt
  import hotshot, hotshot.stats
  def usage():
-    print 'usage: %s output.prof mod.func [args ...]' % argv[0]
+    print 'usage: %s module.function [args ...]' % argv[0]
    return 100
  args = argv[1:]
-  if len(args) < 2: return usage()
+  if len(args) < 1: return usage()
  prof = args.pop(0)
  name = args.pop(0)
  prof = name+'.prof'
  i = name.rindex('.')
  (modname, funcname) = (name[:i], name[i+1:])
-  func = getattr(__import__(modname, fromlist=[modname]), funcname)
+  module = __import__(modname, fromlist=1)
  func = getattr(module, funcname)
  if args:
    args.insert(0, argv[0])
    prof = hotshot.Profile(prof)