wordspace handling improved.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@55 1aa58f4a-7d42-0410-adbc-911cccaed67c
2008-12-25 15:09:54 +00:00 · 2008-12-25 15:09:54 +00:00 · 71be16febe
parent 33f709a0d8
commit 71be16febe
6 changed files with 108 additions and 48 deletions
--- a/pdflib/page.py
+++ b/pdflib/page.py
@ -3,7 +3,7 @@ import sys
 stdout = sys.stdout
 stderr = sys.stderr
 from pdfinterp import PDFDevice, PDFUnicodeNotDefined, \
-     mult_matrix, apply_matrix
+     mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix


 ##  PageItem
@ -37,47 +37,73 @@ class FigureItem(PageItem):
 ##
 class TextItem(object):
  
-  def __init__(self, matrix, font, fontsize, width, text):
+  SPACE_WIDTH = 0.6
+  
+  def __init__(self, matrix, font, fontsize, charspace, scaling, text):
    self.matrix = matrix
    self.font = font
-    (a,b,c,d,tx,ty) = self.matrix
+    (_,_,_,_,tx,ty) = self.matrix
    self.origin = (tx,ty)
    self.direction = 0
+    self.text = ''
    if not self.font.is_vertical():
+      spwidth = int(font.char_width(32) * self.SPACE_WIDTH) # space width
      self.direction = 1
-      (self.width, self.height) = apply_matrix((a,b,c,d,0,0), (width,fontsize))
-      self.width = abs(self.width)
-      (_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*fontsize*0.001))
-      (_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*fontsize*0.001))
+      (_,descent) = apply_matrix_norm(self.matrix, (0,font.descent * fontsize * .001))
      ty += descent
-      self.bbox = (tx, ty, tx+self.width, ty+self.height)
+      w = 0
+      dx = 0
+      prev = ' '
+      for t in text:
+        if isinstance(t, tuple):
+          if prev != ' ' and spwidth < dx:
+            self.text += ' '
+          (_,char) = t
+          self.text += char
+          prev = char
+          dx = 0
+          w += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling * .01
+        else:
+          dx -= t
+          w += t * fontsize * .001 * scaling * .01
+      self.adv = (w, 0)
+      (w,h) = apply_matrix_norm(self.matrix, (w,fontsize))
+      self.bbox = (tx, ty, tx+w, ty+h)
    else:
      self.direction = 2
-      (self.width, self.height) = apply_matrix((a,b,c,d,0,0), (fontsize,width))
-      self.width = abs(self.width)
-      (disp,_) = text[0]
-      (_,disp) = apply_matrix((a,b,c,d,0,0), (0, (1000-disp)*fontsize*0.001))
-      tx -= self.width/2
+      disp = 0
+      h = 0
+      for t in text:
+        if isinstance(t, tuple):
+          (disp,char) = t
+          (_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
+          self.text += char
+          h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling * .01
+          break
+      for t in text:
+        if isinstance(t, tuple):
+          (_,char) = t
+          self.text += char
+          h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling * .01
+      self.adv = (0, h)
+      (w,h) = apply_matrix_norm(self.matrix, (fontsize,h))
+      tx -= w/2
      ty += disp
-      self.bbox = (tx, ty+self.height, tx+self.width, ty)
-    self.text = ''.join( c for (_,c) in text )
-    (w,h) = apply_matrix((a,b,c,d,0,0), (fontsize,fontsize))
-    self.fontsize = max(w,h)
+      self.bbox = (tx, ty+h, tx+w, ty)
+    self.fontsize = max(apply_matrix_norm(self.matrix, (fontsize,fontsize)))
    return
  
  def __repr__(self):
-    return ('<text matrix=%r font=%r fontsize=%r width=%r height=%r text=%r>' %
-            (self.matrix, self.font, self.fontsize, self.width, self.height, self.text))
+    return ('<text matrix=%r font=%r fontsize=%r bbox=%r text=%r>' %
+            (self.matrix, self.font, self.fontsize, self.bbox, self.text))


-##  TextConverter
+##  PageAggregator
 ##
-class TextConverter(PDFDevice):
+class PageAggregator(PDFDevice):

-  def __init__(self, rsrc, outfp, codec='utf-8', debug=0):
+  def __init__(self, rsrc, debug=0):
    PDFDevice.__init__(self, rsrc, debug=debug)
-    self.outfp = outfp
-    self.codec = codec
    self.pageno = 0
    self.stack = []
    return
@ -109,14 +135,12 @@ class TextConverter(PDFDevice):
      print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
    return None

-  def render_string(self, textstate, textmatrix, size, seq, ratio=0.6):
+  def render_string(self, textstate, textmatrix, seq):
    font = textstate.font
-    spwidth = int(-font.char_width(32) * ratio) # space width
    text = []
    for x in seq:
      if isinstance(x, int) or isinstance(x, float):
-        if not font.is_vertical() and x <= spwidth:
-          text.append((0, ' '))
+        text.append(x)
      else:
        chars = font.decode(x)
        for cid in chars:
@ -125,11 +149,20 @@ class TextConverter(PDFDevice):
            text.append((font.char_disp(cid), char))
          except PDFUnicodeNotDefined, e:
            (cidcoding, cid) = e.args
-            s = self.handle_undefined_char(cidcoding, cid)
-            if s:
-              text.append(s)
+            unc = self.handle_undefined_char(cidcoding, cid)
+            if unc:
+              text.append(unc)
+          if cid == 32 and not font.is_multibyte():
+            if text:
+              item = TextItem(mult_matrix(textmatrix, self.ctm),
+                              font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
+              self.cur_item.add(item)
+              (dx,dy) = item.adv
+              dx += textstate.wordspace * textstate.scaling * .01
+              textmatrix = translate_matrix(textmatrix, (dx, dy))
+              text = []
    if text:
      item = TextItem(mult_matrix(textmatrix, self.ctm),
-                      font, textstate.fontsize, size, text)
+                      font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
      self.cur_item.add(item)
    return
--- a/pdflib/pdf2txt.py
+++ b/pdflib/pdf2txt.py
@ -2,11 +2,11 @@
 import sys
 stdout = sys.stdout
 stderr = sys.stderr
-from pdflib.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
-from pdflib.pdfinterp import PDFDevice, PDFResourceManager, \
+from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
+from pdfinterp import PDFDevice, PDFResourceManager, \
     PDFPageInterpreter, PDFUnicodeNotDefined
-from pdflib.cmap import CMapDB
-from pdflib.page import PageItem, FigureItem, TextItem, TextConverter
+from cmap import CMapDB
+from page import PageItem, FigureItem, TextItem, PageAggregator


 def enc(x, codec):
@ -18,6 +18,16 @@ def encprops(props, codec):
  return ''.join( ' %s="%s"' % (enc(k,codec), enc(str(v),codec)) for (k,v) in sorted(props.iteritems()) )


+##  TextConverter
+class TextConverter(PageAggregator):
+  
+  def __init__(self, rsrc, outfp, codec='ascii', debug=0):
+    PageAggregator.__init__(self, rsrc, debug=debug)
+    self.outfp = outfp
+    self.codec = codec
+    return
+  
+  
 ##  SGMLConverter
 ##
 class SGMLConverter(TextConverter):
@ -156,7 +166,7 @@ class TagExtractor(PDFDevice):
 # pdf2txt
 class TextExtractionNotAllowed(RuntimeError): pass

-def convert(rsrc, device, fname, pagenos, maxpages=0, password='', debug=0):
+def convert(rsrc, device, fname, pagenos=None, maxpages=0, password='', debug=0):
  doc = PDFDocument(debug=debug)
  fp = file(fname, 'rb')
  parser = PDFParser(doc, fp, debug=debug)
--- a/pdflib/pdfinterp.py
+++ b/pdflib/pdfinterp.py
@ -73,10 +73,16 @@ def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
          a0*c1+c0*d1,    b0*c1+d0*d1,
          a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)

+def translate_matrix((a,b,c,d,e,f), (x,y)):
+  return (a,b,c,d,e+x,f+y)
+  
 def apply_matrix((a,b,c,d,e,f), (x,y)):
  '''Applies a matrix to coordinates.'''
  return (a*x+c*y+e, b*x+d*y+f)

+def apply_matrix_norm((a,b,c,d,e,f), (x,y)):
+  return (a*x+c*y, b*x+d*y)
+

 ##  Fonts
 ##
@ -103,6 +109,9 @@ class PDFFont(object):
  def is_vertical(self):
    return False
  
+  def is_multibyte(self):
+    return False
+  
  def decode(self, bytes):
    return map(ord, bytes)

@ -373,6 +382,9 @@ class PDFCIDFont(PDFFont):
  def is_vertical(self):
    return self.vertical

+  def is_multibyte(self):
+    return True
+  
  def decode(self, bytes):
    return self.cmap.decode(bytes)

@ -498,7 +510,7 @@ class PDFDevice(object):
  def end_figure(self, name):
    return
  
-  def render_string(self, textstate, textmatrix, size, seq):
+  def render_string(self, textstate, textmatrix, seq):
    raise NotImplementedError
  def render_image(self, stream, size, matrix):
    raise NotImplementedError
@ -928,15 +940,16 @@ class PDFPageInterpreter(object):
  def do_TJ(self, seq):
    #print >>stderr, 'TJ(%r): %r' % (seq,self.textstate)
    textstate = self.textstate
+    matrix = translate_matrix(textstate.matrix, textstate.linematrix)
+    self.device.render_string(textstate, matrix, seq)
    font = textstate.font
-    (a,b,c,d,e,f) = textstate.matrix
-    (lx,ly) = textstate.linematrix
    s = ''.join( x for x in seq if isinstance(x, str) )
    n = sum( x for x in seq if not isinstance(x, str) )
-    w = ((font.string_width(s)-n)/1000.0 * textstate.fontsize +
-         len(s) * textstate.charspace +
-         s.count(' ')*textstate.wordspace) * textstate.scaling / 100.0
-    self.device.render_string(textstate, (a,b,c,d,e+lx,f+ly), w, seq)
+    w = (font.string_width(s)-n)*.001 * textstate.fontsize + len(s) * textstate.charspace
+    if not font.is_multibyte():
+      w += s.count(' ')*textstate.wordspace
+    w *= (textstate.scaling * .01)
+    (lx,ly) = textstate.linematrix
    if font.is_vertical():
      ly += w
    else:
--- a/pdflib/pdfparser.py
+++ b/pdflib/pdfparser.py
@ -586,7 +586,7 @@ class PDFDocument(object):
        self.parser.seek(index)
        (_,objid1) = self.parser.nexttoken() # objid
        (_,genno) = self.parser.nexttoken() # genno
-        assert objid1 == objid, (objid, objid1)
+        #assert objid1 == objid, (objid, objid1)
        (_,kwd) = self.parser.nexttoken()
        if kwd != KEYWORD_OBJ:
          raise PDFSyntaxError('Invalid object spec: offset=%r' % index)
--- a/samples/simple1.pdf
+++ b/samples/simple1.pdf
@ -32,11 +32,15 @@ endobj
 >>
 endobj
 5 0 obj
-<< /Length 46 >>
+<< /Length 86 >>
 stream
 BT
 /F1 24 Tf
-1 0 0 1 100 700 TD
+100 600 Td
+0 Tw
+( Hello World ) Tj
+0 100 Td
+100 Tw
 ( Hello World ) Tj
 ET
 endstream
--- a/tools/dumppdf.py
+++ b/tools/dumppdf.py
@ -89,7 +89,7 @@ def dumpallobjs(out, doc, codec=None):
        dumpxml(out, obj, codec=codec)
        out.write('\n</object>\n\n')
      except:
-        pass
+        raise
  dumptrailers(out, doc)
  out.write('</pdf>')
  return