layout analysis improved.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@93 1aa58f4a-7d42-0410-adbc-911cccaed67c
2009-05-04 08:29:36 +00:00 · 2009-05-04 08:29:36 +00:00 · fd27d16acc
parent 43e5c05307
commit fd27d16acc
5 changed files with 285 additions and 120 deletions
--- a/pdflib/cluster.py
+++ b/pdflib/cluster.py
@ -1,11 +1,18 @@
 #!/usr/bin/env python
 import sys
+from pdfdevice import PageItem
+from utils import pick
+INF = sys.maxint


-##  binary search
+##  bsearch
+##
+##  Finds objects whose coordinates overlap with [v0,v1].
+##  It performs binary search so that the processing time
+##  should be around O(log n).
 ##
 def bsearch(objs, v0, v1):
-  assert v0 <= v1
+  if v1 <= v0: return []
  i0 = 0
  i1 = len(objs)-1
  while i0 <= i1:
@ -31,27 +38,79 @@ def bsearch(objs, v0, v1):
  return []


+##  reorder_hv, reorder_vh
+##
+##  Reorders objects according to its writing direction.
+##
+def reorder_hv(objs, hdir):
+  if 0 < hdir:
+    hkey = (lambda obj: obj.x0)
+  else:
+    hkey = (lambda obj: -obj.x1)
+  vkey = (lambda obj: -obj.y1)
+  r = []
+  line = []
+  for obj1 in sorted(objs, key=vkey):
+    if line and not line[-1].voverlap(obj1):
+      line.sort(key=hkey)
+      r.append(line)
+      line = []
+    line.append(obj1)
+  line.sort(key=hkey)
+  r.append(line)
+  return r
+
+def reorder_vh(objs, hdir):
+  if 0 < hdir:
+    hkey = (lambda obj: obj.x0)
+  else:
+    hkey = (lambda obj: -obj.x1)
+  vkey = (lambda obj: -obj.y1)
+  r = []
+  line = []
+  for obj1 in sorted(objs, key=hkey):
+    if line and not line[-1].hoverlap(obj1):
+      line.sort(key=vkey)
+      r.append(line)
+      line = []
+    line.append(obj1)
+  line.sort(key=vkey)
+  r.append(line)
+  return r
+
+
 ##  Plane
 ##
+##  A data structure for objects placed on a plane.
+##  Can efficiently find objects in a certain rectangular area.
+##  It maintains two parallel lists of objects, each of
+##  which is sorted by its x or y coordinate.
+##
 class Plane(object):

-  def __init__(self):
+  def __init__(self, objs):
    self.xobjs = []
    self.yobjs = []
+    for obj in objs:
+      self.place(obj)
+    self.fixate()
    return

-  def add(self, (x0,y0,x1,y1), obj):
-    self.xobjs.append((x0, obj))
-    self.xobjs.append((x1, obj))
-    self.yobjs.append((y0, obj))
-    self.yobjs.append((y1, obj))
+  # place(obj): place an object in a certain area.
+  def place(self, obj):
+    self.xobjs.append((obj.x0, obj))
+    self.xobjs.append((obj.x1, obj))
+    self.yobjs.append((obj.y0, obj))
+    self.yobjs.append((obj.y1, obj))
    return

-  def finish(self):
+  # fixate(): you must call this after adding all objects.
+  def fixate(self):
    self.xobjs.sort()
    self.yobjs.sort()
    return

+  # find(): finds objects that are in a certain area.
  def find(self, (x0,y0,x1,y1)):
    xobjs = set(bsearch(self.xobjs, x0, x1))
    yobjs = set(bsearch(self.yobjs, y0, y1))
@ -59,68 +118,127 @@ class Plane(object):
    return objs


+##  TextBox
+##
+##  A set of text objects that are clustered in
+##  a certain rectangular area.
+##
+class TextBox(PageItem):
+
+  def __init__(self, objs):
+    self.objs = set(objs)
+    self.vertical = False
+    self.length = None
+    return
+
+  def __repr__(self):
+    return ('<textbox %s %s items=%d>' % (self.bbox(), self.vertical, len(self.objs)))
+
+  def __len__(self):
+    return self.length
+  
+  # merge(boxes): merges with other textboxes.
+  def merge(self, box):
+    self.objs.update(box.objs)
+    return
+
+  # finish(): determines its boundery and writing direction.
+  def finish(self):
+    assert self.objs
+    (bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
+    for obj in self.objs:
+      bx0 = min(bx0, obj.x0)
+      by0 = min(by0, obj.y0)
+      bx1 = max(bx1, obj.x1)
+      by1 = max(by1, obj.y1)
+    PageItem.__init__(self, (bx0, by0, bx1, by1))
+    self.length = sum( len(obj) for obj in self.objs )
+    for obj in self.objs:
+      self.vertical = obj.vertical
+      break
+    if 2 <= len(self.objs):
+      objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
+      if len(objs[0]) == 1 and len(objs[1]) == 1:
+        h = objs[0].voverlap(objs[1])
+        v = objs[0].hoverlap(objs[1])
+        self.vertical = (h < v)
+    return
+
+  def lines(self, ratio):
+    if self.vertical:
+      objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
+      for line in reorder_vh(objs, -1):
+        s = ''
+        y0 = -INF
+        for obj in line:
+          margin = abs(obj.fontsize * ratio)
+          if obj.y1 < y0-margin:
+            s += ' '
+          s += obj.text
+          y0 = obj.y0
+        yield s
+    else:
+      objs = sorted(self.objs, key=lambda obj: obj.x0-obj.y1)
+      for line in reorder_hv(objs, +1):
+        s = ''
+        x1 = INF
+        for obj in line:
+          margin = abs(obj.fontsize * ratio)
+          if x1+margin < obj.x0:
+            s += ' '
+          s += obj.text
+          x1 = obj.x1
+        yield s
+    return
+
+
 ##  ClusterSet
 ##
+##  Maintains a set of TextBox objects.
+##  It incrementally constructs TextBox objects
+##  and group them when necessary. It gives
+##  a sequence of TextBox objects that represent
+##  the text stream of that page.
+##
 class ClusterSet(object):

  def __init__(self):
    self.clusters = {}
    return

-  def add(self, obj):
-    self.clusters[obj] = (obj,)
-    return
-
-  def merge(self, objs):
-    allobjs = set(objs)
+  # add(objs): groups text objects if necessary.
+  def add(self, objs):
+    c = TextBox(objs)
    for obj in objs:
      if obj in self.clusters:
-        allobjs.update(self.clusters[obj])
-    c = tuple(allobjs)
-    for obj in allobjs:
+        c.merge(self.clusters[obj])
+    for obj in c.objs:
      self.clusters[obj] = c
    return

+  # finish(): returns all the TextBoxes in a page.
  def finish(self):
-    return set(self.clusters.itervalues())
+    r = set(self.clusters.itervalues())
+    for textbox in r:
+      textbox.finish()
+    return r

-
-def cluster_pageobjs(objs, ratio):
-  idx = dict( (obj,i) for (i,obj) in enumerate(objs) )
-  plane = Plane()
-  for obj in objs:
-    plane.add(obj.bbox, obj)
-  plane.finish()
+# cluster_textobjs
+def cluster_textobjs(objs, ratio):
+  plane = Plane(objs)
  cset = ClusterSet()
  for obj in objs:
-    (bx0,by0,bx1,by1) = obj.bbox
    margin = abs(obj.fontsize * ratio)
-    x0 = min(bx0,bx1)
-    y0 = min(by0,by1)
-    x1 = max(bx0,bx1)
-    y1 = max(by0,by1)
-    found = plane.find((x0-margin, y0-margin, x1+margin, y1+margin))
-    if len(found) == 1:
-      cset.add(found.pop())
-    else:
-      cset.merge(found)
-  clusters = sorted(cset.finish(), key=lambda objs: idx[objs[0]])
+    neighbors = plane.find((obj.x0-margin, obj.y0-margin, obj.x1+margin, obj.y1+margin))
+    cset.add(neighbors)
+  clusters = cset.finish()
+  vertical = ((sum( len(textbox) for textbox in clusters )/2) <
+              sum( len(textbox) for textbox in clusters if textbox.vertical ))
+  if vertical:
+    lines = reorder_hv(clusters, -1)
+  else:
+    lines = reorder_vh(clusters, +1)
  r = []
-  for objs in clusters:
-    objs = sorted(objs, key=lambda obj: idx[obj])
-    h = v = 0
-    (bx0,by0,bx1,by1) = objs[0].bbox
-    (lx0,ly0,_,_) = objs[0].bbox
-    for obj in objs[1:]:
-      (x0,y0,x1,y1) = obj.bbox
-      if len(obj.text) == 1 and abs(lx0-x0) < abs(ly0-y0):
-        v += 1
-      else:
-        h += 1
-      (lx0,ly0) = (x0,y0)
-      bx0 = min(bx0, x0)
-      bx1 = max(bx1, x1)
-      by0 = min(by0, y0)
-      by1 = max(by1, y1)
-    r.append(((bx0,by0,bx1,by1), h < v, objs))
+  for line in lines:
+    r.extend(line)
  return r
--- a/pdflib/pdf2txt.py
+++ b/pdflib/pdf2txt.py
@ -2,7 +2,7 @@
 import sys
 from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
 from pdfinterp import PDFResourceManager, PDFPageInterpreter
-from pdfdevice import PDFDevice, PageItem, FigureItem, TextItem, PDFPageAggregator
+from pdfdevice import PDFDevice, PageItem, Page, FigureItem, TextItem, PDFPageAggregator
 from pdffont import PDFUnicodeNotDefined
 from cmap import CMapDB

@ -19,7 +19,7 @@ def get_textobjs(item, r=None):
  if r == None: r = []
  if isinstance(item, TextItem):
    r.append(item)
-  elif isinstance(item, PageItem):
+  elif isinstance(item, Page):
    for child in item.objs:
      get_textobjs(child, r)
  return r
@ -49,8 +49,8 @@ class SGMLConverter(PDFConverter):
          f(child)
        self.outfp.write('</figure>\n')
      elif isinstance(item, TextItem):
-        self.outfp.write('<text font="%s" direction="%s" bbox="%s" fontsize="%.3f">' %
-                         (enc(item.font.fontname, self.codec), item.direction, bbox, item.fontsize))
+        self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
+                         (enc(item.font.fontname, self.codec), item.vertical, bbox, item.fontsize))
        self.outfp.write(enc(item.text, self.codec))
        self.outfp.write('</text>\n')
    bbox = '%.3f,%.3f,%.3f,%.3f' % page.bbox
@ -79,42 +79,45 @@ class HTMLConverter(PDFConverter):
    return
  
  def end_page(self, page):
-    from cluster import cluster_pageobjs
+    from cluster import cluster_textobjs
    page = PDFConverter.end_page(self, page)
-    (x0,y0,x1,y1) = page.bbox
-    self.yoffset += y1
+    self.yoffset += page.y1
    if self.pagenum:
      self.outfp.write('<div style="position:absolute; top:%dpx;"><a name="%s">Page %s</a></div>' % 
-                       ((self.yoffset-y1)*self.scale, page.id, page.id))
+                       ((self.yoffset-page.y1)*self.scale, page.id, page.id))
    self.outfp.write('<span style="position:absolute; border: 1px solid gray; '
                     'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' % 
-                     (x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale))
+                     (page.x0*self.scale, (self.yoffset-page.y1)*self.scale,
+                      page.width*self.scale, page.height*self.scale))
    def draw(item):
      if isinstance(item, FigureItem):
        for child in item.objs:
          draw(child)
      elif isinstance(item, TextItem):
-        if item.direction == 2:
+        if item.vertical:
          wmode = 'tb-rl'
        else:
          wmode = 'lr-tb'
-        (x0,y0,x1,y1) = item.bbox
-        self.outfp.write('<span style="position:absolute; writing-mode:%s; left:%dpx; top:%dpx; font-size:%dpx;">' %
-                         (wmode, x0*self.scale, (self.yoffset-y1)*self.scale, item.fontsize*self.scale))
+        self.outfp.write('<span style="position:absolute; writing-mode:%s;'
+                         ' left:%dpx; top:%dpx; font-size:%dpx;">' %
+                         (wmode, item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
+                          item.fontsize*self.scale))
        self.outfp.write(enc(item.text, self.codec))
        self.outfp.write('</span>\n')
        if self.show_text_border:
          self.outfp.write('<span style="position:absolute; border: 1px solid red; '
                           'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' % 
-                           (x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale))
+                           (item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
+                            item.width*self.scale, self.height*self.scale))
    for child in page.objs:
      draw(child)
    if self.cluster_margin:
-      clusters = cluster_pageobjs(get_textobjs(page), self.cluster_margin)
-      for ((x0,y0,x1,y1),_,objs) in clusters:
+      clusters = cluster_textobjs(get_textobjs(page), self.cluster_margin)
+      for textbox in clusters:
        self.outfp.write('<span style="position:absolute; border: 1px solid red; '
                         'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' % 
-                       (x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale))
+                       (textbox.x0*self.scale, (self.yoffset-textbox.y1)*self.scale,
+                        textbox.width*self.scale, textbox.height*self.scale))
    self.yoffset += self.pagepad
    return

@ -135,30 +138,25 @@ class TextConverter(PDFConverter):
    if cluster_margin == None:
      cluster_margin = 0.5
    self.cluster_margin = cluster_margin
+    self.word_margin = 0.2
    return
  
  def end_page(self, page):
-    from cluster import cluster_pageobjs
+    from cluster import cluster_textobjs
    page = PDFConverter.end_page(self, page)
    if self.pagenum:
      self.outfp.write('Page %d\n' % page.id)
    if self.cluster_margin:
      textobjs = get_textobjs(page)
-      clusters = cluster_pageobjs(textobjs, self.cluster_margin)
-      for (_,vertical,objs) in clusters:
-        for (i,item) in enumerate(objs):
-          (x0,y0,x1,y1) = item.bbox
-          if (i and
-              ((not vertical and (y1 < ly0 or ly1 < y0)) or
-               (vertical and (x1 < lx0 or lx1 < x0)))):
-            self.outfp.write('\n')
-          (lx0,ly0,lx1,ly1) = (x0,y0,x1,y1)
-          self.outfp.write(item.text.encode(self.codec, 'replace'))
-        self.outfp.write('\n\n')
+      clusters = cluster_textobjs(textobjs, self.cluster_margin)
+      for textbox in clusters:
+        for line in textbox.lines(self.word_margin):
+          self.outfp.write(line.encode(self.codec, 'replace')+'\n')
+        self.outfp.write('\n')
    else:
-      for item in page.objs:
-        if isinstance(item, TextItem):
-          self.outfp.write(item.text.encode(self.codec, 'replace'))
+      for obj in page.objs:
+        if isinstance(obj, TextItem):
+          self.outfp.write(obj.text.encode(self.codec, 'replace'))
          self.outfp.write('\n')
    self.outfp.write('\f')
    return
--- a/pdflib/pdfdevice.py
+++ b/pdflib/pdfdevice.py
@ -52,74 +52,119 @@ class PDFDevice(object):
    return


-##  PageItem
+##  Page
 ##
 class PageItem(object):
+
+  def __init__(self, (x0,y0,x1,y1)):
+    #assert x0 <= x1 and y0 <= y1
+    self.x0 = x0
+    self.y0 = y0
+    self.x1 = x1
+    self.y1 = y1
+    self.width = x1-x0
+    self.height = y1-y0
+    return
+
+  def __repr__(self):
+    return ('<pageitem bbox=%s>' % (self.bbox()))
  
-  def __init__(self, id, (x0,y0,x1,y1), rotate=0):
-    self.id = id
-    self.bbox = (x0, y0, x1, y1)
-    self.rotate = rotate
+  def bbox(self):
+    return rect2str((self.x0, self.y0, self.x1, self.y1))
+  
+  def hoverlap(self, obj):
+    assert isinstance(obj, PageItem)
+    if self.x1 <= obj.x0 or obj.x1 <= self.x0:
+      return 0
+    else:
+      return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
+
+  def voverlap(self, obj):
+    assert isinstance(obj, PageItem)
+    if self.y1 <= obj.y0 or obj.y1 <= self.y0:
+      return 0
+    else:
+      return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
+  
+  
+class PageContainer(PageItem):
+  
+  def __init__(self, bbox):
+    PageItem.__init__(self, bbox)
    self.objs = []
    return
  
-  def __repr__(self):
-    return ('<page id=%r bbox=%r rotate=%r>' % (self.id, self.bbox, self.rotate))
-  
  def add(self, obj):
    self.objs.append(obj)
    return
+  
+class Page(PageContainer):
+  
+  def __init__(self, id, bbox, rotate=0):
+    PageContainer.__init__(self, bbox)
+    self.id = id
+    self.rotate = rotate
+    return
+  
+  def __repr__(self):
+    return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.bbox(), self.rotate))


 ##  FigureItem
 ##
-class FigureItem(PageItem):
+class FigureItem(PageContainer):
+  
+  def __init__(self, id, bbox):
+    PageContainer.__init__(self, bbox)
+    self.id = id
+    return
  
  def __repr__(self):
-    return ('<figure id=%r bbox=%r>' % (self.id, self.bbox))
+    return ('<figure id=%r bbox=%s>' % (self.id, self.bbox()))
  

 ##  TextItem
 ##
-class TextItem(object):
+class TextItem(PageItem):
  
  def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
+    assert chars
    self.matrix = matrix
    self.font = font
    (_,_,_,_,tx,ty) = self.matrix
-    self.direction = 0
-    self.text = ''
-    adv = 0
-    for (char,cid) in chars:
-      self.text += char
-      adv += font.char_width(cid)
+    self.vertical = self.font.is_vertical()
+    self.text = ''.join( char for (char,_) in chars )
+    adv = sum( font.char_width(cid) for (_,cid) in chars )
    adv = (adv * fontsize + len(chars)*charspace) * scaling * .01
    size = (font.get_ascent() - font.get_descent()) * fontsize
-    if not self.font.is_vertical():
+    if not self.vertical:
      # horizontal text
-      self.direction = 1
+      self.vertical = False
      (dx,dy) = apply_matrix_norm(self.matrix, (adv,size))
      (_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
      ty += descent
      self.adv = (dx, 0)
-      self.bbox = (tx, ty, tx+dx, ty+dy)
+      bbox = (tx, ty, tx+dx, ty+dy)
    else:
      # vertical text
-      self.direction = 2
      (_,cid) = chars[0]
      (_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001))
      (dx,dy) = apply_matrix_norm(self.matrix, (size,adv))
      tx -= dx/2
      ty += disp
      self.adv = (0, dy)
-      self.bbox = (tx, ty+dy, tx+dx, ty)
+      bbox = (tx, ty+dy, tx+dx, ty)
    self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
+    PageItem.__init__(self, bbox)
    return
+
+  def __len__(self):
+    return len(self.text)
  
  def __repr__(self):
-    return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s text=%r adv=%s>' %
-            (matrix2str(self.matrix), self.font, self.fontsize,
-             rect2str(self.bbox), self.text, point2str(self.adv)))
+    return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
+            (matrix2str(self.matrix), self.font, self.fontsize, self.bbox(),
+             point2str(self.adv), self.text))


 ##  PDFPageAggregator
@ -133,7 +178,7 @@ class PDFPageAggregator(PDFDevice):
    return

  def begin_page(self, page):
-    self.cur_item = PageItem(self.pageno, page.mediabox, page.rotate)
+    self.cur_item = Page(self.pageno, page.mediabox, page.rotate)
    return
  
  def end_page(self, _):
@ -177,7 +222,8 @@ class PDFPageAggregator(PDFDevice):
  
  def render_chars(self, textmatrix, textstate, chars):
    if not chars: return (0, 0)
-    item = TextItem(textmatrix, textstate.font, textstate.fontsize, textstate.charspace, textstate.scaling, chars)
+    item = TextItem(textmatrix, textstate.font, textstate.fontsize,
+                    textstate.charspace, textstate.scaling, chars)
    self.cur_item.add(item)
    return item.adv

@ -199,7 +245,7 @@ class PDFPageAggregator(PDFDevice):
            (cidcoding, cid) = e.args
            char = self.handle_undefined_char(cidcoding, cid)
          chars.append((char, cid))
-          if cid == 32 and not font.is_multibyte():
+          if textstate.wordspace and not font.is_multibyte() and cid == 32:
            (dx,dy) = self.render_chars(textmatrix, textstate, chars)
            dx += textstate.wordspace * textstate.scaling * .01
            textmatrix = translate_matrix(textmatrix, (dx, dy))
--- a/pdflib/pdffont.py
+++ b/pdflib/pdffont.py
@ -359,9 +359,6 @@ class PDFFont(object):
  def string_width(self, s):
    return sum( self.char_width(cid) for cid in self.decode(s) )

-  def space_width(self):
-    return max(self.char_width(32), self.char_width(44), self.char_width(46)) * 0.5
-

 # PDFSimpleFont
 class PDFSimpleFont(PDFFont):
@ -572,9 +569,6 @@ class PDFCIDFont(PDFFont):
    chars = unpack('>%dH' % (len(code)/2), code)
    return ''.join( unichr(c) for c in chars )

-  def space_width(self):
-    return 0
-

 # main
 def main(argv):
--- a/pdflib/utils.py
+++ b/pdflib/utils.py
@ -98,3 +98,12 @@ def decode_text(s):
    return unicode(s[2:], 'utf-16be', 'ignore')
  else:
    return ''.join( PDFDocEncoding[ord(c)] for c in s )
+
+##
+def pick(seq, func, maxobj=None):
+  maxscore = None
+  for obj in seq:
+    score = func(obj)
+    if maxscore == None or maxscore < score:
+      (maxscore,maxobj) = (score,obj)
+  return maxobj