git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@79 1aa58f4a-7d42-0410-adbc-911cccaed67c

2009-03-29 13:21:21 +00:00 · 2009-03-29 13:21:21 +00:00 · 68cc99379d
parent c5991f74ff
commit 68cc99379d
3 changed files with 46 additions and 22 deletions
--- a/README.html
+++ b/README.html
@ -17,7 +17,7 @@ Python PDF parser and analyzer

 <div align=right class=lastmod>
 <!-- hhmts start -->
-Last Modified: Wed Mar 25 20:59:18 JST 2009
+Last Modified: Sun Mar 29 19:09:46 JST 2009
 <!-- hhmts end -->
 </div>

@ -202,6 +202,7 @@ This makes the word spacing correctly handled.
 <dd> Specifies the output format. The following formats are currently supported.
 <ul>
 <li> <code>html</code> : HTML format. (Default)
+<li> <code>text</code> : TEXT format.
 <li> <code>sgml</code> : SGML format.
 <li> <code>tag</code> : "Tagged PDF" format. A tagged PDF has its own contents annotated with
 HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations.
--- a/pdflib/cluster.py
+++ b/pdflib/cluster.py
@ -59,9 +59,9 @@ class Plane(object):
    return objs


-##  Clusters
+##  ClusterSet
 ##
-class Clusters(object):
+class ClusterSet(object):

  def __init__(self):
    self.clusters = {}
@ -86,11 +86,12 @@ class Clusters(object):


 def cluster_pageobjs(objs, ratio):
+  idx = dict( (obj,i) for (i,obj) in enumerate(objs) )
  plane = Plane()
  for obj in objs:
    plane.add(obj.bbox, obj)
  plane.finish()
-  clusters = Clusters()
+  cset = ClusterSet()
  for obj in objs:
    (bx0,by0,bx1,by1) = obj.bbox
    margin = abs(obj.fontsize * ratio)
@ -100,17 +101,26 @@ def cluster_pageobjs(objs, ratio):
    y1 = max(by0,by1)
    found = plane.find((x0-margin, y0-margin, x1+margin, y1+margin))
    if len(found) == 1:
-      clusters.add(found.pop())
+      cset.add(found.pop())
    else:
-      clusters.merge(found)
+      cset.merge(found)
+  clusters = sorted(cset.finish(), key=lambda objs: idx[objs[0]])
  r = []
-  for objs in clusters.finish():
+  for objs in clusters:
+    objs = sorted(objs, key=lambda obj: idx[obj])
+    h = v = 0
    (bx0,by0,bx1,by1) = objs[0].bbox
+    (lx0,ly0,_,_) = objs[0].bbox
    for obj in objs[1:]:
      (x0,y0,x1,y1) = obj.bbox
+      if len(obj.text) == 1 and abs(lx0-x0) < abs(ly0-y0):
+        v += 1
+      else:
+        h += 1
+      (lx0,ly0) = (x0,y0)
      bx0 = min(bx0, x0)
      bx1 = max(bx1, x1)
      by0 = min(by0, y0)
      by1 = max(by1, y1)
-    r.append(((bx0,by0,bx1,by1), objs))
+    r.append(((bx0,by0,bx1,by1), h < v, objs))
  return r
--- a/pdflib/pdf2txt.py
+++ b/pdflib/pdf2txt.py
@ -2,7 +2,7 @@
 import sys
 from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
 from pdfinterp import PDFResourceManager, PDFPageInterpreter
-from pdfdevice import PDFDevice, FigureItem, TextItem, PDFPageAggregator
+from pdfdevice import PDFDevice, PageItem, FigureItem, TextItem, PDFPageAggregator
 from pdffont import PDFUnicodeNotDefined
 from cmap import CMapDB

@ -15,6 +15,15 @@ def encprops(props, codec):
  if not props: return ''
  return ''.join( ' %s="%s"' % (enc(k,codec), enc(str(v),codec)) for (k,v) in sorted(props.iteritems()) )

+def get_textobjs(item, r=None):
+  if r == None: r = []
+  if isinstance(item, TextItem):
+    r.append(item)
+  elif isinstance(item, PageItem):
+    for child in item.objs:
+      get_textobjs(child, r)
+  return r
+

 ##  PDFConverter
 class PDFConverter(PDFPageAggregator):
@ -73,7 +82,8 @@ class HTMLConverter(PDFConverter):
    page = PDFConverter.end_page(self, page)
    def f(item):
      if isinstance(item, FigureItem):
-        pass
+        for child in item.objs:
+          f(child)
      elif isinstance(item, TextItem):
        if item.direction == 2:
          wmode = 'tb-rl'
@ -95,8 +105,8 @@ class HTMLConverter(PDFConverter):
    for child in page.objs:
      f(child)
    if self.cluster_margin:
-      textobjs = [ item for item in page.objs if isinstance(item, TextItem) ]
-      for ((x0,y0,x1,y1),objs) in cluster_pageobjs(textobjs, self.cluster_margin):
+      clusters = cluster_pageobjs(get_textobjs(page), self.cluster_margin)
+      for ((x0,y0,x1,y1),_,objs) in clusters:
        self.outfp.write('<span style="position:absolute; border: 1px solid red; '
                         'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' % 
                       (x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale))
@ -114,7 +124,7 @@ class HTMLConverter(PDFConverter):
 ##
 class TextConverter(PDFConverter):

-  def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, cluster_margin=0.2, splitwords=False, hyphenation=True):
+  def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, cluster_margin=0.5, splitwords=False, hyphenation=True):
    PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=True)
    self.pagenum = pagenum
    self.cluster_margin = cluster_margin
@ -125,15 +135,18 @@ class TextConverter(PDFConverter):
    from cluster import cluster_pageobjs
    page = PDFConverter.end_page(self, page)
    if self.cluster_margin:
-      textobjs = [ item for item in page.objs if isinstance(item, TextItem) ]
-      idx = dict( (obj,i) for (i,obj) in enumerate(textobjs) )
+      textobjs = get_textobjs(page)
      clusters = cluster_pageobjs(textobjs, self.cluster_margin)
-      clusters.sort(key=lambda (_,objs): idx[objs[0]])
-      for (_,objs) in clusters:
-        for item in sorted(objs, key=lambda obj:idx[obj]):
-          text = item.text
-          self.outfp.write(text.encode(self.codec, 'replace'))
-        self.outfp.write('\n')
+      for (_,vertical,objs) in clusters:
+        for (i,item) in enumerate(objs):
+          (x0,y0,x1,y1) = item.bbox
+          if (i and
+              ((not vertical and (y1 < ly0 or ly1 < y0)) or
+               (vertical and (x1 < lx0 or lx1 < x0)))):
+            self.outfp.write('\n')
+          (lx0,ly0,lx1,ly1) = (x0,y0,x1,y1)
+          self.outfp.write(item.text.encode(self.codec, 'replace'))
+        self.outfp.write('\n\n')
    else:
      for item in page.objs:
        if isinstance(item, TextItem):
@ -243,7 +256,7 @@ def main(argv):
  codec = 'ascii'
  pagenos = set()
  maxpages = 0
-  outtype = 'text'
+  outtype = 'html'
  password = ''
  splitwords = False
  outfp = sys.stdout