add splitwords option.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@72 1aa58f4a-7d42-0410-adbc-911cccaed67c
2009-03-20 11:00:14 +00:00 · 2009-03-20 11:00:14 +00:00 · 435d0553fa
parent b432a3f4ae
commit 435d0553fa
2 changed files with 31 additions and 26 deletions
--- a/pdflib/pdf2txt.py
+++ b/pdflib/pdf2txt.py
@ -21,8 +21,8 @@ def encprops(props, codec):
 ##  TextConverter
 class TextConverter(PDFPageAggregator):
  
-  def __init__(self, rsrc, outfp, codec='ascii'):
-    PDFPageAggregator.__init__(self, rsrc)
+  def __init__(self, rsrc, outfp, codec='ascii', splitwords=False):
+    PDFPageAggregator.__init__(self, rsrc, splitwords=splitwords)
    self.outfp = outfp
    self.codec = codec
    return
@ -60,8 +60,8 @@ class SGMLConverter(TextConverter):
 ##
 class HTMLConverter(TextConverter):

-  def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1):
-    TextConverter.__init__(self, rsrc, outfp, codec=codec)
+  def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, splitwords=False):
+    TextConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=splitwords)
    self.pagenum = pagenum
    self.pagepad = pagepad
    self.scale = scale
@ -190,10 +190,10 @@ def convert(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
 def main(argv):
  import getopt
  def usage():
-    print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-t html|sgml|tag] [-o output] file ...' % argv[0]
+    print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-w] [-t html|sgml|tag] [-o output] file ...' % argv[0]
    return 100
  try:
-    (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:t:o:C:D:m:')
+    (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:t:o:C:D:m:w')
  except getopt.GetoptError:
    return usage()
  if not args: return usage()
@ -205,6 +205,7 @@ def main(argv):
  maxpages = 0
  outtype = 'html'
  password = ''
+  splitwords = False
  outfp = stdout
  for (k, v) in opts:
    if k == '-d': debug += 1
@ -216,6 +217,7 @@ def main(argv):
    elif k == '-D': cdbcmapdir = v
    elif k == '-t': outtype = v
    elif k == '-o': outfp = file(v, 'wb')
+    elif k == '-w': splitwords = True
  #
  CMapDB.debug = debug
  PDFResourceManager.debug = debug
@ -226,11 +228,11 @@ def main(argv):
  CMapDB.initialize(cmapdir, cdbcmapdir)
  rsrc = PDFResourceManager()
  if outtype == 'sgml':
-    device = SGMLConverter(rsrc, outfp, codec)
+    device = SGMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords)
  elif outtype == 'html':
-    device = HTMLConverter(rsrc, outfp, codec)
+    device = HTMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords)
  elif outtype == 'tag':
-    device = TagExtractor(rsrc, outfp, codec)
+    device = TagExtractor(rsrc, outfp, codec=codec, splitwords=splitwords)
  else:
    return usage()
  for fname in args:
--- a/pdflib/pdfdevice.py
+++ b/pdflib/pdfdevice.py
@ -145,9 +145,10 @@ class TextItem(object):
 ##
 class PDFPageAggregator(PDFDevice):

-  def __init__(self, rsrc, pageno=1):
+  def __init__(self, rsrc, pageno=1, splitwords=False):
    PDFDevice.__init__(self, rsrc)
    self.pageno = pageno
+    self.splitwords = splitwords
    self.stack = []
    return

@ -180,29 +181,31 @@ class PDFPageAggregator(PDFDevice):

  def render_string(self, textstate, textmatrix, seq):
    font = textstate.font
-    text = []
-    textmatrix = mult_matrix(textmatrix, self.ctm)
+    chars = []
    for x in seq:
      if isinstance(x, int) or isinstance(x, float):
-        text.append((None, None, x))
+        chars.append((None, None, x))
      else:
-        chars = font.decode(x)
-        for cid in chars:
+        for cid in font.decode(x):
          try:
            char = font.to_unicode(cid)
          except PDFUnicodeNotDefined, e:
            (cidcoding, cid) = e.args
            char = self.handle_undefined_char(cidcoding, cid)
-          text.append((char, cid, font.char_disp(cid)))
-          if cid == 32 and not font.is_multibyte():
-            if text:
-              item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
-              self.cur_item.add(item)
-              (dx,dy) = item.adv
-              dx += textstate.wordspace * textstate.scaling * .01
-              textmatrix = translate_matrix(textmatrix, (dx, dy))
-              text = []
-    if text:
-      item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
+          chars.append((char, cid, font.char_disp(cid)))
+    textmatrix = mult_matrix(textmatrix, self.ctm)
+    word = []
+    for (char, cid, disp) in chars:
+      word.append((char,cid,disp))
+      if self.splitwords and cid == 32 and not font.is_multibyte():
+        if word:
+          item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, word)
+          self.cur_item.add(item)
+          (dx,dy) = item.adv
+          dx += textstate.wordspace * textstate.scaling * .01
+          textmatrix = translate_matrix(textmatrix, (dx, dy))
+          word = []
+    if word:
+      item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, word)
      self.cur_item.add(item)
    return