various bugfixes

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@56 1aa58f4a-7d42-0410-adbc-911cccaed67c
2009-01-05 04:40:50 +00:00 · 2009-01-05 04:40:50 +00:00 · 24bdd33557
parent 71be16febe
commit 24bdd33557
12 changed files with 162 additions and 114 deletions
--- a/2
+++ b/2
@ -1,7 +1,7 @@
 # Makefile for pdfminer

 PACKAGE=pdfminer
-VERSION=20080906
+VERSION=20081228
 GNUTAR=tar
 SVN=svn
 PYTHON=python
--- a/README.html
+++ b/README.html
@ -14,7 +14,7 @@ Python PDF parser and analyzer

 <div align=right class=lastmod>
 <!-- hhmts start -->
-Last Modified: Sat Sep  6 13:52:10 JST 2008
+Last Modified: Sun Dec 28 20:11:59 JST 2008
 <!-- hhmts end -->
 </div>

@ -245,6 +245,7 @@ no stream header is displayed for the ease of saving it to a file.
 <hr noshade>
 <h2>Changes</h2>
 <ul>
+<li> 2008/12/28: Better handling of word spacing. Thanks to 
 <li> 2008/09/06: A sample pdf2html webapp added.
 <li> 2008/08/30: ASCII85 encoding filter support.
 <li> 2008/07/27: Tagged contents extraction support.
--- a/pdflib/cmap.py
+++ b/pdflib/cmap.py
@ -2,8 +2,8 @@
 import sys
 stderr = sys.stderr
 from struct import pack, unpack
-from utils import choplist, nunpack
-from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
+from pdflib.utils import choplist, nunpack
+from pdflib.psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
     PSLiteral, PSKeyword, literal_name, keyword_name, \
     PSStackParser
 try:
@ -19,8 +19,9 @@ class CMapError(Exception): pass
 ##
 class CMap(object):

-  def __init__(self, debug=0):
-    self.debug = debug
+  debug = 0
+  
+  def __init__(self):
    self.code2cid = {}
    self.cid2code = {}
    self.attrs = {}
@ -90,8 +91,8 @@ class CMap(object):
 ##
 class CDBCMap(CMap):
  
-  def __init__(self, cdbname, debug=0):
-    CMap.__init__(self, debug=debug)
+  def __init__(self, cdbname):
+    CMap.__init__(self)
    self.cdbname = cdbname
    self.db = cdb.init(cdbname)
    return
@ -176,10 +177,9 @@ class CMapDB(object):
  cmapdb = {}

  @classmethod
-  def initialize(klass, dirname, cdbdirname=None, debug=0):
+  def initialize(klass, dirname, cdbdirname=None):
    klass.dirname = dirname
    klass.cdbdirname = cdbdirname or dirname
-    klass.debug = debug
    return

  @classmethod
@ -200,7 +200,7 @@ class CMapDB(object):
          print >>stderr, 'Reading: CMap %r...' % fname
        cmap = CMap()
        fp = file(fname, 'rb')
-        CMapParser(cmap, fp, debug=klass.debug).run()
+        CMapParser(cmap, fp).run()
        fp.close()
      elif not strict:
        cmap = CMap() # just create empty cmap
@ -214,8 +214,8 @@ class CMapDB(object):
 ##
 class CMapParser(PSStackParser):

-  def __init__(self, cmap, fp, debug=0):
-    PSStackParser.__init__(self, fp, debug=debug)
+  def __init__(self, cmap, fp):
+    PSStackParser.__init__(self, fp)
    self.cmap = cmap
    self.in_cmap = False
    return
--- a/pdflib/lzw.py
+++ b/pdflib/lzw.py
@ -6,9 +6,10 @@ stderr = sys.stderr
 ##
 class LZWDecoder(object):

-  def __init__(self, fp, debug=0):
+  debug = 0
+  
+  def __init__(self, fp):
    self.fp = fp
-    self.debug = debug
    self.buff = 0
    self.bpos = 8
    self.nbits = 9
@ -88,7 +89,8 @@ def main(argv):
  input = '\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01'
  fp = StringIO.StringIO(input)
  expected = '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42'
-  output = ''.join(LZWDecoder(fp, debug=1).run())
+  LZWDecoder.debug = 1
+  output = ''.join(LZWDecoder(fp).run())
  print (input, expected, output)
  print output == expected
  return 0
--- a/pdflib/page.py
+++ b/pdflib/page.py
@ -2,8 +2,8 @@
 import sys
 stdout = sys.stdout
 stderr = sys.stderr
-from pdfinterp import PDFDevice, PDFUnicodeNotDefined, \
-     mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix
+from pdflib.pdfinterp import PDFDevice, PDFUnicodeNotDefined
+from pdflib.utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix


 ##  PageItem
@ -46,6 +46,7 @@ class TextItem(object):
    self.origin = (tx,ty)
    self.direction = 0
    self.text = ''
+    scaling *= .01
    if not self.font.is_vertical():
      spwidth = int(font.char_width(32) * self.SPACE_WIDTH) # space width
      self.direction = 1
@ -62,12 +63,12 @@ class TextItem(object):
          self.text += char
          prev = char
          dx = 0
-          w += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling * .01
+          w += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling
        else:
          dx -= t
-          w += t * fontsize * .001 * scaling * .01
-      self.adv = (w, 0)
+          w += t * fontsize * .001 * scaling
      (w,h) = apply_matrix_norm(self.matrix, (w,fontsize))
+      self.adv = (w, 0)
      self.bbox = (tx, ty, tx+w, ty+h)
    else:
      self.direction = 2
@ -78,33 +79,33 @@ class TextItem(object):
          (disp,char) = t
          (_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
          self.text += char
-          h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling * .01
+          h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling
          break
      for t in text:
        if isinstance(t, tuple):
          (_,char) = t
          self.text += char
-          h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling * .01
-      self.adv = (0, h)
+          h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling
      (w,h) = apply_matrix_norm(self.matrix, (fontsize,h))
      tx -= w/2
      ty += disp
+      self.adv = (0, h)
      self.bbox = (tx, ty+h, tx+w, ty)
    self.fontsize = max(apply_matrix_norm(self.matrix, (fontsize,fontsize)))
    return
  
  def __repr__(self):
-    return ('<text matrix=%r font=%r fontsize=%r bbox=%r text=%r>' %
-            (self.matrix, self.font, self.fontsize, self.bbox, self.text))
+    return ('<text matrix=%r font=%r fontsize=%r bbox=%r text=%r adv=%r>' %
+            (self.matrix, self.font, self.fontsize, self.bbox, self.text, self.adv))


 ##  PageAggregator
 ##
 class PageAggregator(PDFDevice):

-  def __init__(self, rsrc, debug=0):
-    PDFDevice.__init__(self, rsrc, debug=debug)
-    self.pageno = 0
+  def __init__(self, rsrc, pageno=1):
+    PDFDevice.__init__(self, rsrc)
+    self.pageno = pageno
    self.stack = []
    return

@ -138,6 +139,7 @@ class PageAggregator(PDFDevice):
  def render_string(self, textstate, textmatrix, seq):
    font = textstate.font
    text = []
+    textmatrix = mult_matrix(textmatrix, self.ctm)
    for x in seq:
      if isinstance(x, int) or isinstance(x, float):
        text.append(x)
@ -154,15 +156,13 @@ class PageAggregator(PDFDevice):
              text.append(unc)
          if cid == 32 and not font.is_multibyte():
            if text:
-              item = TextItem(mult_matrix(textmatrix, self.ctm),
-                              font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
+              item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
              self.cur_item.add(item)
              (dx,dy) = item.adv
              dx += textstate.wordspace * textstate.scaling * .01
              textmatrix = translate_matrix(textmatrix, (dx, dy))
              text = []
    if text:
-      item = TextItem(mult_matrix(textmatrix, self.ctm),
-                      font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
+      item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
      self.cur_item.add(item)
    return
--- a/pdflib/pdf2txt.py
+++ b/pdflib/pdf2txt.py
@ -2,11 +2,11 @@
 import sys
 stdout = sys.stdout
 stderr = sys.stderr
-from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
-from pdfinterp import PDFDevice, PDFResourceManager, \
+from pdflib.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
+from pdflib.pdfinterp import PDFDevice, PDFResourceManager, \
     PDFPageInterpreter, PDFUnicodeNotDefined
-from cmap import CMapDB
-from page import PageItem, FigureItem, TextItem, PageAggregator
+from pdflib.cmap import CMapDB
+from pdflib.page import PageItem, FigureItem, TextItem, PageAggregator


 def enc(x, codec):
@ -21,8 +21,8 @@ def encprops(props, codec):
 ##  TextConverter
 class TextConverter(PageAggregator):
  
-  def __init__(self, rsrc, outfp, codec='ascii', debug=0):
-    PageAggregator.__init__(self, rsrc, debug=debug)
+  def __init__(self, rsrc, outfp, codec='ascii'):
+    PageAggregator.__init__(self, rsrc)
    self.outfp = outfp
    self.codec = codec
    return
@ -60,8 +60,8 @@ class SGMLConverter(TextConverter):
 ##
 class HTMLConverter(TextConverter):

-  def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, debug=0):
-    TextConverter.__init__(self, rsrc, outfp, codec=codec, debug=debug)
+  def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1):
+    TextConverter.__init__(self, rsrc, outfp, codec=codec)
    self.pagenum = pagenum
    self.pagepad = pagepad
    self.scale = scale
@ -110,8 +110,8 @@ class HTMLConverter(TextConverter):
 ##
 class TagExtractor(PDFDevice):

-  def __init__(self, rsrc, outfp, codec='utf-8', debug=0):
-    PDFDevice.__init__(self, rsrc, debug=debug)
+  def __init__(self, rsrc, outfp, codec='utf-8'):
+    PDFDevice.__init__(self, rsrc)
    self.outfp = outfp
    self.codec = codec
    self.pageno = 0
@ -166,18 +166,18 @@ class TagExtractor(PDFDevice):
 # pdf2txt
 class TextExtractionNotAllowed(RuntimeError): pass

-def convert(rsrc, device, fname, pagenos=None, maxpages=0, password='', debug=0):
-  doc = PDFDocument(debug=debug)
+def convert(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
+  doc = PDFDocument()
  fp = file(fname, 'rb')
-  parser = PDFParser(doc, fp, debug=debug)
+  parser = PDFParser(doc, fp)
  try:
    doc.initialize(password)
  except PDFPasswordIncorrect:
    raise TextExtractionNotAllowed('Incorrect password')
  if not doc.is_extractable:
    raise TextExtractionNotAllowed('Text extraction is not allowed: %r' % fname)
-  interpreter = PDFPageInterpreter(rsrc, device, debug=debug)
-  for (pageno,page) in enumerate(doc.get_pages(debug=debug)):
+  interpreter = PDFPageInterpreter(rsrc, device)
+  for (pageno,page) in enumerate(doc.get_pages()):
    if pagenos and (pageno not in pagenos): continue
    interpreter.process_page(page)
    if maxpages and maxpages <= pageno+1: break
@ -217,19 +217,25 @@ def main(argv):
    elif k == '-t': outtype = v
    elif k == '-o': outfp = file(v, 'wb')
  #
-  CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug)
-  rsrc = PDFResourceManager(debug=debug)
+  CMapDB.debug = debug
+  PDFResourceManager.debug = debug
+  PDFDocument.debug = debug
+  PDFParser.debug = debug
+  PDFPageInterpreter.debug = debug
+  #
+  CMapDB.initialize(cmapdir, cdbcmapdir)
+  rsrc = PDFResourceManager()
  if outtype == 'sgml':
-    device = SGMLConverter(rsrc, outfp, codec, debug=debug)
+    device = SGMLConverter(rsrc, outfp, codec)
  elif outtype == 'html':
-    device = HTMLConverter(rsrc, outfp, codec, debug=debug)
+    device = HTMLConverter(rsrc, outfp, codec)
  elif outtype == 'tag':
-    device = TagExtractor(rsrc, outfp, codec, debug=debug)
+    device = TagExtractor(rsrc, outfp, codec)
  else:
    return usage()
  for fname in args:
    convert(rsrc, device, fname, pagenos, 
-            maxpages=maxpages, password=password, debug=debug)
+            maxpages=maxpages, password=password)
  return

 if __name__ == '__main__': sys.exit(main(sys.argv))
--- a/pdflib/pdfinterp.py
+++ b/pdflib/pdfinterp.py
@ -6,14 +6,14 @@ try:
  from cStringIO import StringIO
 except ImportError:
  from StringIO import StringIO
-from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
+from pdflib.psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
     PSStackParser, PSLiteral, PSKeyword, STRICT, \
     PSLiteralTable, PSKeywordTable, literal_name, keyword_name
-from pdfparser import PDFException, PDFObject, PDFStream, PDFObjRef, resolve1, \
+from pdflib.pdfparser import PDFException, PDFObject, PDFStream, PDFObjRef, resolve1, \
     int_value, float_value, num_value, \
     str_value, list_value, dict_value, stream_value
-from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
-from utils import choplist
+from pdflib.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
+from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix


 ##  Exceptions
@ -65,25 +65,6 @@ PREDEFINED_COLORSPACE = dict(
  }.iteritems())


-##  Matrix operations
-##
-def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
-  '''Multiplies two matrices.'''
-  return (a0*a1+c0*b1,    b0*a1+d0*b1,
-          a0*c1+c0*d1,    b0*c1+d0*d1,
-          a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
-
-def translate_matrix((a,b,c,d,e,f), (x,y)):
-  return (a,b,c,d,e+x,f+y)
-  
-def apply_matrix((a,b,c,d,e,f), (x,y)):
-  '''Applies a matrix to coordinates.'''
-  return (a*x+c*y+e, b*x+d*y+f)
-
-def apply_matrix_norm((a,b,c,d,e,f), (x,y)):
-  return (a*x+c*y, b*x+d*y)
-
-
 ##  Fonts
 ##

@ -410,9 +391,9 @@ class PDFResourceManager(object):
  such as fonts, images and cmaps so that large objects are not
  allocated multiple times.
  '''
+  debug = 0
  
-  def __init__(self, debug=0):
-    self.debug = debug
+  def __init__(self):
    self.fonts = {}
    return

@ -478,9 +459,10 @@ class PDFResourceManager(object):
 ##
 class PDFDevice(object):

-  def __init__(self, rsrc, debug=0):
+  debug = 0
+  
+  def __init__(self, rsrc):
    self.rsrc = rsrc
-    self.debug = debug
    self.ctm = None
    return
  
@ -520,10 +502,10 @@ class PDFDevice(object):
 ##
 class PDFContentParser(PSStackParser):

-  def __init__(self, streams, debug=0):
+  def __init__(self, streams):
    self.streams = streams
    self.istream = 0
-    PSStackParser.__init__(self, None, debug=debug)
+    PSStackParser.__init__(self, None)
    return

  def fillfp(self):
@ -608,6 +590,8 @@ class PDFContentParser(PSStackParser):
 ##
 class PDFPageInterpreter(object):

+  debug = 0
+  
  class TextState(object):
    def __init__(self):
      self.font = None
@ -632,14 +616,13 @@ class PDFPageInterpreter(object):
      self.linematrix = (0, 0)
      return

-  def __init__(self, rsrc, device, debug=0):
+  def __init__(self, rsrc, device):
    self.rsrc = rsrc
    self.device = device
-    self.debug = debug
    return

  def dup(self):
-    return PDFPageInterpreter(self.rsrc, self.device, debug=self.debug)
+    return PDFPageInterpreter(self.rsrc, self.device)

  def init_resources(self, resources):
    self.fontmap = {}
@ -940,8 +923,8 @@ class PDFPageInterpreter(object):
  def do_TJ(self, seq):
    #print >>stderr, 'TJ(%r): %r' % (seq,self.textstate)
    textstate = self.textstate
-    matrix = translate_matrix(textstate.matrix, textstate.linematrix)
-    self.device.render_string(textstate, matrix, seq)
+    textmatrix = translate_matrix(textstate.matrix, textstate.linematrix)
+    self.device.render_string(textstate, textmatrix, seq)
    font = textstate.font
    s = ''.join( x for x in seq if isinstance(x, str) )
    n = sum( x for x in seq if not isinstance(x, str) )
@ -1030,7 +1013,7 @@ class PDFPageInterpreter(object):
  
  def execute(self, streams):
    try:
-      parser = PDFContentParser(streams, debug=self.debug)
+      parser = PDFContentParser(streams)
    except PSEOF:
      # empty page
      return
--- a/pdflib/pdfparser.py
+++ b/pdflib/pdfparser.py
@ -411,8 +411,9 @@ class PDFXRefStream(object):
 ##
 class PDFDocument(object):

-  def __init__(self, debug=0):
-    self.debug = debug
+  debug = 0
+  
+  def __init__(self):
    self.xrefs = []
    self.objs = {}
    self.parsed_objs = {}
@ -569,7 +570,7 @@ class PDFDocument(object):
        if strmid in self.parsed_objs:
          objs = self.parsed_objs[stream]
        else:
-          parser = PDFObjStrmParser(self, stream.get_data(), debug=self.debug)
+          parser = PDFObjStrmParser(self, stream.get_data())
          objs = []
          try:
            while 1:
@ -601,7 +602,7 @@ class PDFDocument(object):
    return obj
  
  INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
-  def get_pages(self, debug=0):
+  def get_pages(self):
    if not self.ready:
      raise PDFException('PDFDocument is not initialized')
    #assert self.xrefs
@ -611,13 +612,13 @@ class PDFDocument(object):
        if k in self.INHERITABLE_ATTRS and k not in tree:
          tree[k] = v
      if tree.get('Type') == LITERAL_PAGES and 'Kids' in tree:
-        if 1 <= debug:
+        if 1 <= self.debug:
          print >>stderr, 'Pages: Kids=%r' % tree['Kids']
        for c in tree['Kids']:
          for x in search(c, tree):
            yield x
      elif tree.get('Type') == LITERAL_PAGE:
-        if 1 <= debug:
+        if 1 <= self.debug:
          print >>stderr, 'Page: %r' % tree
        yield (obj.objid, tree)
    if 'Pages' not in self.catalog: return
@ -673,8 +674,8 @@ class PDFDocument(object):
 ##
 class PDFParser(PSStackParser):

-  def __init__(self, doc, fp, debug=0):
-    PSStackParser.__init__(self, fp, debug=debug)
+  def __init__(self, doc, fp):
+    PSStackParser.__init__(self, fp)
    self.doc = doc
    self.doc.set_parser(self)
    return
@ -837,12 +838,13 @@ class PDFParser(PSStackParser):
 ##  PDFObjStrmParser
 ##
 class PDFObjStrmParser(PDFParser):
-  def __init__(self, doc, data, debug=0):
+  
+  def __init__(self, doc, data):
    try:
      from cStringIO import StringIO
    except ImportError:
      from StringIO import StringIO
-    PDFParser.__init__(self, doc, StringIO(data), debug=debug)
+    PDFParser.__init__(self, doc, StringIO(data))
    return
  
  def flush(self):
--- a/pdflib/psparser.py
+++ b/pdflib/psparser.py
@ -122,9 +122,10 @@ class PSBaseParser(object):
  '''
  BUFSIZ = 4096

-  def __init__(self, fp, debug=0):
+  debug = 0
+  
+  def __init__(self, fp):
    self.fp = fp
-    self.debug = debug
    self.seek(0)
    return

@ -421,8 +422,8 @@ class PSBaseParser(object):
 ##
 class PSStackParser(PSBaseParser):

-  def __init__(self, fp, debug=0):
-    PSBaseParser.__init__(self,fp, debug=debug)
+  def __init__(self, fp):
+    PSBaseParser.__init__(self, fp)
    self.reset()
    return
  
@ -582,7 +583,7 @@ func/a/b{(c)do*}def
    class MyParser(PSBaseParser):
      def flush(self):
        self.add_results(*self.popall())
-    parser = MyParser(StringIO.StringIO(s), debug=1)
+    parser = MyParser(StringIO.StringIO(s))
    r = []
    try:
      while 1:
@ -596,7 +597,7 @@ func/a/b{(c)do*}def
    class MyParser(PSStackParser):
      def flush(self):
        self.add_results(*self.popall())
-    parser = MyParser(StringIO.StringIO(s), debug=1)
+    parser = MyParser(StringIO.StringIO(s))
    r = []
    try:
      while 1:
--- a/pdflib/utils.py
+++ b/pdflib/utils.py
@ -1,5 +1,26 @@
 #!/usr/bin/env python
-from struct import pack, unpack
+from struct import unpack
+
+
+##  Matrix operations
+##
+def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
+  '''Multiplies two matrices.'''
+  return (a0*a1+c0*b1,    b0*a1+d0*b1,
+          a0*c1+c0*d1,    b0*c1+d0*d1,
+          a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
+
+def translate_matrix((a,b,c,d,e,f), (x,y)):
+  return (a,b,c,d,e+x,f+y)
+  
+def apply_matrix((a,b,c,d,e,f), (x,y)):
+  '''Applies a matrix to coordinates.'''
+  return (a*x+c*y+e, b*x+d*y+f)
+
+def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
+  '''equiv to apply_matrix(M, (p,q)) - apply_matrix(M, (0,0))'''
+  return (a*p+c*q, b*p+d*q)
+

 ##  Utilities
 ##
--- a/tools/dumppdf.py
+++ b/tools/dumppdf.py
@ -15,7 +15,7 @@ stderr = sys.stderr

 ESC_PAT = re.compile(r'[\000-\037&<>\042\047\134\177-\377]')
 def esc(s):
-  return ESC_PAT.sub(lambda m:'\\x%02x' % ord(m.group(0)), s)
+  return ESC_PAT.sub(lambda m:'&#%d;' % ord(m.group(0)), s)


 # dumpxml
@ -96,10 +96,10 @@ def dumpallobjs(out, doc, codec=None):

 # dumpoutline
 def dumpoutline(outfp, fname, objids, pagenos, password='',
-                dumpall=False, codec=None, debug=0):
-  doc = PDFDocument(debug=debug)
+                dumpall=False, codec=None):
+  doc = PDFDocument()
  fp = file(fname, 'rb')
-  parser = PDFParser(doc, fp, debug=debug)
+  parser = PDFParser(doc, fp)
  doc.initialize(password)
  pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
  for (level,title,dest,a,se) in doc.get_outlines():
@ -116,10 +116,10 @@ def dumpoutline(outfp, fname, objids, pagenos, password='',

 # dumppdf
 def dumppdf(outfp, fname, objids, pagenos, password='',
-            dumpall=False, codec=None, debug=0):
-  doc = PDFDocument(debug=debug)
+            dumpall=False, codec=None):
+  doc = PDFDocument()
  fp = file(fname, 'rb')
-  parser = PDFParser(doc, fp, debug=debug)
+  parser = PDFParser(doc, fp)
  doc.initialize(password)
  if objids:
    for objid in objids:
@ -174,9 +174,12 @@ def main(argv):
    elif k == '-T': proc = dumpoutline
    elif k == '-o': outfp = file(v, 'wb')
  #
+  PDFDocument.debug = debug
+  PDFParser.debug = debug
+  #
  for fname in args:
    proc(outfp, fname, objids, pagenos, password=password,
-         dumpall=dumpall, codec=codec, debug=debug)
+         dumpall=dumpall, codec=codec)
  return

 if __name__ == '__main__': sys.exit(main(sys.argv))
--- a/tools/prof.py
+++ b/tools/prof.py
@ -0,0 +1,29 @@
+#!/usr/bin/env python
+import sys
+
+def prof_main(argv):
+  import getopt
+  import hotshot, hotshot.stats
+  def usage():
+    print 'usage: %s output.prof mod.func [args ...]' % argv[0]
+    return 100
+  args = argv[1:]
+  if len(args) < 2: return usage()
+  prof = args.pop(0)
+  name = args.pop(0)
+  i = name.rindex('.')
+  (modname, funcname) = (name[:i], name[i+1:])
+  func = getattr(__import__(modname, fromlist=[modname]), funcname)
+  if args:
+    args.insert(0, argv[0])
+    prof = hotshot.Profile(prof)
+    prof.runcall(lambda : func(args))
+    prof.close()
+  else:
+    stats = hotshot.stats.load(prof)
+    stats.strip_dirs()
+    stats.sort_stats('time', 'calls')
+    stats.print_stats(1000)
+  return
+  
+if __name__ == '__main__': sys.exit(prof_main(sys.argv))