outline (TOC) extraction supported.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@42 1aa58f4a-7d42-0410-adbc-911cccaed67c
2008-07-09 15:15:32 +00:00 · 2008-07-09 15:15:32 +00:00 · 9740f26cec
parent cb02051481
commit 9740f26cec
10 changed files with 138 additions and 62 deletions
--- a/README.html
+++ b/README.html
@ -11,7 +11,7 @@ blockquote { background: #eeeeee; }
 <h1>PDFMiner</h1>
 <div align=right class=lastmod>
 <!-- hhmts start -->
-Last Modified: Tue Jul  1 00:02:48 JST 2008
+Last Modified: Thu Jul 10 00:14:07 JST 2008
 <!-- hhmts end -->
 </div>
@ -135,13 +135,13 @@ Unicode Standard.
 <p>
 Examples:
 <blockquote><pre>
-$ <strong>./pdf2txt.py -H -o output.html samples/naacl06-shinyama.pdf</strong>
+$ <strong>python -m tools.pdf2txt -H -o output.html samples/naacl06-shinyama.pdf</strong>
 (extract text as an HTML file whose filename is output.html)
-$ <strong>./pdf2txt.py -c euc-jp samples/jo.pdf</strong>
+$ <strong>python -m tools.pdf2txt -c euc-jp samples/jo.pdf</strong>
 (extract Japanese texts in vertical writing, CMap is required)
-$ <strong>./pdf2txt.py -P mypassword secret.pdf</strong>
+$ <strong>python -m tools.pdf2txt -P mypassword secret.pdf</strong>
 (extract texts from an encrypted PDF file with a password)
 </pre></blockquote>
@ -181,10 +181,13 @@ but it's also possible to extract some meaningful contents
 <p>
 Examples:
 <blockquote><pre>
-$ <strong>./dumppdf.py -a foo.pdf</strong>
+$ <strong>python -m tools.dumppdf -a foo.pdf</strong>
 (dump all the headers and contents, except stream objects)
-$ <strong>./dumppdf.py -r -i6 foo.pdf &gt; pic.jpeg</strong>
+$ <strong>python -m tools.dumppdf -T foo.pdf</strong>
 (dump the table of contents)
 $ <strong>python -m tools.dumppdf -r -i6 foo.pdf &gt; pic.jpeg</strong>
 (extract a JPEG image)
 </pre></blockquote>
--- a/pdflib/arcfour.py
+++ b/pdflib/arcfour.py
@ -4,7 +4,7 @@
 #  * public domain *
 #
-class Arcfour:
+class Arcfour(object):
  def __init__(self, key):
    s = range(256)
--- a/pdflib/cmap.py
+++ b/pdflib/cmap.py
@ -17,7 +17,7 @@ class CMapError(Exception): pass
 ##  CMap
 ##
-class CMap:
+class CMap(object):
  def __init__(self, debug=0):
    self.debug = debug
@ -163,7 +163,7 @@ class CDBCMap(CMap):
 ##  CMapDB
 ##
-class CMapDB:
+class CMapDB(object):
  class CMapNotFound(CMapError): pass
@ -340,7 +340,7 @@ class CMapParser(PSStackParser):
 ##  FontMetricsDB
 ##
-class FontMetricsDB:
+class FontMetricsDB(object):
  from fontmetrics import FONT_METRICS
  @classmethod
@ -350,7 +350,7 @@ class FontMetricsDB:
 ##  EncodingDB
 ##
-class EncodingDB:
+class EncodingDB(object):
  from glyphlist import charname2unicode
  from latin_enc import ENCODING
--- a/pdflib/lzw.py
+++ b/pdflib/lzw.py
@ -4,7 +4,7 @@ stderr = sys.stderr
 ##  LZWDecoder
 ##
-class LZWDecoder:
+class LZWDecoder(object):
  def __init__(self, fp, debug=0):
    self.fp = fp
--- a/pdflib/pdfinterp.py
+++ b/pdflib/pdfinterp.py
@ -9,7 +9,7 @@ except ImportError:
 from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
     PSStackParser, PSLiteral, PSKeyword, STRICT, \
     PSLiteralTable, PSKeywordTable, literal_name, keyword_name
-from pdfparser import PDFException, PDFStream, PDFObjRef, resolve1, \
+from pdfparser import PDFException, PDFObject, PDFStream, PDFObjRef, resolve1, \
     int_value, float_value, num_value, \
     str_value, list_value, dict_value, stream_value
 from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
@ -26,7 +26,7 @@ class PDFUnicodeNotDefined(PDFFontError): pass
 ##  ColorSpace
 ##
-class ColorSpace:
+class ColorSpace(object):
  def __init__(self, name, ncomponents):
    self.name = name
    self.ncomponents = ncomponents
@ -82,7 +82,7 @@ def apply_matrix((a,b,c,d,e,f), (x,y)):
 ##
 # PDFFont
-class PDFFont:
+class PDFFont(object):
  def __init__(self, descriptor, widths, default_width=None):
    self.descriptor = descriptor
@ -208,7 +208,7 @@ class PDFType3Font(PDFSimpleFont):
 ##  TrueTypeFont
 ##
-class TrueTypeFont:
+class TrueTypeFont(object):
  class CMapNotFound(Exception): pass
@ -391,7 +391,7 @@ class PDFCIDFont(PDFFont):
 ##  Resource Manager
 ##
-class PDFResourceManager:
+class PDFResourceManager(object):
  '''
  ResourceManager facilitates reuse of shared resources
@ -464,7 +464,7 @@ class PDFResourceManager:
 ##  PDFDevice
 ##
-class PDFDevice:
+class PDFDevice(object):
  def __init__(self, rsrc, debug=0):
    self.rsrc = rsrc
@ -587,9 +587,9 @@ class PDFContentParser(PSStackParser):
 ##  Interpreter
 ##
-class PDFPageInterpreter:
+class PDFPageInterpreter(object):
-  class TextState:
+  class TextState(object):
    def __init__(self):
      self.font = None
      self.fontsize = 0
--- a/pdflib/pdfparser.py
+++ b/pdflib/pdfparser.py
@ -11,7 +11,7 @@ from utils import choplist, nunpack
 from arcfour import Arcfour
 from lzw import LZWDecoder
 from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
-     PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
+     PSObject, PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
     literal_name, keyword_name, \
     PSStackParser, STRICT
@ -46,10 +46,12 @@ KEYWORD_TRAILER = PSKeywordTable.intern('trailer')
 KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
 PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
 class PDFObject(PSObject): pass
 ##  PDFObjRef
 ##
-class PDFObjRef:
+class PDFObjRef(PDFObject):
  def __init__(self, doc, objid, _):
    if objid == 0:
@ -165,7 +167,7 @@ def stream_value(x):
 ##  PDFStream type
 ##
-class PDFStream:
+class PDFStream(PDFObject):
  def __init__(self, dic, rawdata, decipher=None):
    self.dic = dic
@ -247,11 +249,11 @@ class PDFStream:
 ##  PDFPage
 ##
-class PDFPage:
+class PDFPage(object):
-  def __init__(self, doc, pageidx, attrs):
+  def __init__(self, doc, pageid, attrs):
    self.doc = doc
-    self.pageid = pageidx
+    self.pageid = pageid
    self.attrs = dict_value(attrs)
    self.lastmod = self.attrs.get('LastModified')
    self.resources = resolve1(self.attrs['Resources'])
@ -397,7 +399,7 @@ class PDFXRefStream(object):
 ##  at once. Rather it is parsed dynamically as processing goes.
 ##  A PDF parser is associated with the document.
 ##
-class PDFDocument:
+class PDFDocument(object):
  def __init__(self, debug=0):
    self.debug = debug
@ -453,7 +455,6 @@ class PDFDocument:
    if self.catalog.get('Type') != LITERAL_CATALOG:
      if STRICT:
        raise PDFValueError('Catalog not found!')
    self.outline = self.catalog.get('Outline')
    return
  # initialize(password='')
@ -608,11 +609,54 @@ class PDFDocument:
      elif tree.get('Type') == LITERAL_PAGE:
        if 1 <= debug:
          print >>stderr, 'Page: %r' % tree
-        yield tree
+        yield (obj.objid, tree)
    if 'Pages' not in self.catalog: return
-    for (i,tree) in enumerate(search(self.catalog['Pages'], self.catalog)):
+    for (pageid,tree) in search(self.catalog['Pages'], self.catalog):
-      yield PDFPage(self, i, tree)
+      yield PDFPage(self, pageid, tree)
-    return 
+    return
  def get_outlines(self):
    if 'Outlines' not in self.catalog:
      raise PDFException('no /Outlines defined!')
    def search(entry, level):
      entry = dict_value(entry)
      if 'Title' in entry:
        if 'A' in entry or 'Dest' in entry:
          title = unicode(str_value(entry['Title']), 'utf-8', 'ignore')
          dest = entry.get('Dest')
          action = entry.get('A')
          se = entry.get('SE')
          yield (level, title, dest, action, se)
      if 'First' in entry and 'Last' in entry:
        for x in search(entry['First'], level+1):
          yield x
      if 'Next' in entry:
        for x in search(entry['Next'], level):
          yield x
      return
    return search(self.catalog['Outlines'], 0)
  def lookup_name(self, cat, key):
    try:
      names = dict_value(self.catalog['Names'])
    except (PDFTypeError, KeyError):
      raise KeyError((cat,key))
    # may raise KeyError
    d0 = dict_value(names[cat])
    def lookup(d):
      if 'Limits' in d:
        (k1,k2) = list_value(d['Limits'])
        if key < k1 or k2 < key: return None
        if 'Names' in d:
          objs = list_value(d['Names'])
          names = dict(choplist(2, objs))
          return names[key]
      if 'Kids' in d:
        for c in list_value(d['Kids']):
          v = lookup(dict_value(c))
          if v: return v
      raise KeyError((cat,key))
    return lookup(d0)
 ##  PDFParser
--- a/pdflib/psparser.py
+++ b/pdflib/psparser.py
@ -19,7 +19,9 @@ class PSValueError(PSException): pass
 ##
 # PSLiteral
-class PSLiteral:
+class PSObject(object): pass
 class PSLiteral(PSObject):
  '''
  PS literals (e.g. "/Name").
@ -35,7 +37,7 @@ class PSLiteral:
    return '/%s' % self.name
 # PSKeyword
-class PSKeyword:
+class PSKeyword(PSObject):
  '''
  PS keywords (e.g. "showpage").
@ -51,7 +53,7 @@ class PSKeyword:
    return self.name
 # PSSymbolTable
-class PSSymbolTable:
+class PSSymbolTable(object):
  '''
  Symbol table that stores PSLiteral or PSKeyword.
@ -113,7 +115,7 @@ END_KEYWORD = re.compile(r'[#/%\[\]()<>{}\s]')
 END_STRING = re.compile(r'[()\134]')
 OCT_STRING = re.compile(r'[0-7]')
 ESC_STRING = { 'b':8, 't':9, 'n':10, 'f':12, 'r':13, '(':40, ')':41, '\\':92 }
-class PSBaseParser:
+class PSBaseParser(object):
  '''
  Most basic PostScript parser that performs only basic tokenization.
@ -129,6 +131,13 @@ class PSBaseParser:
  def __repr__(self):
    return '<PSBaseParser: %r, bufpos=%d>' % (self.fp, self.bufpos)
  def flush(self):
    return
  def close(self):
    self.flush()
    return
  def tell(self):
    return self.fp.tell()
@ -463,8 +472,6 @@ class PSStackParser(PSBaseParser):
  def do_keyword(self, pos, token):
    return
  def flush(self):
    return
  def nextobject(self):
    '''
--- a/pdflib/pycdb.py
+++ b/pdflib/pycdb.py
@ -51,7 +51,7 @@ def cdbiter(fp, eod):
 # CDBReader
-class CDBReader:
+class CDBReader(object):
  def __init__(self, cdbname, docache=1):
    self.name = cdbname
@ -59,7 +59,7 @@ class CDBReader:
    hash0 = decode(self._fp.read(2048))
    self._hash0 = [ (hash0[i], hash0[i+1]) for i in xrange(0, 512, 2) ]
    self._hash1 = [ None ] * 256
-    self._eod = self._hash0[0]
+    self._eod = hash0[0]
    self._docache = docache
    self._cache = {}
    self._keyiter = None
@ -149,7 +149,7 @@ class CDBReader:
 # CDBMaker
-class CDBMaker:
+class CDBMaker(object):
  def __init__(self, cdbname, tmpname):
    self.fn = cdbname
--- a/tools/dumppdf.py
+++ b/tools/dumppdf.py
@ -8,7 +8,7 @@
 #
 import sys, re
 from pdflib.pdfparser import PDFDocument, PDFParser, PDFStream, \
-     PDFObjRef, PSKeyword, PSLiteral
+     PDFObjRef, PSKeyword, PSLiteral, resolve1
 stdout = sys.stdout
 stderr = sys.stderr
@ -94,8 +94,28 @@ def dumpallobjs(out, doc, codec=None):
  out.write('</pdf>')
  return
 # dumpoutline
 def dumpoutline(outfp, fname, objids, pagenos, password='',
                dumpall=False, codec=None, debug=0):
  doc = PDFDocument(debug=debug)
  fp = file(fname, 'rb')
  parser = PDFParser(doc, fp, debug=debug)
  doc.initialize(password)
  pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
  for (level,title,dest,a,se) in doc.get_outlines():
    pageno = None
    if dest:
      dest = resolve1( doc.lookup_name('Dests', dest) )
      if isinstance(dest, dict):
        dest = dest['D']
      pageno = pages[dest[0].objid]
    outfp.write(repr((level,title,dest,pageno))+'\n')
  parser.close()
  fp.close()
  return
 # dumppdf
-def dumppdf(outfp, fname, objids, pageids, password='',
+def dumppdf(outfp, fname, objids, pagenos, password='',
            dumpall=False, codec=None, debug=0):
  doc = PDFDocument(debug=debug)
  fp = file(fname, 'rb')
@ -110,13 +130,13 @@ def dumppdf(outfp, fname, objids, pageids, password='',
        outfp.write(obj.get_data())
      else:
        dumpxml(outfp, obj, codec=codec)
-  if pageids:
+  if pagenos:
-    for page in doc.get_pages():
+    for (pageno,page) in enumerate(doc.get_pages()):
-      if page.pageid in pageids:
+      if pageno in pagenos:
        dumpxml(outfp, page.attrs)
  if dumpall:
    dumpallobjs(outfp, doc, codec=codec)
-  if (not objids) and (not pageids) and (not dumpall):
+  if (not objids) and (not pagenos) and (not dumpall):
    dumptrailers(outfp, doc)
  fp.close()
  outfp.write('\n')
@ -127,34 +147,36 @@ def dumppdf(outfp, fname, objids, pageids, password='',
 def main(argv):
  import getopt
  def usage():
-    print 'usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-i objid] file ...' % argv[0]
+    print 'usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-i objid] file ...' % argv[0]
    return 100
  try:
-    (opts, args) = getopt.getopt(argv[1:], 'dap:P:rbti:')
+    (opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTi:')
  except getopt.GetoptError:
    return usage()
  if not args: return usage()
  debug = 0
  objids = []
-  pageids = set()
+  pagenos = set()
  codec = None
  password = ''
  dumpall = False
  proc = dumppdf
  outfp = stdout
  for (k, v) in opts:
    if k == '-d': debug += 1
    elif k == '-i': objids.extend( int(x) for x in v.split(',') )
-    elif k == '-p': pageids.update( int(x)-1 for x in v.split(',') )
+    elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
    elif k == '-P': password = v
    elif k == '-a': dumpall = True
    elif k == '-r': codec = 'raw'
    elif k == '-b': codec = 'binary'
    elif k == '-t': codec = 'text'
    elif k == '-T': proc = dumpoutline
    elif k == '-o': outfp = file(v, 'wb')
  #
  for fname in args:
-    dumppdf(outfp, fname, objids, pageids, password=password,
+    proc(outfp, fname, objids, pagenos, password=password,
-            dumpall=dumpall, codec=codec, debug=debug)
+         dumpall=dumpall, codec=codec, debug=debug)
  return
 if __name__ == '__main__': sys.exit(main(sys.argv))
--- a/tools/pdf2txt.py
+++ b/tools/pdf2txt.py
@ -93,7 +93,7 @@ class TextConverter(PDFDevice):
    return
  def begin_page(self, page):
-    self.context = PageItem(str(page.pageid+1), page.mediabox, page.rotate)
+    self.context = PageItem(len(self.pages), page.mediabox, page.rotate)
    return
  def end_page(self, _):
    assert not self.stack
@ -205,7 +205,7 @@ class TextConverter(PDFDevice):
 # pdf2txt
 class TextExtractionNotAllowed(RuntimeError): pass
-def pdf2txt(outfp, rsrc, fname, pages, codec, maxpages=0, html=False, password='', debug=0):
+def pdf2txt(outfp, rsrc, fname, pagenos, codec, maxpages=0, html=False, password='', debug=0):
  device = TextConverter(rsrc, debug=debug)
  doc = PDFDocument(debug=debug)
  fp = file(fname, 'rb')
@ -218,10 +218,10 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, maxpages=0, html=False, password='
    raise TextExtractionNotAllowed('text extraction is not allowed: %r' % fname)
  interpreter = PDFPageInterpreter(rsrc, device, debug=debug)
  device.reset()
-  for (i,page) in enumerate(doc.get_pages(debug=debug)):
+  for (pageno,page) in enumerate(doc.get_pages(debug=debug)):
-    if pages and (i not in pages): continue
+    if pagenos and (pageno not in pagenos): continue
    interpreter.process_page(page)
-    if maxpages and maxpages <= i+1: break
+    if maxpages and maxpages <= pageno+1: break
  if html:
    device.dump_html(outfp, codec)
  else:
@ -235,7 +235,7 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, maxpages=0, html=False, password='
 def main(argv):
  import getopt
  def usage():
-    print 'usage: %s [-d] [-p pages] [-P password] [-c codec] [-H] [-o output] file ...' % argv[0]
+    print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-H] [-o output] file ...' % argv[0]
    return 100
  try:
    (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:Ho:C:D:m:')
@ -246,14 +246,14 @@ def main(argv):
  cmapdir = 'CMap'
  cdbcmapdir = 'CDBCMap'
  codec = 'ascii'
-  pages = set()
+  pagenos = set()
  maxpages = 0
  html = False
  password = ''
  outfp = stdout
  for (k, v) in opts:
    if k == '-d': debug += 1
-    elif k == '-p': pages.update( int(x)-1 for x in v.split(',') )
+    elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
    elif k == '-P': password = v
    elif k == '-c': codec = v
    elif k == '-m': maxpages = int(v)
@ -265,7 +265,7 @@ def main(argv):
  CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug)
  rsrc = PDFResourceManager(debug=debug)
  for fname in args:
-    pdf2txt(outfp, rsrc, fname, pages, codec, 
+    pdf2txt(outfp, rsrc, fname, pagenos, codec, 
            maxpages=maxpages, html=html, password=password, debug=debug)
  return