better TOC extraction

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@207 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2010-04-24 01:34:18 +00:00
parent f35ef4b084
commit 9052cd1ea7
2 changed files with 41 additions and 22 deletions

View File

@ -29,6 +29,7 @@ from utils import decode_text, ObjIdRange
##
class PDFSyntaxError(PDFException): pass
class PDFNoValidXRef(PDFSyntaxError): pass
class PDFNoOutlines(PDFException): pass
class PDFEncryptionError(PDFException): pass
class PDFPasswordIncorrect(PDFEncryptionError): pass
@ -513,7 +514,7 @@ class PDFDocument(object):
def get_outlines(self):
if 'Outlines' not in self.catalog:
raise PDFException('No /Outlines defined!')
raise PDFNoOutlines('No /Outlines defined!')
def search(entry, level):
entry = dict_value(entry)
if 'Title' in entry:

View File

@ -8,17 +8,21 @@
#
import sys, re
from pdfminer.psparser import PSKeyword, PSLiteral
from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfparser import PDFDocument, PDFParser, PDFNoOutlines
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
ESC_PAT = re.compile(r'[\000-\037&<>()\042\047\134\177-\377]')
def esc(s):
ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
def e(s):
return ESC_PAT.sub(lambda m:'&#%d;' % ord(m.group(0)), s)
# dumpxml
def dumpxml(out, obj, codec=None):
if obj is None:
out.write('<null />')
return
if isinstance(obj, dict):
out.write('<dict size="%d">\n' % len(obj))
for (k,v) in obj.iteritems():
@ -38,7 +42,7 @@ def dumpxml(out, obj, codec=None):
return
if isinstance(obj, str):
out.write('<string size="%d">%s</string>' % (len(obj), esc(obj)))
out.write('<string size="%d">%s</string>' % (len(obj), e(obj)))
return
if isinstance(obj, PDFStream):
@ -52,12 +56,12 @@ def dumpxml(out, obj, codec=None):
out.write('\n</props>\n')
if codec == 'text':
data = obj.get_data()
out.write('<data size="%d">%s</data>\n' % (len(data), esc(data)))
out.write('<data size="%d">%s</data>\n' % (len(data), e(data)))
out.write('</stream>')
return
if isinstance(obj, PDFObjRef):
out.write('<ref id="%d"/>' % obj.objid)
out.write('<ref id="%d" />' % obj.objid)
return
if isinstance(obj, PSKeyword):
@ -109,21 +113,35 @@ def dumpoutline(outfp, fname, objids, pagenos, password='',
doc.set_parser(parser)
doc.initialize(password)
pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
for (level,title,dest,a,se) in doc.get_outlines():
pageno = None
if dest:
dest = resolve1( doc.lookup_name('Dests', dest) )
if isinstance(dest, dict):
dest = dest['D']
pageno = pages[dest[0].objid]
elif a:
action = a.resolve()
if isinstance(action, dict):
subtype = action.get('S')
if subtype and repr(subtype) == '/GoTo' and action.get('D'):
dest = action['D']
pageno = pages[dest[0].objid]
outfp.write(repr((level,title,dest,pageno))+'\n')
try:
outlines = doc.get_outlines()
outfp.write('<outlines>\n')
for (level,title,dest,a,se) in outlines:
pageno = None
if dest:
dest = resolve1( doc.lookup_name('Dests', dest) )
if isinstance(dest, dict):
dest = dest['D']
pageno = pages[dest[0].objid]
elif a:
action = a.resolve()
if isinstance(action, dict):
subtype = action.get('S')
if subtype and repr(subtype) == '/GoTo' and action.get('D'):
dest = action['D']
pageno = pages[dest[0].objid]
s = e(title).encode('utf-8', 'xmlcharrefreplace')
outfp.write('<outline level="%r" title="%s">\n' % (level, s))
if dest is not None:
outfp.write('<dest>')
dumpxml(outfp, dest)
outfp.write('</dest>\n')
if pageno is not None:
outfp.write('<pageno>%r</pageno>\n' % pageno)
outfp.write('</outline>\n')
outfp.write('</outlines>\n')
except PDFNoOutlines:
pass
parser.close()
fp.close()
return