better TOC extraction
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@207 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
f35ef4b084
commit
9052cd1ea7
|
@ -29,6 +29,7 @@ from utils import decode_text, ObjIdRange
|
|||
##
|
||||
class PDFSyntaxError(PDFException): pass
|
||||
class PDFNoValidXRef(PDFSyntaxError): pass
|
||||
class PDFNoOutlines(PDFException): pass
|
||||
class PDFEncryptionError(PDFException): pass
|
||||
class PDFPasswordIncorrect(PDFEncryptionError): pass
|
||||
|
||||
|
@ -513,7 +514,7 @@ class PDFDocument(object):
|
|||
|
||||
def get_outlines(self):
|
||||
if 'Outlines' not in self.catalog:
|
||||
raise PDFException('No /Outlines defined!')
|
||||
raise PDFNoOutlines('No /Outlines defined!')
|
||||
def search(entry, level):
|
||||
entry = dict_value(entry)
|
||||
if 'Title' in entry:
|
||||
|
|
|
@ -8,17 +8,21 @@
|
|||
#
|
||||
import sys, re
|
||||
from pdfminer.psparser import PSKeyword, PSLiteral
|
||||
from pdfminer.pdfparser import PDFDocument, PDFParser
|
||||
from pdfminer.pdfparser import PDFDocument, PDFParser, PDFNoOutlines
|
||||
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
|
||||
|
||||
|
||||
ESC_PAT = re.compile(r'[\000-\037&<>()\042\047\134\177-\377]')
|
||||
def esc(s):
|
||||
ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
|
||||
def e(s):
|
||||
return ESC_PAT.sub(lambda m:'&#%d;' % ord(m.group(0)), s)
|
||||
|
||||
|
||||
# dumpxml
|
||||
def dumpxml(out, obj, codec=None):
|
||||
if obj is None:
|
||||
out.write('<null />')
|
||||
return
|
||||
|
||||
if isinstance(obj, dict):
|
||||
out.write('<dict size="%d">\n' % len(obj))
|
||||
for (k,v) in obj.iteritems():
|
||||
|
@ -38,7 +42,7 @@ def dumpxml(out, obj, codec=None):
|
|||
return
|
||||
|
||||
if isinstance(obj, str):
|
||||
out.write('<string size="%d">%s</string>' % (len(obj), esc(obj)))
|
||||
out.write('<string size="%d">%s</string>' % (len(obj), e(obj)))
|
||||
return
|
||||
|
||||
if isinstance(obj, PDFStream):
|
||||
|
@ -52,7 +56,7 @@ def dumpxml(out, obj, codec=None):
|
|||
out.write('\n</props>\n')
|
||||
if codec == 'text':
|
||||
data = obj.get_data()
|
||||
out.write('<data size="%d">%s</data>\n' % (len(data), esc(data)))
|
||||
out.write('<data size="%d">%s</data>\n' % (len(data), e(data)))
|
||||
out.write('</stream>')
|
||||
return
|
||||
|
||||
|
@ -109,7 +113,10 @@ def dumpoutline(outfp, fname, objids, pagenos, password='',
|
|||
doc.set_parser(parser)
|
||||
doc.initialize(password)
|
||||
pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
|
||||
for (level,title,dest,a,se) in doc.get_outlines():
|
||||
try:
|
||||
outlines = doc.get_outlines()
|
||||
outfp.write('<outlines>\n')
|
||||
for (level,title,dest,a,se) in outlines:
|
||||
pageno = None
|
||||
if dest:
|
||||
dest = resolve1( doc.lookup_name('Dests', dest) )
|
||||
|
@ -123,7 +130,18 @@ def dumpoutline(outfp, fname, objids, pagenos, password='',
|
|||
if subtype and repr(subtype) == '/GoTo' and action.get('D'):
|
||||
dest = action['D']
|
||||
pageno = pages[dest[0].objid]
|
||||
outfp.write(repr((level,title,dest,pageno))+'\n')
|
||||
s = e(title).encode('utf-8', 'xmlcharrefreplace')
|
||||
outfp.write('<outline level="%r" title="%s">\n' % (level, s))
|
||||
if dest is not None:
|
||||
outfp.write('<dest>')
|
||||
dumpxml(outfp, dest)
|
||||
outfp.write('</dest>\n')
|
||||
if pageno is not None:
|
||||
outfp.write('<pageno>%r</pageno>\n' % pageno)
|
||||
outfp.write('</outline>\n')
|
||||
outfp.write('</outlines>\n')
|
||||
except PDFNoOutlines:
|
||||
pass
|
||||
parser.close()
|
||||
fp.close()
|
||||
return
|
||||
|
|
Loading…
Reference in New Issue