diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py
index 1748fad..87e6d9e 100644
--- a/pdfminer/pdfparser.py
+++ b/pdfminer/pdfparser.py
@@ -29,6 +29,7 @@ from utils import decode_text, ObjIdRange
##
class PDFSyntaxError(PDFException): pass
class PDFNoValidXRef(PDFSyntaxError): pass
+class PDFNoOutlines(PDFException): pass
class PDFEncryptionError(PDFException): pass
class PDFPasswordIncorrect(PDFEncryptionError): pass
@@ -513,7 +514,7 @@ class PDFDocument(object):
def get_outlines(self):
if 'Outlines' not in self.catalog:
- raise PDFException('No /Outlines defined!')
+ raise PDFNoOutlines('No /Outlines defined!')
def search(entry, level):
entry = dict_value(entry)
if 'Title' in entry:
diff --git a/tools/dumppdf.py b/tools/dumppdf.py
index cd8bb28..3e673ae 100755
--- a/tools/dumppdf.py
+++ b/tools/dumppdf.py
@@ -8,17 +8,21 @@
#
import sys, re
from pdfminer.psparser import PSKeyword, PSLiteral
-from pdfminer.pdfparser import PDFDocument, PDFParser
+from pdfminer.pdfparser import PDFDocument, PDFParser, PDFNoOutlines
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
-ESC_PAT = re.compile(r'[\000-\037&<>()\042\047\134\177-\377]')
-def esc(s):
+ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
+def e(s):
return ESC_PAT.sub(lambda m:'%d;' % ord(m.group(0)), s)
# dumpxml
def dumpxml(out, obj, codec=None):
+ if obj is None:
+ out.write('')
+ return
+
if isinstance(obj, dict):
out.write('\n' % len(obj))
for (k,v) in obj.iteritems():
@@ -38,7 +42,7 @@ def dumpxml(out, obj, codec=None):
return
if isinstance(obj, str):
- out.write('%s' % (len(obj), esc(obj)))
+ out.write('%s' % (len(obj), e(obj)))
return
if isinstance(obj, PDFStream):
@@ -52,12 +56,12 @@ def dumpxml(out, obj, codec=None):
out.write('\n\n')
if codec == 'text':
data = obj.get_data()
- out.write('%s\n' % (len(data), esc(data)))
+ out.write('%s\n' % (len(data), e(data)))
out.write('')
return
if isinstance(obj, PDFObjRef):
- out.write('' % obj.objid)
+ out.write('' % obj.objid)
return
if isinstance(obj, PSKeyword):
@@ -109,21 +113,35 @@ def dumpoutline(outfp, fname, objids, pagenos, password='',
doc.set_parser(parser)
doc.initialize(password)
pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
- for (level,title,dest,a,se) in doc.get_outlines():
- pageno = None
- if dest:
- dest = resolve1( doc.lookup_name('Dests', dest) )
- if isinstance(dest, dict):
- dest = dest['D']
- pageno = pages[dest[0].objid]
- elif a:
- action = a.resolve()
- if isinstance(action, dict):
- subtype = action.get('S')
- if subtype and repr(subtype) == '/GoTo' and action.get('D'):
- dest = action['D']
- pageno = pages[dest[0].objid]
- outfp.write(repr((level,title,dest,pageno))+'\n')
+ try:
+ outlines = doc.get_outlines()
+ outfp.write('\n')
+ for (level,title,dest,a,se) in outlines:
+ pageno = None
+ if dest:
+ dest = resolve1( doc.lookup_name('Dests', dest) )
+ if isinstance(dest, dict):
+ dest = dest['D']
+ pageno = pages[dest[0].objid]
+ elif a:
+ action = a.resolve()
+ if isinstance(action, dict):
+ subtype = action.get('S')
+ if subtype and repr(subtype) == '/GoTo' and action.get('D'):
+ dest = action['D']
+ pageno = pages[dest[0].objid]
+ s = e(title).encode('utf-8', 'xmlcharrefreplace')
+ outfp.write('\n' % (level, s))
+ if dest is not None:
+ outfp.write('')
+ dumpxml(outfp, dest)
+ outfp.write('\n')
+ if pageno is not None:
+ outfp.write('%r\n' % pageno)
+ outfp.write('\n')
+ outfp.write('\n')
+ except PDFNoOutlines:
+ pass
parser.close()
fp.close()
return