diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py index 1748fad..87e6d9e 100644 --- a/pdfminer/pdfparser.py +++ b/pdfminer/pdfparser.py @@ -29,6 +29,7 @@ from utils import decode_text, ObjIdRange ## class PDFSyntaxError(PDFException): pass class PDFNoValidXRef(PDFSyntaxError): pass +class PDFNoOutlines(PDFException): pass class PDFEncryptionError(PDFException): pass class PDFPasswordIncorrect(PDFEncryptionError): pass @@ -513,7 +514,7 @@ class PDFDocument(object): def get_outlines(self): if 'Outlines' not in self.catalog: - raise PDFException('No /Outlines defined!') + raise PDFNoOutlines('No /Outlines defined!') def search(entry, level): entry = dict_value(entry) if 'Title' in entry: diff --git a/tools/dumppdf.py b/tools/dumppdf.py index cd8bb28..3e673ae 100755 --- a/tools/dumppdf.py +++ b/tools/dumppdf.py @@ -8,17 +8,21 @@ # import sys, re from pdfminer.psparser import PSKeyword, PSLiteral -from pdfminer.pdfparser import PDFDocument, PDFParser +from pdfminer.pdfparser import PDFDocument, PDFParser, PDFNoOutlines from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value -ESC_PAT = re.compile(r'[\000-\037&<>()\042\047\134\177-\377]') -def esc(s): +ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]') +def e(s): return ESC_PAT.sub(lambda m:'&#%d;' % ord(m.group(0)), s) # dumpxml def dumpxml(out, obj, codec=None): + if obj is None: + out.write('') + return + if isinstance(obj, dict): out.write('\n' % len(obj)) for (k,v) in obj.iteritems(): @@ -38,7 +42,7 @@ def dumpxml(out, obj, codec=None): return if isinstance(obj, str): - out.write('%s' % (len(obj), esc(obj))) + out.write('%s' % (len(obj), e(obj))) return if isinstance(obj, PDFStream): @@ -52,12 +56,12 @@ def dumpxml(out, obj, codec=None): out.write('\n\n') if codec == 'text': data = obj.get_data() - out.write('%s\n' % (len(data), esc(data))) + out.write('%s\n' % (len(data), e(data))) out.write('') return if isinstance(obj, PDFObjRef): - out.write('' % obj.objid) + out.write('' % obj.objid) return if isinstance(obj, PSKeyword): @@ -109,21 +113,35 @@ def dumpoutline(outfp, fname, objids, pagenos, password='', doc.set_parser(parser) doc.initialize(password) pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) ) - for (level,title,dest,a,se) in doc.get_outlines(): - pageno = None - if dest: - dest = resolve1( doc.lookup_name('Dests', dest) ) - if isinstance(dest, dict): - dest = dest['D'] - pageno = pages[dest[0].objid] - elif a: - action = a.resolve() - if isinstance(action, dict): - subtype = action.get('S') - if subtype and repr(subtype) == '/GoTo' and action.get('D'): - dest = action['D'] - pageno = pages[dest[0].objid] - outfp.write(repr((level,title,dest,pageno))+'\n') + try: + outlines = doc.get_outlines() + outfp.write('\n') + for (level,title,dest,a,se) in outlines: + pageno = None + if dest: + dest = resolve1( doc.lookup_name('Dests', dest) ) + if isinstance(dest, dict): + dest = dest['D'] + pageno = pages[dest[0].objid] + elif a: + action = a.resolve() + if isinstance(action, dict): + subtype = action.get('S') + if subtype and repr(subtype) == '/GoTo' and action.get('D'): + dest = action['D'] + pageno = pages[dest[0].objid] + s = e(title).encode('utf-8', 'xmlcharrefreplace') + outfp.write('\n' % (level, s)) + if dest is not None: + outfp.write('') + dumpxml(outfp, dest) + outfp.write('\n') + if pageno is not None: + outfp.write('%r\n' % pageno) + outfp.write('\n') + outfp.write('\n') + except PDFNoOutlines: + pass parser.close() fp.close() return