diff --git a/CHANGELOG.md b/CHANGELOG.md index 26bdd5e..6595543 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Fixed - Unhandled AssertionError when dumping pdf containing reference to object id 0 ([#318](https://github.com/pdfminer/pdfminer.six/pull/318)) +### Changed +- Using argparse instead of getopt for command line interface of dumppdf.py ([#321](https://github.com/pdfminer/pdfminer.six/pull/321)) + ### Removed - Files for external applications such as django, cgi and pyinstaller ([#314](https://github.com/pdfminer/pdfminer.six/issues/314)) diff --git a/tests/test_tools_dumppdf.py b/tests/test_tools_dumppdf.py index 99f8834..b5febb0 100644 --- a/tests/test_tools_dumppdf.py +++ b/tests/test_tools_dumppdf.py @@ -8,10 +8,10 @@ def run(filename, options=None): absolute_path = absolute_sample_path(filename) with NamedTemporaryFile() as output_file: if options: - s = 'dumppdf -o%s %s %s' % (output_file.name, options, absolute_path) + s = 'dumppdf -o %s %s %s' % (output_file.name, options, absolute_path) else: - s = 'dumppdf -o%s %s' % (output_file.name, absolute_path) - dumppdf.main(s.split(' ')) + s = 'dumppdf -o %s %s' % (output_file.name, absolute_path) + dumppdf.main(s.split(' ')[1:]) class TestDumpPDF(): diff --git a/tools/dumppdf.py b/tools/dumppdf.py index 110f196..01654d4 100755 --- a/tools/dumppdf.py +++ b/tools/dumppdf.py @@ -1,32 +1,29 @@ -#!/usr/bin/env python +"""Extract pdf structure in XML format""" +import logging +import os.path +import re +import sys +from argparse import ArgumentParser + +import six -# -# dumppdf.py - dump pdf contents in XML format. -# -# usage: dumppdf.py [options] [files ...] -# options: -# -i objid : object id -# -import sys, os.path, re, logging -from pdfminer.psparser import PSKeyword, PSLiteral, LIT -from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines +from pdfminer.pdfpage import PDFPage +from pdfminer.pdfparser import PDFParser from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value -from pdfminer.pdfpage import PDFPage +from pdfminer.psparser import PSKeyword, PSLiteral, LIT from pdfminer.utils import isnumber - ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]') + + def e(s): - if six.PY3 and isinstance(s,six.binary_type): - s=str(s,'latin-1') - return ESC_PAT.sub(lambda m:'&#%d;' % ord(m.group(0)), s) - -import six # Python 2+3 compatibility + if six.PY3 and isinstance(s, six.binary_type): + s = str(s, 'latin-1') + return ESC_PAT.sub(lambda m: '&#%d;' % ord(m.group(0)), s) -# dumpxml def dumpxml(out, obj, codec=None): if obj is None: out.write('') @@ -34,7 +31,7 @@ def dumpxml(out, obj, codec=None): if isinstance(obj, dict): out.write('\n' % len(obj)) - for (k,v) in six.iteritems(obj): + for (k, v) in six.iteritems(obj): out.write('%s\n' % k) out.write('') dumpxml(out, v) @@ -87,7 +84,7 @@ def dumpxml(out, obj, codec=None): raise TypeError(obj) -# dumptrailers + def dumptrailers(out, doc): for xref in doc.xrefs: out.write('\n') @@ -95,7 +92,7 @@ def dumptrailers(out, doc): out.write('\n\n\n') return -# dumpallobjs + def dumpallobjs(out, doc, codec=None): visited = set() out.write('') @@ -110,19 +107,20 @@ def dumpallobjs(out, doc, codec=None): dumpxml(out, obj, codec=codec) out.write('\n\n\n') except PDFObjectNotFound as e: - print >>sys.stderr, 'not found: %r' % e + print('not found: %r' % e) dumptrailers(out, doc) out.write('') return -# dumpoutline + def dumpoutline(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): fp = open(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) - pages = dict( (page.pageid, pageno) for (pageno,page) - in enumerate(PDFPage.create_pages(doc), 1) ) + pages = dict((page.pageid, pageno) for (pageno, page) + in enumerate(PDFPage.create_pages(doc), 1)) + def resolve_dest(dest): if isinstance(dest, str): dest = resolve1(doc.get_dest(dest)) @@ -133,10 +131,11 @@ def dumpoutline(outfp, fname, objids, pagenos, password='', if isinstance(dest, PDFObjRef): dest = dest.resolve() return dest + try: outlines = doc.get_outlines() outfp.write('\n') - for (level,title,dest,a,se) in outlines: + for (level, title, dest, a, se) in outlines: pageno = None if dest: dest = resolve_dest(dest) @@ -145,7 +144,8 @@ def dumpoutline(outfp, fname, objids, pagenos, password='', action = a if isinstance(action, dict): subtype = action.get('S') - if subtype and repr(subtype) == '/\'GoTo\'' and action.get('D'): + if subtype and repr(subtype) == '/\'GoTo\'' and action.get( + 'D'): dest = resolve_dest(action['D']) pageno = pages[dest[0].objid] s = e(title).encode('utf-8', 'xmlcharrefreplace') @@ -164,9 +164,11 @@ def dumpoutline(outfp, fname, objids, pagenos, password='', fp.close() return -# extractembedded + LITERAL_FILESPEC = LIT('Filespec') LITERAL_EMBEDDEDFILE = LIT('EmbeddedFile') + + def extractembedded(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): def extract1(obj): @@ -184,8 +186,8 @@ def extractembedded(outfp, fname, objids, pagenos, password='', path = os.path.join(extractdir, filename) if os.path.exists(path): raise IOError('file exists: %r' % path) - print >>sys.stderr, 'extracting: %r' % path - out = file(path, 'wb') + print('extracting: %r' % path) + out = open(path, 'wb') out.write(fileobj.get_data()) out.close() return @@ -201,7 +203,7 @@ def extractembedded(outfp, fname, objids, pagenos, password='', fp.close() return -# dumppdf + def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): fp = open(fname, 'rb') @@ -212,7 +214,7 @@ def dumppdf(outfp, fname, objids, pagenos, password='', obj = doc.getobj(objid) dumpxml(outfp, obj, codec=codec) if pagenos: - for (pageno,page) in enumerate(PDFPage.create_pages(doc)): + for (pageno, page) in enumerate(PDFPage.create_pages(doc)): if pageno in pagenos: if codec: for obj in page.contents: @@ -225,51 +227,119 @@ def dumppdf(outfp, fname, objids, pagenos, password='', if (not objids) and (not pagenos) and (not dumpall): dumptrailers(outfp, doc) fp.close() - if codec not in ('raw','binary'): + if codec not in ('raw', 'binary'): outfp.write('\n') return -# main -def main(argv): - import getopt - def usage(): - print ('usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-E directory] [-i objid] file ...' % argv[0]) - return 100 - try: - (opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTE:i:o:') - except getopt.GetoptError: - return usage() - if not args: return usage() - objids = [] - pagenos = set() - codec = None - password = '' - dumpall = False - proc = dumppdf - outfp = sys.stdout - extractdir = None - for (k, v) in opts: - if k == '-d': logging.getLogger().setLevel(logging.DEBUG) - elif k == '-o': outfp = open(v, 'w') - elif k == '-i': objids.extend( int(x) for x in v.split(',') ) - elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) - elif k == '-P': password = v - elif k == '-a': dumpall = True - elif k == '-r': codec = 'raw' - elif k == '-b': codec = 'binary' - elif k == '-t': codec = 'text' - elif k == '-T': proc = dumpoutline - elif k == '-E': - extractdir = v - proc = extractembedded +def create_parser(): + parser = ArgumentParser(description=__doc__, add_help=True) + parser.add_argument('files', type=str, default=None, nargs='+', + help='One or more paths to PDF files.') + parser.add_argument( + '-d', '--debug', default=False, action='store_true', + help='Use debug logging level.') + procedure_parser = parser.add_mutually_exclusive_group() + procedure_parser.add_argument( + '-T', '--extract-toc', default=False, action='store_true', + help='Extract structure of outline') + procedure_parser.add_argument( + '-E', '--extract-embedded', type=str, + help='Extract embedded files') + + parse_params = parser.add_argument_group( + 'Parser', description='Used during PDF parsing') + parse_params.add_argument( + "--page-numbers", type=int, default=None, nargs="+", + help="A space-seperated list of page numbers to parse.") + parse_params.add_argument( + "-p", "--pagenos", type=str, + help="A comma-separated list of page numbers to parse. Included for " + "legacy applications, use --page-numbers for more idiomatic " + "argument entry.") + parse_params.add_argument( + '-i', '--objects', type=str, + help='Comma separated list of object numbers to extract') + parse_params.add_argument( + '-a', '--all', default=False, action='store_true', + help='If the structure of all objects should be extracted') + parse_params.add_argument( + '-P', '--password', type=str, default='', + help='The password to use for decrypting PDF file.') + + output_params = parser.add_argument_group( + 'Output', description='Used during output generation.') + output_params.add_argument( + '-o', '--outfile', type=str, default='-', + help='Path to file where output is written. Or "-" (default) to ' + 'write to stdout.') + codec_parser = output_params.add_mutually_exclusive_group() + codec_parser.add_argument( + '-r', '--raw-stream', default=False, action='store_true', + help='Write stream objects without encoding') + codec_parser.add_argument( + '-b', '--binary-stream', default=False, action='store_true', + help='Write stream objects with binary encoding') + codec_parser.add_argument( + '-t', '--text-stream', default=False, action='store_true', + help='Write stream objects as plain text') + + return parser + + +def main(argv=None): + parser = create_parser() + args = parser.parse_args(args=argv) + + if args.debug: + logging.getLogger().setLevel(logging.DEBUG) + + if args.outfile == '-': + outfp = sys.stdout + else: + outfp = open(args.outfile, 'w') + + if args.objects: + objids = [int(x) for x in args.objects.split(',')] + else: + objids = [] + + if args.page_numbers: + pagenos = {x - 1 for x in args.page_numbers} + elif args.pagenos: + pagenos = {int(x) - 1 for x in args.pagenos.split(',')} + else: + pagenos = set() + + password = args.password if six.PY2 and sys.stdin.encoding: password = password.decode(sys.stdin.encoding) - for fname in args: + if args.raw_stream: + codec = 'raw' + elif args.binary_stream: + codec = 'binary' + elif args.text_stream: + codec = 'text' + else: + codec = None + + if args.extract_toc: + extractdir = None + proc = dumpoutline + elif args.extract_embedded: + extractdir = args.extract_embedded + proc = extractembedded + else: + extractdir = None + proc = dumppdf + + for fname in args.files: proc(outfp, fname, objids, pagenos, password=password, - dumpall=dumpall, codec=codec, extractdir=extractdir) + dumpall=args.all, codec=codec, extractdir=extractdir) outfp.close() -if __name__ == '__main__': sys.exit(main(sys.argv)) + +if __name__ == '__main__': + sys.exit(main())