diff --git a/CHANGELOG.md b/CHANGELOG.md
index 26bdd5e..6595543 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
### Fixed
- Unhandled AssertionError when dumping pdf containing reference to object id 0 ([#318](https://github.com/pdfminer/pdfminer.six/pull/318))
+### Changed
+- Using argparse instead of getopt for command line interface of dumppdf.py ([#321](https://github.com/pdfminer/pdfminer.six/pull/321))
+
### Removed
- Files for external applications such as django, cgi and pyinstaller ([#314](https://github.com/pdfminer/pdfminer.six/issues/314))
diff --git a/tests/test_tools_dumppdf.py b/tests/test_tools_dumppdf.py
index 99f8834..b5febb0 100644
--- a/tests/test_tools_dumppdf.py
+++ b/tests/test_tools_dumppdf.py
@@ -8,10 +8,10 @@ def run(filename, options=None):
absolute_path = absolute_sample_path(filename)
with NamedTemporaryFile() as output_file:
if options:
- s = 'dumppdf -o%s %s %s' % (output_file.name, options, absolute_path)
+ s = 'dumppdf -o %s %s %s' % (output_file.name, options, absolute_path)
else:
- s = 'dumppdf -o%s %s' % (output_file.name, absolute_path)
- dumppdf.main(s.split(' '))
+ s = 'dumppdf -o %s %s' % (output_file.name, absolute_path)
+ dumppdf.main(s.split(' ')[1:])
class TestDumpPDF():
diff --git a/tools/dumppdf.py b/tools/dumppdf.py
index 110f196..01654d4 100755
--- a/tools/dumppdf.py
+++ b/tools/dumppdf.py
@@ -1,32 +1,29 @@
-#!/usr/bin/env python
+"""Extract pdf structure in XML format"""
+import logging
+import os.path
+import re
+import sys
+from argparse import ArgumentParser
+
+import six
-#
-# dumppdf.py - dump pdf contents in XML format.
-#
-# usage: dumppdf.py [options] [files ...]
-# options:
-# -i objid : object id
-#
-import sys, os.path, re, logging
-from pdfminer.psparser import PSKeyword, PSLiteral, LIT
-from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
+from pdfminer.pdfpage import PDFPage
+from pdfminer.pdfparser import PDFParser
from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
-from pdfminer.pdfpage import PDFPage
+from pdfminer.psparser import PSKeyword, PSLiteral, LIT
from pdfminer.utils import isnumber
-
ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
+
+
def e(s):
- if six.PY3 and isinstance(s,six.binary_type):
- s=str(s,'latin-1')
- return ESC_PAT.sub(lambda m:'%d;' % ord(m.group(0)), s)
-
-import six # Python 2+3 compatibility
+ if six.PY3 and isinstance(s, six.binary_type):
+ s = str(s, 'latin-1')
+ return ESC_PAT.sub(lambda m: '%d;' % ord(m.group(0)), s)
-# dumpxml
def dumpxml(out, obj, codec=None):
if obj is None:
out.write('')
@@ -34,7 +31,7 @@ def dumpxml(out, obj, codec=None):
if isinstance(obj, dict):
out.write('\n' % len(obj))
- for (k,v) in six.iteritems(obj):
+ for (k, v) in six.iteritems(obj):
out.write('%s\n' % k)
out.write('')
dumpxml(out, v)
@@ -87,7 +84,7 @@ def dumpxml(out, obj, codec=None):
raise TypeError(obj)
-# dumptrailers
+
def dumptrailers(out, doc):
for xref in doc.xrefs:
out.write('\n')
@@ -95,7 +92,7 @@ def dumptrailers(out, doc):
out.write('\n\n\n')
return
-# dumpallobjs
+
def dumpallobjs(out, doc, codec=None):
visited = set()
out.write('')
@@ -110,19 +107,20 @@ def dumpallobjs(out, doc, codec=None):
dumpxml(out, obj, codec=codec)
out.write('\n\n\n')
except PDFObjectNotFound as e:
- print >>sys.stderr, 'not found: %r' % e
+ print('not found: %r' % e)
dumptrailers(out, doc)
out.write('')
return
-# dumpoutline
+
def dumpoutline(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None, extractdir=None):
fp = open(fname, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser, password)
- pages = dict( (page.pageid, pageno) for (pageno,page)
- in enumerate(PDFPage.create_pages(doc), 1) )
+ pages = dict((page.pageid, pageno) for (pageno, page)
+ in enumerate(PDFPage.create_pages(doc), 1))
+
def resolve_dest(dest):
if isinstance(dest, str):
dest = resolve1(doc.get_dest(dest))
@@ -133,10 +131,11 @@ def dumpoutline(outfp, fname, objids, pagenos, password='',
if isinstance(dest, PDFObjRef):
dest = dest.resolve()
return dest
+
try:
outlines = doc.get_outlines()
outfp.write('\n')
- for (level,title,dest,a,se) in outlines:
+ for (level, title, dest, a, se) in outlines:
pageno = None
if dest:
dest = resolve_dest(dest)
@@ -145,7 +144,8 @@ def dumpoutline(outfp, fname, objids, pagenos, password='',
action = a
if isinstance(action, dict):
subtype = action.get('S')
- if subtype and repr(subtype) == '/\'GoTo\'' and action.get('D'):
+ if subtype and repr(subtype) == '/\'GoTo\'' and action.get(
+ 'D'):
dest = resolve_dest(action['D'])
pageno = pages[dest[0].objid]
s = e(title).encode('utf-8', 'xmlcharrefreplace')
@@ -164,9 +164,11 @@ def dumpoutline(outfp, fname, objids, pagenos, password='',
fp.close()
return
-# extractembedded
+
LITERAL_FILESPEC = LIT('Filespec')
LITERAL_EMBEDDEDFILE = LIT('EmbeddedFile')
+
+
def extractembedded(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None, extractdir=None):
def extract1(obj):
@@ -184,8 +186,8 @@ def extractembedded(outfp, fname, objids, pagenos, password='',
path = os.path.join(extractdir, filename)
if os.path.exists(path):
raise IOError('file exists: %r' % path)
- print >>sys.stderr, 'extracting: %r' % path
- out = file(path, 'wb')
+ print('extracting: %r' % path)
+ out = open(path, 'wb')
out.write(fileobj.get_data())
out.close()
return
@@ -201,7 +203,7 @@ def extractembedded(outfp, fname, objids, pagenos, password='',
fp.close()
return
-# dumppdf
+
def dumppdf(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None, extractdir=None):
fp = open(fname, 'rb')
@@ -212,7 +214,7 @@ def dumppdf(outfp, fname, objids, pagenos, password='',
obj = doc.getobj(objid)
dumpxml(outfp, obj, codec=codec)
if pagenos:
- for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
+ for (pageno, page) in enumerate(PDFPage.create_pages(doc)):
if pageno in pagenos:
if codec:
for obj in page.contents:
@@ -225,51 +227,119 @@ def dumppdf(outfp, fname, objids, pagenos, password='',
if (not objids) and (not pagenos) and (not dumpall):
dumptrailers(outfp, doc)
fp.close()
- if codec not in ('raw','binary'):
+ if codec not in ('raw', 'binary'):
outfp.write('\n')
return
-# main
-def main(argv):
- import getopt
- def usage():
- print ('usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-E directory] [-i objid] file ...' % argv[0])
- return 100
- try:
- (opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTE:i:o:')
- except getopt.GetoptError:
- return usage()
- if not args: return usage()
- objids = []
- pagenos = set()
- codec = None
- password = ''
- dumpall = False
- proc = dumppdf
- outfp = sys.stdout
- extractdir = None
- for (k, v) in opts:
- if k == '-d': logging.getLogger().setLevel(logging.DEBUG)
- elif k == '-o': outfp = open(v, 'w')
- elif k == '-i': objids.extend( int(x) for x in v.split(',') )
- elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
- elif k == '-P': password = v
- elif k == '-a': dumpall = True
- elif k == '-r': codec = 'raw'
- elif k == '-b': codec = 'binary'
- elif k == '-t': codec = 'text'
- elif k == '-T': proc = dumpoutline
- elif k == '-E':
- extractdir = v
- proc = extractembedded
+def create_parser():
+ parser = ArgumentParser(description=__doc__, add_help=True)
+ parser.add_argument('files', type=str, default=None, nargs='+',
+ help='One or more paths to PDF files.')
+ parser.add_argument(
+ '-d', '--debug', default=False, action='store_true',
+ help='Use debug logging level.')
+ procedure_parser = parser.add_mutually_exclusive_group()
+ procedure_parser.add_argument(
+ '-T', '--extract-toc', default=False, action='store_true',
+ help='Extract structure of outline')
+ procedure_parser.add_argument(
+ '-E', '--extract-embedded', type=str,
+ help='Extract embedded files')
+
+ parse_params = parser.add_argument_group(
+ 'Parser', description='Used during PDF parsing')
+ parse_params.add_argument(
+ "--page-numbers", type=int, default=None, nargs="+",
+ help="A space-seperated list of page numbers to parse.")
+ parse_params.add_argument(
+ "-p", "--pagenos", type=str,
+ help="A comma-separated list of page numbers to parse. Included for "
+ "legacy applications, use --page-numbers for more idiomatic "
+ "argument entry.")
+ parse_params.add_argument(
+ '-i', '--objects', type=str,
+ help='Comma separated list of object numbers to extract')
+ parse_params.add_argument(
+ '-a', '--all', default=False, action='store_true',
+ help='If the structure of all objects should be extracted')
+ parse_params.add_argument(
+ '-P', '--password', type=str, default='',
+ help='The password to use for decrypting PDF file.')
+
+ output_params = parser.add_argument_group(
+ 'Output', description='Used during output generation.')
+ output_params.add_argument(
+ '-o', '--outfile', type=str, default='-',
+ help='Path to file where output is written. Or "-" (default) to '
+ 'write to stdout.')
+ codec_parser = output_params.add_mutually_exclusive_group()
+ codec_parser.add_argument(
+ '-r', '--raw-stream', default=False, action='store_true',
+ help='Write stream objects without encoding')
+ codec_parser.add_argument(
+ '-b', '--binary-stream', default=False, action='store_true',
+ help='Write stream objects with binary encoding')
+ codec_parser.add_argument(
+ '-t', '--text-stream', default=False, action='store_true',
+ help='Write stream objects as plain text')
+
+ return parser
+
+
+def main(argv=None):
+ parser = create_parser()
+ args = parser.parse_args(args=argv)
+
+ if args.debug:
+ logging.getLogger().setLevel(logging.DEBUG)
+
+ if args.outfile == '-':
+ outfp = sys.stdout
+ else:
+ outfp = open(args.outfile, 'w')
+
+ if args.objects:
+ objids = [int(x) for x in args.objects.split(',')]
+ else:
+ objids = []
+
+ if args.page_numbers:
+ pagenos = {x - 1 for x in args.page_numbers}
+ elif args.pagenos:
+ pagenos = {int(x) - 1 for x in args.pagenos.split(',')}
+ else:
+ pagenos = set()
+
+ password = args.password
if six.PY2 and sys.stdin.encoding:
password = password.decode(sys.stdin.encoding)
- for fname in args:
+ if args.raw_stream:
+ codec = 'raw'
+ elif args.binary_stream:
+ codec = 'binary'
+ elif args.text_stream:
+ codec = 'text'
+ else:
+ codec = None
+
+ if args.extract_toc:
+ extractdir = None
+ proc = dumpoutline
+ elif args.extract_embedded:
+ extractdir = args.extract_embedded
+ proc = extractembedded
+ else:
+ extractdir = None
+ proc = dumppdf
+
+ for fname in args.files:
proc(outfp, fname, objids, pagenos, password=password,
- dumpall=dumpall, codec=codec, extractdir=extractdir)
+ dumpall=args.all, codec=codec, extractdir=extractdir)
outfp.close()
-if __name__ == '__main__': sys.exit(main(sys.argv))
+
+if __name__ == '__main__':
+ sys.exit(main())