Replace opts by argparse in dumppdf.py (#321)
Also add multi-character argument names Fixes #175pull/315/head
parent
347c125fb8
commit
6cc78ee124
|
@ -11,6 +11,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
|||
### Fixed
|
||||
- Unhandled AssertionError when dumping pdf containing reference to object id 0 ([#318](https://github.com/pdfminer/pdfminer.six/pull/318))
|
||||
|
||||
### Changed
|
||||
- Using argparse instead of getopt for command line interface of dumppdf.py ([#321](https://github.com/pdfminer/pdfminer.six/pull/321))
|
||||
|
||||
### Removed
|
||||
- Files for external applications such as django, cgi and pyinstaller ([#314](https://github.com/pdfminer/pdfminer.six/issues/314))
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@ def run(filename, options=None):
|
|||
s = 'dumppdf -o %s %s %s' % (output_file.name, options, absolute_path)
|
||||
else:
|
||||
s = 'dumppdf -o %s %s' % (output_file.name, absolute_path)
|
||||
dumppdf.main(s.split(' '))
|
||||
dumppdf.main(s.split(' ')[1:])
|
||||
|
||||
|
||||
class TestDumpPDF():
|
||||
|
|
192
tools/dumppdf.py
192
tools/dumppdf.py
|
@ -1,32 +1,29 @@
|
|||
#!/usr/bin/env python
|
||||
"""Extract pdf structure in XML format"""
|
||||
import logging
|
||||
import os.path
|
||||
import re
|
||||
import sys
|
||||
from argparse import ArgumentParser
|
||||
|
||||
import six
|
||||
|
||||
#
|
||||
# dumppdf.py - dump pdf contents in XML format.
|
||||
#
|
||||
# usage: dumppdf.py [options] [files ...]
|
||||
# options:
|
||||
# -i objid : object id
|
||||
#
|
||||
import sys, os.path, re, logging
|
||||
from pdfminer.psparser import PSKeyword, PSLiteral, LIT
|
||||
from pdfminer.pdfparser import PDFParser
|
||||
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
from pdfminer.pdfparser import PDFParser
|
||||
from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError
|
||||
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
from pdfminer.psparser import PSKeyword, PSLiteral, LIT
|
||||
from pdfminer.utils import isnumber
|
||||
|
||||
|
||||
ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
|
||||
|
||||
|
||||
def e(s):
|
||||
if six.PY3 and isinstance(s, six.binary_type):
|
||||
s = str(s, 'latin-1')
|
||||
return ESC_PAT.sub(lambda m: '&#%d;' % ord(m.group(0)), s)
|
||||
|
||||
import six # Python 2+3 compatibility
|
||||
|
||||
|
||||
# dumpxml
|
||||
def dumpxml(out, obj, codec=None):
|
||||
if obj is None:
|
||||
out.write('<null />')
|
||||
|
@ -87,7 +84,7 @@ def dumpxml(out, obj, codec=None):
|
|||
|
||||
raise TypeError(obj)
|
||||
|
||||
# dumptrailers
|
||||
|
||||
def dumptrailers(out, doc):
|
||||
for xref in doc.xrefs:
|
||||
out.write('<trailer>\n')
|
||||
|
@ -95,7 +92,7 @@ def dumptrailers(out, doc):
|
|||
out.write('\n</trailer>\n\n')
|
||||
return
|
||||
|
||||
# dumpallobjs
|
||||
|
||||
def dumpallobjs(out, doc, codec=None):
|
||||
visited = set()
|
||||
out.write('<pdf>')
|
||||
|
@ -110,12 +107,12 @@ def dumpallobjs(out, doc, codec=None):
|
|||
dumpxml(out, obj, codec=codec)
|
||||
out.write('\n</object>\n\n')
|
||||
except PDFObjectNotFound as e:
|
||||
print >>sys.stderr, 'not found: %r' % e
|
||||
print('not found: %r' % e)
|
||||
dumptrailers(out, doc)
|
||||
out.write('</pdf>')
|
||||
return
|
||||
|
||||
# dumpoutline
|
||||
|
||||
def dumpoutline(outfp, fname, objids, pagenos, password='',
|
||||
dumpall=False, codec=None, extractdir=None):
|
||||
fp = open(fname, 'rb')
|
||||
|
@ -123,6 +120,7 @@ def dumpoutline(outfp, fname, objids, pagenos, password='',
|
|||
doc = PDFDocument(parser, password)
|
||||
pages = dict((page.pageid, pageno) for (pageno, page)
|
||||
in enumerate(PDFPage.create_pages(doc), 1))
|
||||
|
||||
def resolve_dest(dest):
|
||||
if isinstance(dest, str):
|
||||
dest = resolve1(doc.get_dest(dest))
|
||||
|
@ -133,6 +131,7 @@ def dumpoutline(outfp, fname, objids, pagenos, password='',
|
|||
if isinstance(dest, PDFObjRef):
|
||||
dest = dest.resolve()
|
||||
return dest
|
||||
|
||||
try:
|
||||
outlines = doc.get_outlines()
|
||||
outfp.write('<outlines>\n')
|
||||
|
@ -145,7 +144,8 @@ def dumpoutline(outfp, fname, objids, pagenos, password='',
|
|||
action = a
|
||||
if isinstance(action, dict):
|
||||
subtype = action.get('S')
|
||||
if subtype and repr(subtype) == '/\'GoTo\'' and action.get('D'):
|
||||
if subtype and repr(subtype) == '/\'GoTo\'' and action.get(
|
||||
'D'):
|
||||
dest = resolve_dest(action['D'])
|
||||
pageno = pages[dest[0].objid]
|
||||
s = e(title).encode('utf-8', 'xmlcharrefreplace')
|
||||
|
@ -164,9 +164,11 @@ def dumpoutline(outfp, fname, objids, pagenos, password='',
|
|||
fp.close()
|
||||
return
|
||||
|
||||
# extractembedded
|
||||
|
||||
LITERAL_FILESPEC = LIT('Filespec')
|
||||
LITERAL_EMBEDDEDFILE = LIT('EmbeddedFile')
|
||||
|
||||
|
||||
def extractembedded(outfp, fname, objids, pagenos, password='',
|
||||
dumpall=False, codec=None, extractdir=None):
|
||||
def extract1(obj):
|
||||
|
@ -184,8 +186,8 @@ def extractembedded(outfp, fname, objids, pagenos, password='',
|
|||
path = os.path.join(extractdir, filename)
|
||||
if os.path.exists(path):
|
||||
raise IOError('file exists: %r' % path)
|
||||
print >>sys.stderr, 'extracting: %r' % path
|
||||
out = file(path, 'wb')
|
||||
print('extracting: %r' % path)
|
||||
out = open(path, 'wb')
|
||||
out.write(fileobj.get_data())
|
||||
out.close()
|
||||
return
|
||||
|
@ -201,7 +203,7 @@ def extractembedded(outfp, fname, objids, pagenos, password='',
|
|||
fp.close()
|
||||
return
|
||||
|
||||
# dumppdf
|
||||
|
||||
def dumppdf(outfp, fname, objids, pagenos, password='',
|
||||
dumpall=False, codec=None, extractdir=None):
|
||||
fp = open(fname, 'rb')
|
||||
|
@ -230,46 +232,114 @@ def dumppdf(outfp, fname, objids, pagenos, password='',
|
|||
return
|
||||
|
||||
|
||||
# main
|
||||
def main(argv):
|
||||
import getopt
|
||||
def usage():
|
||||
print ('usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-E directory] [-i objid] file ...' % argv[0])
|
||||
return 100
|
||||
try:
|
||||
(opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTE:i:o:')
|
||||
except getopt.GetoptError:
|
||||
return usage()
|
||||
if not args: return usage()
|
||||
objids = []
|
||||
pagenos = set()
|
||||
codec = None
|
||||
password = ''
|
||||
dumpall = False
|
||||
proc = dumppdf
|
||||
outfp = sys.stdout
|
||||
extractdir = None
|
||||
for (k, v) in opts:
|
||||
if k == '-d': logging.getLogger().setLevel(logging.DEBUG)
|
||||
elif k == '-o': outfp = open(v, 'w')
|
||||
elif k == '-i': objids.extend( int(x) for x in v.split(',') )
|
||||
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
|
||||
elif k == '-P': password = v
|
||||
elif k == '-a': dumpall = True
|
||||
elif k == '-r': codec = 'raw'
|
||||
elif k == '-b': codec = 'binary'
|
||||
elif k == '-t': codec = 'text'
|
||||
elif k == '-T': proc = dumpoutline
|
||||
elif k == '-E':
|
||||
extractdir = v
|
||||
proc = extractembedded
|
||||
def create_parser():
|
||||
parser = ArgumentParser(description=__doc__, add_help=True)
|
||||
parser.add_argument('files', type=str, default=None, nargs='+',
|
||||
help='One or more paths to PDF files.')
|
||||
|
||||
parser.add_argument(
|
||||
'-d', '--debug', default=False, action='store_true',
|
||||
help='Use debug logging level.')
|
||||
procedure_parser = parser.add_mutually_exclusive_group()
|
||||
procedure_parser.add_argument(
|
||||
'-T', '--extract-toc', default=False, action='store_true',
|
||||
help='Extract structure of outline')
|
||||
procedure_parser.add_argument(
|
||||
'-E', '--extract-embedded', type=str,
|
||||
help='Extract embedded files')
|
||||
|
||||
parse_params = parser.add_argument_group(
|
||||
'Parser', description='Used during PDF parsing')
|
||||
parse_params.add_argument(
|
||||
"--page-numbers", type=int, default=None, nargs="+",
|
||||
help="A space-seperated list of page numbers to parse.")
|
||||
parse_params.add_argument(
|
||||
"-p", "--pagenos", type=str,
|
||||
help="A comma-separated list of page numbers to parse. Included for "
|
||||
"legacy applications, use --page-numbers for more idiomatic "
|
||||
"argument entry.")
|
||||
parse_params.add_argument(
|
||||
'-i', '--objects', type=str,
|
||||
help='Comma separated list of object numbers to extract')
|
||||
parse_params.add_argument(
|
||||
'-a', '--all', default=False, action='store_true',
|
||||
help='If the structure of all objects should be extracted')
|
||||
parse_params.add_argument(
|
||||
'-P', '--password', type=str, default='',
|
||||
help='The password to use for decrypting PDF file.')
|
||||
|
||||
output_params = parser.add_argument_group(
|
||||
'Output', description='Used during output generation.')
|
||||
output_params.add_argument(
|
||||
'-o', '--outfile', type=str, default='-',
|
||||
help='Path to file where output is written. Or "-" (default) to '
|
||||
'write to stdout.')
|
||||
codec_parser = output_params.add_mutually_exclusive_group()
|
||||
codec_parser.add_argument(
|
||||
'-r', '--raw-stream', default=False, action='store_true',
|
||||
help='Write stream objects without encoding')
|
||||
codec_parser.add_argument(
|
||||
'-b', '--binary-stream', default=False, action='store_true',
|
||||
help='Write stream objects with binary encoding')
|
||||
codec_parser.add_argument(
|
||||
'-t', '--text-stream', default=False, action='store_true',
|
||||
help='Write stream objects as plain text')
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
parser = create_parser()
|
||||
args = parser.parse_args(args=argv)
|
||||
|
||||
if args.debug:
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
|
||||
if args.outfile == '-':
|
||||
outfp = sys.stdout
|
||||
else:
|
||||
outfp = open(args.outfile, 'w')
|
||||
|
||||
if args.objects:
|
||||
objids = [int(x) for x in args.objects.split(',')]
|
||||
else:
|
||||
objids = []
|
||||
|
||||
if args.page_numbers:
|
||||
pagenos = {x - 1 for x in args.page_numbers}
|
||||
elif args.pagenos:
|
||||
pagenos = {int(x) - 1 for x in args.pagenos.split(',')}
|
||||
else:
|
||||
pagenos = set()
|
||||
|
||||
password = args.password
|
||||
if six.PY2 and sys.stdin.encoding:
|
||||
password = password.decode(sys.stdin.encoding)
|
||||
|
||||
for fname in args:
|
||||
if args.raw_stream:
|
||||
codec = 'raw'
|
||||
elif args.binary_stream:
|
||||
codec = 'binary'
|
||||
elif args.text_stream:
|
||||
codec = 'text'
|
||||
else:
|
||||
codec = None
|
||||
|
||||
if args.extract_toc:
|
||||
extractdir = None
|
||||
proc = dumpoutline
|
||||
elif args.extract_embedded:
|
||||
extractdir = args.extract_embedded
|
||||
proc = extractembedded
|
||||
else:
|
||||
extractdir = None
|
||||
proc = dumppdf
|
||||
|
||||
for fname in args.files:
|
||||
proc(outfp, fname, objids, pagenos, password=password,
|
||||
dumpall=dumpall, codec=codec, extractdir=extractdir)
|
||||
dumpall=args.all, codec=codec, extractdir=extractdir)
|
||||
outfp.close()
|
||||
|
||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
|
|
Loading…
Reference in New Issue