Dont dump fallback xref by default when using dumppdf.py, adding a flag to enable it

Fixes #176 * Add failing test for dumping simple1.pdf and simple3.pdf, because they should raise an error when dumppdf.py tries to dump a pdf without xref's * Raise PDFNoValidXRef with explanation if dumppdf.py is called on a pdf that does not have an xref * Use warning instead of error, because not output xrefs is just fine (there aren't any) but it is something the user should know * Adding changelog * Extend help message
2020-05-23 18:04:34 +02:00 · 2020-05-23 18:04:34 +02:00 · 6e05baf0b7
parent 33b60dfd54
commit 6e05baf0b7
4 changed files with 70 additions and 29 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -3,10 +3,15 @@ All notable changes in pdfminer.six will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 ## [Unreleased]
 ## Changed
 - Hiding fallback xref by default from dumppdf.py output ([#431](https://github.com/pdfminer/pdfminer.six/pull/431))
 ## [20200517]
 ### Added
- Python3 shebang line to script in tools ([408](https://github.com/pdfminer/pdfminer.six/pull/408)
+- Python3 shebang line to script in tools ([#408](https://github.com/pdfminer/pdfminer.six/pull/408)
 ### Fixed
 - Fix ordering of textlines within a textbox when `boxes_flow=None` ([#411](https://github.com/pdfminer/pdfminer.six/issues/411))
--- a/pdfminer/pdfdocument.py
+++ b/pdfminer/pdfdocument.py
@ -17,7 +17,6 @@ from .pdftypes import PDFException, uint_value, PDFTypeError, PDFStream, \
 from .pdfparser import PDFSyntaxError, PDFStreamParser
 from .utils import choplist, nunpack, decode_text
 log = logging.getLogger(__name__)
@ -25,6 +24,10 @@ class PDFNoValidXRef(PDFSyntaxError):
    pass
 class PDFNoValidXRefWarning(SyntaxWarning):
    pass
 class PDFNoOutlines(PDFException):
    pass
--- a/tests/test_tools_dumppdf.py
+++ b/tests/test_tools_dumppdf.py
@ -1,6 +1,8 @@
 import warnings
 from tempfile import NamedTemporaryFile
 from helpers import absolute_sample_path
 from pdfminer.pdfdocument import PDFNoValidXRefWarning
 from tools import dumppdf
@ -16,11 +18,23 @@ def run(filename, options=None):
 class TestDumpPDF():
-    def test_1(self):
+    def test_simple1(self):
-        run('jo.pdf', '-t -a')
+        """dumppdf.py simple1.pdf raises a warning because it has no xref"""
-        run('simple1.pdf', '-t -a')
+        with warnings.catch_warnings(record=True) as ws:
            run('simple1.pdf', '-t -a')
            assert any(w.category == PDFNoValidXRefWarning for w in ws)
    def test_simple2(self):
        run('simple2.pdf', '-t -a')
-        run('simple3.pdf', '-t -a')
+
    def test_jo(self):
        run('jo.pdf', '-t -a')
    def test_simple3(self):
        """dumppdf.py simple3.pdf raises a warning because it has no xref"""
        with warnings.catch_warnings(record=True) as ws:
            run('simple3.pdf', '-t -a')
            assert any(w.category == PDFNoValidXRefWarning for w in ws)
    def test_2(self):
        run('nonfree/dmca.pdf', '-t -a')
--- a/tools/dumppdf.py
+++ b/tools/dumppdf.py
@ -4,10 +4,12 @@ import logging
 import os.path
 import re
 import sys
 import warnings
 from argparse import ArgumentParser
 import pdfminer
-from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
+from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines, PDFXRefFallback, \
    PDFNoValidXRefWarning
 from pdfminer.pdfpage import PDFPage
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError
@ -87,15 +89,22 @@ def dumpxml(out, obj, codec=None):
    raise TypeError(obj)
-def dumptrailers(out, doc):
+def dumptrailers(out, doc, show_fallback_xref=False):
    for xref in doc.xrefs:
-        out.write('<trailer>\n')
+        if not isinstance(xref, PDFXRefFallback) or show_fallback_xref:
-        dumpxml(out, xref.trailer)
+            out.write('<trailer>\n')
-        out.write('\n</trailer>\n\n')
+            dumpxml(out, xref.trailer)
            out.write('\n</trailer>\n\n')
    no_xrefs = all(isinstance(xref, PDFXRefFallback) for xref in doc.xrefs)
    if no_xrefs and not show_fallback_xref:
        msg = 'This PDF does not have an xref. Use --show-fallback-xref if ' \
              'you want to display the content of a fallback xref that ' \
              'contains all objects.'
        warnings.warn(msg, PDFNoValidXRefWarning)
    return
-def dumpallobjs(out, doc, codec=None):
+def dumpallobjs(out, doc, codec=None, show_fallback_xref=False):
    visited = set()
    out.write('<pdf>')
    for xref in doc.xrefs:
@ -112,7 +121,7 @@ def dumpallobjs(out, doc, codec=None):
                out.write('\n</object>\n\n')
            except PDFObjectNotFound as e:
                print('not found: %r' % e)
-    dumptrailers(out, doc)
+    dumptrailers(out, doc, show_fallback_xref)
    out.write('</pdf>')
    return
@ -211,8 +220,8 @@ def extractembedded(outfp, fname, objids, pagenos, password='',
    return
-def dumppdf(outfp, fname, objids, pagenos, password='',
+def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False,
-            dumpall=False, codec=None, extractdir=None):
+            codec=None, extractdir=None, show_fallback_xref=False):
    fp = open(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
@ -230,9 +239,9 @@ def dumppdf(outfp, fname, objids, pagenos, password='',
                else:
                    dumpxml(outfp, page.attrs)
    if dumpall:
-        dumpallobjs(outfp, doc, codec=codec)
+        dumpallobjs(outfp, doc, codec, show_fallback_xref)
    if (not objids) and (not pagenos) and (not dumpall):
-        dumptrailers(outfp, doc)
+        dumptrailers(outfp, doc, show_fallback_xref)
    fp.close()
    if codec not in ('raw', 'binary'):
        outfp.write('\n')
@ -274,6 +283,11 @@ def create_parser():
    parse_params.add_argument(
        '--all', '-a', default=False, action='store_true',
        help='If the structure of all objects should be extracted')
    parse_params.add_argument(
        '--show-fallback-xref', action='store_true',
        help='Additionally show the fallback xref. Use this if the PDF '
             'has zero or only invalid xref\'s. This setting is ignored if '
             '--extract-toc or --extract-embedded is used.')
    parse_params.add_argument(
        '--password', '-P', type=str, default='',
        help='The password to use for decrypting PDF file.')
@ -333,19 +347,24 @@ def main(argv=None):
    else:
        codec = None
    if args.extract_toc:
        extractdir = None
        proc = dumpoutline
    elif args.extract_embedded:
        extractdir = args.extract_embedded
        proc = extractembedded
    else:
        extractdir = None
        proc = dumppdf
    for fname in args.files:
-        proc(outfp, fname, objids, pagenos, password=password,
+        if args.extract_toc:
-             dumpall=args.all, codec=codec, extractdir=extractdir)
+            dumpoutline(
                outfp, fname, objids, pagenos, password=password,
                dumpall=args.all, codec=codec, extractdir=None
            )
        elif args.extract_embedded:
            extractembedded(
                outfp, fname, objids, pagenos, password=password,
                dumpall=args.all, codec=codec, extractdir=args.extract_embedded
            )
        else:
            dumppdf(
                outfp, fname, objids, pagenos, password=password,
                dumpall=args.all, codec=codec, extractdir=None,
                show_fallback_xref=args.show_fallback_xref
            )
    outfp.close()