Dont dump fallback xref by default when using dumppdf.py, adding a flag to enable it

Fixes #176 * Add failing test for dumping simple1.pdf and simple3.pdf, because they should raise an error when dumppdf.py tries to dump a pdf without xref's * Raise PDFNoValidXRef with explanation if dumppdf.py is called on a pdf that does not have an xref * Use warning instead of error, because not output xrefs is just fine (there aren't any) but it is something the user should know * Adding changelog * Extend help message
2020-05-23 18:04:34 +02:00 · 2020-05-23 18:04:34 +02:00 · 6e05baf0b7
parent 33b60dfd54
commit 6e05baf0b7
4 changed files with 70 additions and 29 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -3,10 +3,15 @@ All notable changes in pdfminer.six will be documented in this file.

 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

+## [Unreleased]
+
+## Changed
+- Hiding fallback xref by default from dumppdf.py output ([#431](https://github.com/pdfminer/pdfminer.six/pull/431))
+
 ## [20200517]

 ### Added
- Python3 shebang line to script in tools ([408](https://github.com/pdfminer/pdfminer.six/pull/408)
+- Python3 shebang line to script in tools ([#408](https://github.com/pdfminer/pdfminer.six/pull/408)

 ### Fixed
 - Fix ordering of textlines within a textbox when `boxes_flow=None` ([#411](https://github.com/pdfminer/pdfminer.six/issues/411))
--- a/pdfminer/pdfdocument.py
+++ b/pdfminer/pdfdocument.py
@ -17,7 +17,6 @@ from .pdftypes import PDFException, uint_value, PDFTypeError, PDFStream, \
 from .pdfparser import PDFSyntaxError, PDFStreamParser
 from .utils import choplist, nunpack, decode_text

-
 log = logging.getLogger(__name__)


@ -25,6 +24,10 @@ class PDFNoValidXRef(PDFSyntaxError):
    pass


+class PDFNoValidXRefWarning(SyntaxWarning):
+    pass
+
+
 class PDFNoOutlines(PDFException):
    pass

--- a/tests/test_tools_dumppdf.py
+++ b/tests/test_tools_dumppdf.py
@ -1,6 +1,8 @@
+import warnings
 from tempfile import NamedTemporaryFile

 from helpers import absolute_sample_path
+from pdfminer.pdfdocument import PDFNoValidXRefWarning
 from tools import dumppdf


@ -16,11 +18,23 @@ def run(filename, options=None):


 class TestDumpPDF():
-    def test_1(self):
-        run('jo.pdf', '-t -a')
-        run('simple1.pdf', '-t -a')
+    def test_simple1(self):
+        """dumppdf.py simple1.pdf raises a warning because it has no xref"""
+        with warnings.catch_warnings(record=True) as ws:
+            run('simple1.pdf', '-t -a')
+            assert any(w.category == PDFNoValidXRefWarning for w in ws)
+
+    def test_simple2(self):
        run('simple2.pdf', '-t -a')
-        run('simple3.pdf', '-t -a')
+
+    def test_jo(self):
+        run('jo.pdf', '-t -a')
+
+    def test_simple3(self):
+        """dumppdf.py simple3.pdf raises a warning because it has no xref"""
+        with warnings.catch_warnings(record=True) as ws:
+            run('simple3.pdf', '-t -a')
+            assert any(w.category == PDFNoValidXRefWarning for w in ws)

    def test_2(self):
        run('nonfree/dmca.pdf', '-t -a')
--- a/tools/dumppdf.py
+++ b/tools/dumppdf.py
@ -4,10 +4,12 @@ import logging
 import os.path
 import re
 import sys
+import warnings
 from argparse import ArgumentParser

 import pdfminer
-from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
+from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines, PDFXRefFallback, \
+    PDFNoValidXRefWarning
 from pdfminer.pdfpage import PDFPage
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError
@ -87,15 +89,22 @@ def dumpxml(out, obj, codec=None):
    raise TypeError(obj)


-def dumptrailers(out, doc):
+def dumptrailers(out, doc, show_fallback_xref=False):
    for xref in doc.xrefs:
-        out.write('<trailer>\n')
-        dumpxml(out, xref.trailer)
-        out.write('\n</trailer>\n\n')
+        if not isinstance(xref, PDFXRefFallback) or show_fallback_xref:
+            out.write('<trailer>\n')
+            dumpxml(out, xref.trailer)
+            out.write('\n</trailer>\n\n')
+    no_xrefs = all(isinstance(xref, PDFXRefFallback) for xref in doc.xrefs)
+    if no_xrefs and not show_fallback_xref:
+        msg = 'This PDF does not have an xref. Use --show-fallback-xref if ' \
+              'you want to display the content of a fallback xref that ' \
+              'contains all objects.'
+        warnings.warn(msg, PDFNoValidXRefWarning)
    return


-def dumpallobjs(out, doc, codec=None):
+def dumpallobjs(out, doc, codec=None, show_fallback_xref=False):
    visited = set()
    out.write('<pdf>')
    for xref in doc.xrefs:
@ -112,7 +121,7 @@ def dumpallobjs(out, doc, codec=None):
                out.write('\n</object>\n\n')
            except PDFObjectNotFound as e:
                print('not found: %r' % e)
-    dumptrailers(out, doc)
+    dumptrailers(out, doc, show_fallback_xref)
    out.write('</pdf>')
    return

@ -211,8 +220,8 @@ def extractembedded(outfp, fname, objids, pagenos, password='',
    return


-def dumppdf(outfp, fname, objids, pagenos, password='',
-            dumpall=False, codec=None, extractdir=None):
+def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False,
+            codec=None, extractdir=None, show_fallback_xref=False):
    fp = open(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
@ -230,9 +239,9 @@ def dumppdf(outfp, fname, objids, pagenos, password='',
                else:
                    dumpxml(outfp, page.attrs)
    if dumpall:
-        dumpallobjs(outfp, doc, codec=codec)
+        dumpallobjs(outfp, doc, codec, show_fallback_xref)
    if (not objids) and (not pagenos) and (not dumpall):
-        dumptrailers(outfp, doc)
+        dumptrailers(outfp, doc, show_fallback_xref)
    fp.close()
    if codec not in ('raw', 'binary'):
        outfp.write('\n')
@ -274,6 +283,11 @@ def create_parser():
    parse_params.add_argument(
        '--all', '-a', default=False, action='store_true',
        help='If the structure of all objects should be extracted')
+    parse_params.add_argument(
+        '--show-fallback-xref', action='store_true',
+        help='Additionally show the fallback xref. Use this if the PDF '
+             'has zero or only invalid xref\'s. This setting is ignored if '
+             '--extract-toc or --extract-embedded is used.')
    parse_params.add_argument(
        '--password', '-P', type=str, default='',
        help='The password to use for decrypting PDF file.')
@ -333,19 +347,24 @@ def main(argv=None):
    else:
        codec = None

-    if args.extract_toc:
-        extractdir = None
-        proc = dumpoutline
-    elif args.extract_embedded:
-        extractdir = args.extract_embedded
-        proc = extractembedded
-    else:
-        extractdir = None
-        proc = dumppdf
-
    for fname in args.files:
-        proc(outfp, fname, objids, pagenos, password=password,
-             dumpall=args.all, codec=codec, extractdir=extractdir)
+        if args.extract_toc:
+            dumpoutline(
+                outfp, fname, objids, pagenos, password=password,
+                dumpall=args.all, codec=codec, extractdir=None
+            )
+        elif args.extract_embedded:
+            extractembedded(
+                outfp, fname, objids, pagenos, password=password,
+                dumpall=args.all, codec=codec, extractdir=args.extract_embedded
+            )
+        else:
+            dumppdf(
+                outfp, fname, objids, pagenos, password=password,
+                dumpall=args.all, codec=codec, extractdir=None,
+                show_fallback_xref=args.show_fallback_xref
+            )
+
    outfp.close()