From 6e05baf0b7b77f3e0ed9e30d3dfe7808673f1fab Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sat, 23 May 2020 18:04:34 +0200 Subject: [PATCH] Dont dump fallback xref by default when using dumppdf.py, adding a flag to enable it Fixes #176 * Add failing test for dumping simple1.pdf and simple3.pdf, because they should raise an error when dumppdf.py tries to dump a pdf without xref's * Raise PDFNoValidXRef with explanation if dumppdf.py is called on a pdf that does not have an xref * Use warning instead of error, because not output xrefs is just fine (there aren't any) but it is something the user should know * Adding changelog * Extend help message --- CHANGELOG.md | 7 +++- pdfminer/pdfdocument.py | 5 ++- tests/test_tools_dumppdf.py | 22 ++++++++++--- tools/dumppdf.py | 65 ++++++++++++++++++++++++------------- 4 files changed, 70 insertions(+), 29 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eb90242..6606198 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,10 +3,15 @@ All notable changes in pdfminer.six will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [Unreleased] + +## Changed +- Hiding fallback xref by default from dumppdf.py output ([#431](https://github.com/pdfminer/pdfminer.six/pull/431)) + ## [20200517] ### Added -- Python3 shebang line to script in tools ([408](https://github.com/pdfminer/pdfminer.six/pull/408) +- Python3 shebang line to script in tools ([#408](https://github.com/pdfminer/pdfminer.six/pull/408) ### Fixed - Fix ordering of textlines within a textbox when `boxes_flow=None` ([#411](https://github.com/pdfminer/pdfminer.six/issues/411)) diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py index d8be5c6..71852ee 100644 --- a/pdfminer/pdfdocument.py +++ b/pdfminer/pdfdocument.py @@ -17,7 +17,6 @@ from .pdftypes import PDFException, uint_value, PDFTypeError, PDFStream, \ from .pdfparser import PDFSyntaxError, PDFStreamParser from .utils import choplist, nunpack, decode_text - log = logging.getLogger(__name__) @@ -25,6 +24,10 @@ class PDFNoValidXRef(PDFSyntaxError): pass +class PDFNoValidXRefWarning(SyntaxWarning): + pass + + class PDFNoOutlines(PDFException): pass diff --git a/tests/test_tools_dumppdf.py b/tests/test_tools_dumppdf.py index bee7e3b..34b8bfe 100644 --- a/tests/test_tools_dumppdf.py +++ b/tests/test_tools_dumppdf.py @@ -1,6 +1,8 @@ +import warnings from tempfile import NamedTemporaryFile from helpers import absolute_sample_path +from pdfminer.pdfdocument import PDFNoValidXRefWarning from tools import dumppdf @@ -16,11 +18,23 @@ def run(filename, options=None): class TestDumpPDF(): - def test_1(self): - run('jo.pdf', '-t -a') - run('simple1.pdf', '-t -a') + def test_simple1(self): + """dumppdf.py simple1.pdf raises a warning because it has no xref""" + with warnings.catch_warnings(record=True) as ws: + run('simple1.pdf', '-t -a') + assert any(w.category == PDFNoValidXRefWarning for w in ws) + + def test_simple2(self): run('simple2.pdf', '-t -a') - run('simple3.pdf', '-t -a') + + def test_jo(self): + run('jo.pdf', '-t -a') + + def test_simple3(self): + """dumppdf.py simple3.pdf raises a warning because it has no xref""" + with warnings.catch_warnings(record=True) as ws: + run('simple3.pdf', '-t -a') + assert any(w.category == PDFNoValidXRefWarning for w in ws) def test_2(self): run('nonfree/dmca.pdf', '-t -a') diff --git a/tools/dumppdf.py b/tools/dumppdf.py index 7e2808c..0aa7b45 100755 --- a/tools/dumppdf.py +++ b/tools/dumppdf.py @@ -4,10 +4,12 @@ import logging import os.path import re import sys +import warnings from argparse import ArgumentParser import pdfminer -from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines +from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines, PDFXRefFallback, \ + PDFNoValidXRefWarning from pdfminer.pdfpage import PDFPage from pdfminer.pdfparser import PDFParser from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError @@ -87,15 +89,22 @@ def dumpxml(out, obj, codec=None): raise TypeError(obj) -def dumptrailers(out, doc): +def dumptrailers(out, doc, show_fallback_xref=False): for xref in doc.xrefs: - out.write('\n') - dumpxml(out, xref.trailer) - out.write('\n\n\n') + if not isinstance(xref, PDFXRefFallback) or show_fallback_xref: + out.write('\n') + dumpxml(out, xref.trailer) + out.write('\n\n\n') + no_xrefs = all(isinstance(xref, PDFXRefFallback) for xref in doc.xrefs) + if no_xrefs and not show_fallback_xref: + msg = 'This PDF does not have an xref. Use --show-fallback-xref if ' \ + 'you want to display the content of a fallback xref that ' \ + 'contains all objects.' + warnings.warn(msg, PDFNoValidXRefWarning) return -def dumpallobjs(out, doc, codec=None): +def dumpallobjs(out, doc, codec=None, show_fallback_xref=False): visited = set() out.write('') for xref in doc.xrefs: @@ -112,7 +121,7 @@ def dumpallobjs(out, doc, codec=None): out.write('\n\n\n') except PDFObjectNotFound as e: print('not found: %r' % e) - dumptrailers(out, doc) + dumptrailers(out, doc, show_fallback_xref) out.write('') return @@ -211,8 +220,8 @@ def extractembedded(outfp, fname, objids, pagenos, password='', return -def dumppdf(outfp, fname, objids, pagenos, password='', - dumpall=False, codec=None, extractdir=None): +def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False, + codec=None, extractdir=None, show_fallback_xref=False): fp = open(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) @@ -230,9 +239,9 @@ def dumppdf(outfp, fname, objids, pagenos, password='', else: dumpxml(outfp, page.attrs) if dumpall: - dumpallobjs(outfp, doc, codec=codec) + dumpallobjs(outfp, doc, codec, show_fallback_xref) if (not objids) and (not pagenos) and (not dumpall): - dumptrailers(outfp, doc) + dumptrailers(outfp, doc, show_fallback_xref) fp.close() if codec not in ('raw', 'binary'): outfp.write('\n') @@ -274,6 +283,11 @@ def create_parser(): parse_params.add_argument( '--all', '-a', default=False, action='store_true', help='If the structure of all objects should be extracted') + parse_params.add_argument( + '--show-fallback-xref', action='store_true', + help='Additionally show the fallback xref. Use this if the PDF ' + 'has zero or only invalid xref\'s. This setting is ignored if ' + '--extract-toc or --extract-embedded is used.') parse_params.add_argument( '--password', '-P', type=str, default='', help='The password to use for decrypting PDF file.') @@ -333,19 +347,24 @@ def main(argv=None): else: codec = None - if args.extract_toc: - extractdir = None - proc = dumpoutline - elif args.extract_embedded: - extractdir = args.extract_embedded - proc = extractembedded - else: - extractdir = None - proc = dumppdf - for fname in args.files: - proc(outfp, fname, objids, pagenos, password=password, - dumpall=args.all, codec=codec, extractdir=extractdir) + if args.extract_toc: + dumpoutline( + outfp, fname, objids, pagenos, password=password, + dumpall=args.all, codec=codec, extractdir=None + ) + elif args.extract_embedded: + extractembedded( + outfp, fname, objids, pagenos, password=password, + dumpall=args.all, codec=codec, extractdir=args.extract_embedded + ) + else: + dumppdf( + outfp, fname, objids, pagenos, password=password, + dumpall=args.all, codec=codec, extractdir=None, + show_fallback_xref=args.show_fallback_xref + ) + outfp.close()