Dont dump fallback xref by default when using dumppdf.py, adding a flag to enable it

Fixes #176 

* Add failing test for dumping simple1.pdf and simple3.pdf, because they should raise an error when dumppdf.py tries to dump a pdf without xref's

* Raise PDFNoValidXRef with explanation if dumppdf.py is called on a pdf that does not have an xref

* Use warning instead of error, because not output xrefs is just fine (there aren't any) but it is something the user should know

* Adding changelog

* Extend help message
pull/436/head
Pieter Marsman 2020-05-23 18:04:34 +02:00 committed by GitHub
parent 33b60dfd54
commit 6e05baf0b7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 70 additions and 29 deletions

View File

@ -3,10 +3,15 @@ All notable changes in pdfminer.six will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
## [Unreleased]
## Changed
- Hiding fallback xref by default from dumppdf.py output ([#431](https://github.com/pdfminer/pdfminer.six/pull/431))
## [20200517]
### Added
- Python3 shebang line to script in tools ([408](https://github.com/pdfminer/pdfminer.six/pull/408)
- Python3 shebang line to script in tools ([#408](https://github.com/pdfminer/pdfminer.six/pull/408)
### Fixed
- Fix ordering of textlines within a textbox when `boxes_flow=None` ([#411](https://github.com/pdfminer/pdfminer.six/issues/411))

View File

@ -17,7 +17,6 @@ from .pdftypes import PDFException, uint_value, PDFTypeError, PDFStream, \
from .pdfparser import PDFSyntaxError, PDFStreamParser
from .utils import choplist, nunpack, decode_text
log = logging.getLogger(__name__)
@ -25,6 +24,10 @@ class PDFNoValidXRef(PDFSyntaxError):
pass
class PDFNoValidXRefWarning(SyntaxWarning):
pass
class PDFNoOutlines(PDFException):
pass

View File

@ -1,6 +1,8 @@
import warnings
from tempfile import NamedTemporaryFile
from helpers import absolute_sample_path
from pdfminer.pdfdocument import PDFNoValidXRefWarning
from tools import dumppdf
@ -16,11 +18,23 @@ def run(filename, options=None):
class TestDumpPDF():
def test_1(self):
run('jo.pdf', '-t -a')
run('simple1.pdf', '-t -a')
def test_simple1(self):
"""dumppdf.py simple1.pdf raises a warning because it has no xref"""
with warnings.catch_warnings(record=True) as ws:
run('simple1.pdf', '-t -a')
assert any(w.category == PDFNoValidXRefWarning for w in ws)
def test_simple2(self):
run('simple2.pdf', '-t -a')
run('simple3.pdf', '-t -a')
def test_jo(self):
run('jo.pdf', '-t -a')
def test_simple3(self):
"""dumppdf.py simple3.pdf raises a warning because it has no xref"""
with warnings.catch_warnings(record=True) as ws:
run('simple3.pdf', '-t -a')
assert any(w.category == PDFNoValidXRefWarning for w in ws)
def test_2(self):
run('nonfree/dmca.pdf', '-t -a')

View File

@ -4,10 +4,12 @@ import logging
import os.path
import re
import sys
import warnings
from argparse import ArgumentParser
import pdfminer
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines, PDFXRefFallback, \
PDFNoValidXRefWarning
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError
@ -87,15 +89,22 @@ def dumpxml(out, obj, codec=None):
raise TypeError(obj)
def dumptrailers(out, doc):
def dumptrailers(out, doc, show_fallback_xref=False):
for xref in doc.xrefs:
out.write('<trailer>\n')
dumpxml(out, xref.trailer)
out.write('\n</trailer>\n\n')
if not isinstance(xref, PDFXRefFallback) or show_fallback_xref:
out.write('<trailer>\n')
dumpxml(out, xref.trailer)
out.write('\n</trailer>\n\n')
no_xrefs = all(isinstance(xref, PDFXRefFallback) for xref in doc.xrefs)
if no_xrefs and not show_fallback_xref:
msg = 'This PDF does not have an xref. Use --show-fallback-xref if ' \
'you want to display the content of a fallback xref that ' \
'contains all objects.'
warnings.warn(msg, PDFNoValidXRefWarning)
return
def dumpallobjs(out, doc, codec=None):
def dumpallobjs(out, doc, codec=None, show_fallback_xref=False):
visited = set()
out.write('<pdf>')
for xref in doc.xrefs:
@ -112,7 +121,7 @@ def dumpallobjs(out, doc, codec=None):
out.write('\n</object>\n\n')
except PDFObjectNotFound as e:
print('not found: %r' % e)
dumptrailers(out, doc)
dumptrailers(out, doc, show_fallback_xref)
out.write('</pdf>')
return
@ -211,8 +220,8 @@ def extractembedded(outfp, fname, objids, pagenos, password='',
return
def dumppdf(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None, extractdir=None):
def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False,
codec=None, extractdir=None, show_fallback_xref=False):
fp = open(fname, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser, password)
@ -230,9 +239,9 @@ def dumppdf(outfp, fname, objids, pagenos, password='',
else:
dumpxml(outfp, page.attrs)
if dumpall:
dumpallobjs(outfp, doc, codec=codec)
dumpallobjs(outfp, doc, codec, show_fallback_xref)
if (not objids) and (not pagenos) and (not dumpall):
dumptrailers(outfp, doc)
dumptrailers(outfp, doc, show_fallback_xref)
fp.close()
if codec not in ('raw', 'binary'):
outfp.write('\n')
@ -274,6 +283,11 @@ def create_parser():
parse_params.add_argument(
'--all', '-a', default=False, action='store_true',
help='If the structure of all objects should be extracted')
parse_params.add_argument(
'--show-fallback-xref', action='store_true',
help='Additionally show the fallback xref. Use this if the PDF '
'has zero or only invalid xref\'s. This setting is ignored if '
'--extract-toc or --extract-embedded is used.')
parse_params.add_argument(
'--password', '-P', type=str, default='',
help='The password to use for decrypting PDF file.')
@ -333,19 +347,24 @@ def main(argv=None):
else:
codec = None
if args.extract_toc:
extractdir = None
proc = dumpoutline
elif args.extract_embedded:
extractdir = args.extract_embedded
proc = extractembedded
else:
extractdir = None
proc = dumppdf
for fname in args.files:
proc(outfp, fname, objids, pagenos, password=password,
dumpall=args.all, codec=codec, extractdir=extractdir)
if args.extract_toc:
dumpoutline(
outfp, fname, objids, pagenos, password=password,
dumpall=args.all, codec=codec, extractdir=None
)
elif args.extract_embedded:
extractembedded(
outfp, fname, objids, pagenos, password=password,
dumpall=args.all, codec=codec, extractdir=args.extract_embedded
)
else:
dumppdf(
outfp, fname, objids, pagenos, password=password,
dumpall=args.all, codec=codec, extractdir=None,
show_fallback_xref=args.show_fallback_xref
)
outfp.close()