Dont dump fallback xref by default when using dumppdf.py, adding a flag to enable it

Fixes #176 

* Add failing test for dumping simple1.pdf and simple3.pdf, because they should raise an error when dumppdf.py tries to dump a pdf without xref's

* Raise PDFNoValidXRef with explanation if dumppdf.py is called on a pdf that does not have an xref

* Use warning instead of error, because not output xrefs is just fine (there aren't any) but it is something the user should know

* Adding changelog

* Extend help message
pull/436/head
Pieter Marsman 2020-05-23 18:04:34 +02:00 committed by GitHub
parent 33b60dfd54
commit 6e05baf0b7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 70 additions and 29 deletions

View File

@ -3,10 +3,15 @@ All notable changes in pdfminer.six will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
## [Unreleased]
## Changed
- Hiding fallback xref by default from dumppdf.py output ([#431](https://github.com/pdfminer/pdfminer.six/pull/431))
## [20200517] ## [20200517]
### Added ### Added
- Python3 shebang line to script in tools ([408](https://github.com/pdfminer/pdfminer.six/pull/408) - Python3 shebang line to script in tools ([#408](https://github.com/pdfminer/pdfminer.six/pull/408)
### Fixed ### Fixed
- Fix ordering of textlines within a textbox when `boxes_flow=None` ([#411](https://github.com/pdfminer/pdfminer.six/issues/411)) - Fix ordering of textlines within a textbox when `boxes_flow=None` ([#411](https://github.com/pdfminer/pdfminer.six/issues/411))

View File

@ -17,7 +17,6 @@ from .pdftypes import PDFException, uint_value, PDFTypeError, PDFStream, \
from .pdfparser import PDFSyntaxError, PDFStreamParser from .pdfparser import PDFSyntaxError, PDFStreamParser
from .utils import choplist, nunpack, decode_text from .utils import choplist, nunpack, decode_text
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -25,6 +24,10 @@ class PDFNoValidXRef(PDFSyntaxError):
pass pass
class PDFNoValidXRefWarning(SyntaxWarning):
pass
class PDFNoOutlines(PDFException): class PDFNoOutlines(PDFException):
pass pass

View File

@ -1,6 +1,8 @@
import warnings
from tempfile import NamedTemporaryFile from tempfile import NamedTemporaryFile
from helpers import absolute_sample_path from helpers import absolute_sample_path
from pdfminer.pdfdocument import PDFNoValidXRefWarning
from tools import dumppdf from tools import dumppdf
@ -16,11 +18,23 @@ def run(filename, options=None):
class TestDumpPDF(): class TestDumpPDF():
def test_1(self): def test_simple1(self):
run('jo.pdf', '-t -a') """dumppdf.py simple1.pdf raises a warning because it has no xref"""
run('simple1.pdf', '-t -a') with warnings.catch_warnings(record=True) as ws:
run('simple1.pdf', '-t -a')
assert any(w.category == PDFNoValidXRefWarning for w in ws)
def test_simple2(self):
run('simple2.pdf', '-t -a') run('simple2.pdf', '-t -a')
run('simple3.pdf', '-t -a')
def test_jo(self):
run('jo.pdf', '-t -a')
def test_simple3(self):
"""dumppdf.py simple3.pdf raises a warning because it has no xref"""
with warnings.catch_warnings(record=True) as ws:
run('simple3.pdf', '-t -a')
assert any(w.category == PDFNoValidXRefWarning for w in ws)
def test_2(self): def test_2(self):
run('nonfree/dmca.pdf', '-t -a') run('nonfree/dmca.pdf', '-t -a')

View File

@ -4,10 +4,12 @@ import logging
import os.path import os.path
import re import re
import sys import sys
import warnings
from argparse import ArgumentParser from argparse import ArgumentParser
import pdfminer import pdfminer
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines, PDFXRefFallback, \
PDFNoValidXRefWarning
from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser from pdfminer.pdfparser import PDFParser
from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError
@ -87,15 +89,22 @@ def dumpxml(out, obj, codec=None):
raise TypeError(obj) raise TypeError(obj)
def dumptrailers(out, doc): def dumptrailers(out, doc, show_fallback_xref=False):
for xref in doc.xrefs: for xref in doc.xrefs:
out.write('<trailer>\n') if not isinstance(xref, PDFXRefFallback) or show_fallback_xref:
dumpxml(out, xref.trailer) out.write('<trailer>\n')
out.write('\n</trailer>\n\n') dumpxml(out, xref.trailer)
out.write('\n</trailer>\n\n')
no_xrefs = all(isinstance(xref, PDFXRefFallback) for xref in doc.xrefs)
if no_xrefs and not show_fallback_xref:
msg = 'This PDF does not have an xref. Use --show-fallback-xref if ' \
'you want to display the content of a fallback xref that ' \
'contains all objects.'
warnings.warn(msg, PDFNoValidXRefWarning)
return return
def dumpallobjs(out, doc, codec=None): def dumpallobjs(out, doc, codec=None, show_fallback_xref=False):
visited = set() visited = set()
out.write('<pdf>') out.write('<pdf>')
for xref in doc.xrefs: for xref in doc.xrefs:
@ -112,7 +121,7 @@ def dumpallobjs(out, doc, codec=None):
out.write('\n</object>\n\n') out.write('\n</object>\n\n')
except PDFObjectNotFound as e: except PDFObjectNotFound as e:
print('not found: %r' % e) print('not found: %r' % e)
dumptrailers(out, doc) dumptrailers(out, doc, show_fallback_xref)
out.write('</pdf>') out.write('</pdf>')
return return
@ -211,8 +220,8 @@ def extractembedded(outfp, fname, objids, pagenos, password='',
return return
def dumppdf(outfp, fname, objids, pagenos, password='', def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False,
dumpall=False, codec=None, extractdir=None): codec=None, extractdir=None, show_fallback_xref=False):
fp = open(fname, 'rb') fp = open(fname, 'rb')
parser = PDFParser(fp) parser = PDFParser(fp)
doc = PDFDocument(parser, password) doc = PDFDocument(parser, password)
@ -230,9 +239,9 @@ def dumppdf(outfp, fname, objids, pagenos, password='',
else: else:
dumpxml(outfp, page.attrs) dumpxml(outfp, page.attrs)
if dumpall: if dumpall:
dumpallobjs(outfp, doc, codec=codec) dumpallobjs(outfp, doc, codec, show_fallback_xref)
if (not objids) and (not pagenos) and (not dumpall): if (not objids) and (not pagenos) and (not dumpall):
dumptrailers(outfp, doc) dumptrailers(outfp, doc, show_fallback_xref)
fp.close() fp.close()
if codec not in ('raw', 'binary'): if codec not in ('raw', 'binary'):
outfp.write('\n') outfp.write('\n')
@ -274,6 +283,11 @@ def create_parser():
parse_params.add_argument( parse_params.add_argument(
'--all', '-a', default=False, action='store_true', '--all', '-a', default=False, action='store_true',
help='If the structure of all objects should be extracted') help='If the structure of all objects should be extracted')
parse_params.add_argument(
'--show-fallback-xref', action='store_true',
help='Additionally show the fallback xref. Use this if the PDF '
'has zero or only invalid xref\'s. This setting is ignored if '
'--extract-toc or --extract-embedded is used.')
parse_params.add_argument( parse_params.add_argument(
'--password', '-P', type=str, default='', '--password', '-P', type=str, default='',
help='The password to use for decrypting PDF file.') help='The password to use for decrypting PDF file.')
@ -333,19 +347,24 @@ def main(argv=None):
else: else:
codec = None codec = None
if args.extract_toc:
extractdir = None
proc = dumpoutline
elif args.extract_embedded:
extractdir = args.extract_embedded
proc = extractembedded
else:
extractdir = None
proc = dumppdf
for fname in args.files: for fname in args.files:
proc(outfp, fname, objids, pagenos, password=password, if args.extract_toc:
dumpall=args.all, codec=codec, extractdir=extractdir) dumpoutline(
outfp, fname, objids, pagenos, password=password,
dumpall=args.all, codec=codec, extractdir=None
)
elif args.extract_embedded:
extractembedded(
outfp, fname, objids, pagenos, password=password,
dumpall=args.all, codec=codec, extractdir=args.extract_embedded
)
else:
dumppdf(
outfp, fname, objids, pagenos, password=password,
dumpall=args.all, codec=codec, extractdir=None,
show_fallback_xref=args.show_fallback_xref
)
outfp.close() outfp.close()