Dont dump fallback xref by default when using dumppdf.py, adding a flag to enable it
Fixes #176 * Add failing test for dumping simple1.pdf and simple3.pdf, because they should raise an error when dumppdf.py tries to dump a pdf without xref's * Raise PDFNoValidXRef with explanation if dumppdf.py is called on a pdf that does not have an xref * Use warning instead of error, because not output xrefs is just fine (there aren't any) but it is something the user should know * Adding changelog * Extend help messagepull/436/head
parent
33b60dfd54
commit
6e05baf0b7
|
@ -3,10 +3,15 @@ All notable changes in pdfminer.six will be documented in this file.
|
||||||
|
|
||||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
|
|
||||||
|
## [Unreleased]
|
||||||
|
|
||||||
|
## Changed
|
||||||
|
- Hiding fallback xref by default from dumppdf.py output ([#431](https://github.com/pdfminer/pdfminer.six/pull/431))
|
||||||
|
|
||||||
## [20200517]
|
## [20200517]
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
- Python3 shebang line to script in tools ([408](https://github.com/pdfminer/pdfminer.six/pull/408)
|
- Python3 shebang line to script in tools ([#408](https://github.com/pdfminer/pdfminer.six/pull/408)
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
- Fix ordering of textlines within a textbox when `boxes_flow=None` ([#411](https://github.com/pdfminer/pdfminer.six/issues/411))
|
- Fix ordering of textlines within a textbox when `boxes_flow=None` ([#411](https://github.com/pdfminer/pdfminer.six/issues/411))
|
||||||
|
|
|
@ -17,7 +17,6 @@ from .pdftypes import PDFException, uint_value, PDFTypeError, PDFStream, \
|
||||||
from .pdfparser import PDFSyntaxError, PDFStreamParser
|
from .pdfparser import PDFSyntaxError, PDFStreamParser
|
||||||
from .utils import choplist, nunpack, decode_text
|
from .utils import choplist, nunpack, decode_text
|
||||||
|
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@ -25,6 +24,10 @@ class PDFNoValidXRef(PDFSyntaxError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class PDFNoValidXRefWarning(SyntaxWarning):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class PDFNoOutlines(PDFException):
|
class PDFNoOutlines(PDFException):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
|
import warnings
|
||||||
from tempfile import NamedTemporaryFile
|
from tempfile import NamedTemporaryFile
|
||||||
|
|
||||||
from helpers import absolute_sample_path
|
from helpers import absolute_sample_path
|
||||||
|
from pdfminer.pdfdocument import PDFNoValidXRefWarning
|
||||||
from tools import dumppdf
|
from tools import dumppdf
|
||||||
|
|
||||||
|
|
||||||
|
@ -16,11 +18,23 @@ def run(filename, options=None):
|
||||||
|
|
||||||
|
|
||||||
class TestDumpPDF():
|
class TestDumpPDF():
|
||||||
def test_1(self):
|
def test_simple1(self):
|
||||||
run('jo.pdf', '-t -a')
|
"""dumppdf.py simple1.pdf raises a warning because it has no xref"""
|
||||||
run('simple1.pdf', '-t -a')
|
with warnings.catch_warnings(record=True) as ws:
|
||||||
|
run('simple1.pdf', '-t -a')
|
||||||
|
assert any(w.category == PDFNoValidXRefWarning for w in ws)
|
||||||
|
|
||||||
|
def test_simple2(self):
|
||||||
run('simple2.pdf', '-t -a')
|
run('simple2.pdf', '-t -a')
|
||||||
run('simple3.pdf', '-t -a')
|
|
||||||
|
def test_jo(self):
|
||||||
|
run('jo.pdf', '-t -a')
|
||||||
|
|
||||||
|
def test_simple3(self):
|
||||||
|
"""dumppdf.py simple3.pdf raises a warning because it has no xref"""
|
||||||
|
with warnings.catch_warnings(record=True) as ws:
|
||||||
|
run('simple3.pdf', '-t -a')
|
||||||
|
assert any(w.category == PDFNoValidXRefWarning for w in ws)
|
||||||
|
|
||||||
def test_2(self):
|
def test_2(self):
|
||||||
run('nonfree/dmca.pdf', '-t -a')
|
run('nonfree/dmca.pdf', '-t -a')
|
||||||
|
|
|
@ -4,10 +4,12 @@ import logging
|
||||||
import os.path
|
import os.path
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
import warnings
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
|
|
||||||
import pdfminer
|
import pdfminer
|
||||||
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
|
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines, PDFXRefFallback, \
|
||||||
|
PDFNoValidXRefWarning
|
||||||
from pdfminer.pdfpage import PDFPage
|
from pdfminer.pdfpage import PDFPage
|
||||||
from pdfminer.pdfparser import PDFParser
|
from pdfminer.pdfparser import PDFParser
|
||||||
from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError
|
from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError
|
||||||
|
@ -87,15 +89,22 @@ def dumpxml(out, obj, codec=None):
|
||||||
raise TypeError(obj)
|
raise TypeError(obj)
|
||||||
|
|
||||||
|
|
||||||
def dumptrailers(out, doc):
|
def dumptrailers(out, doc, show_fallback_xref=False):
|
||||||
for xref in doc.xrefs:
|
for xref in doc.xrefs:
|
||||||
out.write('<trailer>\n')
|
if not isinstance(xref, PDFXRefFallback) or show_fallback_xref:
|
||||||
dumpxml(out, xref.trailer)
|
out.write('<trailer>\n')
|
||||||
out.write('\n</trailer>\n\n')
|
dumpxml(out, xref.trailer)
|
||||||
|
out.write('\n</trailer>\n\n')
|
||||||
|
no_xrefs = all(isinstance(xref, PDFXRefFallback) for xref in doc.xrefs)
|
||||||
|
if no_xrefs and not show_fallback_xref:
|
||||||
|
msg = 'This PDF does not have an xref. Use --show-fallback-xref if ' \
|
||||||
|
'you want to display the content of a fallback xref that ' \
|
||||||
|
'contains all objects.'
|
||||||
|
warnings.warn(msg, PDFNoValidXRefWarning)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
def dumpallobjs(out, doc, codec=None):
|
def dumpallobjs(out, doc, codec=None, show_fallback_xref=False):
|
||||||
visited = set()
|
visited = set()
|
||||||
out.write('<pdf>')
|
out.write('<pdf>')
|
||||||
for xref in doc.xrefs:
|
for xref in doc.xrefs:
|
||||||
|
@ -112,7 +121,7 @@ def dumpallobjs(out, doc, codec=None):
|
||||||
out.write('\n</object>\n\n')
|
out.write('\n</object>\n\n')
|
||||||
except PDFObjectNotFound as e:
|
except PDFObjectNotFound as e:
|
||||||
print('not found: %r' % e)
|
print('not found: %r' % e)
|
||||||
dumptrailers(out, doc)
|
dumptrailers(out, doc, show_fallback_xref)
|
||||||
out.write('</pdf>')
|
out.write('</pdf>')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -211,8 +220,8 @@ def extractembedded(outfp, fname, objids, pagenos, password='',
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
def dumppdf(outfp, fname, objids, pagenos, password='',
|
def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False,
|
||||||
dumpall=False, codec=None, extractdir=None):
|
codec=None, extractdir=None, show_fallback_xref=False):
|
||||||
fp = open(fname, 'rb')
|
fp = open(fname, 'rb')
|
||||||
parser = PDFParser(fp)
|
parser = PDFParser(fp)
|
||||||
doc = PDFDocument(parser, password)
|
doc = PDFDocument(parser, password)
|
||||||
|
@ -230,9 +239,9 @@ def dumppdf(outfp, fname, objids, pagenos, password='',
|
||||||
else:
|
else:
|
||||||
dumpxml(outfp, page.attrs)
|
dumpxml(outfp, page.attrs)
|
||||||
if dumpall:
|
if dumpall:
|
||||||
dumpallobjs(outfp, doc, codec=codec)
|
dumpallobjs(outfp, doc, codec, show_fallback_xref)
|
||||||
if (not objids) and (not pagenos) and (not dumpall):
|
if (not objids) and (not pagenos) and (not dumpall):
|
||||||
dumptrailers(outfp, doc)
|
dumptrailers(outfp, doc, show_fallback_xref)
|
||||||
fp.close()
|
fp.close()
|
||||||
if codec not in ('raw', 'binary'):
|
if codec not in ('raw', 'binary'):
|
||||||
outfp.write('\n')
|
outfp.write('\n')
|
||||||
|
@ -274,6 +283,11 @@ def create_parser():
|
||||||
parse_params.add_argument(
|
parse_params.add_argument(
|
||||||
'--all', '-a', default=False, action='store_true',
|
'--all', '-a', default=False, action='store_true',
|
||||||
help='If the structure of all objects should be extracted')
|
help='If the structure of all objects should be extracted')
|
||||||
|
parse_params.add_argument(
|
||||||
|
'--show-fallback-xref', action='store_true',
|
||||||
|
help='Additionally show the fallback xref. Use this if the PDF '
|
||||||
|
'has zero or only invalid xref\'s. This setting is ignored if '
|
||||||
|
'--extract-toc or --extract-embedded is used.')
|
||||||
parse_params.add_argument(
|
parse_params.add_argument(
|
||||||
'--password', '-P', type=str, default='',
|
'--password', '-P', type=str, default='',
|
||||||
help='The password to use for decrypting PDF file.')
|
help='The password to use for decrypting PDF file.')
|
||||||
|
@ -333,19 +347,24 @@ def main(argv=None):
|
||||||
else:
|
else:
|
||||||
codec = None
|
codec = None
|
||||||
|
|
||||||
if args.extract_toc:
|
|
||||||
extractdir = None
|
|
||||||
proc = dumpoutline
|
|
||||||
elif args.extract_embedded:
|
|
||||||
extractdir = args.extract_embedded
|
|
||||||
proc = extractembedded
|
|
||||||
else:
|
|
||||||
extractdir = None
|
|
||||||
proc = dumppdf
|
|
||||||
|
|
||||||
for fname in args.files:
|
for fname in args.files:
|
||||||
proc(outfp, fname, objids, pagenos, password=password,
|
if args.extract_toc:
|
||||||
dumpall=args.all, codec=codec, extractdir=extractdir)
|
dumpoutline(
|
||||||
|
outfp, fname, objids, pagenos, password=password,
|
||||||
|
dumpall=args.all, codec=codec, extractdir=None
|
||||||
|
)
|
||||||
|
elif args.extract_embedded:
|
||||||
|
extractembedded(
|
||||||
|
outfp, fname, objids, pagenos, password=password,
|
||||||
|
dumpall=args.all, codec=codec, extractdir=args.extract_embedded
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
dumppdf(
|
||||||
|
outfp, fname, objids, pagenos, password=password,
|
||||||
|
dumpall=args.all, codec=codec, extractdir=None,
|
||||||
|
show_fallback_xref=args.show_fallback_xref
|
||||||
|
)
|
||||||
|
|
||||||
outfp.close()
|
outfp.close()
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue