2020-04-28 08:58:42 +00:00
|
|
|
#!/usr/bin/env python3
|
2019-10-27 20:40:04 +00:00
|
|
|
"""Extract pdf structure in XML format"""
|
|
|
|
import logging
|
|
|
|
import os.path
|
|
|
|
import re
|
|
|
|
import sys
|
2020-05-23 16:04:34 +00:00
|
|
|
import warnings
|
2019-10-27 20:40:04 +00:00
|
|
|
from argparse import ArgumentParser
|
|
|
|
|
2020-05-17 15:48:06 +00:00
|
|
|
import pdfminer
|
2020-05-23 16:04:34 +00:00
|
|
|
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines, PDFXRefFallback, \
|
|
|
|
PDFNoValidXRefWarning
|
2019-10-27 20:40:04 +00:00
|
|
|
from pdfminer.pdfpage import PDFPage
|
|
|
|
from pdfminer.pdfparser import PDFParser
|
2013-10-18 23:41:01 +00:00
|
|
|
from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError
|
2010-01-30 07:30:01 +00:00
|
|
|
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
|
2019-10-27 20:40:04 +00:00
|
|
|
from pdfminer.psparser import PSKeyword, PSLiteral, LIT
|
2014-03-28 08:50:59 +00:00
|
|
|
from pdfminer.utils import isnumber
|
2007-12-31 02:40:32 +00:00
|
|
|
|
2019-11-06 20:47:19 +00:00
|
|
|
logging.basicConfig()
|
|
|
|
|
2010-04-24 01:34:18 +00:00
|
|
|
ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
|
2007-12-31 02:40:32 +00:00
|
|
|
|
2019-10-27 20:40:04 +00:00
|
|
|
|
|
|
|
def e(s):
|
2020-01-04 15:47:07 +00:00
|
|
|
if isinstance(s, bytes):
|
2019-10-27 20:40:04 +00:00
|
|
|
s = str(s, 'latin-1')
|
|
|
|
return ESC_PAT.sub(lambda m: '&#%d;' % ord(m.group(0)), s)
|
2014-09-03 11:17:41 +00:00
|
|
|
|
2007-12-31 02:40:32 +00:00
|
|
|
|
2008-04-27 09:34:48 +00:00
|
|
|
def dumpxml(out, obj, codec=None):
|
2010-04-24 01:34:18 +00:00
|
|
|
if obj is None:
|
|
|
|
out.write('<null />')
|
|
|
|
return
|
2013-11-07 07:14:53 +00:00
|
|
|
|
2009-10-24 04:41:59 +00:00
|
|
|
if isinstance(obj, dict):
|
|
|
|
out.write('<dict size="%d">\n' % len(obj))
|
2020-01-04 15:47:07 +00:00
|
|
|
for (k, v) in obj.items():
|
2009-10-24 04:41:59 +00:00
|
|
|
out.write('<key>%s</key>\n' % k)
|
|
|
|
out.write('<value>')
|
|
|
|
dumpxml(out, v)
|
|
|
|
out.write('</value>\n')
|
|
|
|
out.write('</dict>')
|
|
|
|
return
|
|
|
|
|
|
|
|
if isinstance(obj, list):
|
|
|
|
out.write('<list size="%d">\n' % len(obj))
|
|
|
|
for v in obj:
|
|
|
|
dumpxml(out, v)
|
|
|
|
out.write('\n')
|
|
|
|
out.write('</list>')
|
|
|
|
return
|
|
|
|
|
2020-01-04 15:47:07 +00:00
|
|
|
if isinstance(obj, ((str,), bytes)):
|
2010-04-24 01:34:18 +00:00
|
|
|
out.write('<string size="%d">%s</string>' % (len(obj), e(obj)))
|
2009-10-24 04:41:59 +00:00
|
|
|
return
|
|
|
|
|
|
|
|
if isinstance(obj, PDFStream):
|
2010-01-30 07:30:01 +00:00
|
|
|
if codec == 'raw':
|
|
|
|
out.write(obj.get_rawdata())
|
|
|
|
elif codec == 'binary':
|
|
|
|
out.write(obj.get_data())
|
|
|
|
else:
|
|
|
|
out.write('<stream>\n<props>\n')
|
|
|
|
dumpxml(out, obj.attrs)
|
|
|
|
out.write('\n</props>\n')
|
|
|
|
if codec == 'text':
|
|
|
|
data = obj.get_data()
|
2010-04-24 01:34:18 +00:00
|
|
|
out.write('<data size="%d">%s</data>\n' % (len(data), e(data)))
|
2010-01-30 07:30:01 +00:00
|
|
|
out.write('</stream>')
|
2009-10-24 04:41:59 +00:00
|
|
|
return
|
|
|
|
|
|
|
|
if isinstance(obj, PDFObjRef):
|
2010-04-24 01:34:18 +00:00
|
|
|
out.write('<ref id="%d" />' % obj.objid)
|
2009-10-24 04:41:59 +00:00
|
|
|
return
|
|
|
|
|
|
|
|
if isinstance(obj, PSKeyword):
|
|
|
|
out.write('<keyword>%s</keyword>' % obj.name)
|
|
|
|
return
|
|
|
|
|
|
|
|
if isinstance(obj, PSLiteral):
|
|
|
|
out.write('<literal>%s</literal>' % obj.name)
|
|
|
|
return
|
|
|
|
|
2014-03-28 08:50:59 +00:00
|
|
|
if isnumber(obj):
|
2009-10-24 04:41:59 +00:00
|
|
|
out.write('<number>%s</number>' % obj)
|
|
|
|
return
|
|
|
|
|
|
|
|
raise TypeError(obj)
|
2007-12-31 02:40:32 +00:00
|
|
|
|
2019-10-27 20:40:04 +00:00
|
|
|
|
2020-05-23 16:04:34 +00:00
|
|
|
def dumptrailers(out, doc, show_fallback_xref=False):
|
2009-10-24 04:41:59 +00:00
|
|
|
for xref in doc.xrefs:
|
2020-05-23 16:04:34 +00:00
|
|
|
if not isinstance(xref, PDFXRefFallback) or show_fallback_xref:
|
|
|
|
out.write('<trailer>\n')
|
|
|
|
dumpxml(out, xref.trailer)
|
|
|
|
out.write('\n</trailer>\n\n')
|
|
|
|
no_xrefs = all(isinstance(xref, PDFXRefFallback) for xref in doc.xrefs)
|
|
|
|
if no_xrefs and not show_fallback_xref:
|
|
|
|
msg = 'This PDF does not have an xref. Use --show-fallback-xref if ' \
|
|
|
|
'you want to display the content of a fallback xref that ' \
|
|
|
|
'contains all objects.'
|
|
|
|
warnings.warn(msg, PDFNoValidXRefWarning)
|
2009-10-24 04:41:59 +00:00
|
|
|
return
|
2007-12-31 02:40:32 +00:00
|
|
|
|
2019-10-27 20:40:04 +00:00
|
|
|
|
2020-05-23 16:04:34 +00:00
|
|
|
def dumpallobjs(out, doc, codec=None, show_fallback_xref=False):
|
2013-11-19 01:41:09 +00:00
|
|
|
visited = set()
|
2009-10-24 04:41:59 +00:00
|
|
|
out.write('<pdf>')
|
|
|
|
for xref in doc.xrefs:
|
2010-02-07 03:14:00 +00:00
|
|
|
for objid in xref.get_objids():
|
2019-12-29 20:20:20 +00:00
|
|
|
if objid in visited:
|
|
|
|
continue
|
2013-11-19 01:41:09 +00:00
|
|
|
visited.add(objid)
|
2009-10-24 04:41:59 +00:00
|
|
|
try:
|
|
|
|
obj = doc.getobj(objid)
|
2019-12-29 20:20:20 +00:00
|
|
|
if obj is None:
|
|
|
|
continue
|
2009-10-24 04:41:59 +00:00
|
|
|
out.write('<object id="%d">\n' % objid)
|
|
|
|
dumpxml(out, obj, codec=codec)
|
|
|
|
out.write('\n</object>\n\n')
|
2014-06-16 09:50:07 +00:00
|
|
|
except PDFObjectNotFound as e:
|
2019-10-27 20:40:04 +00:00
|
|
|
print('not found: %r' % e)
|
2020-05-23 16:04:34 +00:00
|
|
|
dumptrailers(out, doc, show_fallback_xref)
|
2009-10-24 04:41:59 +00:00
|
|
|
out.write('</pdf>')
|
|
|
|
return
|
2007-12-31 02:40:32 +00:00
|
|
|
|
2019-10-27 20:40:04 +00:00
|
|
|
|
2008-07-09 15:15:32 +00:00
|
|
|
def dumpoutline(outfp, fname, objids, pagenos, password='',
|
2013-10-18 23:41:01 +00:00
|
|
|
dumpall=False, codec=None, extractdir=None):
|
2014-09-03 11:17:41 +00:00
|
|
|
fp = open(fname, 'rb')
|
2010-01-01 03:09:26 +00:00
|
|
|
parser = PDFParser(fp)
|
2014-03-24 11:55:00 +00:00
|
|
|
doc = PDFDocument(parser, password)
|
2020-01-04 15:47:07 +00:00
|
|
|
pages = {page.pageid: pageno for (pageno, page)
|
|
|
|
in enumerate(PDFPage.create_pages(doc), 1)}
|
2019-10-27 20:40:04 +00:00
|
|
|
|
2010-10-17 05:14:52 +00:00
|
|
|
def resolve_dest(dest):
|
|
|
|
if isinstance(dest, str):
|
|
|
|
dest = resolve1(doc.get_dest(dest))
|
|
|
|
elif isinstance(dest, PSLiteral):
|
|
|
|
dest = resolve1(doc.get_dest(dest.name))
|
|
|
|
if isinstance(dest, dict):
|
|
|
|
dest = dest['D']
|
2017-07-20 18:46:11 +00:00
|
|
|
if isinstance(dest, PDFObjRef):
|
|
|
|
dest = dest.resolve()
|
2010-10-17 05:14:52 +00:00
|
|
|
return dest
|
2019-10-27 20:40:04 +00:00
|
|
|
|
2010-04-24 01:34:18 +00:00
|
|
|
try:
|
|
|
|
outlines = doc.get_outlines()
|
|
|
|
outfp.write('<outlines>\n')
|
2019-10-27 20:40:04 +00:00
|
|
|
for (level, title, dest, a, se) in outlines:
|
2010-04-24 01:34:18 +00:00
|
|
|
pageno = None
|
|
|
|
if dest:
|
2010-10-17 05:14:52 +00:00
|
|
|
dest = resolve_dest(dest)
|
|
|
|
pageno = pages[dest[0].objid]
|
2010-04-24 01:34:18 +00:00
|
|
|
elif a:
|
2017-07-20 18:46:11 +00:00
|
|
|
action = a
|
2010-04-24 01:34:18 +00:00
|
|
|
if isinstance(action, dict):
|
|
|
|
subtype = action.get('S')
|
2019-10-27 20:40:04 +00:00
|
|
|
if subtype and repr(subtype) == '/\'GoTo\'' and action.get(
|
|
|
|
'D'):
|
2010-10-17 05:14:52 +00:00
|
|
|
dest = resolve_dest(action['D'])
|
2010-04-24 01:34:18 +00:00
|
|
|
pageno = pages[dest[0].objid]
|
|
|
|
s = e(title).encode('utf-8', 'xmlcharrefreplace')
|
2020-01-04 15:47:07 +00:00
|
|
|
outfp.write('<outline level="{!r}" title="{}">\n'.format(level, s))
|
2010-04-24 01:34:18 +00:00
|
|
|
if dest is not None:
|
|
|
|
outfp.write('<dest>')
|
|
|
|
dumpxml(outfp, dest)
|
|
|
|
outfp.write('</dest>\n')
|
|
|
|
if pageno is not None:
|
|
|
|
outfp.write('<pageno>%r</pageno>\n' % pageno)
|
|
|
|
outfp.write('</outline>\n')
|
|
|
|
outfp.write('</outlines>\n')
|
|
|
|
except PDFNoOutlines:
|
|
|
|
pass
|
2009-10-24 04:41:59 +00:00
|
|
|
parser.close()
|
|
|
|
fp.close()
|
|
|
|
return
|
2008-07-09 15:15:32 +00:00
|
|
|
|
2019-10-27 20:40:04 +00:00
|
|
|
|
2013-10-18 23:41:01 +00:00
|
|
|
LITERAL_FILESPEC = LIT('Filespec')
|
|
|
|
LITERAL_EMBEDDEDFILE = LIT('EmbeddedFile')
|
2019-10-27 20:40:04 +00:00
|
|
|
|
|
|
|
|
2013-01-20 03:23:58 +00:00
|
|
|
def extractembedded(outfp, fname, objids, pagenos, password='',
|
2013-10-18 23:41:01 +00:00
|
|
|
dumpall=False, codec=None, extractdir=None):
|
2020-01-16 21:11:42 +00:00
|
|
|
def extract1(objid, obj):
|
|
|
|
filename = os.path.basename(obj.get('UF') or obj.get('F').decode())
|
|
|
|
fileref = obj['EF'].get('UF') or obj['EF'].get('F')
|
2013-10-18 23:41:01 +00:00
|
|
|
fileobj = doc.getobj(fileref.objid)
|
|
|
|
if not isinstance(fileobj, PDFStream):
|
2020-01-16 21:11:42 +00:00
|
|
|
error_msg = 'unable to process PDF: reference for %r is not a ' \
|
|
|
|
'PDFStream' % filename
|
|
|
|
raise PDFValueError(error_msg)
|
2013-10-18 23:41:01 +00:00
|
|
|
if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
|
|
|
|
raise PDFValueError(
|
2019-12-29 20:20:20 +00:00
|
|
|
'unable to process PDF: reference for %r '
|
|
|
|
'is not an EmbeddedFile' % (filename))
|
2020-01-16 21:11:42 +00:00
|
|
|
path = os.path.join(extractdir, '%.6d-%s' % (objid, filename))
|
2013-10-18 23:41:01 +00:00
|
|
|
if os.path.exists(path):
|
|
|
|
raise IOError('file exists: %r' % path)
|
2019-10-27 20:40:04 +00:00
|
|
|
print('extracting: %r' % path)
|
2020-01-16 21:11:42 +00:00
|
|
|
os.makedirs(os.path.dirname(path), exist_ok=True)
|
2019-10-27 20:40:04 +00:00
|
|
|
out = open(path, 'wb')
|
2013-10-18 23:41:01 +00:00
|
|
|
out.write(fileobj.get_data())
|
|
|
|
out.close()
|
|
|
|
return
|
2013-11-07 07:14:53 +00:00
|
|
|
|
2020-01-16 21:11:42 +00:00
|
|
|
with open(fname, 'rb') as fp:
|
|
|
|
parser = PDFParser(fp)
|
|
|
|
doc = PDFDocument(parser, password)
|
|
|
|
extracted_objids = set()
|
|
|
|
for xref in doc.xrefs:
|
|
|
|
for objid in xref.get_objids():
|
|
|
|
obj = doc.getobj(objid)
|
|
|
|
if objid not in extracted_objids and isinstance(obj, dict) \
|
|
|
|
and obj.get('Type') is LITERAL_FILESPEC:
|
|
|
|
extracted_objids.add(objid)
|
|
|
|
extract1(objid, obj)
|
2013-10-18 23:41:01 +00:00
|
|
|
return
|
2013-01-20 03:23:58 +00:00
|
|
|
|
2019-10-27 20:40:04 +00:00
|
|
|
|
2020-05-23 16:04:34 +00:00
|
|
|
def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False,
|
|
|
|
codec=None, extractdir=None, show_fallback_xref=False):
|
2014-09-03 11:17:41 +00:00
|
|
|
fp = open(fname, 'rb')
|
2010-01-01 03:09:26 +00:00
|
|
|
parser = PDFParser(fp)
|
2014-03-24 11:55:00 +00:00
|
|
|
doc = PDFDocument(parser, password)
|
2009-10-24 04:41:59 +00:00
|
|
|
if objids:
|
|
|
|
for objid in objids:
|
|
|
|
obj = doc.getobj(objid)
|
2010-01-30 07:30:01 +00:00
|
|
|
dumpxml(outfp, obj, codec=codec)
|
2009-10-24 04:41:59 +00:00
|
|
|
if pagenos:
|
2019-10-27 20:40:04 +00:00
|
|
|
for (pageno, page) in enumerate(PDFPage.create_pages(doc)):
|
2009-10-24 04:41:59 +00:00
|
|
|
if pageno in pagenos:
|
2010-01-30 07:30:01 +00:00
|
|
|
if codec:
|
|
|
|
for obj in page.contents:
|
|
|
|
obj = stream_value(obj)
|
|
|
|
dumpxml(outfp, obj, codec=codec)
|
|
|
|
else:
|
|
|
|
dumpxml(outfp, page.attrs)
|
2009-10-24 04:41:59 +00:00
|
|
|
if dumpall:
|
2020-05-23 16:04:34 +00:00
|
|
|
dumpallobjs(outfp, doc, codec, show_fallback_xref)
|
2009-10-24 04:41:59 +00:00
|
|
|
if (not objids) and (not pagenos) and (not dumpall):
|
2020-05-23 16:04:34 +00:00
|
|
|
dumptrailers(outfp, doc, show_fallback_xref)
|
2009-10-24 04:41:59 +00:00
|
|
|
fp.close()
|
2019-10-27 20:40:04 +00:00
|
|
|
if codec not in ('raw', 'binary'):
|
2009-10-24 04:41:59 +00:00
|
|
|
outfp.write('\n')
|
|
|
|
return
|
2007-12-31 02:40:32 +00:00
|
|
|
|
|
|
|
|
2019-10-27 20:40:04 +00:00
|
|
|
def create_parser():
|
|
|
|
parser = ArgumentParser(description=__doc__, add_help=True)
|
|
|
|
parser.add_argument('files', type=str, default=None, nargs='+',
|
|
|
|
help='One or more paths to PDF files.')
|
|
|
|
|
2020-05-17 15:48:06 +00:00
|
|
|
parser.add_argument(
|
|
|
|
"--version", "-v", action="version",
|
|
|
|
version="pdfminer.six v{}".format(pdfminer.__version__))
|
2019-10-27 20:40:04 +00:00
|
|
|
parser.add_argument(
|
2019-11-07 20:12:34 +00:00
|
|
|
'--debug', '-d', default=False, action='store_true',
|
2019-10-27 20:40:04 +00:00
|
|
|
help='Use debug logging level.')
|
|
|
|
procedure_parser = parser.add_mutually_exclusive_group()
|
|
|
|
procedure_parser.add_argument(
|
2019-11-07 20:12:34 +00:00
|
|
|
'--extract-toc', '-T', default=False, action='store_true',
|
2019-10-27 20:40:04 +00:00
|
|
|
help='Extract structure of outline')
|
|
|
|
procedure_parser.add_argument(
|
2019-11-07 20:12:34 +00:00
|
|
|
'--extract-embedded', '-E', type=str,
|
2019-10-27 20:40:04 +00:00
|
|
|
help='Extract embedded files')
|
|
|
|
|
|
|
|
parse_params = parser.add_argument_group(
|
|
|
|
'Parser', description='Used during PDF parsing')
|
|
|
|
parse_params.add_argument(
|
2019-11-07 20:12:34 +00:00
|
|
|
'--page-numbers', type=int, default=None, nargs='+',
|
|
|
|
help='A space-seperated list of page numbers to parse.')
|
2019-10-27 20:40:04 +00:00
|
|
|
parse_params.add_argument(
|
2019-11-07 20:12:34 +00:00
|
|
|
'--pagenos', '-p', type=str,
|
|
|
|
help='A comma-separated list of page numbers to parse. Included for '
|
|
|
|
'legacy applications, use --page-numbers for more idiomatic '
|
|
|
|
'argument entry.')
|
2019-10-27 20:40:04 +00:00
|
|
|
parse_params.add_argument(
|
2019-11-07 20:12:34 +00:00
|
|
|
'--objects', '-i', type=str,
|
2019-10-27 20:40:04 +00:00
|
|
|
help='Comma separated list of object numbers to extract')
|
|
|
|
parse_params.add_argument(
|
2019-11-07 20:12:34 +00:00
|
|
|
'--all', '-a', default=False, action='store_true',
|
2019-10-27 20:40:04 +00:00
|
|
|
help='If the structure of all objects should be extracted')
|
2020-05-23 16:04:34 +00:00
|
|
|
parse_params.add_argument(
|
|
|
|
'--show-fallback-xref', action='store_true',
|
|
|
|
help='Additionally show the fallback xref. Use this if the PDF '
|
|
|
|
'has zero or only invalid xref\'s. This setting is ignored if '
|
|
|
|
'--extract-toc or --extract-embedded is used.')
|
2019-10-27 20:40:04 +00:00
|
|
|
parse_params.add_argument(
|
2019-11-07 20:12:34 +00:00
|
|
|
'--password', '-P', type=str, default='',
|
2019-10-27 20:40:04 +00:00
|
|
|
help='The password to use for decrypting PDF file.')
|
|
|
|
|
|
|
|
output_params = parser.add_argument_group(
|
|
|
|
'Output', description='Used during output generation.')
|
|
|
|
output_params.add_argument(
|
2019-11-07 20:12:34 +00:00
|
|
|
'--outfile', '-o', type=str, default='-',
|
2019-10-27 20:40:04 +00:00
|
|
|
help='Path to file where output is written. Or "-" (default) to '
|
|
|
|
'write to stdout.')
|
|
|
|
codec_parser = output_params.add_mutually_exclusive_group()
|
|
|
|
codec_parser.add_argument(
|
2019-11-07 20:12:34 +00:00
|
|
|
'--raw-stream', '-r', default=False, action='store_true',
|
2019-10-27 20:40:04 +00:00
|
|
|
help='Write stream objects without encoding')
|
|
|
|
codec_parser.add_argument(
|
2019-11-07 20:12:34 +00:00
|
|
|
'--binary-stream', '-b', default=False, action='store_true',
|
2019-10-27 20:40:04 +00:00
|
|
|
help='Write stream objects with binary encoding')
|
|
|
|
codec_parser.add_argument(
|
2019-11-07 20:12:34 +00:00
|
|
|
'--text-stream', '-t', default=False, action='store_true',
|
2019-10-27 20:40:04 +00:00
|
|
|
help='Write stream objects as plain text')
|
|
|
|
|
|
|
|
return parser
|
|
|
|
|
2014-09-03 11:17:41 +00:00
|
|
|
|
2019-10-27 20:40:04 +00:00
|
|
|
def main(argv=None):
|
|
|
|
parser = create_parser()
|
|
|
|
args = parser.parse_args(args=argv)
|
|
|
|
|
|
|
|
if args.debug:
|
|
|
|
logging.getLogger().setLevel(logging.DEBUG)
|
|
|
|
|
|
|
|
if args.outfile == '-':
|
|
|
|
outfp = sys.stdout
|
|
|
|
else:
|
|
|
|
outfp = open(args.outfile, 'w')
|
|
|
|
|
|
|
|
if args.objects:
|
|
|
|
objids = [int(x) for x in args.objects.split(',')]
|
|
|
|
else:
|
|
|
|
objids = []
|
|
|
|
|
|
|
|
if args.page_numbers:
|
|
|
|
pagenos = {x - 1 for x in args.page_numbers}
|
|
|
|
elif args.pagenos:
|
|
|
|
pagenos = {int(x) - 1 for x in args.pagenos.split(',')}
|
|
|
|
else:
|
|
|
|
pagenos = set()
|
|
|
|
|
|
|
|
password = args.password
|
2014-09-16 21:00:34 +00:00
|
|
|
|
2019-10-27 20:40:04 +00:00
|
|
|
if args.raw_stream:
|
|
|
|
codec = 'raw'
|
|
|
|
elif args.binary_stream:
|
|
|
|
codec = 'binary'
|
|
|
|
elif args.text_stream:
|
|
|
|
codec = 'text'
|
|
|
|
else:
|
|
|
|
codec = None
|
|
|
|
|
|
|
|
for fname in args.files:
|
2020-05-23 16:04:34 +00:00
|
|
|
if args.extract_toc:
|
|
|
|
dumpoutline(
|
|
|
|
outfp, fname, objids, pagenos, password=password,
|
|
|
|
dumpall=args.all, codec=codec, extractdir=None
|
|
|
|
)
|
|
|
|
elif args.extract_embedded:
|
|
|
|
extractembedded(
|
|
|
|
outfp, fname, objids, pagenos, password=password,
|
|
|
|
dumpall=args.all, codec=codec, extractdir=args.extract_embedded
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
dumppdf(
|
|
|
|
outfp, fname, objids, pagenos, password=password,
|
|
|
|
dumpall=args.all, codec=codec, extractdir=None,
|
|
|
|
show_fallback_xref=args.show_fallback_xref
|
|
|
|
)
|
|
|
|
|
2014-09-03 11:17:41 +00:00
|
|
|
outfp.close()
|
2007-12-31 02:40:32 +00:00
|
|
|
|
2019-10-27 20:40:04 +00:00
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
sys.exit(main())
|