Merge pull request #173 from wlbentley/pdf2txt-maketheparser

Move argparse object setup out of pdf2txt.py main to separate function.
pull/204/head
Tata Ganesh 2018-08-21 23:40:50 +05:30 committed by GitHub
commit 1ea9446bd6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 34 additions and 25 deletions

View File

@ -3,9 +3,10 @@
""" """
Converts PDF text content (though not images containing text) to plain text, html, xml or "tags". Converts PDF text content (though not images containing text) to plain text, html, xml or "tags".
""" """
import sys import argparse
import logging import logging
import six import six
import sys
import pdfminer.settings import pdfminer.settings
pdfminer.settings.STRICT = False pdfminer.settings.STRICT = False
import pdfminer.high_level import pdfminer.high_level
@ -62,32 +63,40 @@ def extract_text(files=[], outfile='-',
pdfminer.high_level.extract_text_to_fp(fp, **locals()) pdfminer.high_level.extract_text_to_fp(fp, **locals())
return outfp return outfp
def maketheparser():
parser = argparse.ArgumentParser(description=__doc__, add_help=True)
parser.add_argument("files", type=str, default=None, nargs="+", help="File to process.")
parser.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
parser.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use --page-numbers for more idiomatic argument entry.")
parser.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
parser.add_argument("-m", "--maxpages", type=int, default=0, help="Maximum pages to parse")
parser.add_argument("-P", "--password", type=str, default="", help="Decryption password for PDF")
parser.add_argument("-o", "--outfile", type=str, default="-", help="Output file (default \"-\" is stdout)")
parser.add_argument("-t", "--output_type", type=str, default="text", help="Output type: text|html|xml|tag (default is text)")
parser.add_argument("-c", "--codec", type=str, default="utf-8", help="Text encoding")
parser.add_argument("-s", "--scale", type=float, default=1.0, help="Scale")
parser.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts")
parser.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical")
parser.add_argument("-W", "--word-margin", type=float, default=None, help="LAParams word margin")
parser.add_argument("-M", "--char-margin", type=float, default=None, help="LAParams char margin")
parser.add_argument("-L", "--line-margin", type=float, default=None, help="LAParams line margin")
parser.add_argument("-F", "--boxes-flow", type=float, default=None, help="LAParams boxes flow")
parser.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode")
parser.add_argument("-n", "--no-laparams", default=False, action="store_true", help="Pass None as LAParams")
parser.add_argument("-R", "--rotation", default=0, type=int, help="Rotation")
parser.add_argument("-O", "--output-dir", default=None, help="Output directory for images")
parser.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching")
parser.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode")
return parser
# main # main
def main(args=None): def main(args=None):
import argparse
P = argparse.ArgumentParser(description=__doc__) P = maketheparser()
P.add_argument("files", type=str, default=None, nargs="+", help="Files to process.")
P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use --page-numbers for more idiomatic argument entry.")
P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse")
P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for PDF")
P.add_argument("-o", "--outfile", type=str, default="-", help="Output file (default/'-' is stdout)")
P.add_argument("-t", "--output_type", type=str, default="text", help = "Output type: text|html|xml|tag (default is text)")
P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding")
P.add_argument("-s", "--scale", type=float, default=1.0, help = "Scale")
P.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts")
P.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical")
P.add_argument("-W", "--word-margin", type=float, default=None, help = "LAParams word margin")
P.add_argument("-M", "--char-margin", type=float, default=None, help = "LAParams char margin")
P.add_argument("-L", "--line-margin", type=float, default=None, help = "LAParams line margin")
P.add_argument("-F", "--boxes-flow", type=float, default=None, help = "LAParams boxes flow")
P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode")
P.add_argument("-n", "--no-laparams", default=False, action="store_true", help = "Pass None as LAParams")
P.add_argument("-R", "--rotation", default=0, type=int, help = "Rotation")
P.add_argument("-O", "--output-dir", default=None, help="Output directory for images")
P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching")
P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode")
A = P.parse_args(args=args) A = P.parse_args(args=args)
if A.page_numbers: if A.page_numbers: