pdfminer.six/tools/pdf2txt.py

"""A command line tool for extracting text and images from PDF and output it to plain text, html, xml or tags."""
import argparse
import logging
import sys
import six

import pdfminer.high_level
import pdfminer.layout
from pdfminer.image import ImageWriter

logging.basicConfig()


def extract_text(files=[], outfile='-',
            no_laparams=False, all_texts=None, detect_vertical=None, # LAParams
            word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams
            output_type='text', codec='utf-8', strip_control=False,
            maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
            layoutmode='normal', output_dir=None, debug=False,
            disable_caching=False, **kwargs):
    if '_py2_no_more_posargs' in kwargs is not None:
        raise DeprecationWarning(
            'The `_py2_no_more_posargs will be removed on January, 2020. At '
            'that moment pdfminer.six will stop supporting Python 2. Please '
            'upgrade to Python 3. For more information see '
            'https://github.com/pdfminer/pdfminer .six/issues/194')

    if not files:
        raise ValueError("Must provide files to work upon!")

    # If any LAParams group arguments were passed, create an LAParams object and
    # populate with given args. Otherwise, set it to None.
    if not no_laparams:
        laparams = pdfminer.layout.LAParams()
        for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
            paramv = locals().get(param, None)
            if paramv is not None:
                setattr(laparams, param, paramv)
    else:
        laparams = None

    imagewriter = None
    if output_dir:
        imagewriter = ImageWriter(output_dir)

    if output_type == "text" and outfile != "-":
        for override, alttype in (  (".htm", "html"),
                                    (".html", "html"),
                                    (".xml", "xml"),
                                    (".tag", "tag") ):
            if outfile.endswith(override):
                output_type = alttype

    if outfile == "-":
        outfp = sys.stdout
        if outfp.encoding is not None:
            codec = 'utf-8'
    else:
        outfp = open(outfile, "wb")


    for fname in files:
        with open(fname, "rb") as fp:
            pdfminer.high_level.extract_text_to_fp(fp, **locals())
    return outfp


def maketheparser():
    parser = argparse.ArgumentParser(description=__doc__, add_help=True)
    parser.add_argument("files", type=str, default=None, nargs="+", help="One or more paths to PDF files.")

    parser.add_argument("--debug", "-d", default=False, action="store_true",
                        help="Use debug logging level.")
    parser.add_argument("--disable-caching", "-C", default=False, action="store_true",
                        help="If caching or resources, such as fonts, should be disabled.")

    parse_params = parser.add_argument_group('Parser', description='Used during PDF parsing')
    parse_params.add_argument("--page-numbers", type=int, default=None, nargs="+",
                              help="A space-seperated list of page numbers to parse.")
    parse_params.add_argument("--pagenos", "-p", type=str,
                              help="A comma-separated list of page numbers to parse. Included for legacy applications, "
                                   "use --page-numbers for more idiomatic argument entry.")
    parse_params.add_argument("--maxpages", "-m", type=int, default=0,
                              help="The maximum number of pages to parse.")
    parse_params.add_argument("--password", "-P", type=str, default="",
                              help="The password to use for decrypting PDF file.")
    parse_params.add_argument("--rotation", "-R", default=0, type=int,
                              help="The number of degrees to rotate the PDF before other types of processing.")

    la_params = parser.add_argument_group('Layout analysis', description='Used during layout analysis.')
    la_params.add_argument("--no-laparams", "-n", default=False, action="store_true",
                           help="If layout analysis parameters should be ignored.")
    la_params.add_argument("--detect-vertical", "-V", default=False, action="store_true",
                           help="If vertical text should be considered during layout analysis")
    la_params.add_argument("--char-margin", "-M", type=float, default=2.0,
                           help="If two characters are closer together than this margin they are considered to be part "
                                "of the same word. The margin is specified relative to the width of the character.")
    la_params.add_argument("--word-margin", "-W", type=float, default=0.1,
                           help="If two words are are closer together than this margin they are considered to be part "
                                "of the same line. A space is added in between for readability. The margin is "
                                "specified relative to the width of the word.")
    la_params.add_argument("--line-margin", "-L", type=float, default=0.5,
                           help="If two lines are are close together they are considered to be part of the same "
                                "paragraph. The margin is specified relative to the height of a line.")
    la_params.add_argument("--boxes-flow", "-F", type=float, default=0.5,
                           help="Specifies how much a horizontal and vertical position of a text matters when "
                                "determining the order of lines. The value should be within the range of -1.0 (only "
                                "horizontal position matters) to +1.0 (only vertical position matters).")
    la_params.add_argument("--all-texts", "-A", default=True, action="store_true",
                           help="If layout analysis should be performed on text in figures.")

    output_params = parser.add_argument_group('Output', description='Used during output generation.')
    output_params.add_argument("--outfile", "-o", type=str, default="-",
                               help="Path to file where output is written. Or \"-\" (default) to write to stdout.")
    output_params.add_argument("--output_type", "-t", type=str, default="text",
                               help="Type of output to generate {text,html,xml,tag}.")
    output_params.add_argument("--codec", "-c", type=str, default="utf-8",
                               help="Text encoding to use in output file.")
    output_params.add_argument("--output-dir", "-O", default=None,
                               help="The output directory to put extracted images in. If not given, images are not "
                                    "extracted.")
    output_params.add_argument("--layoutmode", "-Y", default="normal", type=str,
                               help="Type of layout to use when generating html {normal,exact,loose}. If normal, "
                                    "each line is positioned separately in the html. If exact, each character is "
                                    "positioned separately in the html. If loose, same result as normal but with an "
                                    "additional newline after each text line. Only used when output_type is html.")
    output_params.add_argument("--scale", "-s", type=float, default=1.0,
                               help="The amount of zoom to use when generating html file. Only used when output_type "
                                    "is html.")
    output_params.add_argument("--strip-control", "-S", default=False, action="store_true",
                               help="Remove control statement from text. Only used when output_type is xml.")
    return parser


# main


def main(args=None):

    P = maketheparser()
    A = P.parse_args(args=args)

    if A.page_numbers:
        A.page_numbers = set([x-1 for x in A.page_numbers])
    if A.pagenos:
        A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])

    imagewriter = None
    if A.output_dir:
        imagewriter = ImageWriter(A.output_dir)

    if six.PY2 and sys.stdin.encoding:
        A.password = A.password.decode(sys.stdin.encoding)

    if A.output_type == "text" and A.outfile != "-":
        for override, alttype in (  (".htm",  "html"),
                                    (".html", "html"),
                                    (".xml",  "xml" ),
                                    (".tag",  "tag" ) ):
            if A.outfile.endswith(override):
                A.output_type = alttype

    if A.outfile == "-":
        outfp = sys.stdout
        if outfp.encoding is not None:
            # Why ignore outfp.encoding? :-/ stupid cathal?
            A.codec = 'utf-8'
    else:
        outfp = open(A.outfile, "wb")

    ## Test Code
    outfp = extract_text(**vars(A))
    outfp.close()
    return 0


if __name__ == '__main__': sys.exit(main())