"""A command line tool for extracting text and images from PDF and output it to plain text, html, xml or tags.""" import argparse import logging import sys import six import pdfminer.high_level import pdfminer.layout from pdfminer.image import ImageWriter logging.basicConfig() OUTPUT_TYPES = ((".htm", "html"), (".html", "html"), (".xml", "xml"), (".tag", "tag")) def extract_text(files=[], outfile='-', no_laparams=False, all_texts=None, detect_vertical=None, # LAParams word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams output_type='text', codec='utf-8', strip_control=False, maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0, layoutmode='normal', output_dir=None, debug=False, disable_caching=False, **kwargs): if '_py2_no_more_posargs' in kwargs is not None: raise DeprecationWarning( 'The `_py2_no_more_posargs will be removed on January, 2020. At ' 'that moment pdfminer.six will stop supporting Python 2. Please ' 'upgrade to Python 3. For more information see ' 'https://github.com/pdfminer/pdfminer .six/issues/194') if not files: raise ValueError("Must provide files to work upon!") # If any LAParams group arguments were passed, create an LAParams object and # populate with given args. Otherwise, set it to None. if not no_laparams: laparams = pdfminer.layout.LAParams() for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"): paramv = locals().get(param, None) if paramv is not None: setattr(laparams, param, paramv) else: laparams = None if output_type == "text" and outfile != "-": for override, alttype in OUTPUT_TYPES: if outfile.endswith(override): output_type = alttype if outfile == "-": outfp = sys.stdout if outfp.encoding is not None: codec = 'utf-8' else: outfp = open(outfile, "wb") for fname in files: with open(fname, "rb") as fp: pdfminer.high_level.extract_text_to_fp(fp, **locals()) return outfp def maketheparser(): parser = argparse.ArgumentParser(description=__doc__, add_help=True) parser.add_argument("files", type=str, default=None, nargs="+", help="One or more paths to PDF files.") parser.add_argument("--debug", "-d", default=False, action="store_true", help="Use debug logging level.") parser.add_argument("--disable-caching", "-C", default=False, action="store_true", help="If caching or resources, such as fonts, should be disabled.") parse_params = parser.add_argument_group('Parser', description='Used during PDF parsing') parse_params.add_argument("--page-numbers", type=int, default=None, nargs="+", help="A space-seperated list of page numbers to parse.") parse_params.add_argument("--pagenos", "-p", type=str, help="A comma-separated list of page numbers to parse. Included for legacy applications, " "use --page-numbers for more idiomatic argument entry.") parse_params.add_argument("--maxpages", "-m", type=int, default=0, help="The maximum number of pages to parse.") parse_params.add_argument("--password", "-P", type=str, default="", help="The password to use for decrypting PDF file.") parse_params.add_argument("--rotation", "-R", default=0, type=int, help="The number of degrees to rotate the PDF before other types of processing.") la_params = parser.add_argument_group('Layout analysis', description='Used during layout analysis.') la_params.add_argument("--no-laparams", "-n", default=False, action="store_true", help="If layout analysis parameters should be ignored.") la_params.add_argument("--detect-vertical", "-V", default=False, action="store_true", help="If vertical text should be considered during layout analysis") la_params.add_argument("--char-margin", "-M", type=float, default=2.0, help="If two characters are closer together than this margin they are considered to be part " "of the same word. The margin is specified relative to the width of the character.") la_params.add_argument("--word-margin", "-W", type=float, default=0.1, help="If two words are are closer together than this margin they are considered to be part " "of the same line. A space is added in between for readability. The margin is " "specified relative to the width of the word.") la_params.add_argument("--line-margin", "-L", type=float, default=0.5, help="If two lines are are close together they are considered to be part of the same " "paragraph. The margin is specified relative to the height of a line.") la_params.add_argument("--boxes-flow", "-F", type=float, default=0.5, help="Specifies how much a horizontal and vertical position of a text matters when " "determining the order of lines. The value should be within the range of -1.0 (only " "horizontal position matters) to +1.0 (only vertical position matters).") la_params.add_argument("--all-texts", "-A", default=True, action="store_true", help="If layout analysis should be performed on text in figures.") output_params = parser.add_argument_group('Output', description='Used during output generation.') output_params.add_argument("--outfile", "-o", type=str, default="-", help="Path to file where output is written. Or \"-\" (default) to write to stdout.") output_params.add_argument("--output_type", "-t", type=str, default="text", help="Type of output to generate {text,html,xml,tag}.") output_params.add_argument("--codec", "-c", type=str, default="utf-8", help="Text encoding to use in output file.") output_params.add_argument("--output-dir", "-O", default=None, help="The output directory to put extracted images in. If not given, images are not " "extracted.") output_params.add_argument("--layoutmode", "-Y", default="normal", type=str, help="Type of layout to use when generating html {normal,exact,loose}. If normal, " "each line is positioned separately in the html. If exact, each character is " "positioned separately in the html. If loose, same result as normal but with an " "additional newline after each text line. Only used when output_type is html.") output_params.add_argument("--scale", "-s", type=float, default=1.0, help="The amount of zoom to use when generating html file. Only used when output_type " "is html.") output_params.add_argument("--strip-control", "-S", default=False, action="store_true", help="Remove control statement from text. Only used when output_type is xml.") return parser # main def main(args=None): P = maketheparser() A = P.parse_args(args=args) if A.page_numbers: A.page_numbers = set([x-1 for x in A.page_numbers]) if A.pagenos: A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")]) if six.PY2 and sys.stdin.encoding: A.password = A.password.decode(sys.stdin.encoding) if A.output_type == "text" and A.outfile != "-": for override, alttype in OUTPUT_TYPES: if A.outfile.endswith(override): A.output_type = alttype ## Test Code outfp = extract_text(**vars(A)) outfp.close() return 0 if __name__ == '__main__': sys.exit(main())