combine_training_corpus.py | Explore in Territory

# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
r"""Combine multiple training corpus into a single training corpus.

Currently only support the case that multiple corpus share the same
configurables except the "modules" field.

Usage: we'd like to combine training corpus corpus1 and corpus2 into
combinedcorpus; we first structure the files as follows:

combinedcorpus
combinedcorpus/corpus1
combinedcorpus/corpus2

Running this script with

python3 \
compiler_opt/tools/combine_training_corpus.py \
  --root_dir=$PATH_TO_combinedcorpus

generates combinedcorpus/corpus_description.json file. In this way corpus1
and corpus2 are combined into combinedcorpus.
"""

import argparse
import logging

from mlgo.corpus import combine_training_corpus_lib


def parse_args_and_run():
    parser = argparse.ArgumentParser(
        description="A tool for combining multiple training corpora"
    )
    parser.add_argument(
        "--root_dir", type=str, help="The root dir of module paths to combine."
    )
    # TODO(#107898): Refactor this into a common location.
    parser.add_argument(
        "--verbosity",
        type=str,
        help="The verbosity level to use for logging",
        default="INFO",
        nargs="?",
        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
    )
    args = parser.parse_args()
    main(args)


def main(args):
    logging.basicConfig(level=args.verbosity)

    combine_training_corpus_lib.combine_corpus(args.root_dir)


if __name__ == "__main__":
    parse_args_and_run()
llvm/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py