# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
r"""Combine multiple training corpus into a single training corpus.
Currently only support the case that multiple corpus share the same
configurables except the "modules" field.
Usage: we'd like to combine training corpus corpus1 and corpus2 into
combinedcorpus; we first structure the files as follows:
combinedcorpus
combinedcorpus/corpus1
combinedcorpus/corpus2
Running this script with
python3 \
compiler_opt/tools/combine_training_corpus.py \
--root_dir=$PATH_TO_combinedcorpus
generates combinedcorpus/corpus_description.json file. In this way corpus1
and corpus2 are combined into combinedcorpus.
"""
import argparse
import logging
from mlgo.corpus import combine_training_corpus_lib
def parse_args_and_run():
parser = argparse.ArgumentParser(
description="A tool for combining multiple training corpora"
)
parser.add_argument(
"--root_dir", type=str, help="The root dir of module paths to combine."
)
# TODO(#107898): Refactor this into a common location.
parser.add_argument(
"--verbosity",
type=str,
help="The verbosity level to use for logging",
default="INFO",
nargs="?",
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
)
args = parser.parse_args()
main(args)
def main(args):
logging.basicConfig(level=args.verbosity)
combine_training_corpus_lib.combine_corpus(args.root_dir)
if __name__ == "__main__":
parse_args_and_run()