llvm/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py

# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
"""Extract IR for training.

Extract IR for training, either from a compile_commands.json file produced by
cmake, or a linker parameter list file.

Only run with
'python compiler_opt/tools/extract_ir.py ...'

The compilation is assumed to have been performed with clang, using
-fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all)

In a distributed ThinLTO case, the compilation is assumed to have been performed
specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt.

In a local ThinLTO case, the compilation is assumedto have been performed
specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files

To change the logging verbosity, set the --verbosity flag to the desired level.
Setting it to a specific level will enable all messages at that level and
higher. Exact values can be found by invoking the script with --help.
"""

import argparse
import json
import logging

from mlgo.corpus import extract_ir_lib


def parse_args_and_run():
    parser = argparse.ArgumentParser(
        description="A tool for making a corpus from build artifacts"
    )
    parser.add_argument(
        "--input",
        type=str,
        help="Input file or directory - either compile_commands.json, a linker "
        "parameter list, or a path to a directory containing object files.",
    )
    parser.add_argument(
        "--input_type",
        type=str,
        help="Input file type - JSON, LLD params, directory, or bazel aquery.",
        choices=["json", "params", "directory", "bazel_aquery"],
        default="json",
        nargs="?",
    )
    parser.add_argument("--output_dir", type=str, help="Output directory")
    parser.add_argument(
        "--num_workers",
        type=int,
        help="Number of parallel works for objcopy. `None` for maximum available.",
        default=None,
        nargs="?",
    )
    parser.add_argument(
        "--llvm_objcopy_path",
        type=str,
        help="Path to llvm-objcopy",
        default="llvm-objcopy",
        nargs="?",
    )
    parser.add_argument(
        "--obj_base_dir",
        type=str,
        help="Base directory for object files. Defaults to current working dir.",
        default="",
        nargs="?",
    )
    parser.add_argument(
        "--cmd_filter",
        type=str,
        help="Include only those modules with a command line matching this regular "
        "expression. Set it to None to not perform any filtering. Note that the "
        "regular expression is applied independently for each separate command line "
        "option. For example, ^-Oz$ will match Oz built binaries. This does not work "
        "with thinlto_build=lld.",
        default=None,
        nargs="?",
    )
    parser.add_argument(
        "--thinlto_build",
        type=str,
        help="Set if the build was performed with either 'distributed' or 'local' "
        "ThinLTO. This ensures the thinlto.bc files are also copied. The build is "
        "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in "
        "the distributed case or -Wl,--save-temps=import and "
        "-Wl,--thinlto-emit-index-files passed in the local case",
        choices=["distributed", "local"],
        default=None,
        nargs="?",
    )
    parser.add_argument(
        "--cmd_section_name",
        type=str,
        help="The section name passed to llvm-objcopy. For ELF object files, the "
        "default .llvmcmd is correct. For Mach-O object files, one should use "
        "something like __LLVM,__cmdline",
        default=".llvmcmd",
        nargs="?",
    )
    parser.add_argument(
        "--bitcode_section_name",
        type=str,
        help="The section name passed to llvm-objcopy. For ELF object files, the "
        "default .llvmbc is correct. For Mach-O object files, one should use "
        "__LLVM,__bitcode",
        default=".llvmbc",
        nargs="?",
    )
    # TODO(#107898): Refactor this into a common location.
    parser.add_argument(
        "--verbosity",
        type=str,
        help="The verbosity level to use for logging",
        default="INFO",
        nargs="?",
        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
    )
    args = parser.parse_args()
    main(args)


def main(args):
    logging.basicConfig(level=args.verbosity)

    objs = []
    if args.input is not None and args.thinlto_build == "local":
        raise ValueError("--thinlto_build=local cannot be run with --input")
    if args.input is None:
        if args.thinlto_build != "local":
            raise ValueError("--input or --thinlto_build=local must be provided")
        objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir)
    elif args.input_type == "json":
        with open(args.input, encoding="utf-8") as f:
            objs = extract_ir_lib.load_from_compile_commands(
                json.load(f), args.output_dir
            )
    elif args.input_type == "params":
        if not args.obj_base_dir:
            logging.info(
                "-obj_base_dir is unspecified, assuming current directory. "
                "If no objects are found, use this option to specify the root "
                "directory for the object file paths in the input file."
            )
        with open(args.input, encoding="utf-8") as f:
            objs = extract_ir_lib.load_from_lld_params(
                [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir
            )
    elif args.input_type == "directory":
        logging.warning(
            "Using the directory input is only recommended if the build system "
            "your project uses does not support any structured output that "
            "ml-compiler-opt understands. If your build system provides a "
            "structured compilation database, use that instead"
        )
        objs = extract_ir_lib.load_from_directory(args.input, args.output_dir)
    elif args.input_type == "bazel_aquery":
        with open(args.input, encoding="utf-8") as aquery_json_handle:
            objs = extract_ir_lib.load_bazel_aquery(
                json.load(aquery_json_handle), args.obj_base_dir, args.output_dir
            )
    else:
        logging.error("Unknown input type: %s", args.input_type)

    relative_output_paths = extract_ir_lib.run_extraction(
        objs,
        args.num_workers,
        args.llvm_objcopy_path,
        args.cmd_filter,
        args.thinlto_build,
        args.cmd_section_name,
        args.bitcode_section_name,
    )

    extract_ir_lib.write_corpus_manifest(
        args.thinlto_build, relative_output_paths, args.output_dir
    )

    logging.info(
        "Converted %d files out of %d",
        len(objs) - relative_output_paths.count(None),
        len(objs),
    )


if __name__ == "__main__":
    parse_args_and_run()