llvm/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py

# REQUIRES: system-linux

## Test the functionality of extract_ir_lib

import sys

from mlgo.corpus import extract_ir_lib

## Test that we can convert a compilation database with a single compilation
## command in it.

# RUN: %python %s test_one_conversion | FileCheck %s --check-prefix CHECK-ONE-CONVERSION


def test_one_conversion():
    obj = extract_ir_lib.convert_compile_command_to_objectfile(
        {
            "directory": "/output/directory",
            "command": "-cc1 -c /some/path/lib/foo/bar.cc -o lib/bar.o",
            "file": "/some/path/lib/foo/bar.cc",
        },
        "/corpus/destination/path",
    )
    print(obj.input_obj())
    # CHECK-ONE-CONVERSION: /output/directory/lib/bar.o
    print(obj.relative_output_path())
    # CHECK-ONE-CONVERSION: lib/bar.o
    print(obj.cmd_file())
    # CHECK-ONE-CONVERSION: /corpus/destination/path/lib/bar.o.cmd
    print(obj.bc_file())
    # CHECK-ONE-CONVERSION: /corpus/destination/path/lib/bar.o.bc
    print(obj.thinlto_index_file())
    # CHECK-ONE-CONVERSION: /corpus/destination/path/lib/bar.o.thinlto.bc


## Test that we can convert an arguments style compilation database

# RUN: %python %s test_one_conversion_arguments_style | FileCheck %s --check-prefix CHECK-ARGUMENTS-STYLE


def test_one_conversion_arguments_style():
    obj = extract_ir_lib.convert_compile_command_to_objectfile(
        {
            "directory": "/output/directory",
            "arguments": [
                "-cc1",
                "-c",
                "/some/path/lib/foo/bar.cc",
                "-o",
                "lib/bar.o",
            ],
            "file": "/some/path/lib/foo/bar.cc",
        },
        "/corpus/destination/path",
    )
    print(obj.input_obj())
    # CHECK-ARGUMENTS-STYLE: /output/directory/lib/bar.o
    print(obj.relative_output_path())
    # CHECK-ARGUMENTS-STYLE: lib/bar.o
    print(obj.cmd_file())
    # CHECK-ARGUMENTS-STYLE: /corpus/destination/path/lib/bar.o.cmd
    print(obj.bc_file())
    # CHECK-ARGUMENTS-STYLE: /corpus/destination/path/lib/bar.o.bc
    print(obj.thinlto_index_file())
    # CHECK-ARGUMENTS-STYLE: /corpus/destination/path/lib/bar.o.thinlto.bc


## Test that converting multiple files works as well

# RUN: %python %s test_multiple_conversion | FileCheck %s --check-prefix CHECK-MULTIPLE-CONVERSION


def test_multiple_conversion():
    res = extract_ir_lib.load_from_compile_commands(
        [
            {
                "directory": "/output/directory",
                "command": "-cc1 -c /some/path/lib/foo/bar.cc -o lib/bar.o",
                "file": "/some/path/lib/foo/bar.cc",
            },
            {
                "directory": "/output/directory",
                "command": "-cc1 -c /some/path/lib/foo/baz.cc -o lib/other/baz.o",
                "file": "/some/path/lib/foo/baz.cc",
            },
        ],
        "/corpus/destination/path",
    )
    res = list(res)
    print(res[0].input_obj())
    # CHECK-MULTIPLE-CONVERSION: /output/directory/lib/bar.o
    print(res[0].relative_output_path())
    # CHECK-MULTIPLE-CONVERSION: lib/bar.o
    print(res[0].cmd_file())
    # CHECK-MULTIPLE-CONVERSION: /corpus/destination/path/lib/bar.o.cmd
    print(res[0].bc_file())
    # CHECK-MULTIPLE-CONVERSION: /corpus/destination/path/lib/bar.o.bc
    print(res[0].thinlto_index_file())
    # CHECK-MULTIPLE-CONVERSION: /corpus/destination/path/lib/bar.o.thinlto.bc

    print(res[1].input_obj(), "/output/directory/lib/other/baz.o")
    # CHECK-MULTIPLE-CONVERSION: /output/directory/lib/other/baz.o
    print(res[1].relative_output_path(), "lib/other/baz.o")
    # CHECK-MULTIPLE-CONVERSION: lib/other/baz.o
    print(res[1].cmd_file())
    # CHECK-MULTIPLE-CONVERSION: /corpus/destination/path/lib/other/baz.o.cmd
    print(res[1].bc_file())
    # CHECK-MULTIPLE-CONVERSION: /corpus/destination/path/lib/other/baz.o.bc
    print(res[1].thinlto_index_file())
    # CHECK-MULTIPLE-CONVERSION: /corpus/destination/path/lib/other/baz.o.thinlto.bc


## Test that we generate the correct objcopy commands for extracting commands

# RUN: %python %s test_command_extraction | FileCheck %s --check-prefix CHECK-COMMAND-EXTRACT


def test_command_extraction():
    obj = extract_ir_lib.TrainingIRExtractor(
        obj_relative_path="lib/obj_file.o",
        output_base_dir="/where/corpus/goes",
        obj_base_dir="/foo/bar",
    )
    extraction_cmd1 = obj._get_extraction_cmd_command(
        "/bin/llvm_objcopy_path", ".llvmcmd"
    )
    for part in extraction_cmd1:
        print(part)
    # CHECK-COMMAND-EXTRACT: /bin/llvm_objcopy_path
    # CHECK-COMMAND-EXTRACT: --dump-section=.llvmcmd=/where/corpus/goes/lib/obj_file.o.cmd
    # CHECK-COMMAND-EXTRACT: /foo/bar/lib/obj_file.o
    # CHECK-COMMAND-EXTRACT: /dev/null

    extraction_cmd2 = obj._get_extraction_bc_command(
        "/bin/llvm_objcopy_path", ".llvmbc"
    )
    for part in extraction_cmd2:
        print(part)
    # CHECK-COMMAND-EXTRACT: /bin/llvm_objcopy_path
    # CHECK-COMMAND-EXTRACT: --dump-section=.llvmbc=/where/corpus/goes/lib/obj_file.o.bc
    # CHECK-COMMAND-EXTRACT: /foo/bar/lib/obj_file.o
    # CHECK-COMMAND-EXTRACT: /dev/null


## Test that we generate the correct extraction commands without specifying
## an output base directory.

# RUN: %python %s test_command_extraction_no_basedir | FileCheck %s --check-prefix CHECK-COMMAND-EXTRACT-NOBASEDIR


def test_command_extraction_no_basedir():
    obj = extract_ir_lib.TrainingIRExtractor("lib/obj_file.o", "/where/corpus/goes")
    extraction_cmd1 = obj._get_extraction_cmd_command(
        "/bin/llvm_objcopy_path", ".llvmcmd"
    )
    for part in extraction_cmd1:
        print(part)
    # CHECK-COMMAND-EXTRACT-NOBASEDIR: /bin/llvm_objcopy_path
    # CHECK-COMMAND-EXTRACT-NOBASEDIR: --dump-section=.llvmcmd=/where/corpus/goes/lib/obj_file.o.cmd
    # CHECK-COMMAND-EXTRACT-NOBASEDIR: lib/obj_file.o
    # CHECK-COMMAND-EXTRACT-NOBASEDIR: /dev/null

    extraction_cmd2 = obj._get_extraction_bc_command(
        "/bin/llvm_objcopy_path", ".llvmbc"
    )
    for part in extraction_cmd2:
        print(part)
    # CHECK-COMMAND-EXTRACT-NOBASEDIR: /bin/llvm_objcopy_path
    # CHECK-COMMAND-EXTRACT-NOBASEDIR: --dump-section=.llvmbc=/where/corpus/goes/lib/obj_file.o.bc
    # CHECK-COMMAND-EXTRACT-NOBASEDIR: lib/obj_file.o
    # CHECK-COMMAND-EXTRACT-NOBASEDIR: /dev/null


## Test that we can extract a corpus from lld parameters

# RUN: %python %s test_lld_params | FileCheck %s --check-prefix CHECK-LLD-PARAMS


def test_lld_params():
    lld_opts = [
        "-o",
        "output/dir/exe",
        "lib/obj1.o",
        "somelib.a",
        "-W,blah",
        "lib/dir/obj2.o",
    ]
    obj = extract_ir_lib.load_from_lld_params(lld_opts, "/some/path", "/tmp/out")
    print(obj[0].input_obj())
    # CHECK-LLD-PARAMS: /some/path/lib/obj1.o
    print(obj[0].relative_output_path())
    # CHECK-LLD-PARAMS: lib/obj1.o
    print(obj[0].cmd_file())
    # CHECK-LLD-PARAMS: /tmp/out/lib/obj1.o.cmd
    print(obj[0].thinlto_index_file())
    # CHECK-LLD-PARAMS: /tmp/out/lib/obj1.o.thinlto.bc
    print(obj[1].input_obj())
    # CHECK-LLD-PARMAS: /some/path/lib/dir/obj2.o


## Test that we can load a corpus from a directory containing object files

# RUN: rm -rf %t.dir && mkdir %t.dir
# RUN: mkdir %t.dir/subdir
# RUN: touch %t.dir/subdir/test1.o
# RUN: touch %t.dir/subdir/test2.o
# RUN: %python %s test_load_from_directory %t.dir | FileCheck %s --check-prefix CHECK-LOAD-DIR


def test_load_from_directory(tempdir):
    objs = extract_ir_lib.load_from_directory(tempdir, "/output")
    for index, obj in enumerate(sorted(objs, key=lambda x: x._obj_relative_path)):
        print(obj._obj_relative_path, f"subdir/test{index + 1:d}.o")
        # CHECK-LOAD-DIR: subdir/test1.o
        # Explicitly check for equality here as we can not check within
        # FileCheck the exact value as lit substitutions do not work in
        # FileCheck lines.
        print(obj._obj_base_dir == tempdir)
        # CHECK-LOAD-DIR: True
        print(obj._output_base_dir)
        # CHECK-LOAD-DIR /output


## Test that we can load a corpus in the lld thinLTO case

# RUN: rm -rf %.dir && mkdir %t.dir
# RUN: touch %t.dir/1.3.import.bc
# RUN: touch %t.dir/2.3.import.bc
# RUN: touch %t.dir/3.3.import.bc
# RUN: touch %t.dir/1.thinlto.bc
# RUN: touch %t.dir/2.thinlto.bc
# RUN: touch %t.dir/3.thinlto.bc
# RUN: %python %s test_lld_thinlto_discovery %t.dir | FileCheck %s --check-prefix CHECK-LLD-THINLTO-DISCOVERY


def test_lld_thinlto_discovery(tempdir):
    obj = extract_ir_lib.load_for_lld_thinlto(tempdir, "/output")
    for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
        print(o._obj_relative_path)
        # Explicitly check for equality as we can not check within FileCheck
        # using the lit substitution for the temp dir
        print(o._obj_base_dir == tempdir)
        print(o._output_base_dir)  # outdir
    # CHECK-LLD-THINLTO-DISCOVERY: 1
    # CHECK-LLD-THINLTO-DISCOVERY: True
    # CHECK-LLD-THINLTO-DISCOVERY: /output
    # CHECK-LLD-THINLTO-DISCOVERY: 2
    # CHECK-LLD-THINLTO-DISCOVERY: True
    # CHECK-LLD-THINLTO-DISCOVERY: /output
    # CHECK-LLD-THINLTO-DISCOVERY: 3
    # CHECK-LLD-THINLTO-DISCOVERY: True
    # CHECK-LLD-THINLTO-DISCOVERY: /output


## Test that we can load a corpus in the nested lld thinLTO case

# RUN: mkdir %t.dir/nest
# RUN: mv %t.dir/*.bc %t.dir/nest
# RUN: %python %s test_lld_thinlto_discovery_nested %t.dir | FileCheck %s --check-prefix CHECK-LLD-THINLTO-DISCOVERY-NESTED


def test_lld_thinlto_discovery_nested(outer):
    obj = extract_ir_lib.load_for_lld_thinlto(outer, "/output")
    for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
        print(o._obj_relative_path)
        print(o._obj_base_dir == outer)
        print(o._output_base_dir)
    # CHECK-LLD-THINLTO-DISCOVERY-NESTED: nest/1
    # CHECK-LLD-THINLTO-DISCOVERY-NESTED: True
    # CHECK-LLD-THINLTO-DISCOVERY-NESTED: /output
    # CHECK-LLD-THINLTO-DISCOVERY-NESTED: nest/2
    # CHECK-LLD-THINLTO-DISCOVERY-NESTED: True
    # CHECK-LLD-THINLTO-DISCOVERY-NESTED: /output
    # CHECK-LLD-THINLTO-DISCOVERY-NESTED: nest/3
    # CHECK-LLD-THINLTO-DISCOVERY-NESTED: True
    # CHECK-LLD-THINLTO-DISCOVERY-NESTED: /output


## Test the lld extraction works as expected

# RUN: rm -rf  %t.dir.out && mkdir %t.dir.out
# RUN: %python %s test_lld_thinlto_extraction %t.dir %t.dir.out | FileCheck %s --check-prefix CHECK-LLD-THINLTO-EXTRACTION-PY
# ls %t.dir.out/nest | FileChceck %s --check-prefix CHECK-LLD-THINLTO-EXTRACTION

# CHECK-LLD-THINLTO-EXTRACTION: 1
# CHECK-LLD-THINLTO-EXTRACTION: 2
# CHECK-LLD-THINLTO-EXTRACTION: 3
# CHECK-LLD-THINLTO-EXTRACTION: 1.bc
# CHECK-LLD-THINLTO-EXTRACTION: 2.bc
# CHECK-LLD-THINLTO-EXTRACTION: 3.bc
# CHECK-LLD-THINLTO-EXTRACTION: 1.thinlto.bc
# CHECK-LLD-THINLTO-EXTRACTION: 2.thinlto.bc
# CHECK-LLD-THINLTO-EXTRACTION: 3.thinlto.bc


def test_lld_thinlto_extraction(outer, outdir):
    obj = extract_ir_lib.load_for_lld_thinlto(outer, outdir)
    for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
        mod_path = o.extract(thinlto_build="local")
        print(mod_path)
    # CHECK-LLD-THINLTO-EXTRACTION-PY: 1
    # CHECK-LLD-THINLTO-EXTRACTION-PY: 2
    # CHECK-LLD-THINLTO-EXTRACTION-PY: 3


## Test that we can load a bazel query JSON as expected.

# RUN: %python %s test_load_bazel_aquery | FileCheck %s --check-prefix CHECK-TEST-LOAD-BAZEL-AQUERY


def test_load_bazel_aquery():
    obj = extract_ir_lib.load_bazel_aquery(
        {
            "actions": [
                {"mnemonic": "not-link", "arguments": []},
                {
                    "mnemonic": "CppLink",
                    "arguments": ["clang", "-o", "output_binary", "test1.o", "test2.o"],
                },
            ]
        },
        "/some/path",
        "/tmp/out",
    )
    print(obj[0].input_obj())
    # CHECK-TEST-LOAD-BAZEL-AQUERY: /some/path/test1.o
    print(obj[0].relative_output_path())
    # CHECK-TEST-LOAD-BAZEL-AQUERY: test1.o
    print(obj[0].cmd_file())
    # CHECK-TEST-LOAD-BAZEL-AQUERY: /tmp/out/test1.o.cmd
    print(obj[0].bc_file())
    # CHECK-TEST-LOAD-BAZEL-AQUERY: /tmp/out/test1.o.bc
    print(obj[1].input_obj())
    # CHECK-TEST-LOAD-BAZEL-AQUERY: /some/path/test2.o
    print(obj[1].relative_output_path())
    # CHECK-TEST-LOAD-BAZEL-AQUERY: test2.o
    print(obj[1].cmd_file())
    # CHECK-TEST-LOAD-BAZEL-AQUERY: /tmp/out/test2.o.cmd
    print(obj[1].bc_file())
    # CHECK-TEST-LOAD-BAZEL-AQUERY: /tmp/out/test2.o.bc


## Test that filtering works correctly

# RUN: %python %s test_filtering | FileCheck %s --check-prefix CHECK-TEST-FILTERING


def test_filtering():
    cmdline = "-cc1\0x/y/foobar.cpp\0-Oz\0-Ifoo\0-o\0bin/out.o"
    print(extract_ir_lib.should_include_module(cmdline, None))
    # CHECK-TEST-FILTERING: True
    print(extract_ir_lib.should_include_module(cmdline, ".*"))
    # CHECK-TEST-FILTERING: True
    print(extract_ir_lib.should_include_module(cmdline, "^-Oz$"))
    # CHECK-TEST-FILTERING: True
    print(extract_ir_lib.should_include_module(cmdline, "^-O3$"))
    # CHECK-TEST-FILTERING: False


## Test that we extract the thinLTO index correctly

# RUN: %python %s test_thinlto_index_extractor | FileCheck %s --check-prefix CHECK-THINLTO-INDEX-EXTRACTOR


def test_thinlto_index_extractor():
    cmdline = (
        "-cc1\0x/y/foobar.cpp\0-Oz\0-Ifoo\0-o\0bin/"
        "out.o\0-fthinlto-index=foo/bar.thinlto.bc"
    )
    print(extract_ir_lib.get_thinlto_index(cmdline, "/the/base/dir"))
    # CHECK-THINLTO-INDEX-EXTRACTOR: /the/base/dir/foo/bar.thinlto.bc


if __name__ == "__main__":
    globals()[sys.argv[1]](*sys.argv[2:])