llvm/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_test.py

# REQUIRES: system-linux

## Test the functionality of combine_training_corpus_lib

import json
import os
import sys

from mlgo.corpus import combine_training_corpus_lib

## Test that combining two training corpora works as expected

# RUN: rm -rf %t.dir && mkdir %t.dir
# RUN: mkdir %t.dir/subcorpus1
# RUN: mkdir %t.dir/subcorpus2
# RUN: %python %s test_combine_corpus %t.dir | FileCheck %s --check-prefix CHECK-COMBINE-CORPUS


def test_combine_corpus(corpus_dir):
    subcorpus1_dir = os.path.join(corpus_dir, "subcorpus1")
    subcorpus2_dir = os.path.join(corpus_dir, "subcorpus2")
    subcorpus1_description = {
        "has_thinlto": False,
        "modules": ["test1.o", "test2.o"],
    }
    subcorpus2_description = {
        "has_thinlto": False,
        "modules": ["test3.o", "test4.o"],
    }
    with open(
        os.path.join(subcorpus1_dir, "corpus_description.json"), "w"
    ) as corpus1_description_handle:
        json.dump(subcorpus1_description, corpus1_description_handle)
    with open(
        os.path.join(subcorpus2_dir, "corpus_description.json"), "w"
    ) as corpus2_description_handle:
        json.dump(subcorpus2_description, corpus2_description_handle)
    combine_training_corpus_lib.combine_corpus(corpus_dir)
    with open(
        os.path.join(corpus_dir, "corpus_description.json"), encoding="utf-8"
    ) as combined_corpus_description_file:
        combined_corpus_description = json.load(combined_corpus_description_file)
    print(combined_corpus_description["has_thinlto"])
    # CHECK-COMBINE-CORPUS: False
    for module in sorted(combined_corpus_description["modules"]):
        print(module)
    # CHECK-COMBINE-CORPUS: subcorpus1/test1.o
    # CHECK-COMBINE-CORPUS: subcorpus1/test2.o
    # CHECK-COMBINE-CORPUS: subcorpus2/test3.o
    # CHECK-COMBINE-CORPUS: subcorpus2/test4.o


## Test that we handle the empty folder case gracefully

# RUN: rm -rf %t.dir && mkdir %t.dir
# RUN: mkdir %t.dir/subcorpus1
# RUN: mkdir %t.dir/empty_dir
# RUN: %python %s test_empty_folder %t.dir | FileCheck %s --check-prefix CHECK-EMPTY-DIR


def test_empty_folder(corpus_dir):
    subcorpus1_dir = os.path.join(corpus_dir, "subcorpus1")
    subcorpus1_description = {"modules": ["test1.o", "test2.o"]}
    with open(
        os.path.join(subcorpus1_dir, "corpus_description.json"), "w"
    ) as subcorpus1_description_handle:
        json.dump(subcorpus1_description, subcorpus1_description_handle)
    combine_training_corpus_lib.combine_corpus(corpus_dir)
    with open(
        os.path.join(corpus_dir, "corpus_description.json"), encoding="utf-8"
    ) as combined_corpus_description_file:
        combined_corpus_description = json.load(combined_corpus_description_file)
    print(len(combined_corpus_description["modules"]))
    # CHECK-EMPTY-DIR: 2


## Test that we ignore extra files that will not end up contributing to the
## corpus.

# RUN: rm -rf %t.dir && mkdir %t.dir
# RUN: mkdir %t.dir/subcorpus1
# RUN: touch %t.dir/empty.log
# RUN: %python %s test_ignore_extra_file %t.dir | FileCheck %s --check-prefix CHECK-IGNORE-EXTRA-FILE


def test_ignore_extra_file(corpus_dir):
    subcorpus1_dir = os.path.join(corpus_dir, "subcorpus1")
    subcorpus1_description = {"modules": ["test1.o", "test2.o"]}
    with open(
        os.path.join(subcorpus1_dir, "corpus_description.json"), "w"
    ) as subcorpus1_description_handle:
        json.dump(subcorpus1_description, subcorpus1_description_handle)
    combine_training_corpus_lib.combine_corpus(corpus_dir)
    with open(
        os.path.join(corpus_dir, "corpus_description.json"), encoding="utf-8"
    ) as combined_corpus_description_file:
        combined_corpus_description = json.load(combined_corpus_description_file)
    print(len(combined_corpus_description["modules"]))
    # CHECK-IGNORE-EXTRA-FILE: 2


## Test that we raise an error in the case where the corpora differ in a
## substantial way.

# RUN: rm -rf  %t.dir && mkdir %t.dir
# RUN: mkdir %t.dir/subcorpus1
# RUN: mkdir %t.dir/subcorpus2
# RUN: %python %s test_different_corpora %t.dir | FileCheck %s --check-prefix CHECK-DIFFERENT-CORPORA


def test_different_corpora(corpus_dir):
    subcorpus1_dir = os.path.join(corpus_dir, "subcorpus1")
    subcorpus2_dir = os.path.join(corpus_dir, "subcorpus2")
    subcorpus1_description = {"has_thinlto": False, "modules": ["test1.o"]}
    subcorpus2_description = {"has_thinlto": True, "modules": ["test2.o"]}
    with open(
        os.path.join(subcorpus1_dir, "corpus_description.json"), "w"
    ) as subcorpus1_description_handle:
        json.dump(subcorpus1_description, subcorpus1_description_handle)
    with open(
        os.path.join(subcorpus2_dir, "corpus_description.json"), "w"
    ) as subcorpus2_description_handle:
        json.dump(subcorpus2_description, subcorpus2_description_handle)
    try:
        combine_training_corpus_lib.combine_corpus(corpus_dir)
    except ValueError:
        print("ValueError")
        # CHECK-DIFFERENT-CORPORA: ValueError


if __name__ == "__main__":
    globals()[sys.argv[1]](*sys.argv[2:])