llvm/openmp/runtime/test/tasking/task_teams_stress_test.cpp

// RUN: %libomp-cxx-compile
// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=0 %libomp-run
// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=1 KMP_HOT_TEAMS_MODE=0 %libomp-run
// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=1 KMP_HOT_TEAMS_MODE=1 %libomp-run
// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=2 %libomp-run
// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=3 %libomp-run
// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=4 %libomp-run
// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=5 %libomp-run
//
// RUN: %libomp-cxx-compile -DUSE_HIDDEN_HELPERS=1
// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=0 %libomp-run
// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=1 KMP_HOT_TEAMS_MODE=0 %libomp-run
// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=1 KMP_HOT_TEAMS_MODE=1 %libomp-run
// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=2 %libomp-run
// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=3 %libomp-run
// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=4 %libomp-run
// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=5 %libomp-run

// This test stresses the task team mechanism by running a simple
// increment task over and over with varying number of threads and nesting.
// The test covers nested serial teams and mixing serial teams with
// normal active teams.

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

// The number of times to run each test
#define NTIMES 5

// Regular single increment task
void task_inc_a(int *a) {
#pragma omp task
  {
#pragma omp atomic
    (*a)++;
  }
}

// Splitting increment task that binary splits the incrementing task
void task_inc_split_a(int *a, int low, int high) {
#pragma omp task firstprivate(low, high)
  {
    if (low == high) {
#pragma omp atomic
      (*a)++;
    } else if (low < high) {
      int mid = (high - low) / 2 + low;
      task_inc_split_a(a, low, mid);
      task_inc_split_a(a, mid + 1, high);
    }
  }
}

#ifdef USE_HIDDEN_HELPERS
// Hidden helper tasks force serial regions to create task teams
void task_inc_a_hidden_helper(int *a) {
#pragma omp target map(tofrom : a[0]) nowait
  {
#pragma omp atomic
    (*a)++;
  }
}
#else
// Detached tasks force serial regions to create task teams
void task_inc_a_detached(int *a, omp_event_handle_t handle) {
#pragma omp task detach(handle)
  {
#pragma omp atomic
    (*a)++;
    omp_fulfill_event(handle);
  }
}
#endif

void check_a(int *a, int expected) {
  if (*a != expected) {
    fprintf(stderr,
            "FAIL: a = %d instead of expected = %d. Compile with "
            "-DVERBOSE for more verbose output.\n",
            *a, expected);
    exit(EXIT_FAILURE);
  }
}

// Every thread creates a single "increment" task
void test_tasks(omp_event_handle_t *handles, int expected, int *a) {
  int tid = omp_get_thread_num();

  task_inc_a(a);

#pragma omp barrier
  check_a(a, expected);
#pragma omp barrier
  check_a(a, expected);
#pragma omp barrier

#ifdef USE_HIDDEN_HELPERS
  task_inc_a_hidden_helper(a);
#else
  task_inc_a_detached(a, handles[tid]);
#endif

#pragma omp barrier
  check_a(a, 2 * expected);
#pragma omp barrier
  task_inc_a(a);
#pragma omp barrier
  check_a(a, 3 * expected);
}

// Testing single level of parallelism with increment tasks
void test_base(int nthreads) {
#ifdef VERBOSE
#pragma omp master
  printf("    test_base(%d)\n", nthreads);
#endif
  int a = 0;
  omp_event_handle_t *handles;
  handles = (omp_event_handle_t *)malloc(sizeof(omp_event_handle_t) * nthreads);
#pragma omp parallel num_threads(nthreads) shared(a)
  { test_tasks(handles, nthreads, &a); }
  free(handles);
}

// Testing nested parallel with increment tasks
// first = nthreads of outer parallel
// second = nthreads of nested parallel
void test_nest(int first, int second) {
#ifdef VERBOSE
#pragma omp master
  printf("   test_nest(%d, %d)\n", first, second);
#endif
#pragma omp parallel num_threads(first)
  { test_base(second); }
}

// Testing 2-level nested parallels with increment tasks
// first = nthreads of outer parallel
// second = nthreads of nested parallel
// third = nthreads of second nested parallel
void test_nest2(int first, int second, int third) {
#ifdef VERBOSE
#pragma omp master
  printf("  test_nest2(%d, %d, %d)\n", first, second, third);
#endif
#pragma omp parallel num_threads(first)
  { test_nest(second, third); }
}

// Testing 3-level nested parallels with increment tasks
// first = nthreads of outer parallel
// second = nthreads of nested parallel
// third = nthreads of second nested parallel
// fourth = nthreads of third nested parallel
void test_nest3(int first, int second, int third, int fourth) {
#ifdef VERBOSE
#pragma omp master
  printf(" test_nest3(%d, %d, %d, %d)\n", first, second, third, fourth);
#endif
#pragma omp parallel num_threads(first)
  { test_nest2(second, third, fourth); }
}

// Testing 4-level nested parallels with increment tasks
// first = nthreads of outer parallel
// second = nthreads of nested parallel
// third = nthreads of second nested parallel
// fourth = nthreads of third nested parallel
// fifth = nthreads of fourth nested parallel
void test_nest4(int first, int second, int third, int fourth, int fifth) {
#ifdef VERBOSE
#pragma omp master
  printf("test_nest4(%d, %d, %d, %d, %d)\n", first, second, third, fourth,
         fifth);
#endif
#pragma omp parallel num_threads(first)
  { test_nest3(second, third, fourth, fifth); }
}

// Single thread starts a binary splitting "increment" task
// Detached tasks are still single "increment" task
void test_tasks_split(omp_event_handle_t *handles, int expected, int *a) {
  int tid = omp_get_thread_num();

#pragma omp single
  task_inc_split_a(a, 1, expected); // task team A

#pragma omp barrier
  check_a(a, expected);
#pragma omp barrier
  check_a(a, expected);
#pragma omp barrier

#ifdef USE_HIDDEN_HELPERS
  task_inc_a_hidden_helper(a);
#else
  task_inc_a_detached(a, handles[tid]);
#endif

#pragma omp barrier
  check_a(a, 2 * expected);
#pragma omp barrier
#pragma omp single
  task_inc_split_a(a, 1, expected); // task team B
#pragma omp barrier
  check_a(a, 3 * expected);
}

// Testing single level of parallelism with splitting incrementing tasks
void test_base_split(int nthreads) {
#ifdef VERBOSE
#pragma omp master
  printf("  test_base_split(%d)\n", nthreads);
#endif
  int a = 0;
  omp_event_handle_t *handles;
  handles = (omp_event_handle_t *)malloc(sizeof(omp_event_handle_t) * nthreads);
#pragma omp parallel num_threads(nthreads) shared(a)
  { test_tasks_split(handles, nthreads, &a); }
  free(handles);
}

// Testing nested parallels with splitting tasks
// first = nthreads of outer parallel
// second = nthreads of nested parallel
void test_nest_split(int first, int second) {
#ifdef VERBOSE
#pragma omp master
  printf(" test_nest_split(%d, %d)\n", first, second);
#endif
#pragma omp parallel num_threads(first)
  { test_base_split(second); }
}

// Testing doubly nested parallels with splitting tasks
// first = nthreads of outer parallel
// second = nthreads of nested parallel
// third = nthreads of second nested parallel
void test_nest2_split(int first, int second, int third) {
#ifdef VERBOSE
#pragma omp master
  printf("test_nest2_split(%d, %d, %d)\n", first, second, third);
#endif
#pragma omp parallel num_threads(first)
  { test_nest_split(second, third); }
}

template <typename... Args>
void run_ntimes(int n, void (*func)(Args...), Args... args) {
  for (int i = 0; i < n; ++i) {
    func(args...);
  }
}

int main() {
  omp_set_max_active_levels(5);

  run_ntimes(NTIMES, test_base, 4);
  run_ntimes(NTIMES, test_base, 1);
  run_ntimes(NTIMES, test_base, 8);
  run_ntimes(NTIMES, test_base, 2);
  run_ntimes(NTIMES, test_base, 6);
  run_ntimes(NTIMES, test_nest, 1, 1);
  run_ntimes(NTIMES, test_nest, 1, 5);
  run_ntimes(NTIMES, test_nest, 2, 6);
  run_ntimes(NTIMES, test_nest, 1, 1);
  run_ntimes(NTIMES, test_nest, 4, 3);
  run_ntimes(NTIMES, test_nest, 3, 2);
  run_ntimes(NTIMES, test_nest, 1, 1);
  run_ntimes(NTIMES, test_nest2, 1, 1, 2);
  run_ntimes(NTIMES, test_nest2, 1, 2, 1);
  run_ntimes(NTIMES, test_nest2, 2, 2, 1);
  run_ntimes(NTIMES, test_nest2, 2, 1, 1);
  run_ntimes(NTIMES, test_nest2, 4, 2, 1);
  run_ntimes(NTIMES, test_nest2, 4, 2, 2);
  run_ntimes(NTIMES, test_nest2, 1, 1, 1);
  run_ntimes(NTIMES, test_nest2, 4, 2, 2);
  run_ntimes(NTIMES, test_nest3, 1, 1, 1, 1);
  run_ntimes(NTIMES, test_nest3, 1, 2, 1, 1);
  run_ntimes(NTIMES, test_nest3, 1, 1, 2, 1);
  run_ntimes(NTIMES, test_nest3, 1, 1, 1, 2);
  run_ntimes(NTIMES, test_nest3, 2, 1, 1, 1);
  run_ntimes(NTIMES, test_nest4, 1, 1, 1, 1, 1);
  run_ntimes(NTIMES, test_nest4, 2, 1, 1, 1, 1);
  run_ntimes(NTIMES, test_nest4, 1, 2, 1, 1, 1);
  run_ntimes(NTIMES, test_nest4, 1, 1, 2, 1, 1);
  run_ntimes(NTIMES, test_nest4, 1, 1, 1, 2, 1);
  run_ntimes(NTIMES, test_nest4, 1, 1, 1, 1, 2);
  run_ntimes(NTIMES, test_nest4, 1, 1, 1, 1, 1);
  run_ntimes(NTIMES, test_nest4, 1, 2, 1, 2, 1);

  run_ntimes(NTIMES, test_base_split, 4);
  run_ntimes(NTIMES, test_base_split, 2);

  run_ntimes(NTIMES, test_base_split, 7);

  run_ntimes(NTIMES, test_base_split, 1);
  run_ntimes(NTIMES, test_nest_split, 4, 2);
  run_ntimes(NTIMES, test_nest_split, 2, 1);

  run_ntimes(NTIMES, test_nest_split, 7, 2);
  run_ntimes(NTIMES, test_nest_split, 1, 1);
  run_ntimes(NTIMES, test_nest_split, 1, 4);

  run_ntimes(NTIMES, test_nest2_split, 1, 1, 2);
  run_ntimes(NTIMES, test_nest2_split, 1, 2, 1);
  run_ntimes(NTIMES, test_nest2_split, 2, 2, 1);
  run_ntimes(NTIMES, test_nest2_split, 2, 1, 1);
  run_ntimes(NTIMES, test_nest2_split, 4, 2, 1);
  run_ntimes(NTIMES, test_nest2_split, 4, 2, 2);
  run_ntimes(NTIMES, test_nest2_split, 1, 1, 1);
  run_ntimes(NTIMES, test_nest2_split, 4, 2, 2);

  printf("PASS\n");
  return EXIT_SUCCESS;
}