llvm/openmp/runtime/test/worksharing/for/collapse_test.inc

#include <omp.h>
#include <malloc.h>
#include <stdio.h>
#include <memory.h>

#define LOOP_IV_TYPE0 LOOP_TYPES
#define LOOP_TYPE0 LOOP_TYPES
#define LOOP_STYPE0 LOOP_TYPES

#define LOOP_IV_TYPE1 LOOP_TYPES
#define LOOP_TYPE1 LOOP_TYPES
#define LOOP_STYPE1 LOOP_TYPES

#define LOOP_IV_TYPE2 LOOP_TYPES
#define LOOP_TYPE2 LOOP_TYPES
#define LOOP_STYPE2 LOOP_TYPES

#define MAX_THREADS 256

#if defined VERBOSE
#define PRINTF(...) printf(__VA_ARGS__)
#else
#define PRINTF(...)
#endif

LOOP_TYPE0 iLB, iUB;
LOOP_TYPE1 jA0, jB0;
LOOP_TYPE2 kA0, kB0;

LOOP_STYPE0 iStep;
LOOP_STYPE1 jA1, jB1, jStep;
LOOP_STYPE2 kA1, kB1, kStep;

// We can check <=, <, >=, > (!= has different pattern)
// Additional definition of LOOP_LEi, LOOP_LTi, etc. is helpful to build calls
// of the test from main

#if defined LOOP_LE0
#define COMPARE0 <=
#elif defined LOOP_LT0
#define COMPARE0 <
#elif defined LOOP_GE0
#define COMPARE0 >=
#elif defined LOOP_GT0
#define COMPARE0 >
#endif

#if defined LOOP_LE1
#define COMPARE1 <=
#elif defined LOOP_LT1
#define COMPARE1 <
#elif defined LOOP_GE1
#define COMPARE1 >=
#elif defined LOOP_GT1
#define COMPARE1 >
#endif

#if defined LOOP_LE2
#define COMPARE2 <=
#elif defined LOOP_LT2
#define COMPARE2 <
#elif defined LOOP_GE2
#define COMPARE2 >=
#elif defined LOOP_GT2
#define COMPARE2 >
#endif

typedef struct {
  LOOP_IV_TYPE0 i;
  LOOP_IV_TYPE1 j;
  LOOP_IV_TYPE2 k;
} spaceType;

spaceType *AllocSpace(unsigned size) {

  spaceType *p = (spaceType *)malloc(size * sizeof(spaceType));
  memset(p, 0, size * sizeof(spaceType));
  return p;
}

void FreeSpace(spaceType *space) { free(space); }

// record an iteration
void Set(spaceType *space, unsigned count, unsigned trueCount, LOOP_IV_TYPE0 i,
         LOOP_IV_TYPE1 j, LOOP_IV_TYPE0 k) {
  if (count > trueCount) {
    // number of iterations exceeded
    // will be reported with checks
    return;
  }
  space[count - 1].i = i;
  space[count - 1].j = j;
  space[count - 1].k = k;
}
int test() {
  int pass = 1;
  LOOP_IV_TYPE0 i;
  LOOP_IV_TYPE1 j;
  LOOP_IV_TYPE2 k;

  spaceType *openmpSpace;
  spaceType *scalarSpace;

  unsigned trueCount = 0;
  unsigned openmpCount = 0;
  unsigned scalarCount = 0;
  unsigned uselessThreadsOpenMP = 0;
  unsigned usefulThreadsOpenMP = 0;

  // Use half of the available threads/logical processors.
  unsigned num_threads = omp_get_max_threads() / 2;

  // Make sure num_threads is not 0 after the division in case
  // omp_get_max_threads() returns 1.
  if (num_threads == 0)
    num_threads = 1;

  if (num_threads > MAX_THREADS)
    num_threads = MAX_THREADS;

  unsigned long *chunkSizesOpenmp =
      (unsigned long *)malloc(sizeof(unsigned long) * num_threads);
  memset(chunkSizesOpenmp, 0, sizeof(unsigned long) * num_threads);

  // count iterations and allocate space
  LOOP { ++trueCount; }

  openmpSpace = AllocSpace(trueCount);
  scalarSpace = AllocSpace(trueCount);

  // fill the scalar (compare) space
  LOOP {
    ++scalarCount;
    Set(scalarSpace, scalarCount, trueCount, i, j, k);
  }

  // test run body:
  // perform and record OpenMP iterations and thread use
#pragma omp parallel num_threads(num_threads)
  {
    unsigned gtid = omp_get_thread_num();
#pragma omp for collapse(3) private(i, j, k)
    LOOP {
      unsigned count;
#pragma omp atomic update
      ++chunkSizesOpenmp[gtid];
#pragma omp atomic capture
      count = ++openmpCount;
      Set(openmpSpace, count, trueCount, i, j, k);
    }
  }

  // check for the right number of iterations processed
  // (only need to check for less, greater is checked when recording)
  if (openmpCount < trueCount) {
    PRINTF("OpenMP FAILURE: Openmp processed fewer iterations: %d vs %d\n",
           openmpCount, trueCount);
    pass = 0;
  } else if (openmpCount > trueCount) {
    PRINTF("OpenMP FAILURE: Openmp processed more iterations: %d vs %d\n",
           openmpCount, trueCount);
    pass = 0;
  }

  // check openMP for iteration correctnes against scalar
  for (unsigned i = 0; i < trueCount; i++) {
    unsigned j;
    for (j = 0; j < openmpCount; j++) {
      if ((scalarSpace[i].i == openmpSpace[j].i) &&
          (scalarSpace[i].j == openmpSpace[j].j) &&
          (scalarSpace[i].k == openmpSpace[j].k)) {
        break;
      }
    }
    if (j == openmpCount) {
      PRINTF("OpenMP FAILURE: (%d %d %d) not processed\n", scalarSpace[i].i,
             scalarSpace[i].j, scalarSpace[i].k);
      pass = 0;
    }
  }

  // check for efficient thread use
  for (unsigned i = 0; i < num_threads; ++i) {
    if (chunkSizesOpenmp[i] == 0) {
      ++uselessThreadsOpenMP;
    }
  }

  // a check to see if at least more than one thread was used (weakish)
  if ((uselessThreadsOpenMP == num_threads - 1) && (trueCount > 1)) {
    PRINTF("OpenMP FAILURE: threads are not used\n");
    pass = 0;
  }

#if 0
    // a check to see if the load was spread more or less evenly so that
    // when there was more work than threads each one got at least something 
    // (stronger, but may currently fail for a general collapse case)
    if ((trueCount >= num_threads) && (uselessThreadsOpenMP > 0)) {
       PRINTF("OpenMP FAILURE: %d threads not used with %d iterations\n", 
           uselessThreadsOpenMP, openmpCount);
       pass = 0;
    }
#endif

  // clean up space
  FreeSpace(openmpSpace);
  FreeSpace(scalarSpace);
  free(chunkSizesOpenmp);
  return pass;
}