llvm/libc/benchmarks/gpu/LibcGpuBenchmark.h

#ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
#define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H

#include "benchmarks/gpu/BenchmarkLogger.h"
#include "benchmarks/gpu/timing/timing.h"
#include "src/__support/CPP/array.h"
#include "src/__support/CPP/functional.h"
#include "src/__support/CPP/limits.h"
#include "src/__support/CPP/string_view.h"
#include "src/__support/CPP/type_traits.h"
#include "src/__support/FPUtil/FPBits.h"
#include "src/__support/macros/config.h"
#include "src/stdlib/rand.h"
#include "src/time/clock.h"

#include <stdint.h>

namespace LIBC_NAMESPACE_DECL {

namespace benchmarks {

struct BenchmarkOptions {
  uint32_t initial_iterations = 1;
  uint32_t min_iterations = 1;
  uint32_t max_iterations = 10000000;
  uint32_t min_samples = 4;
  uint32_t max_samples = 1000;
  int64_t min_duration = 500 * 1000;         // 500 * 1000 nanoseconds = 500 us
  int64_t max_duration = 1000 * 1000 * 1000; // 1e9 nanoseconds = 1 second
  double epsilon = 0.0001;
  double scaling_factor = 1.4;
};

struct Measurement {
  uint32_t iterations = 0;
  uint64_t elapsed_cycles = 0;
};

class RefinableRuntimeEstimation {
  uint64_t total_cycles = 0;
  uint32_t total_iterations = 0;

public:
  uint64_t update(const Measurement &M) {
    total_cycles += M.elapsed_cycles;
    total_iterations += M.iterations;
    return total_cycles / total_iterations;
  }
};

// Tracks the progression of the runtime estimation
class RuntimeEstimationProgression {
  RefinableRuntimeEstimation rre;

public:
  uint64_t current_estimation = 0;

  double compute_improvement(const Measurement &M) {
    const uint64_t new_estimation = rre.update(M);
    double ratio =
        (static_cast<double>(current_estimation) / new_estimation) - 1.0;

    // Get absolute value
    if (ratio < 0)
      ratio *= -1;

    current_estimation = new_estimation;
    return ratio;
  }
};

struct BenchmarkResult {
  uint64_t cycles = 0;
  double standard_deviation = 0;
  uint64_t min = UINT64_MAX;
  uint64_t max = 0;
  uint32_t samples = 0;
  uint32_t total_iterations = 0;
  clock_t total_time = 0;
};

BenchmarkResult benchmark(const BenchmarkOptions &options,
                          cpp::function<uint64_t(void)> wrapper_func);

class Benchmark {
  const cpp::function<uint64_t(void)> func;
  const cpp::string_view suite_name;
  const cpp::string_view test_name;
  const uint32_t num_threads;

public:
  Benchmark(cpp::function<uint64_t(void)> func, char const *suite_name,
            char const *test_name, uint32_t num_threads)
      : func(func), suite_name(suite_name), test_name(test_name),
        num_threads(num_threads) {
    add_benchmark(this);
  }

  static void run_benchmarks();
  const cpp::string_view get_suite_name() const { return suite_name; }
  const cpp::string_view get_test_name() const { return test_name; }

protected:
  static void add_benchmark(Benchmark *benchmark);

private:
  BenchmarkResult run() {
    BenchmarkOptions options;
    return benchmark(options, func);
  }
};

// We want our random values to be approximately
// Output: a random number with the exponent field between min_exp and max_exp,
// i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1),
// Caveats:
//   -EXP_BIAS corresponding to denormal values,
//   EXP_BIAS + 1 corresponding to inf or nan.
template <typename T>
static T
get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
               int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
  using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;

  // Required to correctly instantiate FPBits for floats and doubles.
  using RandType = typename cpp::conditional_t<(cpp::is_same_v<T, double>),
                                               uint64_t, uint32_t>;
  RandType bits;
  if constexpr (cpp::is_same_v<T, uint64_t>)
    bits = (static_cast<uint64_t>(LIBC_NAMESPACE::rand()) << 32) |
           static_cast<uint64_t>(LIBC_NAMESPACE::rand());
  else
    bits = LIBC_NAMESPACE::rand();
  double scale =
      static_cast<double>(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1);
  FPBits fp(bits);
  fp.set_biased_exponent(
      static_cast<uint32_t>(fp.get_biased_exponent() * scale + min_exp));
  return fp.get_val();
}

template <typename T> class MathPerf {
  using FPBits = fputil::FPBits<T>;
  using StorageType = typename FPBits::StorageType;
  static constexpr StorageType UIntMax =
      cpp::numeric_limits<StorageType>::max();

public:
  template <size_t N = 1>
  static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp) {
    cpp::array<T, N> inputs;
    for (size_t i = 0; i < N; ++i)
      inputs[i] = get_rand_input<T>(min_exp, max_exp);

    uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs);

    return total_time / N;
  }

  // Throughput benchmarking for functions that take 2 inputs.
  template <size_t N = 1>
  static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp,
                                          int arg1_max_exp, int arg2_min_exp,
                                          int arg2_max_exp) {
    cpp::array<T, N> inputs1;
    cpp::array<T, N> inputs2;
    for (size_t i = 0; i < N; ++i) {
      inputs1[i] = get_rand_input<T>(arg1_min_exp, arg1_max_exp);
      inputs2[i] = get_rand_input<T>(arg2_min_exp, arg2_max_exp);
    }

    uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs1, inputs2);

    return total_time / N;
  }
};

} // namespace benchmarks
} // namespace LIBC_NAMESPACE_DECL

// Passing -1 indicates the benchmark should be run with as many threads as
// allocated by the user in the benchmark's CMake.
#define BENCHMARK(SuiteName, TestName, Func)                                   \
  LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance(     \
      Func, #SuiteName, #TestName, -1)

#define BENCHMARK_N_THREADS(SuiteName, TestName, Func, NumThreads)             \
  LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance(     \
      Func, #SuiteName, #TestName, NumThreads)

#define SINGLE_THREADED_BENCHMARK(SuiteName, TestName, Func)                   \
  BENCHMARK_N_THREADS(SuiteName, TestName, Func, 1)

#define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func)                       \
  BENCHMARK_N_THREADS(SuiteName, TestName, Func,                               \
                      LIBC_NAMESPACE::gpu::get_lane_size())
#endif