#ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
#define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
#include "benchmarks/gpu/BenchmarkLogger.h"
#include "benchmarks/gpu/timing/timing.h"
#include "src/__support/CPP/array.h"
#include "src/__support/CPP/functional.h"
#include "src/__support/CPP/limits.h"
#include "src/__support/CPP/string_view.h"
#include "src/__support/CPP/type_traits.h"
#include "src/__support/FPUtil/FPBits.h"
#include "src/__support/macros/config.h"
#include "src/stdlib/rand.h"
#include "src/time/clock.h"
#include <stdint.h>
namespace LIBC_NAMESPACE_DECL {
namespace benchmarks {
struct BenchmarkOptions {
uint32_t initial_iterations = 1;
uint32_t min_iterations = 1;
uint32_t max_iterations = 10000000;
uint32_t min_samples = 4;
uint32_t max_samples = 1000;
int64_t min_duration = 500 * 1000; // 500 * 1000 nanoseconds = 500 us
int64_t max_duration = 1000 * 1000 * 1000; // 1e9 nanoseconds = 1 second
double epsilon = 0.0001;
double scaling_factor = 1.4;
};
struct Measurement {
uint32_t iterations = 0;
uint64_t elapsed_cycles = 0;
};
class RefinableRuntimeEstimation {
uint64_t total_cycles = 0;
uint32_t total_iterations = 0;
public:
uint64_t update(const Measurement &M) {
total_cycles += M.elapsed_cycles;
total_iterations += M.iterations;
return total_cycles / total_iterations;
}
};
// Tracks the progression of the runtime estimation
class RuntimeEstimationProgression {
RefinableRuntimeEstimation rre;
public:
uint64_t current_estimation = 0;
double compute_improvement(const Measurement &M) {
const uint64_t new_estimation = rre.update(M);
double ratio =
(static_cast<double>(current_estimation) / new_estimation) - 1.0;
// Get absolute value
if (ratio < 0)
ratio *= -1;
current_estimation = new_estimation;
return ratio;
}
};
struct BenchmarkResult {
uint64_t cycles = 0;
double standard_deviation = 0;
uint64_t min = UINT64_MAX;
uint64_t max = 0;
uint32_t samples = 0;
uint32_t total_iterations = 0;
clock_t total_time = 0;
};
BenchmarkResult benchmark(const BenchmarkOptions &options,
cpp::function<uint64_t(void)> wrapper_func);
class Benchmark {
const cpp::function<uint64_t(void)> func;
const cpp::string_view suite_name;
const cpp::string_view test_name;
const uint32_t num_threads;
public:
Benchmark(cpp::function<uint64_t(void)> func, char const *suite_name,
char const *test_name, uint32_t num_threads)
: func(func), suite_name(suite_name), test_name(test_name),
num_threads(num_threads) {
add_benchmark(this);
}
static void run_benchmarks();
const cpp::string_view get_suite_name() const { return suite_name; }
const cpp::string_view get_test_name() const { return test_name; }
protected:
static void add_benchmark(Benchmark *benchmark);
private:
BenchmarkResult run() {
BenchmarkOptions options;
return benchmark(options, func);
}
};
// We want our random values to be approximately
// Output: a random number with the exponent field between min_exp and max_exp,
// i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1),
// Caveats:
// -EXP_BIAS corresponding to denormal values,
// EXP_BIAS + 1 corresponding to inf or nan.
template <typename T>
static T
get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
// Required to correctly instantiate FPBits for floats and doubles.
using RandType = typename cpp::conditional_t<(cpp::is_same_v<T, double>),
uint64_t, uint32_t>;
RandType bits;
if constexpr (cpp::is_same_v<T, uint64_t>)
bits = (static_cast<uint64_t>(LIBC_NAMESPACE::rand()) << 32) |
static_cast<uint64_t>(LIBC_NAMESPACE::rand());
else
bits = LIBC_NAMESPACE::rand();
double scale =
static_cast<double>(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1);
FPBits fp(bits);
fp.set_biased_exponent(
static_cast<uint32_t>(fp.get_biased_exponent() * scale + min_exp));
return fp.get_val();
}
template <typename T> class MathPerf {
using FPBits = fputil::FPBits<T>;
using StorageType = typename FPBits::StorageType;
static constexpr StorageType UIntMax =
cpp::numeric_limits<StorageType>::max();
public:
template <size_t N = 1>
static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp) {
cpp::array<T, N> inputs;
for (size_t i = 0; i < N; ++i)
inputs[i] = get_rand_input<T>(min_exp, max_exp);
uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs);
return total_time / N;
}
// Throughput benchmarking for functions that take 2 inputs.
template <size_t N = 1>
static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp,
int arg1_max_exp, int arg2_min_exp,
int arg2_max_exp) {
cpp::array<T, N> inputs1;
cpp::array<T, N> inputs2;
for (size_t i = 0; i < N; ++i) {
inputs1[i] = get_rand_input<T>(arg1_min_exp, arg1_max_exp);
inputs2[i] = get_rand_input<T>(arg2_min_exp, arg2_max_exp);
}
uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs1, inputs2);
return total_time / N;
}
};
} // namespace benchmarks
} // namespace LIBC_NAMESPACE_DECL
// Passing -1 indicates the benchmark should be run with as many threads as
// allocated by the user in the benchmark's CMake.
#define BENCHMARK(SuiteName, TestName, Func) \
LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \
Func, #SuiteName, #TestName, -1)
#define BENCHMARK_N_THREADS(SuiteName, TestName, Func, NumThreads) \
LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \
Func, #SuiteName, #TestName, NumThreads)
#define SINGLE_THREADED_BENCHMARK(SuiteName, TestName, Func) \
BENCHMARK_N_THREADS(SuiteName, TestName, Func, 1)
#define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func) \
BENCHMARK_N_THREADS(SuiteName, TestName, Func, \
LIBC_NAMESPACE::gpu::get_lane_size())
#endif