folly/folly/lang/test/BitsClassBenchmark.cpp

/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <atomic>
#include <memory>
#include <random>

#include <glog/logging.h>

#include <folly/Benchmark.h>
#include <folly/lang/BitsClass.h>

std::random_device rd;

const size_t kBufferSize = 1 << 10;
std::vector<uint8_t> buffer(kBufferSize + 16);

template <class T>
void benchmarkSet(size_t n, T) {
  size_t size = folly::to_integral(sizeof(T) * 6.9); // use 6.9 bits/byte
  const size_t k = 16;
  T values[k];
  BENCHMARK_SUSPEND {
    std::mt19937 gen(rd());
    T max, min;
    if (std::is_signed<T>::value) {
      max = (T(1) << (size - 1)) - 1;
      min = -(T(1) << (size - 1));
    } else {
      max = (T(1) << size) - 1;
      min = 0;
    }
    CHECK_LE(folly::findLastSet(max), size);
    CHECK_LE(folly::findLastSet(-min), size);
    std::uniform_int_distribution<T> dis(min, max);
    for (size_t i = 0; i < k; ++i) {
      values[i] = dis(gen);
    }
  }

  for (size_t i = 0; i < n; ++i) {
    size_t bit = (i * 2973) % (kBufferSize * 8);
    size_t drop = i % size;
    folly::Bits<T>::set(
        reinterpret_cast<T*>(buffer.data()),
        bit,
        size - drop,
        values[i % k] >> drop);
  }

  folly::doNotOptimizeAway(
      folly::Bits<T>::test(reinterpret_cast<T*>(buffer.data()), 512));
}

BENCHMARK_NAMED_PARAM(benchmarkSet, u16, uint16_t())
BENCHMARK_RELATIVE_NAMED_PARAM(benchmarkSet, i16, int16_t())
BENCHMARK_NAMED_PARAM(benchmarkSet, u32, uint32_t())
BENCHMARK_RELATIVE_NAMED_PARAM(benchmarkSet, i32, int32_t())
BENCHMARK_NAMED_PARAM(benchmarkSet, u64, uint64_t())
BENCHMARK_RELATIVE_NAMED_PARAM(benchmarkSet, i64, int64_t())

BENCHMARK_DRAW_LINE();

std::atomic<int64_t> sum(0);

template <class T>
void benchmarkGet(size_t n, T x) {
  size_t size = folly::to_integral(sizeof(T) * 6.9); // use 6.9 bits/byte
  for (size_t i = 0; i < n; ++i) {
    size_t bit = (i * 2973) % (kBufferSize * 8);
    size_t drop = i % size;
    x += folly::Bits<T>::get(
        reinterpret_cast<T*>(buffer.data()), bit, size - drop);
  }
  folly::doNotOptimizeAway(x);
}

BENCHMARK_NAMED_PARAM(benchmarkGet, u16, uint16_t(0))
BENCHMARK_RELATIVE_NAMED_PARAM(benchmarkGet, i16, int16_t(0))
BENCHMARK_NAMED_PARAM(benchmarkGet, u32, uint32_t(0))
BENCHMARK_RELATIVE_NAMED_PARAM(benchmarkGet, i32, int32_t(0))
BENCHMARK_NAMED_PARAM(benchmarkGet, u64, uint64_t(0))
BENCHMARK_RELATIVE_NAMED_PARAM(benchmarkGet, i64, int64_t(0))

#if 0
============================================================================
folly/experimental/test/BitsBenchmark.cpp       relative  time/iter  iters/s
============================================================================
benchmarkSet(u16)                                            8.58ns  116.59M
benchmarkSet(i16)                                 88.42%     9.70ns  103.08M
benchmarkSet(u32)                                            8.37ns  119.45M
benchmarkSet(i32)                                 88.23%     9.49ns  105.39M
benchmarkSet(u64)                                            9.23ns  108.34M
benchmarkSet(i64)                                 82.77%    11.15ns   89.68M
----------------------------------------------------------------------------
benchmarkGet(u16)                                            6.32ns  158.13M
benchmarkGet(i16)                                 80.40%     7.87ns  127.14M
benchmarkGet(u32)                                            6.34ns  157.65M
benchmarkGet(i32)                                 84.61%     7.50ns  133.39M
benchmarkGet(u64)                                            7.32ns  136.58M
benchmarkGet(i64)                                 85.78%     8.53ns  117.16M
============================================================================
#endif

int main(int argc, char* argv[]) {
  gflags::ParseCommandLineFlags(&argc, &argv, true);
  folly::runBenchmarks();
  return sum.load();
}