folly/folly/test/MemcpyBenchmark.cpp

/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <string.h>

#include <chrono>
#include <random>

#include <folly/Benchmark.h>
#include <folly/FollyMemcpy.h>
#include <folly/portability/Unistd.h>

void bench(
    uint32_t iters,
    void*(memcpy_func)(void*, const void*, size_t),
    size_t min,
    size_t max,
    size_t align,
    bool hot) {
  static std::string dst_buffer;
  static std::string src_buffer;
  static std::vector<size_t> sizes;
  static std::vector<size_t> dst_offsets;
  static std::vector<size_t> src_offsets;

  BENCHMARK_SUSPEND {
    size_t src_buffer_size = folly::to_integral(
        sysconf(_SC_PAGE_SIZE) *
        std::ceil(
            static_cast<double>(max + 2 * align) / sysconf(_SC_PAGE_SIZE)));
    size_t dst_buffer_size;
    if (hot) {
      dst_buffer_size = src_buffer_size;
    } else {
      dst_buffer_size = 1024 * 1024 * 1024; // 1 GiB
    }
    dst_buffer.resize(dst_buffer_size);
    memset(dst_buffer.data(), 'd', dst_buffer.size());
    src_buffer.resize(src_buffer_size);
    memset(src_buffer.data(), 's', src_buffer.size());

    std::default_random_engine gen;
    sizes.resize(4095);
    std::uniform_int_distribution<size_t> size_dist(min, max);
    for (size_t i = 0; i < sizes.size(); i++) {
      sizes[i] = size_dist(gen);
    }

    src_offsets.resize(4096);
    dst_offsets.resize(4096);
    std::uniform_int_distribution<size_t> src_offset_dist(
        0, (src_buffer_size - max) / align);
    std::uniform_int_distribution<size_t> dst_offset_dist(
        0, (dst_buffer_size - max) / align);
    for (size_t i = 0; i < src_offsets.size(); i++) {
      src_offsets[i] = align * src_offset_dist(gen);
      dst_offsets[i] = align * dst_offset_dist(gen);
    }
  }

  size_t size_idx = 0;
  size_t offset_idx = 0;
  for (unsigned int i = 0; i < iters; i++) {
    if (size_idx + 1 == sizes.size()) {
      size_idx = 0;
    }
    if (offset_idx >= src_offsets.size()) {
      offset_idx = 0;
    }
    void* dst = &dst_buffer[dst_offsets[offset_idx]];
    const void* src = &src_buffer[src_offsets[offset_idx]];
    size_t size = sizes[size_idx];
    memcpy_func(dst, src, size);
    size_idx++;
    offset_idx++;
  }
}

#define BENCH_BOTH(MIN, MAX, HOT, HOT_STR)   \
  BENCHMARK_NAMED_PARAM(                     \
      bench,                                 \
      MIN##_to_##MAX##_##HOT_STR##_glibc,    \
      /*memcpy_func=*/memcpy,                \
      /*min=*/MIN,                           \
      /*max=*/MAX,                           \
      /*align=*/1,                           \
      /*hot=*/HOT)                           \
  BENCHMARK_RELATIVE_NAMED_PARAM(            \
      bench,                                 \
      MIN##_to_##MAX##_##HOT_STR##_folly,    \
      /*memcpy_func=*/folly::__folly_memcpy, \
      /*min=*/MIN,                           \
      /*max=*/MAX,                           \
      /*align=*/1,                           \
      /*hot=*/HOT)

BENCH_BOTH(0, 7, true, HOT)
BENCH_BOTH(0, 16, true, HOT)
BENCH_BOTH(0, 32, true, HOT)
BENCH_BOTH(0, 64, true, HOT)
BENCH_BOTH(0, 128, true, HOT)
BENCH_BOTH(0, 256, true, HOT)
BENCH_BOTH(0, 512, true, HOT)
BENCH_BOTH(0, 1024, true, HOT)
BENCH_BOTH(0, 32768, true, HOT)
BENCH_BOTH(8, 16, true, HOT)
BENCH_BOTH(16, 32, true, HOT)
BENCH_BOTH(32, 256, true, HOT)
BENCH_BOTH(256, 1024, true, HOT)
BENCH_BOTH(1024, 8192, true, HOT)

BENCHMARK_DRAW_LINE();
BENCH_BOTH(0, 7, false, COLD)
BENCH_BOTH(0, 16, false, COLD)
BENCH_BOTH(0, 32, false, COLD)
BENCH_BOTH(0, 64, false, COLD)
BENCH_BOTH(0, 128, false, COLD)
BENCH_BOTH(0, 256, false, COLD)
BENCH_BOTH(0, 512, false, COLD)
BENCH_BOTH(0, 1024, false, COLD)
BENCH_BOTH(0, 32768, false, COLD)
BENCH_BOTH(8, 16, false, COLD)
BENCH_BOTH(16, 32, false, COLD)
BENCH_BOTH(32, 256, false, COLD)
BENCH_BOTH(256, 1024, false, COLD)
BENCH_BOTH(1024, 8192, false, COLD)

BENCHMARK_DRAW_LINE();
BENCHMARK_NAMED_PARAM(
    bench,
    64k_to_1024k_unaligned_cold_glibc,
    /*memcpy_func=*/memcpy,
    /*min=*/65536,
    /*max=*/1048576,
    /*align=*/1,
    /*hot=*/false)
BENCHMARK_RELATIVE_NAMED_PARAM(
    bench,
    64k_to_1024k_unaligned_cold_folly,
    /*memcpy_func=*/folly::__folly_memcpy,
    /*min=*/65536,
    /*max=*/1048576,
    /*align=*/1,
    /*hot=*/false)

BENCHMARK_NAMED_PARAM(
    bench,
    64k_to_1024k_aligned_cold_glibc,
    /*memcpy_func=*/memcpy,
    /*min=*/65536,
    /*max=*/1048576,
    /*align=*/32,
    /*hot=*/false)
BENCHMARK_RELATIVE_NAMED_PARAM(
    bench,
    64k_to_1024k_aligned_cold_folly,
    /*memcpy_func=*/folly::__folly_memcpy,
    /*min=*/65536,
    /*max=*/1048576,
    /*align=*/32,
    /*hot=*/false)

// Benchmark results (Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GHz, Linux x86_64)
// Buck build mode: @mode/opt-lto
// ============================================================================
// folly/test/MemcpyBenchmark.cpp                  relative  time/iter  iters/s
// ============================================================================
// bench(0_to_7_HOT_glibc)                                      9.51ns  105.19M
// bench(0_to_7_HOT_folly)                          142.33%     6.68ns  149.72M
// bench(0_to_16_HOT_glibc)                                     8.98ns  111.30M
// bench(0_to_16_HOT_folly)                         153.23%     5.86ns  170.55M
// bench(0_to_32_HOT_glibc)                                     9.08ns  110.08M
// bench(0_to_32_HOT_folly)                         166.79%     5.45ns  183.61M
// bench(0_to_64_HOT_glibc)                                     8.35ns  119.79M
// bench(0_to_64_HOT_folly)                         124.48%     6.71ns  149.11M
// bench(0_to_128_HOT_glibc)                                    8.20ns  122.00M
// bench(0_to_128_HOT_folly)                        121.55%     6.74ns  148.29M
// bench(0_to_256_HOT_glibc)                                    8.64ns  115.68M
// bench(0_to_256_HOT_folly)                         95.85%     9.02ns  110.88M
// bench(0_to_512_HOT_glibc)                                   13.05ns   76.61M
// bench(0_to_512_HOT_folly)                        110.04%    11.86ns   84.31M
// bench(0_to_1024_HOT_glibc)                                  16.00ns   62.50M
// bench(0_to_1024_HOT_folly)                       100.53%    15.91ns   62.83M
// bench(0_to_32768_HOT_glibc)                                658.76ns    1.52M
// bench(0_to_32768_HOT_folly)                      112.30%   586.62ns    1.70M
// bench(8_to_16_HOT_glibc)                                     5.18ns  193.08M
// bench(8_to_16_HOT_folly)                         162.18%     3.19ns  313.13M
// bench(16_to_32_HOT_glibc)                                    4.55ns  219.65M
// bench(16_to_32_HOT_folly)                        117.18%     3.89ns  257.39M
// bench(32_to_256_HOT_glibc)                                   8.70ns  114.98M
// bench(32_to_256_HOT_folly)                        95.64%     9.09ns  109.97M
// bench(256_to_1024_HOT_glibc)                                16.59ns   60.28M
// bench(256_to_1024_HOT_folly)                      96.15%    17.25ns   57.96M
// bench(1024_to_8192_HOT_glibc)                              111.93ns    8.93M
// bench(1024_to_8192_HOT_folly)                    135.92%    82.35ns   12.14M
// ----------------------------------------------------------------------------
// bench(0_to_7_COLD_glibc)                                   101.72ns    9.83M
// bench(0_to_7_COLD_folly)                         242.15%    42.01ns   23.81M
// bench(0_to_16_COLD_glibc)                                  105.14ns    9.51M
// bench(0_to_16_COLD_folly)                        244.61%    42.98ns   23.26M
// bench(0_to_32_COLD_glibc)                                  108.45ns    9.22M
// bench(0_to_32_COLD_folly)                        238.48%    45.48ns   21.99M
// bench(0_to_64_COLD_glibc)                                  102.38ns    9.77M
// bench(0_to_64_COLD_folly)                        192.08%    53.30ns   18.76M
// bench(0_to_128_COLD_glibc)                                 122.86ns    8.14M
// bench(0_to_128_COLD_folly)                       198.17%    62.00ns   16.13M
// bench(0_to_256_COLD_glibc)                                 125.43ns    7.97M
// bench(0_to_256_COLD_folly)                       154.93%    80.96ns   12.35M
// bench(0_to_512_COLD_glibc)                                 161.50ns    6.19M
// bench(0_to_512_COLD_folly)                       149.92%   107.72ns    9.28M
// bench(0_to_1024_COLD_glibc)                                229.68ns    4.35M
// bench(0_to_1024_COLD_folly)                      141.36%   162.48ns    6.15M
// bench(0_to_32768_COLD_glibc)                                 2.91us  343.90K
// bench(0_to_32768_COLD_folly)                     138.83%     2.09us  477.42K
// bench(8_to_16_COLD_glibc)                                  115.47ns    8.66M
// bench(8_to_16_COLD_folly)                        242.11%    47.69ns   20.97M
// bench(16_to_32_COLD_glibc)                                 103.71ns    9.64M
// bench(16_to_32_COLD_folly)                       207.16%    50.06ns   19.98M
// bench(32_to_256_COLD_glibc)                                141.85ns    7.05M
// bench(32_to_256_COLD_folly)                      179.79%    78.90ns   12.67M
// bench(256_to_1024_COLD_glibc)                              236.81ns    4.22M
// bench(256_to_1024_COLD_folly)                    110.72%   213.88ns    4.68M
// bench(1024_to_8192_COLD_glibc)                             911.56ns    1.10M
// bench(1024_to_8192_COLD_folly)                   120.27%   757.90ns    1.32M
// ----------------------------------------------------------------------------
// bench(64k_to_1024k_unaligned_cold_glibc)                    70.17us   14.25K
// bench(64k_to_1024k_unaligned_cold_folly)         129.15%    54.34us   18.40K
// bench(64k_to_1024k_aligned_cold_glibc)                      69.28us   14.43K
// bench(64k_to_1024k_aligned_cold_folly)           246.52%    28.10us   35.58K
// ============================================================================

int main(int argc, char** argv) {
  gflags::ParseCommandLineFlags(&argc, &argv, true);
  folly::runBenchmarks();
  return 0;
}