folly/folly/futures/test/Benchmark.cpp

/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <folly/Benchmark.h>

#include <vector>

#include <folly/executors/InlineExecutor.h>
#include <folly/futures/Future.h>
#include <folly/futures/Promise.h>
#include <folly/futures/test/TestExecutor.h>
#include <folly/portability/GFlags.h>
#include <folly/synchronization/Baton.h>
#include <folly/synchronization/NativeSemaphore.h>

using namespace folly;

namespace {

template <class T>
T incr(Try<T>&& t) {
  return t.value() + 1;
}

Future<int> thens(Future<int> f, size_t n, bool runInline = false) {
  for (size_t i = 0; i < n; i++) {
    if (runInline) {
      f = std::move(f).thenInline(incr<int>);
    } else {
      f = std::move(f).then(incr<int>);
    }
  }
  return f;
}

void someThens(size_t n) {
  auto f = makeFuture<int>(42);
  f = thens(std::move(f), n);
}

void someThensOnThread(size_t n, bool runInline = false) {
  auto executor = std::make_unique<TestExecutor>(1);
  auto f = makeFuture<int>(42).via(executor.get());
  f = thens(std::move(f), n / 2, runInline);
  f = thens(std::move(f), 1, false);
  f = thens(std::move(f), n / 2, runInline);
  f.wait();
}

} // namespace

BENCHMARK(constantFuture) {
  makeFuture(42);
}

BENCHMARK_RELATIVE(promiseAndFuture) {
  Promise<int> p;
  Future<int> f = p.getFuture();
  p.setValue(42);
  f.value();
}

BENCHMARK_RELATIVE(withThen) {
  Promise<int> p;
  Future<int> f = p.getFuture().then(incr<int>);
  p.setValue(42);
  f.value();
}

// thens
BENCHMARK_DRAW_LINE();

BENCHMARK(oneThen) {
  someThens(1);
}

// look for >= 50% relative
BENCHMARK_RELATIVE(twoThens) {
  someThens(2);
}

// look for >= 25% relative
BENCHMARK_RELATIVE(fourThens) {
  someThens(4);
}

// look for >= 1% relative
BENCHMARK_RELATIVE(hundredThens) {
  someThens(100);
}

BENCHMARK_DRAW_LINE();

// look for >= 25% relative
BENCHMARK_RELATIVE(fourThensOnThread) {
  someThensOnThread(4);
}

// look for >= 25% relative
BENCHMARK_RELATIVE(fourThensOnThreadInline) {
  someThensOnThread(4, true);
}

// look for >= 1% relative
BENCHMARK_RELATIVE(hundredThensOnThread) {
  someThensOnThread(100);
}

// look for >= 1% relative
BENCHMARK_RELATIVE(hundredThensOnThreadInline) {
  someThensOnThread(100, true);
}

// Lock contention. Although in practice fulfills tend to be temporally
// separate from then()s, still sometimes they will be concurrent. So the
// higher this number is, the better.
BENCHMARK_DRAW_LINE();

BENCHMARK(no_contention) {
  std::vector<Promise<int>> promises(10000);
  std::vector<Future<int>> futures;
  std::thread producer, consumer;

  BENCHMARK_SUSPEND {
    folly::Baton<> b1, b2;
    for (auto& p : promises) {
      futures.push_back(p.getFuture());
    }

    consumer = std::thread([&] {
      b1.post();
      for (auto& f : futures) {
        std::move(f).then(incr<int>);
      }
    });
    consumer.join();

    producer = std::thread([&] {
      b2.post();
      for (auto& p : promises) {
        p.setValue(42);
      }
    });

    b1.wait();
    b2.wait();
  }

  // The only thing we are measuring is how long fulfill + callbacks take
  producer.join();
}

BENCHMARK_RELATIVE(contention) {
  std::vector<Promise<int>> promises(10000);
  std::vector<Future<int>> futures;
  std::thread producer, consumer;
  folly::NativeSemaphore sem;

  BENCHMARK_SUSPEND {
    folly::Baton<> b1, b2;
    for (auto& p : promises) {
      futures.push_back(p.getFuture());
    }

    consumer = std::thread([&] {
      b1.post();
      for (auto& f : futures) {
        sem.wait();
        std::move(f).then(incr<int>);
      }
    });

    producer = std::thread([&] {
      b2.post();
      for (auto& p : promises) {
        sem.post();
        p.setValue(42);
      }
    });

    b1.wait();
    b2.wait();
  }

  // The astute reader will notice that we're not *precisely* comparing apples
  // to apples here. Well, maybe it's like comparing Granny Smith to
  // Braeburn or something. In the serial version, we waited for the futures
  // to be all set up, but here we are probably still doing that work
  // (although in parallel). But even though there is more work (on the order
  // of 2x), it is being done by two threads. Hopefully most of the difference
  // we see is due to lock contention and not false parallelism.
  //
  // Be warned that if the box is under heavy load, this will greatly skew
  // these results (scheduling overhead will begin to dwarf lock contention).
  // I'm not sure but I'd guess in Windtunnel this will mean large variance,
  // because I expect they load the boxes as much as they can?
  consumer.join();
  producer.join();
}

BENCHMARK_DRAW_LINE();

// The old way. Throw an exception, and rethrow to access it upstream.
void throwAndCatchImpl() {
  makeFuture()
      .then([](Try<Unit>&&) { throw std::runtime_error("oh no"); })
      .then([](Try<Unit>&& t) {
        try {
          t.value();
        } catch (const std::runtime_error&) {
          // ...
          return;
        }
        CHECK(false);
      });
}

// Not much better. Throw an exception, and access it via the wrapper upstream.
// Actually a little worse due to wrapper overhead. then() won't know that the
// exception is a runtime_error, so will have to store it as an exception_ptr
// anyways. withException will therefore have to rethrow. Note that if we threw
// std::exception instead, we would see some wins, as that's the type then()
// will try to wrap, so no exception_ptrs/rethrows are necessary.
void throwAndCatchWrappedImpl() {
  makeFuture()
      .then([](Try<Unit>&&) { throw std::runtime_error("oh no"); })
      .then([](Try<Unit>&& t) {
        auto caught = t.withException<std::runtime_error>(
            [](const std::runtime_error& /* e */) {
              // ...
            });
        CHECK(caught);
      });
}

// Better. Wrap an exception, and rethrow to access it upstream.
void throwWrappedAndCatchImpl() {
  makeFuture()
      .then([](Try<Unit>&&) {
        return makeFuture<Unit>(std::runtime_error("oh no"));
      })
      .then([](Try<Unit>&& t) {
        try {
          t.value();
        } catch (const std::runtime_error&) {
          // ...
          return;
        }
        CHECK(false);
      });
}

// The new way. Wrap an exception, and access it via the wrapper upstream
void throwWrappedAndCatchWrappedImpl() {
  makeFuture()
      .then([](Try<Unit>&&) {
        return makeFuture<Unit>(std::runtime_error("oh no"));
      })
      .then([](Try<Unit>&& t) {
        auto caught = t.withException<std::runtime_error>(
            [](const std::runtime_error& /* e */) {
              // ...
            });
        CHECK(caught);
      });
}

// Apparently OSX doesn't implement barriers since it is an optional part of
// POSIX realtime threads extension, therefore we test for its presence.
// Details:
// https://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap02.html
#if defined(_POSIX_BARRIERS) && (_POSIX_BARRIERS > 0)
// Simulate heavy contention on func
void contend(void (*func)()) {
  folly::BenchmarkSuspender s;
  const int N = 100;
  const int iters = 1000;
  pthread_barrier_t barrier;
  pthread_barrier_init(&barrier, nullptr, N + 1);
  std::vector<std::thread> threads;
  for (int i = 0; i < N; i++) {
    threads.push_back(std::thread([&]() {
      pthread_barrier_wait(&barrier);
      for (int j = 0; j < iters; j++) {
        func();
      }
    }));
  }
  pthread_barrier_wait(&barrier);
  s.dismiss();
  for (auto& t : threads) {
    t.join();
  }
  s.rehire();
  pthread_barrier_destroy(&barrier);
}
#endif

BENCHMARK(throwAndCatch) {
  throwAndCatchImpl();
}

BENCHMARK_RELATIVE(throwAndCatchWrapped) {
  throwAndCatchWrappedImpl();
}

BENCHMARK_RELATIVE(throwWrappedAndCatch) {
  throwWrappedAndCatchImpl();
}

BENCHMARK_RELATIVE(throwWrappedAndCatchWrapped) {
  throwWrappedAndCatchWrappedImpl();
}

BENCHMARK_DRAW_LINE();

#if defined(_POSIX_BARRIERS) && (_POSIX_BARRIERS > 0)
BENCHMARK(throwAndCatchContended) {
  contend(throwAndCatchImpl);
}

BENCHMARK_RELATIVE(throwAndCatchWrappedContended) {
  contend(throwAndCatchWrappedImpl);
}

BENCHMARK_RELATIVE(throwWrappedAndCatchContended) {
  contend(throwWrappedAndCatchImpl);
}

BENCHMARK_RELATIVE(throwWrappedAndCatchWrappedContended) {
  contend(throwWrappedAndCatchWrappedImpl);
}
BENCHMARK_DRAW_LINE();
#endif

namespace {
struct Bulky {
  explicit Bulky(std::string message) : message_(message) {}
  std::string message() & { return message_; }
  std::string&& message() && { return std::move(message_); }

 private:
  std::string message_;
  std::array<int, 1024> ints_;
};
} // anonymous namespace

BENCHMARK(lvalue_get) {
  BenchmarkSuspender suspender;
  Optional<Future<Bulky>> future;
  future = makeFuture(Bulky("Hello"));
  suspender.dismissing([&] {
    std::string message = std::move(future.value()).get().message();
    doNotOptimizeAway(message);
  });
}

BENCHMARK_RELATIVE(rvalue_get) {
  BenchmarkSuspender suspender;
  Optional<Future<Bulky>> future;
  future = makeFuture(Bulky("Hello"));
  suspender.dismissing([&] {
    std::string message = std::move(future.value()).get().message();
    doNotOptimizeAway(message);
  });
}

InlineExecutor exe;

template <class T>
Future<T> fGen() {
  Promise<T> p;
  auto f = p.getFuture()
               .thenValue([](T&& t) { return std::move(t); })
               .thenValue([](T&& t) { return makeFuture(std::move(t)); })
               .via(&exe)
               .thenValue([](T&& t) { return std::move(t); })
               .thenValue([](T&& t) { return makeFuture(std::move(t)); });
  p.setValue(T());
  return f;
}

template <class T>
std::vector<Future<T>> fsGen() {
  std::vector<Future<T>> fs;
  for (auto i = 0; i < 10; i++) {
    fs.push_back(fGen<T>());
  }
  return fs;
}

template <class T>
void complexBenchmark() {
  collect(fsGen<T>());
  collectAll(fsGen<T>());
  collectAny(fsGen<T>());
  futures::mapValue(fsGen<T>(), [](const T& t) { return t; });
  futures::mapValue(fsGen<T>(), [](const T& t) { return makeFuture(T(t)); });
}

BENCHMARK_DRAW_LINE();

template <size_t S>
struct Blob {
  char buf[S];
};

BENCHMARK(complexUnit) {
  complexBenchmark<Unit>();
}

BENCHMARK_RELATIVE(complexBlob4) {
  complexBenchmark<Blob<4>>();
}

BENCHMARK_RELATIVE(complexBlob8) {
  complexBenchmark<Blob<8>>();
}

BENCHMARK_RELATIVE(complexBlob64) {
  complexBenchmark<Blob<64>>();
}

BENCHMARK_RELATIVE(complexBlob128) {
  complexBenchmark<Blob<128>>();
}

BENCHMARK_RELATIVE(complexBlob256) {
  complexBenchmark<Blob<256>>();
}

BENCHMARK_RELATIVE(complexBlob512) {
  complexBenchmark<Blob<512>>();
}

BENCHMARK_RELATIVE(complexBlob1024) {
  complexBenchmark<Blob<1024>>();
}

BENCHMARK_RELATIVE(complexBlob2048) {
  complexBenchmark<Blob<2048>>();
}

BENCHMARK_RELATIVE(complexBlob4096) {
  complexBenchmark<Blob<4096>>();
}

int main(int argc, char** argv) {
  gflags::ParseCommandLineFlags(&argc, &argv, true);
  folly::runBenchmarks();
  return 0;
}