folly/folly/algorithm/simd/detail/SimdForEach.h

/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include <folly/CPortability.h>
#include <folly/Traits.h>
#include <folly/algorithm/simd/detail/UnrollUtils.h>

#include <array>
#include <cstdint>
#include <type_traits>

namespace folly {
namespace simd_detail {

// Based on
// https://github.com/jfalcou/eve/blob/5264e20c51aeca17675e67abf236ce1ead781c52/include/eve/module/algo/algo/for_each_iteration.hpp#L148
//
// Everything is ALWAYS_INLINE because we want to have one top level noinline
// function that does everything. Otherwise sometimes the compiler tends
// to mess that up.
//

/**
 * ignore(_none/_extrema)
 *
 * Tag types for handling the tails.
 * ignore_none indicates that the whole register is used.
 * ignore_extrema.first, .last show how many elements are out of the data.
 *
 * For example 3 elements, starting from the second for an 8 element register
 * will be ignore_extrema{.first = 1, .last = 4}
 */
struct ignore_extrema {};

struct ignore_none {};

/**
 * simdForEachAligning<unrolling>(cardinal, f, l, delegate);
 *
 * The main idea is that you can read from the memory within the page.
 * The beginning and end of the page are aligned to 4KB.
 * That means, the previous aligned address is always safe to read from
 * (requires asan disablement). This is how strlen works
 * https://stackoverflow.com/questions/25566302/vectorized-strlen-getting-away-with-reading-unallocated-memory
 *
 * The interface parameters are:
 * - unrolling: by how much do you want to unroll the main loop. 4 is a good
 * default for simple operations.
 * - cardinal: how big is your register in elements.
 * - f, l: [first, last) range
 * - delegate:
 *     conceptually a callback but has 2 different operations.
 *     - bool step(T*, ignore): to process one register.
 *       Is called for tails and gets ignore_none/ignore_extrema.
 *     - bool unrolledStep(std::array<T*, unrolling>) -
 *       to process the unrolled part. unrolledStep is not called
 *       if unrolling == 1.
 *   Both step and unrolledStep should return true if they would like to break.
 *   Delegate is passed by reference so that the caller can store state in it.
 */
template <int unrolling, typename T, typename Delegate>
FOLLY_ALWAYS_INLINE void simdForEachAligning(
    int cardinal, T* f, T* l, Delegate& delegate);

/**
 * previousAlignedAddress
 *
 * Given a pointer returns a closest pointer aligned to a given size.
 * (it just masks out some lower bits)
 */
template <typename T>
FOLLY_ALWAYS_INLINE T* previousAlignedAddress(T* ptr, int to) {}

/**
 * SimdForEachMainLoop
 *
 * Implementaiton detail of simdForEach
 *
 * Regardless of how you chose to handle tails, the middle will be the same.
 * The operator() returns true if the delegate returned to break.
 *
 * There are two variations:
 * - no unrolling (unroll<1>)
 * - unrolling > 1
 *
 * For explanation of parameters see simdForEachAligning
 */
struct SimdForEachMainLoop {};

// Comment is at the top of the file.
template <int unrolling, typename T, typename Delegate>
FOLLY_ALWAYS_INLINE void simdForEachAligning(
    int cardinal, T* f, T* l, Delegate& delegate) {}

} // namespace simd_detail
} // namespace folly