/* * Copyright (c) Meta Platforms, Inc. and affiliates. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include <folly/CPortability.h> #include <folly/Traits.h> #include <folly/algorithm/simd/detail/UnrollUtils.h> #include <array> #include <cstdint> #include <type_traits> namespace folly { namespace simd_detail { // Based on // https://github.com/jfalcou/eve/blob/5264e20c51aeca17675e67abf236ce1ead781c52/include/eve/module/algo/algo/for_each_iteration.hpp#L148 // // Everything is ALWAYS_INLINE because we want to have one top level noinline // function that does everything. Otherwise sometimes the compiler tends // to mess that up. // /** * ignore(_none/_extrema) * * Tag types for handling the tails. * ignore_none indicates that the whole register is used. * ignore_extrema.first, .last show how many elements are out of the data. * * For example 3 elements, starting from the second for an 8 element register * will be ignore_extrema{.first = 1, .last = 4} */ struct ignore_extrema { … }; struct ignore_none { … }; /** * simdForEachAligning<unrolling>(cardinal, f, l, delegate); * * The main idea is that you can read from the memory within the page. * The beginning and end of the page are aligned to 4KB. * That means, the previous aligned address is always safe to read from * (requires asan disablement). This is how strlen works * https://stackoverflow.com/questions/25566302/vectorized-strlen-getting-away-with-reading-unallocated-memory * * The interface parameters are: * - unrolling: by how much do you want to unroll the main loop. 4 is a good * default for simple operations. * - cardinal: how big is your register in elements. * - f, l: [first, last) range * - delegate: * conceptually a callback but has 2 different operations. * - bool step(T*, ignore): to process one register. * Is called for tails and gets ignore_none/ignore_extrema. * - bool unrolledStep(std::array<T*, unrolling>) - * to process the unrolled part. unrolledStep is not called * if unrolling == 1. * Both step and unrolledStep should return true if they would like to break. * Delegate is passed by reference so that the caller can store state in it. */ template <int unrolling, typename T, typename Delegate> FOLLY_ALWAYS_INLINE void simdForEachAligning( int cardinal, T* f, T* l, Delegate& delegate); /** * previousAlignedAddress * * Given a pointer returns a closest pointer aligned to a given size. * (it just masks out some lower bits) */ template <typename T> FOLLY_ALWAYS_INLINE T* previousAlignedAddress(T* ptr, int to) { … } /** * SimdForEachMainLoop * * Implementaiton detail of simdForEach * * Regardless of how you chose to handle tails, the middle will be the same. * The operator() returns true if the delegate returned to break. * * There are two variations: * - no unrolling (unroll<1>) * - unrolling > 1 * * For explanation of parameters see simdForEachAligning */ struct SimdForEachMainLoop { … }; // Comment is at the top of the file. template <int unrolling, typename T, typename Delegate> FOLLY_ALWAYS_INLINE void simdForEachAligning( int cardinal, T* f, T* l, Delegate& delegate) { … } } // namespace simd_detail } // namespace folly