// Copyright 2022 the V8 project authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "src/objects/simd.h" #include "src/base/cpu.h" #include "src/codegen/cpu-features.h" #include "src/objects/compressed-slots.h" #include "src/objects/fixed-array-inl.h" #include "src/objects/heap-number-inl.h" #include "src/objects/smi-inl.h" #ifdef _MSC_VER // MSVC doesn't define SSE3. However, it does define AVX, and AVX implies SSE3. #ifdef __AVX__ #ifndef __SSE3__ #define __SSE3__ #endif #endif #endif #ifdef __SSE3__ #include <immintrin.h> #endif #ifdef V8_HOST_ARCH_ARM64 // We use Neon only on 64-bit ARM (because on 32-bit, some instructions and some // types are not available). Note that ARM64 is guaranteed to have Neon. #define NEON64 #include <arm_neon.h> #endif namespace v8 { namespace internal { namespace { enum class SimdKinds { … }; inline SimdKinds get_vectorization_kind() { … } // Searches for |search_element| in |array| using a simple non-vectorized linear // search. This is used as a fall-back when SIMD are not available, and to // process the end of arrays than SIMD cannot process. template <typename T> inline uintptr_t slow_search(T* array, uintptr_t array_len, uintptr_t index, T search_element) { … } #ifdef NEON64 // extract_first_nonzero_index returns the first non-zero index in |v|. |v| is a // Neon vector that can be either 32x4 (the return is then 0, 1, 2 or 3) or 64x2 // (the return is then 0 or 1). This is more or less equivalent to doing a // movemask followed by a tzcnt on Intel. // // The input |v| should be a vector of -1 or 0 (for instance {0, 0}, // {0, -1, 0, -1}, {0, -1, 0, 0}), where -1 represents a match (and 0 a // non-match), that was obtained by doing a vceqq. This function extract the // index of the first non-zero item of the vector. To do so, we "and" the vector // with {4, 3, 2, 1} (each number is "4 - the index of the item it's in"), which // produces a vector of "indices or 0". Then, we extract the maximum of this // vector, which is the index of the 1st match. An example: // // v = {-1, 0, 0, -1} // mask = {4, 3, 2, 1} // v & mask = {4, 0, 0, 1} // max(v & mask) = 4 // index of the first match = 4-max = 4-4 = 0 // // With MSVC, uint32x4_t and uint64x2_t typedef to a union, where first member // is uint64_t[2], and not uint32_t[4]. // C++ standard dictates that a union can only be initialized through its first // member, which forces us to have uint64_t[2] for definition. #if defined(_MSC_VER) && !defined(__clang__) #define PACK32x4 … #else #define PACK32x4 … #endif // MSVC workaround V8_ALLOW_UNUSED inline int extract_first_nonzero_index_uint32x4_t( uint32x4_t v) { uint32x4_t mask = PACK32x4(4, 3, 2, 1); mask = vandq_u32(mask, v); return 4 - vmaxvq_u32(mask); } inline int extract_first_nonzero_index_uint64x2_t(uint64x2_t v) { uint32x4_t mask = PACK32x4(2, 0, 1, 0); // Could also be {2,2,1,1} or {0,2,0,1} mask = vandq_u32(mask, vreinterpretq_u32_u64(v)); return 2 - vmaxvq_u32(mask); } inline int32_t reinterpret_vmaxvq_u64(uint64x2_t v) { return vmaxvq_u32(vreinterpretq_u32_u64(v)); } #endif #define VECTORIZED_LOOP_Neon … #define VECTORIZED_LOOP_x86 … // Uses SIMD to vectorize the search loop. This function should only be called // for large-ish arrays. Note that nothing will break if |array_len| is less // than vectorization_threshold: things will just be slower than necessary. template <typename T> inline uintptr_t fast_search_noavx(T* array, uintptr_t array_len, uintptr_t index, T search_element) { … } #if defined(_MSC_VER) && defined(__clang__) // Generating AVX2 code with Clang on Windows without the /arch:AVX2 flag does // not seem possible at the moment. #define IS_CLANG_WIN … #endif // Since we don't compile with -mavx or -mavx2 (or /arch:AVX2 on MSVC), Clang // and MSVC do not define __AVX__ nor __AVX2__. Thus, if __SSE3__ is defined, we // generate the AVX2 code, and, at runtime, we'll decide to call it or not, // depending on whether the CPU supports AVX2. #if defined(__SSE3__) && !defined(_M_IX86) && !defined(IS_CLANG_WIN) #ifdef _MSC_VER #define TARGET_AVX2 #else #define TARGET_AVX2 … #endif template <typename T> TARGET_AVX2 inline uintptr_t fast_search_avx(T* array, uintptr_t array_len, uintptr_t index, T search_element) { … } #undef TARGET_AVX2 #elif defined(IS_CLANG_WIN) template <typename T> inline uintptr_t fast_search_avx(T* array, uintptr_t array_len, uintptr_t index, T search_element) { // Falling back to SSE version return fast_search_noavx(array, array_len, index, search_element); } #else template <typename T> uintptr_t fast_search_avx(T* array, uintptr_t array_len, uintptr_t index, T search_element) { UNREACHABLE(); } #endif // ifdef __SSE3__ #undef IS_CLANG_WIN #undef VECTORIZED_LOOP_Neon #undef VECTORIZED_LOOP_x86 template <typename T> inline uintptr_t search(T* array, uintptr_t array_len, uintptr_t index, T search_element) { … } enum class ArrayIndexOfIncludesKind { … }; // ArrayIndexOfIncludes only handles cases that can be efficiently // vectorized: // // * Searching for a Smi in a Smi array // // * Searching for a Smi or Double in a Double array // // * Searching for an object in an object array. // // Other cases should be dealt with either with the CSA builtin or with the // inlined optimized code. template <ArrayIndexOfIncludesKind kind> Address ArrayIndexOfIncludes(Address array_start, uintptr_t array_len, uintptr_t from_index, Address search_element) { … } } // namespace uintptr_t ArrayIndexOfIncludesSmiOrObject(Address array_start, uintptr_t array_len, uintptr_t from_index, Address search_element) { … } uintptr_t ArrayIndexOfIncludesDouble(Address array_start, uintptr_t array_len, uintptr_t from_index, Address search_element) { … } #ifdef NEON64 #undef NEON64 #endif } // namespace internal } // namespace v8