//===- X86InterleavedAccess.cpp -------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// \file /// This file contains the X86 implementation of the interleaved accesses /// optimization generating X86-specific instructions/intrinsics for /// interleaved access groups. // //===----------------------------------------------------------------------===// #include "X86ISelLowering.h" #include "X86Subtarget.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGenTypes/MachineValueType.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/Support/Casting.h" #include <algorithm> #include <cassert> #include <cmath> #include <cstdint> usingnamespacellvm; namespace { /// This class holds necessary information to represent an interleaved /// access group and supports utilities to lower the group into /// X86-specific instructions/intrinsics. /// E.g. A group of interleaving access loads (Factor = 2; accessing every /// other element) /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr /// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <0, 2, 4, 6> /// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <1, 3, 5, 7> class X86InterleavedAccessGroup { … }; } // end anonymous namespace bool X86InterleavedAccessGroup::isSupported() const { … } void X86InterleavedAccessGroup::decompose( Instruction *VecInst, unsigned NumSubVectors, FixedVectorType *SubVecTy, SmallVectorImpl<Instruction *> &DecomposedVectors) { … } // Changing the scale of the vector type by reducing the number of elements and // doubling the scalar size. static MVT scaleVectorType(MVT VT) { … } static constexpr int Concat[] = …; // genShuffleBland - Creates shuffle according to two vectors.This function is // only works on instructions with lane inside 256 registers. According to // the mask 'Mask' creates a new Mask 'Out' by the offset of the mask. The // offset amount depends on the two integer, 'LowOffset' and 'HighOffset'. // Where the 'LowOffset' refers to the first vector and the highOffset refers to // the second vector. // |a0....a5,b0....b4,c0....c4|a16..a21,b16..b20,c16..c20| // |c5...c10,a5....a9,b5....b9|c21..c26,a22..a26,b21..b25| // |b10..b15,c11..c15,a10..a15|b26..b31,c27..c31,a27..a31| // For the sequence to work as a mirror to the load. // We must consider the elements order as above. // In this function we are combining two types of shuffles. // The first one is vpshufed and the second is a type of "blend" shuffle. // By computing the shuffle on a sequence of 16 elements(one lane) and add the // correct offset. We are creating a vpsuffed + blend sequence between two // shuffles. static void genShuffleBland(MVT VT, ArrayRef<int> Mask, SmallVectorImpl<int> &Out, int LowOffset, int HighOffset) { … } // reorderSubVector returns the data to is the original state. And de-facto is // the opposite of the function concatSubVector. // For VecElems = 16 // Invec[0] - |0| TransposedMatrix[0] - |0| // Invec[1] - |1| => TransposedMatrix[1] - |1| // Invec[2] - |2| TransposedMatrix[2] - |2| // For VecElems = 32 // Invec[0] - |0|3| TransposedMatrix[0] - |0|1| // Invec[1] - |1|4| => TransposedMatrix[1] - |2|3| // Invec[2] - |2|5| TransposedMatrix[2] - |4|5| // For VecElems = 64 // Invec[0] - |0|3|6|9 | TransposedMatrix[0] - |0|1|2 |3 | // Invec[1] - |1|4|7|10| => TransposedMatrix[1] - |4|5|6 |7 | // Invec[2] - |2|5|8|11| TransposedMatrix[2] - |8|9|10|11| static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix, ArrayRef<Value *> Vec, ArrayRef<int> VPShuf, unsigned VecElems, unsigned Stride, IRBuilder<> &Builder) { … } void X86InterleavedAccessGroup::interleave8bitStride4VF8( ArrayRef<Instruction *> Matrix, SmallVectorImpl<Value *> &TransposedMatrix) { … } void X86InterleavedAccessGroup::interleave8bitStride4( ArrayRef<Instruction *> Matrix, SmallVectorImpl<Value *> &TransposedMatrix, unsigned NumOfElm) { … } // createShuffleStride returns shuffle mask of size N. // The shuffle pattern is as following : // {0, Stride%(VF/Lane), (2*Stride%(VF/Lane))...(VF*Stride/Lane)%(VF/Lane), // (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),..., // (VF / Lane)+(VF*Stride/Lane)%(VF/Lane)} // Where Lane is the # of lanes in a register: // VectorSize = 128 => Lane = 1 // VectorSize = 256 => Lane = 2 // For example shuffle pattern for VF 16 register size 256 -> lanes = 2 // {<[0|3|6|1|4|7|2|5]-[8|11|14|9|12|15|10|13]>} static void createShuffleStride(MVT VT, int Stride, SmallVectorImpl<int> &Mask) { … } // setGroupSize sets 'SizeInfo' to the size(number of elements) of group // inside mask a shuffleMask. A mask contains exactly 3 groups, where // each group is a monotonically increasing sequence with stride 3. // For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2} static void setGroupSize(MVT VT, SmallVectorImpl<int> &SizeInfo) { … } // DecodePALIGNRMask returns the shuffle mask of vpalign instruction. // vpalign works according to lanes // Where Lane is the # of lanes in a register: // VectorWide = 128 => Lane = 1 // VectorWide = 256 => Lane = 2 // For Lane = 1 shuffle pattern is: {DiffToJump,...,DiffToJump+VF-1}. // For Lane = 2 shuffle pattern is: // {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}. // Imm variable sets the offset amount. The result of the // function is stored inside ShuffleMask vector and it built as described in // the begin of the description. AlignDirection is a boolean that indicates the // direction of the alignment. (false - align to the "right" side while true - // align to the "left" side) static void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask, bool AlignDirection = true, bool Unary = false) { … } // concatSubVector - The function rebuilds the data to a correct expected // order. An assumption(The shape of the matrix) was taken for the // deinterleaved to work with lane's instructions like 'vpalign' or 'vphuf'. // This function ensures that the data is built in correct way for the lane // instructions. Each lane inside the vector is a 128-bit length. // // The 'InVec' argument contains the data in increasing order. In InVec[0] You // can find the first 128 bit data. The number of different lanes inside a // vector depends on the 'VecElems'.In general, the formula is // VecElems * type / 128. The size of the array 'InVec' depends and equal to // 'VecElems'. // For VecElems = 16 // Invec[0] - |0| Vec[0] - |0| // Invec[1] - |1| => Vec[1] - |1| // Invec[2] - |2| Vec[2] - |2| // For VecElems = 32 // Invec[0] - |0|1| Vec[0] - |0|3| // Invec[1] - |2|3| => Vec[1] - |1|4| // Invec[2] - |4|5| Vec[2] - |2|5| // For VecElems = 64 // Invec[0] - |0|1|2 |3 | Vec[0] - |0|3|6|9 | // Invec[1] - |4|5|6 |7 | => Vec[1] - |1|4|7|10| // Invec[2] - |8|9|10|11| Vec[2] - |2|5|8|11| static void concatSubVector(Value **Vec, ArrayRef<Instruction *> InVec, unsigned VecElems, IRBuilder<> &Builder) { … } void X86InterleavedAccessGroup::deinterleave8bitStride3( ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix, unsigned VecElems) { … } // group2Shuffle reorder the shuffle stride back into continuous order. // For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} => // MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}. static void group2Shuffle(MVT VT, SmallVectorImpl<int> &Mask, SmallVectorImpl<int> &Output) { … } void X86InterleavedAccessGroup::interleave8bitStride3( ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix, unsigned VecElems) { … } void X86InterleavedAccessGroup::transpose_4x4( ArrayRef<Instruction *> Matrix, SmallVectorImpl<Value *> &TransposedMatrix) { … } // Lowers this interleaved access group into X86-specific // instructions/intrinsics. bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { … } // Lower interleaved load(s) into target specific instructions/ // intrinsics. Lowering sequence varies depending on the vector-types, factor, // number of shuffles and ISA. // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX. bool X86TargetLowering::lowerInterleavedLoad( LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const { … } bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const { … }