X86InterleavedAccess.cpp | Explore in Territory

//===- X86InterleavedAccess.cpp -------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// This file contains the X86 implementation of the interleaved accesses
/// optimization generating X86-specific instructions/intrinsics for
/// interleaved access groups.
//
//===----------------------------------------------------------------------===//

#include "X86ISelLowering.h"
#include "X86Subtarget.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGenTypes/MachineValueType.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/Support/Casting.h"
#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstdint>

usingnamespacellvm;

namespace {

/// This class holds necessary information to represent an interleaved
/// access group and supports utilities to lower the group into
/// X86-specific instructions/intrinsics.
///  E.g. A group of interleaving access loads (Factor = 2; accessing every
///       other element)
///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
///        %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <0, 2, 4, 6>
///        %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <1, 3, 5, 7>
class X86InterleavedAccessGroup { … };

} // end anonymous namespace

bool X86InterleavedAccessGroup::isSupported() const { … }

void X86InterleavedAccessGroup::decompose(
    Instruction *VecInst, unsigned NumSubVectors, FixedVectorType *SubVecTy,
    SmallVectorImpl<Instruction *> &DecomposedVectors) { … }

// Changing the scale of the vector type by reducing the number of elements and
// doubling the scalar size.
static MVT scaleVectorType(MVT VT) { … }

static constexpr int Concat[] = …;

// genShuffleBland - Creates shuffle according to two vectors.This function is
// only works on instructions with lane inside 256 registers. According to
// the mask 'Mask' creates a new Mask 'Out' by the offset of the mask. The
// offset amount depends on the two integer, 'LowOffset' and 'HighOffset'.
// Where the 'LowOffset' refers to the first vector and the highOffset refers to
// the second vector.
// |a0....a5,b0....b4,c0....c4|a16..a21,b16..b20,c16..c20|
// |c5...c10,a5....a9,b5....b9|c21..c26,a22..a26,b21..b25|
// |b10..b15,c11..c15,a10..a15|b26..b31,c27..c31,a27..a31|
// For the sequence to work as a mirror to the load.
// We must consider the elements order as above.
// In this function we are combining two types of shuffles.
// The first one is vpshufed and the second is a type of "blend" shuffle.
// By computing the shuffle on a sequence of 16 elements(one lane) and add the
// correct offset. We are creating a vpsuffed + blend sequence between two
// shuffles.
static void genShuffleBland(MVT VT, ArrayRef<int> Mask,
                            SmallVectorImpl<int> &Out, int LowOffset,
                            int HighOffset) { … }

// reorderSubVector returns the data to is the original state. And de-facto is
// the opposite of  the function concatSubVector.

// For VecElems = 16
// Invec[0] -  |0|      TransposedMatrix[0] - |0|
// Invec[1] -  |1|  =>  TransposedMatrix[1] - |1|
// Invec[2] -  |2|      TransposedMatrix[2] - |2|

// For VecElems = 32
// Invec[0] -  |0|3|      TransposedMatrix[0] - |0|1|
// Invec[1] -  |1|4|  =>  TransposedMatrix[1] - |2|3|
// Invec[2] -  |2|5|      TransposedMatrix[2] - |4|5|

// For VecElems = 64
// Invec[0] -  |0|3|6|9 |     TransposedMatrix[0] - |0|1|2 |3 |
// Invec[1] -  |1|4|7|10| =>  TransposedMatrix[1] - |4|5|6 |7 |
// Invec[2] -  |2|5|8|11|     TransposedMatrix[2] - |8|9|10|11|

static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix,
                             ArrayRef<Value *> Vec, ArrayRef<int> VPShuf,
                             unsigned VecElems, unsigned Stride,
                             IRBuilder<> &Builder) { … }

void X86InterleavedAccessGroup::interleave8bitStride4VF8(
    ArrayRef<Instruction *> Matrix,
    SmallVectorImpl<Value *> &TransposedMatrix) { … }

void X86InterleavedAccessGroup::interleave8bitStride4(
    ArrayRef<Instruction *> Matrix, SmallVectorImpl<Value *> &TransposedMatrix,
    unsigned NumOfElm) { … }

//  createShuffleStride returns shuffle mask of size N.
//  The shuffle pattern is as following :
//  {0, Stride%(VF/Lane), (2*Stride%(VF/Lane))...(VF*Stride/Lane)%(VF/Lane),
//  (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),...,
//  (VF / Lane)+(VF*Stride/Lane)%(VF/Lane)}
//  Where Lane is the # of lanes in a register:
//  VectorSize = 128 => Lane = 1
//  VectorSize = 256 => Lane = 2
//  For example shuffle pattern for VF 16 register size 256 -> lanes = 2
//  {<[0|3|6|1|4|7|2|5]-[8|11|14|9|12|15|10|13]>}
static void createShuffleStride(MVT VT, int Stride,
                                SmallVectorImpl<int> &Mask) { … }

//  setGroupSize sets 'SizeInfo' to the size(number of elements) of group
//  inside mask a shuffleMask. A mask contains exactly 3 groups, where
//  each group is a monotonically increasing sequence with stride 3.
//  For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2}
static void setGroupSize(MVT VT, SmallVectorImpl<int> &SizeInfo) { … }

//  DecodePALIGNRMask returns the shuffle mask of vpalign instruction.
//  vpalign works according to lanes
//  Where Lane is the # of lanes in a register:
//  VectorWide = 128 => Lane = 1
//  VectorWide = 256 => Lane = 2
//  For Lane = 1 shuffle pattern is: {DiffToJump,...,DiffToJump+VF-1}.
//  For Lane = 2 shuffle pattern is:
//  {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}.
//  Imm variable sets the offset amount. The result of the
//  function is stored inside ShuffleMask vector and it built as described in
//  the begin of the description. AlignDirection is a boolean that indicates the
//  direction of the alignment. (false - align to the "right" side while true -
//  align to the "left" side)
static void DecodePALIGNRMask(MVT VT, unsigned Imm,
                              SmallVectorImpl<int> &ShuffleMask,
                              bool AlignDirection = true, bool Unary = false) { … }

// concatSubVector - The function rebuilds the data to a correct expected
// order. An assumption(The shape of the matrix) was taken for the
// deinterleaved to work with lane's instructions like 'vpalign' or 'vphuf'.
// This function ensures that the data is built in correct way for the lane
// instructions. Each lane inside the vector is a 128-bit length.
//
// The 'InVec' argument contains the data in increasing order. In InVec[0] You
// can find the first 128 bit data. The number of different lanes inside a
// vector depends on the 'VecElems'.In general, the formula is
// VecElems * type / 128. The size of the array 'InVec' depends and equal to
// 'VecElems'.

// For VecElems = 16
// Invec[0] - |0|      Vec[0] - |0|
// Invec[1] - |1|  =>  Vec[1] - |1|
// Invec[2] - |2|      Vec[2] - |2|

// For VecElems = 32
// Invec[0] - |0|1|      Vec[0] - |0|3|
// Invec[1] - |2|3|  =>  Vec[1] - |1|4|
// Invec[2] - |4|5|      Vec[2] - |2|5|

// For VecElems = 64
// Invec[0] - |0|1|2 |3 |      Vec[0] - |0|3|6|9 |
// Invec[1] - |4|5|6 |7 |  =>  Vec[1] - |1|4|7|10|
// Invec[2] - |8|9|10|11|      Vec[2] - |2|5|8|11|

static void concatSubVector(Value **Vec, ArrayRef<Instruction *> InVec,
                            unsigned VecElems, IRBuilder<> &Builder) { … }

void X86InterleavedAccessGroup::deinterleave8bitStride3(
    ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
    unsigned VecElems) { … }

// group2Shuffle reorder the shuffle stride back into continuous order.
// For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} =>
// MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}.
static void group2Shuffle(MVT VT, SmallVectorImpl<int> &Mask,
                          SmallVectorImpl<int> &Output) { … }

void X86InterleavedAccessGroup::interleave8bitStride3(
    ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
    unsigned VecElems) { … }

void X86InterleavedAccessGroup::transpose_4x4(
    ArrayRef<Instruction *> Matrix,
    SmallVectorImpl<Value *> &TransposedMatrix) { … }

// Lowers this interleaved access group into X86-specific
// instructions/intrinsics.
bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { … }

// Lower interleaved load(s) into target specific instructions/
// intrinsics. Lowering sequence varies depending on the vector-types, factor,
// number of shuffles and ISA.
// Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
bool X86TargetLowering::lowerInterleavedLoad(
    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
    ArrayRef<unsigned> Indices, unsigned Factor) const { … }

bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
                                              ShuffleVectorInst *SVI,
                                              unsigned Factor) const { … }
llvm/llvm/lib/Target/X86/X86InterleavedAccess.cpp