chromium/third_party/eigen3/src/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h

// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <[email protected]>
// Copyright (C) 2016 Mehdi Goli, Codeplay Software Ltd <[email protected]>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H

// clang is incompatible with the CUDA syntax wrt making a kernel a class friend,
// so we'll use a macro to make clang happy.
#ifndef KERNEL_FRIEND
#if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__))
#define KERNEL_FRIEND
#else
#define KERNEL_FRIEND
#endif
#endif

// IWYU pragma: private
#include "./InternalHeaderCheck.h"

namespace Eigen {

/** \class TensorReduction
 * \ingroup CXX11_Tensor_Module
 *
 * \brief Tensor reduction class.
 *
 */

namespace internal {
traits<TensorReductionOp<Op, Dims, XprType, MakePointer_>>;

eval<TensorReductionOp<Op, Dims, XprType, MakePointer_>, Eigen::Dense>;

nested<TensorReductionOp<Op, Dims, XprType, MakePointer_>, 1, typename eval<TensorReductionOp<Op, Dims, XprType, MakePointer_>>::type>;

template <typename OutputDims>
struct DimInitializer {};

template <>
struct DimInitializer<Sizes<> > {};

template <typename ReducedDims, int NumTensorDims, int Layout>
struct are_inner_most_dims {};
template <typename ReducedDims, int NumTensorDims, int Layout>
struct preserve_inner_most_dims {};

are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>;
are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>;
preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>;
preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>;

template <int DimIndex, typename Self, typename Op>
struct GenericDimReducer {};
GenericDimReducer<0, Self, Op>;
GenericDimReducer<-1, Self, Op>;

template <typename Self, typename Op,
          bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess),
          bool UseTreeReduction = (!Self::ReducerTraits::IsStateful && !Self::ReducerTraits::IsExactlyAssociative &&
                                   // GPU threads can quickly run out of stack space
                                   // for moderately sized inputs.
                                   !Self::RunningOnGPU)>
struct InnerMostDimReducer {};

InnerMostDimReducer<Self, Op, true, false>;

#if !defined(EIGEN_HIPCC)

// The following implements tree-based reduction, which improves the accuracy
// of sum and mean reductions, since each of the n inputs only participates in
// O(log n) additions.
template <typename T>
EIGEN_DEVICE_FUNC inline Index LeafSize() {}
template <>
EIGEN_DEVICE_FUNC inline Index LeafSize<half>() {}
template <>
EIGEN_DEVICE_FUNC inline Index LeafSize<bfloat16>() {}

InnerMostDimReducer<Self, Op, false, true>;

InnerMostDimReducer<Self, Op, true, true>;
#endif

template <int DimIndex, typename Self, typename Op,
          bool vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
struct InnerMostDimPreserver {};

InnerMostDimPreserver<DimIndex, Self, Op, true>;

InnerMostDimPreserver<0, Self, Op, true>;
InnerMostDimPreserver<-1, Self, Op, true>;

// Default full reducer
template <typename Self, typename Op, typename Device,
          bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
struct FullReducer {};

#ifdef EIGEN_USE_THREADS
// Multithreaded full reducers
template <typename Self, typename Op,
          bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
struct FullReducerShard {
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex,
                                                        typename Self::Index numValuesToReduce, Op& reducer,
                                                        typename Self::CoeffReturnType* output) {
    *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, firstIndex, numValuesToReduce, reducer);
  }
};

// Multithreaded full reducer
template <typename Self, typename Op, bool Vectorizable>
struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
  static constexpr bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful;
  static constexpr Index PacketSize = unpacket_traits<typename Self::PacketReturnType>::size;

  // launch one reducer per thread and accumulate the result.
  static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device,
                  typename Self::CoeffReturnType* output) {
    typedef typename Self::Index Index;
    const Index num_coeffs = array_prod(self.m_impl.dimensions());
    if (num_coeffs == 0) {
      *output = reducer.finalize(reducer.initialize());
      return;
    }
    const TensorOpCost cost = self.m_impl.costPerCoeff(Vectorizable) +
                              TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable, PacketSize);
    const Index num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(num_coeffs, cost, device.numThreads());
    if (num_threads == 1) {
      *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
      return;
    }
    const Index blocksize = num_coeffs / num_threads;
    const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
    eigen_assert(num_coeffs >= numblocks * blocksize);

    Barrier barrier(internal::convert_index<unsigned int>(numblocks));
    MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
    for (Index i = 0; i < numblocks; ++i) {
      device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run, self, i * blocksize,
                                  blocksize, reducer, &shards[i]);
    }
    typename Self::CoeffReturnType finalShard;
    if (numblocks * blocksize < num_coeffs) {
      finalShard = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, numblocks * blocksize,
                                                                       num_coeffs - numblocks * blocksize, reducer);
    } else {
      finalShard = reducer.initialize();
    }
    barrier.Wait();

    for (Index i = 0; i < numblocks; ++i) {
      reducer.reduce(shards[i], &finalShard);
    }
    *output = reducer.finalize(finalShard);
  }
};

#endif

// Default inner reducer
template <typename Self, typename Op, typename Device>
struct InnerReducer {};

// Default outer reducer
template <typename Self, typename Op, typename Device>
struct OuterReducer {};

#ifdef EIGEN_USE_SYCL
// Default Generic reducer
template <typename Self, typename Op, typename Device>
struct GenericReducer {
  static constexpr bool HasOptimizedImplementation = false;

  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*,
                                    typename Self::Index, typename Self::Index) {
    eigen_assert(false && "Not implemented");
    return true;
  }
};
#endif

#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
template <int B, int N, typename S, typename R, typename I_>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*,
                                                                 unsigned int*);

#if defined(EIGEN_HAS_GPU_FP16)
template <typename S, typename R, typename I_>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(
    R, const S, I_, internal::packet_traits<half>::type*);
template <int B, int N, typename S, typename R, typename I_>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(R, const S, I_, half*,
                                                                          internal::packet_traits<half>::type*);
template <int NPT, typename S, typename R, typename I_>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);

#endif

template <int NPT, typename S, typename R, typename I_>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);

template <int NPT, typename S, typename R, typename I_>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
#endif

/**
 * For SYCL, the return type of the reduction is deduced from the initialize method of the given Op.
 * This allows the reduction to have a different type for the accumulator than the input data type.
 * If this is the case, the functor needs to have two reduce method: one for reducing an element of the input
 * with the accumulator and the other for reducing two accumulators.
 * Such a reducer can be useful for instance when the accumulator is a boolean or a bitset that checks for
 * some properties of the input.
 */
template <typename Op, typename CoeffReturnType>
struct ReductionReturnType {};

}  // end namespace internal

template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType, MakePointer_>, ReadOnlyAccessors> {};

template <typename ArgType, typename Device>
struct TensorReductionEvaluatorBase;

// Eval as rvalue
TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>;

TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>;

TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice>;

}  // end namespace Eigen

#endif  // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H