#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
#ifndef KERNEL_FRIEND
#if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__))
#define KERNEL_FRIEND …
#else
#define KERNEL_FRIEND …
#endif
#endif
#include "./InternalHeaderCheck.h"
namespace Eigen {
namespace internal {
traits<TensorReductionOp<Op, Dims, XprType, MakePointer_>>;
eval<TensorReductionOp<Op, Dims, XprType, MakePointer_>, Eigen::Dense>;
nested<TensorReductionOp<Op, Dims, XprType, MakePointer_>, 1, typename eval<TensorReductionOp<Op, Dims, XprType, MakePointer_>>::type>;
template <typename OutputDims>
struct DimInitializer { … };
template <>
struct DimInitializer<Sizes<> > { … };
template <typename ReducedDims, int NumTensorDims, int Layout>
struct are_inner_most_dims { … };
template <typename ReducedDims, int NumTensorDims, int Layout>
struct preserve_inner_most_dims { … };
are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>;
are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>;
preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>;
preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>;
template <int DimIndex, typename Self, typename Op>
struct GenericDimReducer { … };
GenericDimReducer<0, Self, Op>;
GenericDimReducer<-1, Self, Op>;
template <typename Self, typename Op,
bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess),
bool UseTreeReduction = (!Self::ReducerTraits::IsStateful && !Self::ReducerTraits::IsExactlyAssociative &&
!Self::RunningOnGPU)>
struct InnerMostDimReducer { … };
InnerMostDimReducer<Self, Op, true, false>;
#if !defined(EIGEN_HIPCC)
template <typename T>
EIGEN_DEVICE_FUNC inline Index LeafSize() { … }
template <>
EIGEN_DEVICE_FUNC inline Index LeafSize<half>() { … }
template <>
EIGEN_DEVICE_FUNC inline Index LeafSize<bfloat16>() { … }
InnerMostDimReducer<Self, Op, false, true>;
InnerMostDimReducer<Self, Op, true, true>;
#endif
template <int DimIndex, typename Self, typename Op,
bool vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
struct InnerMostDimPreserver { … };
InnerMostDimPreserver<DimIndex, Self, Op, true>;
InnerMostDimPreserver<0, Self, Op, true>;
InnerMostDimPreserver<-1, Self, Op, true>;
template <typename Self, typename Op, typename Device,
bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
struct FullReducer { … };
#ifdef EIGEN_USE_THREADS
template <typename Self, typename Op,
bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
struct FullReducerShard {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex,
typename Self::Index numValuesToReduce, Op& reducer,
typename Self::CoeffReturnType* output) {
*output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, firstIndex, numValuesToReduce, reducer);
}
};
template <typename Self, typename Op, bool Vectorizable>
struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
static constexpr bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful;
static constexpr Index PacketSize = unpacket_traits<typename Self::PacketReturnType>::size;
static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device,
typename Self::CoeffReturnType* output) {
typedef typename Self::Index Index;
const Index num_coeffs = array_prod(self.m_impl.dimensions());
if (num_coeffs == 0) {
*output = reducer.finalize(reducer.initialize());
return;
}
const TensorOpCost cost = self.m_impl.costPerCoeff(Vectorizable) +
TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable, PacketSize);
const Index num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(num_coeffs, cost, device.numThreads());
if (num_threads == 1) {
*output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
return;
}
const Index blocksize = num_coeffs / num_threads;
const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
eigen_assert(num_coeffs >= numblocks * blocksize);
Barrier barrier(internal::convert_index<unsigned int>(numblocks));
MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
for (Index i = 0; i < numblocks; ++i) {
device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run, self, i * blocksize,
blocksize, reducer, &shards[i]);
}
typename Self::CoeffReturnType finalShard;
if (numblocks * blocksize < num_coeffs) {
finalShard = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, numblocks * blocksize,
num_coeffs - numblocks * blocksize, reducer);
} else {
finalShard = reducer.initialize();
}
barrier.Wait();
for (Index i = 0; i < numblocks; ++i) {
reducer.reduce(shards[i], &finalShard);
}
*output = reducer.finalize(finalShard);
}
};
#endif
template <typename Self, typename Op, typename Device>
struct InnerReducer { … };
template <typename Self, typename Op, typename Device>
struct OuterReducer { … };
#ifdef EIGEN_USE_SYCL
template <typename Self, typename Op, typename Device>
struct GenericReducer {
static constexpr bool HasOptimizedImplementation = false;
EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*,
typename Self::Index, typename Self::Index) {
eigen_assert(false && "Not implemented");
return true;
}
};
#endif
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
template <int B, int N, typename S, typename R, typename I_>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*,
unsigned int*);
#if defined(EIGEN_HAS_GPU_FP16)
template <typename S, typename R, typename I_>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(
R, const S, I_, internal::packet_traits<half>::type*);
template <int B, int N, typename S, typename R, typename I_>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(R, const S, I_, half*,
internal::packet_traits<half>::type*);
template <int NPT, typename S, typename R, typename I_>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
#endif
template <int NPT, typename S, typename R, typename I_>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
template <int NPT, typename S, typename R, typename I_>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
#endif
template <typename Op, typename CoeffReturnType>
struct ReductionReturnType { … };
}
template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType, MakePointer_>, ReadOnlyAccessors> { … };
template <typename ArgType, typename Device>
struct TensorReductionEvaluatorBase;
TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>;
TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>;
TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice>;
}
#endif