#ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
#define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
#include "./InternalHeaderCheck.h"
namespace Eigen {
namespace internal {
traits<TensorScanOp<Op, XprType>>;
eval<TensorScanOp<Op, XprType>, Eigen::Dense>;
nested<TensorScanOp<Op, XprType>, 1, typename eval<TensorScanOp<Op, XprType>>::type>;
}
template <typename Op, typename XprType>
class TensorScanOp : public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> { … };
namespace internal {
template <typename Self>
EIGEN_STRONG_INLINE void ReduceScalar(Self& self, Index offset, typename Self::CoeffReturnType* data) { … }
template <typename Self>
EIGEN_STRONG_INLINE void ReducePacket(Self& self, Index offset, typename Self::CoeffReturnType* data) { … }
template <typename Self, bool Vectorize, bool Parallel>
struct ReduceBlock { … };
ReduceBlock<Self, true, false>;
template <typename Self, typename Reducer, typename Device,
bool Vectorize = (TensorEvaluator<typename Self::ChildTypeNoConst, Device>::PacketAccess &&
internal::reducer_traits<Reducer, Device>::PacketAccess)>
struct ScanLauncher { … };
#ifdef EIGEN_USE_THREADS
EIGEN_STRONG_INLINE Index AdjustBlockSize(Index item_size, Index block_size) {
EIGEN_CONSTEXPR Index kBlockAlignment = 128;
const Index items_per_cacheline = numext::maxi<Index>(1, kBlockAlignment / item_size);
return items_per_cacheline * numext::div_ceil(block_size, items_per_cacheline);
}
template <typename Self>
struct ReduceBlock<Self, true, true> {
EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) {
using Scalar = typename Self::CoeffReturnType;
using Packet = typename Self::PacketReturnType;
const int PacketSize = internal::unpacket_traits<Packet>::size;
Index num_scalars = self.stride();
Index num_packets = 0;
if (self.stride() >= PacketSize) {
num_packets = self.stride() / PacketSize;
self.device().parallelFor(
num_packets,
TensorOpCost(PacketSize * self.size(), PacketSize * self.size(), 16 * PacketSize * self.size(), true,
PacketSize),
[=](Index blk_size) { return AdjustBlockSize(PacketSize * sizeof(Scalar), blk_size); },
[&](Index first, Index last) {
for (Index packet = first; packet < last; ++packet) {
const Index idx2 = packet * PacketSize;
ReducePacket(self, idx1 + idx2, data);
}
});
num_scalars -= num_packets * PacketSize;
}
self.device().parallelFor(
num_scalars, TensorOpCost(self.size(), self.size(), 16 * self.size()),
[=](Index blk_size) { return AdjustBlockSize(sizeof(Scalar), blk_size); },
[&](Index first, Index last) {
for (Index scalar = first; scalar < last; ++scalar) {
const Index idx2 = num_packets * PacketSize + scalar;
ReduceScalar(self, idx1 + idx2, data);
}
});
}
};
template <typename Self>
struct ReduceBlock<Self, false, true> {
EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) {
using Scalar = typename Self::CoeffReturnType;
self.device().parallelFor(
self.stride(), TensorOpCost(self.size(), self.size(), 16 * self.size()),
[=](Index blk_size) { return AdjustBlockSize(sizeof(Scalar), blk_size); },
[&](Index first, Index last) {
for (Index idx2 = first; idx2 < last; ++idx2) {
ReduceScalar(self, idx1 + idx2, data);
}
});
}
};
template <typename Self, typename Reducer, bool Vectorize>
struct ScanLauncher<Self, Reducer, ThreadPoolDevice, Vectorize> {
void operator()(Self& self, typename Self::CoeffReturnType* data) {
using Scalar = typename Self::CoeffReturnType;
using Packet = typename Self::PacketReturnType;
const int PacketSize = internal::unpacket_traits<Packet>::size;
const Index total_size = internal::array_prod(self.dimensions());
const Index inner_block_size = self.stride() * self.size();
bool parallelize_by_outer_blocks = (total_size >= (self.stride() * inner_block_size));
if ((parallelize_by_outer_blocks && total_size <= 4096) ||
(!parallelize_by_outer_blocks && self.stride() < PacketSize)) {
ScanLauncher<Self, Reducer, DefaultDevice, Vectorize> launcher;
launcher(self, data);
return;
}
if (parallelize_by_outer_blocks) {
const Index num_outer_blocks = total_size / inner_block_size;
self.device().parallelFor(
num_outer_blocks,
TensorOpCost(inner_block_size, inner_block_size, 16 * PacketSize * inner_block_size, Vectorize, PacketSize),
[=](Index blk_size) { return AdjustBlockSize(inner_block_size * sizeof(Scalar), blk_size); },
[&](Index first, Index last) {
for (Index idx1 = first; idx1 < last; ++idx1) {
ReduceBlock<Self, Vectorize, false> block_reducer;
block_reducer(self, idx1 * inner_block_size, data);
}
});
} else {
ReduceBlock<Self, Vectorize, true> block_reducer;
for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
block_reducer(self, idx1, data);
}
}
}
};
#endif
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
template <typename Self, typename Reducer>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ScanKernel(Self self, Index total_size,
typename Self::CoeffReturnType* data) {
Index val = threadIdx.x + blockIdx.x * blockDim.x;
Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
if (offset + (self.size() - 1) * self.stride() < total_size) {
typename Self::CoeffReturnType accum = self.accumulator().initialize();
for (Index idx = 0; idx < self.size(); idx++) {
Index curr = offset + idx * self.stride();
if (self.exclusive()) {
data[curr] = self.accumulator().finalize(accum);
self.accumulator().reduce(self.inner().coeff(curr), &accum);
} else {
self.accumulator().reduce(self.inner().coeff(curr), &accum);
data[curr] = self.accumulator().finalize(accum);
}
}
}
__syncthreads();
}
template <typename Self, typename Reducer, bool Vectorize>
struct ScanLauncher<Self, Reducer, GpuDevice, Vectorize> {
void operator()(const Self& self, typename Self::CoeffReturnType* data) {
Index total_size = internal::array_prod(self.dimensions());
Index num_blocks = (total_size / self.size() + 63) / 64;
Index block_size = 64;
LAUNCH_GPU_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
}
};
#endif
}
TensorEvaluator<const TensorScanOp<Op, ArgType>, Device>;
}
#endif