#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
#include "./InternalHeaderCheck.h"
namespace Eigen {
namespace internal {
template <typename Index, typename InputDims, int NumKernelDims, int Layout>
class IndexMapper { … };
traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>>;
eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>;
nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>>::type>;
}
template <typename Indices, typename InputXprType, typename KernelXprType>
class TensorConvolutionOp
: public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors> { … };
TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>;
#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
template <int StaticKernelSize>
struct GetKernelSize {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(const int ) const { return StaticKernelSize; }
};
template <>
struct GetKernelSize<Dynamic> {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(const int kernelSize) const { return kernelSize; }
};
template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSize>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D(
InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout> indexMapper,
const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int kernelSize,
float* buffer) {
#if defined(EIGEN_HIPCC)
HIP_DYNAMIC_SHARED(float, s)
#else
extern __shared__ float s[];
#endif
const int first_x = blockIdx.x * maxX;
const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
const int num_x_output = last_x - first_x + 1;
const int first_plane = blockIdx.y * blockDim.y;
const int plane_stride = blockDim.y * gridDim.y;
for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
const int plane_kernel_offset = threadIdx.y * num_x_input;
#pragma unroll
for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + first_x);
s[i + plane_kernel_offset] = eval.coeff(tensor_index);
}
__syncthreads();
const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
#pragma unroll
for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
const int kernel_offset = plane_kernel_offset + i;
float result = 0.0f;
#pragma unroll
for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
result += s[k + kernel_offset] * kernel[k];
}
const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i + first_x);
buffer[tensor_index] = result;
}
__syncthreads();
}
};
template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSizeX, int StaticKernelSizeY>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D(
InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout> indexMapper,
const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int numY, const int maxY,
const int kernelSizeX, const int kernelSizeY, float* buffer) {
#if defined(EIGEN_HIPCC)
HIP_DYNAMIC_SHARED(float, s)
#else
extern __shared__ float s[];
#endif
const int first_x = blockIdx.x * maxX;
const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
const int num_x_output = last_x - first_x + 1;
const int first_y = blockIdx.y * maxY;
const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
const int num_y_output = last_y - first_y + 1;
const int first_plane = blockIdx.z * blockDim.z;
const int plane_stride = blockDim.z * gridDim.z;
for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
const int plane_kernel_offset = threadIdx.z * num_y_input;
#pragma unroll
for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
const int input_offset = num_x_input * (j + plane_kernel_offset);
#pragma unroll
for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
const int tensor_index =
plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + first_x, j + first_y);
s[i + input_offset] = eval.coeff(tensor_index);
}
}
__syncthreads();
const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
#pragma unroll
for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
#pragma unroll
for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
float result = 0.0f;
#pragma unroll
for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
const int kernel_offset = kernelSizeX * l;
const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
#pragma unroll
for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
result += s[k + input_offset] * kernel[k + kernel_offset];
}
}
const int tensor_index =
plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i + first_x, j + first_y);
buffer[tensor_index] = result;
}
}
__syncthreads();
}
};
template <typename InputEvaluator, typename Index, typename InputDims>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout> indexMapper,
const float* __restrict kernel, const size_t numPlanes, const size_t numX, const size_t maxX, const size_t numY,
const size_t maxY, const size_t numZ, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
const size_t kernelSizeZ, float* buffer) {
#if defined(EIGEN_HIPCC)
HIP_DYNAMIC_SHARED(float, s)
#else
extern __shared__ float s[];
#endif
const int first_x = blockIdx.x * maxX;
const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
const int num_x_input = last_x - first_x + kernelSizeX;
const int first_y = blockIdx.y * maxY;
const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
const int num_y_input = last_y - first_y + kernelSizeY;
const int first_z = blockIdx.z * maxZ;
const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
const int num_z_input = last_z - first_z + kernelSizeZ;
for (int p = 0; p < numPlanes; ++p) {
const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
const int plane_kernel_offset = 0;
for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
i + first_x, j + first_y, k + first_z);
s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
}
}
}
__syncthreads();
const int num_z_output = last_z - first_z + 1;
const int num_y_output = last_y - first_y + 1;
const int num_x_output = last_x - first_x + 1;
const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
float result = 0.0f;
for (int n = 0; n < kernelSizeZ; ++n) {
for (int m = 0; m < kernelSizeY; ++m) {
for (int l = 0; l < kernelSizeX; ++l) {
result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] *
kernel[l + kernelSizeX * (m + kernelSizeY * n)];
}
}
}
const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(
i + first_x, j + first_y, k + first_z);
buffer[tensor_index] = result;
}
}
}
__syncthreads();
}
};
template <typename Indices, typename InputArgType, typename KernelArgType>
struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice> {
typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
static constexpr int NumDims =
internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
static constexpr int NumKernelDims = internal::array_size<Indices>::value;
typedef typename XprType::Index Index;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
static constexpr int Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout;
enum {
IsAligned =
TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
PacketAccess = false,
BlockAccess = false,
PreferBlockAccess = false,
CoordAccess = false,
RawAccess = false
};
typedef internal::TensorBlockNotImplemented TensorBlock;
TensorEvaluator(const XprType& op, const GpuDevice& device)
: m_inputImpl(op.inputExpression(), device),
m_kernelImpl(op.kernelExpression(), device),
m_kernelArg(op.kernelExpression()),
m_indices(op.indices()),
m_buf(NULL),
m_kernel(NULL),
m_local_kernel(false),
m_device(device) {
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) ==
static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)),
YOU_MADE_A_PROGRAMMING_MISTAKE);
const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
m_dimensions = m_inputImpl.dimensions();
for (int i = 0; i < NumKernelDims; ++i) {
const Index index = op.indices()[i];
const Index input_dim = input_dims[index];
const Index kernel_dim = kernel_dims[i];
const Index result_dim = input_dim - kernel_dim + 1;
m_dimensions[index] = result_dim;
}
}
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
typedef typename InputArgType::Scalar Scalar;
static constexpr int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
preloadKernel();
m_inputImpl.evalSubExprsIfNeeded(NULL);
if (data) {
executeEval(data);
return false;
} else {
m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
executeEval(m_buf);
return true;
}
}
EIGEN_STRONG_INLINE void cleanup() {
m_inputImpl.cleanup();
if (m_buf) {
m_device.deallocate(m_buf);
m_buf = NULL;
}
if (m_local_kernel) {
m_device.deallocate((void*)m_kernel);
m_local_kernel = false;
}
m_kernel = NULL;
}
EIGEN_STRONG_INLINE void preloadKernel() {
const Scalar* in_place = m_kernelImpl.data();
if (in_place) {
m_kernel = in_place;
m_local_kernel = false;
} else {
size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
typedef TensorEvalToOp<const KernelArgType> EvalTo;
EvalTo evalToTmp(local, m_kernelArg);
const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);
m_kernel = local;
m_local_kernel = true;
}
}
static unsigned int ceil(unsigned int num, unsigned int denom) {
const unsigned int rounded_toward_zero = num / denom;
if (num > rounded_toward_zero * denom) {
return rounded_toward_zero + 1;
}
return rounded_toward_zero;
}
void executeEval(Scalar* data) const {
typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
const int maxSharedMem = m_device.sharedMemPerBlock();
const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock();
const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
const int numMultiProcessors = m_device.getNumGpuMultiProcessors();
const int warpSize = 32;
switch (NumKernelDims) {
case 1: {
const int kernel_size = m_kernelImpl.dimensions().TotalSize();
const int numX = dimensions()[m_indices[0]];
const int numP = dimensions().TotalSize() / numX;
int maxX;
dim3 block_size;
const int single_stride_dim =
static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : m_inputImpl.dimensions().rank() - 1;
if (m_indices[0] == single_stride_dim) {
const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
maxX = numext::mini<int>(inner_dim, numX);
const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
block_size.x = numext::mini(maxThreadsPerBlock, maxX);
block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
} else {
const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
const int maxP = numext::mini<int>(inner_dim, numP);
maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
block_size.x = numext::mini(warpSize, maxX);
block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
}
const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
gpu_assert(shared_mem <= maxSharedMem);
const int num_x_blocks = ceil(numX, maxX);
const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
const array<Index, 1> indices{m_indices[0]};
const array<Index, 1> kernel_dims{m_kernelImpl.dimensions()[0]};
internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
switch (kernel_size) {
case 4: {
LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>),
num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP,
numX, maxX, 4, data);
break;
}
case 7: {
LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>),
num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP,
numX, maxX, 7, data);
break;
}
default: {
LAUNCH_GPU_KERNEL(
(EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>),
num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
kernel_size, data);
}
}
break;
}
case 2: {
const int idxX = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
const int idxY = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
const int numX = dimensions()[m_indices[idxX]];
const int numY = dimensions()[m_indices[idxY]];
const int numP = dimensions().TotalSize() / (numX * numY);
const float scaling_factor =
sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
const int maxX = numext::mini<int>(inner_dim, numX);
const int maxY =
numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
const int maxP = numext::mini<int>(
maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
dim3 block_size;
block_size.x = numext::mini(1024, maxX);
block_size.y = numext::mini<int>(1024 / block_size.x, maxY);
block_size.z = numext::mini<int>(1024 / (block_size.x * block_size.y), maxP);
const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
gpu_assert(shared_mem <= maxSharedMem);
const int num_x_blocks = ceil(numX, maxX);
const int num_y_blocks = ceil(numY, maxY);
const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
const array<Index, 2> indices{m_indices[idxX], m_indices[idxY]};
const array<Index, 2> kernel_dims{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY]};
internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
switch (kernel_size_x) {
case 4: {
switch (kernel_size_y) {
case 7: {
LAUNCH_GPU_KERNEL(
(EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>),
num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
numY, maxY, 4, 7, data);
break;
}
default: {
LAUNCH_GPU_KERNEL(
(EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>),
num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
numY, maxY, 4, kernel_size_y, data);
break;
}
}
break;
}
case 7: {
switch (kernel_size_y) {
case 4: {
LAUNCH_GPU_KERNEL(
(EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>),
num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
numY, maxY, 7, 4, data);
break;
}
default: {
LAUNCH_GPU_KERNEL(
(EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>),
num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
numY, maxY, 7, kernel_size_y, data);
break;
}
}
break;
}
default: {
LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims,
Dynamic, Dynamic>),
num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP,
numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
break;
}
}
break;
}
case 3: {
const int idxX = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
const int idxY = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
const int idxZ = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
const int numX = dimensions()[m_indices[idxX]];
const int numY = dimensions()[m_indices[idxY]];
const int numZ = dimensions()[m_indices[idxZ]];
const int numP = dimensions().TotalSize() / (numX * numY * numZ);
const int maxX = numext::mini<int>(
128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1,
numX));
const int maxY = numext::mini<int>(
128, numext::mini<int>(
maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1,
numY));
const int maxZ = numext::mini<int>(
128, numext::mini<int>(
maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) -
kernel_size_z + 1,
numZ));
dim3 block_size;
block_size.x = numext::mini(32, maxX);
block_size.y = numext::mini(32, maxY);
block_size.z = numext::mini<int>(1024 / (block_size.x * block_size.y), maxZ);
dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
const int shared_mem =
(maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
gpu_assert(shared_mem <= maxSharedMem);
const array<Index, 3> indices{m_indices[idxX], m_indices[idxY], m_indices[idxZ]};
const array<Index, 3> kernel_dims{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY],
m_kernelImpl.dimensions()[idxZ]};
internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>),
num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX,
maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
break;
}
default: {
EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
}
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
eigen_assert(m_buf);
eigen_assert(index < m_dimensions.TotalSize());
return m_buf[index];
}
template <int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const {
eigen_assert(m_buf);
eigen_assert(index < m_dimensions.TotalSize());
return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
const double kernel_size = m_kernelImpl.dimensions().TotalSize();
const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
const double firstIndex_compute_cost =
NumDims *
(2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
}
private:
TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
KernelArgType m_kernelArg;
Indices m_indices;
Dimensions m_dimensions;
Scalar* m_buf;
const Scalar* m_kernel;
bool m_local_kernel;
const GpuDevice& m_device;
};
#endif
}
#endif