GeneralBlockPanelKernel.h | Explore in Territory

// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2009 Gael Guennebaud <[email protected]>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

#ifndef EIGEN_GENERAL_BLOCK_PANEL_H
#define EIGEN_GENERAL_BLOCK_PANEL_H

// IWYU pragma: private
#include "../InternalHeaderCheck.h"

namespace Eigen {

namespace internal {

enum GEBPPacketSizeType { … };

template <typename LhsScalar_, typename RhsScalar_, bool ConjLhs_ = false, bool ConjRhs_ = false,
          int Arch = Architecture::Target, int PacketSize_ = GEBPPacketFull>
class gebp_traits;

/** \internal \returns b if a<=0, and returns a otherwise. */
inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b) { … }

#if defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE …
#else
#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE …
#endif  // defined(EIGEN_DEFAULT_L1_CACHE_SIZE)

#if defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE …
#else
#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE …
#endif  // defined(EIGEN_DEFAULT_L2_CACHE_SIZE)

#if defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE …
#else
#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE …
#endif  // defined(EIGEN_DEFAULT_L3_CACHE_SIZE)

#if EIGEN_ARCH_i386_OR_x86_64
const std::ptrdiff_t defaultL1CacheSize = …);
const std::ptrdiff_t defaultL2CacheSize = …);
const std::ptrdiff_t defaultL3CacheSize = …);
#elif EIGEN_ARCH_PPC
const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64 * 1024);
#ifdef _ARCH_PWR10
const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(2 * 1024 * 1024);
const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(8 * 1024 * 1024);
#else
const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512 * 1024);
const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4 * 1024 * 1024);
#endif
#else
const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16 * 1024);
const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512 * 1024);
const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512 * 1024);
#endif

#undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE
#undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE
#undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE

/** \internal */
struct CacheSizes { … };

/** \internal */
inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3) { … }

/* Helper for computeProductBlockingSizes.
 *
 * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
 * this function computes the blocking size parameters along the respective dimensions
 * for matrix products and related algorithms. The blocking sizes depends on various
 * parameters:
 * - the L1 and L2 cache sizes,
 * - the register level blocking sizes defined by gebp_traits,
 * - the number of scalars that fit into a packet (when vectorization is enabled).
 *
 * \sa setCpuCacheSizes */

template <typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1) { … }

template <typename Index>
inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n) { … }

/** \brief Computes the blocking parameters for a m x k times k x n matrix product
 *
 * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension.
 * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension.
 * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same
 * dimension.
 *
 * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
 * this function computes the blocking size parameters along the respective dimensions
 * for matrix products and related algorithms.
 *
 * The blocking size parameters may be evaluated:
 *   - either by a heuristic based on cache sizes;
 *   - or using fixed prescribed values (for testing purposes).
 *
 * \sa setCpuCacheSizes */

template <typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) { … }

template <typename LhsScalar, typename RhsScalar, typename Index>
inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) { … }

template <typename RhsPacket, typename RhsPacketx4, int registers_taken>
struct RhsPanelHelper { … };

template <typename Packet>
struct QuadPacket { … };

template <int N, typename T1, typename T2, typename T3>
struct packet_conditional { … };

packet_conditional<GEBPPacketFull, T1, T2, T3>;

packet_conditional<GEBPPacketHalf, T1, T2, T3>;

#define PACKET_DECL_COND_POSTFIX …

#define PACKET_DECL_COND …

#define PACKET_DECL_COND_SCALAR_POSTFIX …

#define PACKET_DECL_COND_SCALAR …

/* Vectorization logic
 *  real*real: unpack rhs to constant packets, ...
 *
 *  cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i),
 *          storing each res packet into two packets (2x2),
 *          at the end combine them: swap the second and addsub them
 *  cf*cf : same but with 2x4 blocks
 *  cplx*real : unpack rhs to constant packets, ...
 *  real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
 */
template <typename LhsScalar_, typename RhsScalar_, bool ConjLhs_, bool ConjRhs_, int Arch, int PacketSize_>
class gebp_traits { … };

gebp_traits<std::complex<RealScalar>, RealScalar, ConjLhs_, false, Arch, PacketSize_>;

template <typename Packet>
struct DoublePacket { … };

template <typename Packet>
DoublePacket<Packet> padd(const DoublePacket<Packet>& a, const DoublePacket<Packet>& b) { … }

// note that for DoublePacket<RealPacket> the "4" in "downto4"
// corresponds to the number of complexes, so it means "8"
// it terms of real coefficients.

template <typename Packet>
const DoublePacket<Packet>& predux_half_dowto4(const DoublePacket<Packet>& a,
                                               std::enable_if_t<unpacket_traits<Packet>::size <= 8>* = 0) { … }

template <typename Packet>
DoublePacket<typename unpacket_traits<Packet>::half> predux_half_dowto4(
    const DoublePacket<Packet>& a, std::enable_if_t<unpacket_traits<Packet>::size == 16>* = 0) { … }

// same here, "quad" actually means "8" in terms of real coefficients
template <typename Scalar, typename RealPacket>
void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
                            std::enable_if_t<unpacket_traits<RealPacket>::size <= 8>* = 0) { … }

template <typename Scalar, typename RealPacket>
void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
                            std::enable_if_t<unpacket_traits<RealPacket>::size == 16>* = 0) { … }

unpacket_traits<DoublePacket<Packet>>;
// template<typename Packet>
// DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
// {
//   DoublePacket<Packet> res;
//   res.first  = padd(a.first, b.first);
//   res.second = padd(a.second,b.second);
//   return res;
// }

gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, ConjLhs_, ConjRhs_, Arch, PacketSize_>;

gebp_traits<RealScalar, std::complex<RealScalar>, false, ConjRhs_, Arch, PacketSize_>;

/* optimized General packed Block * packed Panel product kernel
 *
 * Mixing type logic: C += A * B
 *  |  A  |  B  | comments
 *  |real |cplx | no vectorization yet, would require to pack A with duplication
 *  |cplx |real | easy vectorization
 */
template <typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
          bool ConjugateLhs, bool ConjugateRhs>
struct gebp_kernel { … };

template <typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
          bool ConjugateLhs, bool ConjugateRhs,
          int SwappedLhsProgress =
              gebp_traits<RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target>::LhsProgress>
struct last_row_process_16_packets { … };

last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs, 16>;

template <int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar,
          typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits,
          typename LinearMapper, typename DataMapper>
struct lhs_process_one_packet { … };

template <int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar,
          typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits,
          typename LinearMapper, typename DataMapper>
struct lhs_process_fraction_of_packet
    : lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket,
                             RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper> { … };

template <typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
          bool ConjugateLhs, bool ConjugateRhs>
EIGEN_DONT_INLINE void gebp_kernel<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs,
                                   ConjugateRhs>::operator()(const DataMapper& res, const LhsScalar* blockA,
                                                             const RhsScalar* blockB, Index rows, Index depth,
                                                             Index cols, ResScalar alpha, Index strideA, Index strideB,
                                                             Index offsetA, Index offsetB) { … }

// pack a block of the lhs
// The traversal is as follow (mr==4):
//   0  4  8 12 ...
//   1  5  9 13 ...
//   2  6 10 14 ...
//   3  7 11 15 ...
//
//  16 20 24 28 ...
//  17 21 25 29 ...
//  18 22 26 30 ...
//  19 23 27 31 ...
//
//  32 33 34 35 ...
//  36 36 38 39 ...
gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>;

template <typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate,
          bool PanelMode>
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate,
                                     PanelMode>::operator()(Scalar* blockA, const DataMapper& lhs, Index depth,
                                                            Index rows, Index stride, Index offset) { … }

gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>;

template <typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate,
          bool PanelMode>
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate,
                                     PanelMode>::operator()(Scalar* blockA, const DataMapper& lhs, Index depth,
                                                            Index rows, Index stride, Index offset) { … }

// copy a complete panel of the rhs
// this version is optimized for column major matrices
// The traversal order is as follow: (nr==4):
//  0  1  2  3   12 13 14 15   24 27
//  4  5  6  7   16 17 18 19   25 28
//  8  9 10 11   20 21 22 23   26 29
//  .  .  .  .    .  .  .  .    .  .
gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>;

template <typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::operator()(
    Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { … }

// this version is optimized for row major matrices
gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>;

}  // end namespace internal

/** \returns the currently set level 1 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
 * \sa setCpuCacheSize */
inline std::ptrdiff_t l1CacheSize() { … }

/** \returns the currently set level 2 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
 * \sa setCpuCacheSize */
inline std::ptrdiff_t l2CacheSize() { … }

/** \returns the currently set level 3 cpu cache size (in bytes) used to estimate the ideal blocking size paramete\
rs.
* \sa setCpuCacheSize */
inline std::ptrdiff_t l3CacheSize() { … }

/** Set the cpu L1 and L2 cache sizes (in bytes).
 * These values are use to adjust the size of the blocks
 * for the algorithms working per blocks.
 *
 * \sa computeProductBlockingSizes */
inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3) { … }

}  // end namespace Eigen

#endif  // EIGEN_GENERAL_BLOCK_PANEL_H
chromium/third_party/eigen3/src/Eigen/src/Core/products/GeneralBlockPanelKernel.h