/* Copyright 2019 Google LLC. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ // Internal types and helpers for matrices. // "Mat" is the name we use to refer to our internal matrix classes; it can be // thought of as a shorter version of "InternalMatrix"` // // Ruy has four internal matrix classes, besides the // Matrix<T> class that we expose to the user-facing API. // // TODO(silvasean): Put parts of this architecture description somewhere more // prominent. // // The 4 internal matrix classes are named Mat, EMat, PMat, PEMat, where: // - "E" indicates a type-erased class, storing a void* pointer and a runtime // enum value to track the scalar type, as opposed to being templatized // on a Scalar type and storing a Scalar* pointer. // - "P" indicates a packed matrix class, the output of the packing code and // input of the kernel code. See comments in pack.h regarding packing. // // In other words: // // Plain matrices Packed matrices // +---------------------------------- // Templated | Mat, Matrix PMat // Type-erased | EMat PEMat // // Note that Matrix<T> is *not* implemented in terms of the internal types. It // is an independent, simple, and user-facing type. Matrix<T> is functionally // equivalent to Mat, but we keep it separate to insulate internals from // interface and to be able to make different compromises in internals // vs interface: in internals we prefer Mat to be a C-style struct with // raw data member access and to be similar to the other PMat/EMat/PEMat // classes for consistency. // // The use of type-erasure might seem surprising for a library like Ruy with a // heavily-templated entry point, but it is motivated by the desire for most of // Ruy's "middle-end" to be non-templated. Ruy can be thought of as having 3 // main parts: // - "entry-point" (ruy.h) - this is the highly templated ruy::Mul entry // point. // - "front-end" (frontend.*, validate.*, create_trmul_params.*, // prepare_packed_matrices.*) - the work to handle the entry-point call down to // the point where it can be handed off to the middle/back ends below. That // includes routines that select RunKernel and RunPack // implementations statically based on those template parameters. // - "back-end" (kernel_*.*, pack_*.*)- this consists of the implementations of // RunKernel and RunPack, often in assembly code, which are the building blocks // that Ruy calls to perform matrix multiplication. These are templated so that // only the requested types/Path's are actually emitted by the compiler. // - "middle-end" (trmul.*) - this is the part of Ruy that orchestrates the // calls to the "back-end" optimized building blocks. This layer has to deal // with issues like cache locality and low-overhead multi-threading. // // There is a desire for the "middle-end" to be non-templated in order to // simplify the implementation and reduce code-size. We type-erase when going // from the "front-end" to the "middle-end", and un-type-erase going from the // "middle-end" to the "back-end". The un-type-erasure is possible because the // "front-end" is responsible for instantiating the needed "back-end" templates, // and thus the static type information is still present. // // Each layer of Ruy uses matrix types: // - "entry-point": Matrix<T> // - "front-end": Mat // - "middle-end": EMat, PEMat // - "back-end": Mat, PMat // // The use of separate types for packed matrices is not essential, but makes it // obvious at a glance whether a matrix is a packed matrix or not. We would // reconsider this decision if there was significant duplication between packed // and unpacked matrices, but that doesn't seem to be the case at the moment. // // Another goal is to keep the user-facing Matrix<T> as simple and // understandable as possible. Ideally, a user should be able to read the struct // definition for Matrix<T> and see a very simple definition with no internal // details like sums and kernel block layout. #ifndef RUY_RUY_INTERNAL_MATRIX_H_ #define RUY_RUY_INTERNAL_MATRIX_H_ #include <cstddef> #include <cstdint> #include <type_traits> #include <utility> #include "ruy/check_macros.h" #include "ruy/matrix.h" #include "ruy/size_util.h" namespace ruy { // Internal counterpart of Layout, used by Mat. struct MatLayout final { … }; inline MatLayout ToInternal(const Layout& src) { … } // Internal counterpart of Matrix template <typename Scalar> struct Mat final { … }; template <typename Scalar> inline Mat<Scalar> ToInternal(const Matrix<Scalar>& src) { … } template <typename Scalar> inline Mat<Scalar> ToInternal(Matrix<Scalar>& src) { … } // KernelLayout describes small-scale block structure in a packed matrix layout. // It's a runtime (as opposed to compile-time-constant) version of the // FixedKernelLayout struct used to declare kernel layouts. // // This is is sometimes known as "tiling" in other contexts. // // For example, consider a packed matrix in column-major format with a // column-major KernelLayout. The matrix logically has a shape of // `[cols, rows]`. However, the matrix is laid out as though it were a 4D array // of shape `[cols / kcols, rows / krows, kcols, krows]`. // // Note that in the case of kcols=1, krows=1, this degenerates to // `[cols, rows, 1, 1]` which is equivalent to having no small-scale block // structure. struct KernelLayout final { … }; // A packed matrix has a small-scale block structure that is not present in in // the input matrices. This block structure is necessary for the kernels to // process data efficiently. // // This struct is very similar to MatLayout, but has the extra KernelLayout // field. struct PMatLayout final { … }; inline bool operator==(const PMatLayout& a, const PMatLayout& b) { … } // Dynamic representation for a type. // // The most important field in this struct is the size, which Ruy uses to know // how much memory to allocate without having to be templated on a type. // Signed-ness and floating-point-ness are mainly present as debugging checks. // // Note: Ruy does not use this struct to to dynamically dispatch between // different typed implementations. As described in the comment at the top of // this file, Ruy's "front-end", which is templated, instantiates all the // necessary "back-end" routines with complete static knowledge of all the // types. struct Type final { … }; inline bool operator==(const Type& type1, const Type& type2) { … } // Type-erased matrix. struct EMat final { … }; // Type-erased packed matrix. struct PEMat final { … }; // Convenient typed helper for packed matrices. template <typename Scalar> struct PMat final { … }; template <typename T> EMat EraseType(const Mat<T>& matrix) { … } template <typename T> Mat<T> UneraseType(const EMat& matrix) { … } template <typename T> PMat<T> UneraseType(const PEMat& matrix) { … } // Helpers for MatLayout / PMatLayout. inline bool IsUnstrided(const MatLayout& layout) { … } inline bool IsRowMajor(const MatLayout& layout) { … } inline bool IsColMajor(const MatLayout& layout) { … } inline std::ptrdiff_t FlatSize(const MatLayout& layout) { … } inline bool IsUnstrided(const PMatLayout& layout) { … } inline bool IsRowMajor(const PMatLayout& layout) { … } inline bool IsColMajor(const PMatLayout& layout) { … } inline std::ptrdiff_t FlatSize(const PMatLayout& layout) { … } // TODO(b/130417400) add a unit test inline int Offset(const MatLayout& layout, int row, int col) { … } // TODO(b/130417400) add a unit test inline int Offset(const PMatLayout& layout, int row, int col) { … } // Helpers for Mat<T>. template <typename Scalar> const Scalar* ElementPtr(const Mat<Scalar>& mat, int row, int col) { … } template <typename Scalar> Scalar* ElementPtr(Mat<Scalar>* mat, int row, int col) { … } template <typename Scalar> Scalar Element(const Mat<Scalar>& mat, int row, int col) { … } // Helpers for PMat<T>. // Duplicated from Matrix<T>, but the duplication seems acceptable. template <typename Scalar> const Scalar* ElementPtr(const PMat<Scalar>& mat, int row, int col) { … } template <typename Scalar> Scalar* ElementPtr(PMat<Scalar>* mat, int row, int col) { … } template <typename Scalar> Scalar Element(const PMat<Scalar>& mat, int row, int col) { … } // Helpers for PEMat. inline std::ptrdiff_t DataBytes(const PEMat& packed) { … } inline std::ptrdiff_t SumsBytes(const PEMat& packed) { … } // Transpose helpers. inline Order Transpose(Order order) { … } inline MatLayout Transpose(const MatLayout& layout) { … } template <typename Scalar> Mat<Scalar> Transpose(const Mat<Scalar>& matrix) { … } // Compile-time version of KernelLayout, used to declare kernel layouts in a // way that can be consumed by compile-time logic. template <Order tOrder, int tRows, int tCols> struct FixedKernelLayout { … }; template <typename FixedKernelLayout> KernelLayout ToKernelLayout() { … } #if (__cplusplus < 201703L) // A static constexpr data member is automatically inline and should not require // redeclaration without an initializer. This is actually deprecated from C++17 // onwards. Clang with -O0 without this can fail to link. template <Order tOrder, int tRows, int tCols> constexpr int FixedKernelLayout<tOrder, tRows, tCols>::kCols; template <Order tOrder, int tRows, int tCols> constexpr int FixedKernelLayout<tOrder, tRows, tCols>::kRows; #endif } // namespace ruy #endif // RUY_RUY_INTERNAL_MATRIX_H_