block_map.cc | Explore in Territory

/* Copyright 2019 Google LLC. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include "ruy/block_map.h"

#include <algorithm>
#include <cstdint>
#include <limits>

#ifdef RUY_MAKEBLOCKMAP_DEBUG
#include <cstdio>
#include <cstdlib>
#include <string>
#endif

#include "ruy/check_macros.h"
#include "ruy/opt_set.h"
#include "ruy/profiler/instrumentation.h"
#include "ruy/size_util.h"
#include "ruy/trace.h"

namespace ruy {

namespace {

void DecodeTraversalLinear(int size_log2, std::uint32_t square_index,
                           SidePair<int>* local_pos) { … }

void DecodeTraversalFractalZ(std::uint32_t square_index,
                             SidePair<int>* local_pos) { … }

void DecodeTraversalFractalU(std::uint32_t square_index,
                             SidePair<int>* local_pos) { … }

// Code inspired by the sample code in
//   https://en.wikipedia.org/wiki/Hilbert_curve
// The main optimization is to avoid hard-to-predict conditional branches
// based on the bits of the square_index parameter.
void DecodeTraversalFractalHilbert(int size_log2, std::uint32_t square_index,
                                   SidePair<int>* local_pos) { … }

}  // end anonymous namespace

void GetBlockByIndex(const BlockMap& block_map, int index,
                     SidePair<int>* block) { … }

namespace {

BlockMapTraversalOrder GetTraversalOrder(
    int rows_after_rectangularness_division,
    int cols_after_rectangularness_division, int depth, int lhs_scalar_size,
    int rhs_scalar_size, const CpuCacheParams& cpu_cache_params) { … }

int floor_log2_quotient(int num, int denom) { … }

// Computes the rectangularness of the matrix shape (rows, cols). This is
// essentially just the log2 of the quotient (rows / cols). The kernel_rows and
// kernel_cols only get into the picture for clamping bounds but don't affect
// the generic computation.
void GetRectangularness(int rows, int cols, int kernel_rows, int kernel_cols,
                        int* rows_rectangularness_log2,
                        int* cols_rectangularness_log2) { … }

// Computes a 'multithreading score'. When multithreading, we need there to
// be at least as many tiles as there are threads, and hopefully
// substantially more than that, so we benefit from ruy's ability to
// dispatch fine-grained workloads to threads.
int GetMultithreadingScore(int block_size_log2, int rows, int cols,
                           int tentative_thread_count) { … }

// Computes a 'cache locality score'.
int GetCacheLocalityScore(int block_size_log2, int rows, int cols, int depth,
                          int kernel_rows_log2, int kernel_cols_log2,
                          int lhs_scalar_size, int rhs_scalar_size,
                          const CpuCacheParams& cpu_cache_params) { … }

// Compute a 'kernel amortization score'. This is the notion that very small
// tiles result in more overhead outside of kernels, more complex memory
// access patterns and less benefits from ruy's fat kernels, so we reward
// larger blocks more than smaller ones.
int GetKernelAmortizationScore(int block_size_log2, int rows, int cols,
                               int kernel_rows_log2, int kernel_cols_log2) { … }

}  // namespace

bool IsObviouslyLinearTraversal(int rows, int cols, int depth,
                                int lhs_scalar_size, int rhs_scalar_size,
                                const CpuCacheParams& cpu_cache_params) { … }

void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
                  int kernel_cols, int lhs_scalar_size, int rhs_scalar_size,
                  int tentative_thread_count,
                  const CpuCacheParams& cpu_cache_params, BlockMap* block_map) { … }

void GetBlockMatrixCoords(Side side, const BlockMap& block_map, int block,
                          int* start, int* end) { … }

void GetBlockMatrixCoords(const BlockMap& block_map, const SidePair<int>& block,
                          SidePair<int>* start, SidePair<int>* end) { … }

}  // namespace ruy
chromium/third_party/ruy/src/ruy/block_map.cc