// Copyright 2015 The Gemmlowp Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // multi_thread_gemm.h: Multi-threaded GEMM entry point. // Readers note: To understand this file, it is useful to first // read and understand the much simpler single_thread_gemm.h. #ifndef GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_ #define GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_ #include <atomic> // NOLINT #include <chrono> // NOLINT #include <thread> // NOLINT #include <vector> #include "single_thread_gemm.h" namespace gemmlowp { // This value was empirically derived on an end-to-end application benchmark. // That this number of cycles means that we may be sleeping substantially longer // than a scheduler timeslice's duration is not necessarily surprising. The // idea is to pick up quickly new work after having finished the previous // workload. When it's new work within the same GEMM as the previous work, the // time interval that we might be busy-waiting is very small, so for that // purpose it would be more than enough to sleep for 1 million cycles. // That is all what we would observe on a GEMM benchmark. However, in a real // application, after having finished a GEMM, we might do unrelated work for // a little while, then start on a new GEMM. Think of a neural network // application performing inference, where many but not all layers are // implemented by a GEMM. In such cases, our worker threads might be idle for // longer periods of time before having work again. If we let them passively // wait, on a mobile device, the CPU scheduler might aggressively clock down // or even turn off the CPU cores that they were running on. That would result // in a long delay the next time these need to be turned back on for the next // GEMM. So we need to strike a balance that reflects typical time intervals // between consecutive GEMM invokations, not just intra-GEMM considerations. // Of course, we need to balance keeping CPUs spinning longer to resume work // faster, versus passively waiting to conserve power. const int kMaxBusyWaitNOPs = …; // On X86 and ARM platforms we may use NOP instructions to know how long we // are busy-waiting. #if defined(GEMMLOWP_ALLOW_INLINE_ASM) && !defined(GEMMLOWP_NO_BUSYWAIT) && \ (defined(GEMMLOWP_ARM) || defined(GEMMLOWP_X86)) #define GEMMLOWP_NOP … #define GEMMLOWP_STRING_CONCAT_4 … #define GEMMLOWP_NOP4 … #define GEMMLOWP_NOP16 … #define GEMMLOWP_NOP64 … inline int DoSomeNOPs() { … } #undef GEMMLOWP_STRING_CONCAT_4 #undef GEMMLOWP_NOP64 #undef GEMMLOWP_NOP16 #undef GEMMLOWP_NOP4 #undef GEMMLOWP_NOP #else // May not use asm NOP. // If we can't use NOPs, let's use a non-inline function call as a basic // thing that has some vaguely known, nonzero cost. GEMMLOWP_NOINLINE inline int DoSomeNOPs() { // Pretend that calling an empty function takes as long as 16 NOPs... return 16; } #endif // Waits until *var != initial_value. // // Returns the new value of *var. The guarantee here is that // the return value is different from initial_value, and that that // new value has been taken by *var at some point during the // execution of this function. There is no guarantee that this is // still the value of *var when this function returns, since *var is // not assumed to be guarded by any lock. // // First does some busy-waiting for a fixed number of no-op cycles, // then falls back to passive waiting for the given condvar, guarded // by the given mutex. // // The idea of doing some initial busy-waiting is to help get // better and more consistent multithreading benefits for small GEMM sizes. // Busy-waiting help ensuring that if we need to wake up soon after having // started waiting, then we can wake up quickly (as opposed to, say, // having to wait to be scheduled again by the OS). On the other hand, // we must still eventually revert to passive waiting for longer waits // (e.g. worker threads having finished a GEMM and waiting until the next GEMM) // so as to avoid permanently spinning. // template <typename T> T WaitForVariableChange(std::atomic<T>* var, T initial_value, pthread_cond_t* cond, pthread_mutex_t* mutex) { … } // A BlockingCounter lets one thread to wait for N events to occur. // This is how the master thread waits for all the worker threads // to have finished working. // The waiting is done using a naive spinlock waiting for the atomic // count_ to hit the value 0. This is acceptable because in our usage // pattern, BlockingCounter is used only to synchronize threads after // short-lived tasks (performing parts of the same GEMM). It is not used // for synchronizing longer waits (resuming work on the next GEMM). class BlockingCounter { … }; // A workload for a worker. struct Task { … }; // A worker thread. class Worker { … }; // A very simple pool of workers, that only allows the very // specific parallelization pattern that we use here: // a fixed number of workers can be given work, and one then // waits for all of them to finish. // // See MultiThreadGemmContextBase for how other WorkersPool implementations can // be used. class WorkersPool { … }; // The task we use to implement a multi-threaded Gemm: a block of the // RHS has been packed by the master thread; each worker thread // then has to pack a block of the LHS and accumulate the Gemm of these // packed LHS and RHS blocks. template <typename KernelFormat, typename InputScalar, typename OutputScalar, typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder, MapOrder ResultOrder, typename LhsOffset, typename RhsOffset, typename OutputPipelineType, typename GemmContextType> struct GemmWithPackedRhsTask : Task { … }; // This base class for multi-threading allows subclasses to implement their own // workers_pool() method. See MultiThreadGemmContext below for an example; // any other implementation of workers_pool() must return an object with the // same public methods as WorkersPool. class MultiThreadGemmContextBase : public SingleThreadGemmContext { … }; class MultiThreadGemmContext : public MultiThreadGemmContextBase { … }; // Determines how many threads should be used for a given Gemm // operation. template <int KernelRows> inline int HowManyThreads(int max_num_threads, int rows, int cols, int depth) { … } // The main multi-threaded Gemm function. // To understand it, first read the code of SingleThreadGemm(). // The parallelization scheme used here is to have this master function // pack a block of RHS and then start worker threads to pack a block of LHS // each, and accumulate the corresponding products. template <typename KernelFormat, typename InputScalar, typename OutputScalar, typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder, MapOrder ResultOrder, typename LhsOffset, typename RhsOffset, typename OutputPipelineType, typename GemmContextType> void MultiThreadGemm(GemmContextType* context, const KernelBase& kernel, const MatrixMap<const InputScalar, LhsOrder>& lhs, const MatrixMap<const InputScalar, RhsOrder>& rhs, MatrixMap<OutputScalar, ResultOrder>* result, const LhsOffset& lhs_offset, const RhsOffset& rhs_offset, const OutputPipelineType& output_pipeline) { … } } // namespace gemmlowp #endif // GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_