/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ // Implement the Philox algorithm to generate random numbers in parallel. // Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3. // http://www.thesalmons.org/john/random123/papers/random123sc11.pdf #ifndef TENSORFLOW_TSL_LIB_RANDOM_PHILOX_RANDOM_H_ #define TENSORFLOW_TSL_LIB_RANDOM_PHILOX_RANDOM_H_ #include <stdlib.h> #include <cstdint> // Function qualifiers that need to work on both CPU and GPU. #if defined(__CUDACC__) || defined(__HIPCC__) // For nvcc. #define PHILOX_DEVICE_FUNC … #define PHILOX_INLINE … #else // For non-nvcc. #define PHILOX_DEVICE_FUNC #define PHILOX_INLINE … #endif #define PHILOX_DEVICE_INLINE … #include <math.h> namespace tsl { namespace random { // A class that represents an inline array. It can be used on both CPU and GPU, // and also trivially copyable between CPU and GPU. // Arguments: // T: the array element type; // ElementCount: the fixed size of the array; template <typename T, int ElementCount> class Array { … }; // A class that encapsulates all the states for a random number generator using // the philox_4x32_10 algorithm. Each invocation returns a 128-bit random bits // in the form of four uint32_t. // There are multiple variants of this algorithm, we picked the 4x32_10 version // that is most suited for our applications. // Since this class is meant to be copied between CPU to GPU, it maintains a // value semantics. // // For example: To use this class and populate an array of 1024 randoms on CPU // with two threads, // // void Fill(PhiloxRandom rnd, uint32_t* output, int start, int limit) { // assert(start % 4 == 0); // assert(limit % 4 == 0); // rnd.Skip(start / 4); // for (int i = start; i < limit; i += 4) { // auto sample = rnd(); // ... copy sample[0..3] to output[i..i+3] // } // } // // PhiloxRandom rng(seed); // PhiloxRandom rng_copy = rng; // rng.Skip(1000/4); // // ... schedule Fill(rng_copy, output, 0, 512) in thread 1; // ... schedule Fill(rng_copy, output, 512, 1024) in thread 2; // ... wait for thread 1 & 2 to finish executing Fill(). // // NOTE: // 1. PhiloxRandom is trivially copyable. // 2. PhiloxRandom is compilable by gcc and nvcc. class PhiloxRandom { … }; } // namespace random } // namespace tsl #endif // TENSORFLOW_TSL_LIB_RANDOM_PHILOX_RANDOM_H_