/* * kmp_collapse.cpp -- loop collapse feature */ //===----------------------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "kmp.h" #include "kmp_error.h" #include "kmp_i18n.h" #include "kmp_itt.h" #include "kmp_stats.h" #include "kmp_str.h" #include "kmp_collapse.h" #if OMPT_SUPPORT #include "ompt-specific.h" #endif // OMPTODO: different style of comments (see kmp_sched) // OMPTODO: OMPT/OMPD // avoid inadevertently using a library based abs template <typename T> T __kmp_abs(const T val) { … } kmp_uint32 __kmp_abs(const kmp_uint32 val) { … } kmp_uint64 __kmp_abs(const kmp_uint64 val) { … } //---------------------------------------------------------------------------- // Common functions for working with rectangular and non-rectangular loops //---------------------------------------------------------------------------- template <typename T> int __kmp_sign(T val) { … } template <typename T> class CollapseAllocator { … }; //----------Loop canonicalization--------------------------------------------- // For loop nest (any shape): // convert != to < or >; // switch from using < or > to <= or >=. // "bounds" array has to be allocated per thread. // All other internal functions will work only with canonicalized loops. template <typename T> void kmp_canonicalize_one_loop_XX( ident_t *loc, /*in/out*/ bounds_infoXX_template<T> *bounds) { … } // Canonicalize loop nest. original_bounds_nest is an array of length n. void kmp_canonicalize_loop_nest(ident_t *loc, /*in/out*/ bounds_info_t *original_bounds_nest, kmp_index_t n) { … } //----------Calculating trip count on one level------------------------------- // Calculate trip count on this loop level. // We do this either for a rectangular loop nest, // or after an adjustment bringing the loops to a parallelepiped shape. // This number should not depend on the value of outer IV // even if the formular has lb1 and ub1. // Note: for non-rectangular loops don't use span for this, it's too big. template <typename T> kmp_loop_nest_iv_t kmp_calculate_trip_count_XX( /*in/out*/ bounds_infoXX_template<T> *bounds) { … } // Calculate trip count on this loop level. kmp_loop_nest_iv_t kmp_calculate_trip_count(/*in/out*/ bounds_info_t *bounds) { … } //----------Trim original iv according to its type---------------------------- // Trim original iv according to its type. // Return kmp_uint64 value which can be easily used in all internal calculations // And can be statically cast back to original type in user code. kmp_uint64 kmp_fix_iv(loop_type_t loop_iv_type, kmp_uint64 original_iv) { … } //----------Compare two IVs (remember they have a type)----------------------- bool kmp_ivs_eq(loop_type_t loop_iv_type, kmp_uint64 original_iv1, kmp_uint64 original_iv2) { … } //----------Calculate original iv on one level-------------------------------- // Return true if the point fits into upper bounds on this level, // false otherwise template <typename T> bool kmp_iv_is_in_upper_bound_XX(const bounds_infoXX_template<T> *bounds, const kmp_point_t original_ivs, kmp_index_t ind) { … } // Calculate one iv corresponding to iteration on the level ind. // Return true if it fits into lower-upper bounds on this level // (if not, we need to re-calculate) template <typename T> bool kmp_calc_one_iv_XX(const bounds_infoXX_template<T> *bounds, /*in/out*/ kmp_point_t original_ivs, const kmp_iterations_t iterations, kmp_index_t ind, bool start_with_lower_bound, bool checkBounds) { … } bool kmp_calc_one_iv(const bounds_info_t *bounds, /*in/out*/ kmp_point_t original_ivs, const kmp_iterations_t iterations, kmp_index_t ind, bool start_with_lower_bound, bool checkBounds) { … } //----------Calculate original iv on one level for rectangular loop nest------ // Calculate one iv corresponding to iteration on the level ind. // Return true if it fits into lower-upper bounds on this level // (if not, we need to re-calculate) template <typename T> void kmp_calc_one_iv_rectang_XX(const bounds_infoXX_template<T> *bounds, /*in/out*/ kmp_uint64 *original_ivs, const kmp_iterations_t iterations, kmp_index_t ind) { … } void kmp_calc_one_iv_rectang(const bounds_info_t *bounds, /*in/out*/ kmp_uint64 *original_ivs, const kmp_iterations_t iterations, kmp_index_t ind) { … } //---------------------------------------------------------------------------- // Rectangular loop nest //---------------------------------------------------------------------------- //----------Canonicalize loop nest and calculate trip count------------------- // Canonicalize loop nest and calculate overall trip count. // "bounds_nest" has to be allocated per thread. // API will modify original bounds_nest array to bring it to a canonical form // (only <= and >=, no !=, <, >). If the original loop nest was already in a // canonical form there will be no changes to bounds in bounds_nest array // (only trip counts will be calculated). // Returns trip count of overall space. extern "C" kmp_loop_nest_iv_t __kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid, /*in/out*/ bounds_info_t *original_bounds_nest, kmp_index_t n) { … } //----------Calculate old induction variables--------------------------------- // Calculate old induction variables corresponding to overall new_iv. // Note: original IV will be returned as if it had kmp_uint64 type, // will have to be converted to original type in user code. // Note: trip counts should be already calculated by // __kmpc_process_loop_nest_rectang. // OMPTODO: special case 2, 3 nested loops: either do different // interface without array or possibly template this over n extern "C" void __kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv, const bounds_info_t *original_bounds_nest, /*out*/ kmp_uint64 *original_ivs, kmp_index_t n) { … } //---------------------------------------------------------------------------- // Non-rectangular loop nest //---------------------------------------------------------------------------- //----------Calculate maximum possible span of iv values on one level--------- // Calculate span for IV on this loop level for "<=" case. // Note: it's for <= on this loop nest level, so lower bound should be smallest // value, upper bound should be the biggest value. If the loop won't execute, // 'smallest' may be bigger than 'biggest', but we'd better not switch them // around. template <typename T> void kmp_calc_span_lessoreq_XX( /* in/out*/ bounds_info_internalXX_template<T> *bounds, /* in/out*/ bounds_info_internal_t *bounds_nest) { … } // Calculate span for IV on this loop level for ">=" case. template <typename T> void kmp_calc_span_greateroreq_XX( /* in/out*/ bounds_info_internalXX_template<T> *bounds, /* in/out*/ bounds_info_internal_t *bounds_nest) { … } // Calculate maximum possible span for IV on this loop level. template <typename T> void kmp_calc_span_XX( /* in/out*/ bounds_info_internalXX_template<T> *bounds, /* in/out*/ bounds_info_internal_t *bounds_nest) { … } //----------All initial processing of the loop nest--------------------------- // Calculate new bounds for this loop level. // To be able to work with the nest we need to get it to a parallelepiped shape. // We need to stay in the original range of values, so that there will be no // overflow, for that we'll adjust both upper and lower bounds as needed. template <typename T> void kmp_calc_new_bounds_XX( /* in/out*/ bounds_info_internalXX_template<T> *bounds, /* in/out*/ bounds_info_internal_t *bounds_nest) { … } // Do all processing for one canonicalized loop in the nest // (assuming that outer loops already were processed): template <typename T> kmp_loop_nest_iv_t kmp_process_one_loop_XX( /* in/out*/ bounds_info_internalXX_template<T> *bounds, /*in/out*/ bounds_info_internal_t *bounds_nest) { … } // Non-rectangular loop nest, canonicalized to use <= or >=. // Process loop nest to have a parallelepiped shape, // calculate biggest spans for IV's on all levels and calculate overall trip // count. "bounds_nest" has to be allocated per thread. // Returns overall trip count (for adjusted space). kmp_loop_nest_iv_t kmp_process_loop_nest( /*in/out*/ bounds_info_internal_t *bounds_nest, kmp_index_t n) { … } //----------Calculate iterations (in the original or updated space)----------- // Calculate number of iterations in original or updated space resulting in // original_ivs[ind] (only on this level, non-negative) // (not counting initial iteration) template <typename T> kmp_loop_nest_iv_t kmp_calc_number_of_iterations_XX(const bounds_infoXX_template<T> *bounds, const kmp_point_t original_ivs, kmp_index_t ind) { … } // Calculate number of iterations in the original or updated space resulting in // original_ivs[ind] (only on this level, non-negative) kmp_loop_nest_iv_t kmp_calc_number_of_iterations(const bounds_info_t *bounds, const kmp_point_t original_ivs, kmp_index_t ind) { … } //----------Calculate new iv corresponding to original ivs-------------------- // We got a point in the original loop nest. // Take updated bounds and calculate what new_iv will correspond to this point. // When we are getting original IVs from new_iv, we have to adjust to fit into // original loops bounds. Getting new_iv for the adjusted original IVs will help // with making more chunks non-empty. kmp_loop_nest_iv_t kmp_calc_new_iv_from_original_ivs(const bounds_info_internal_t *bounds_nest, const kmp_point_t original_ivs, kmp_index_t n) { … } //----------Calculate original ivs for provided iterations-------------------- // Calculate original IVs for provided iterations, assuming iterations are // calculated in the original space. // Loop nest is in canonical form (with <= / >=). bool kmp_calc_original_ivs_from_iterations( const bounds_info_t *original_bounds_nest, kmp_index_t n, /*in/out*/ kmp_point_t original_ivs, /*in/out*/ kmp_iterations_t iterations, kmp_index_t ind) { … } //----------Calculate original ivs for the beginning of the loop nest--------- // Calculate IVs for the beginning of the loop nest. // Note: lower bounds of all loops may not work - // if on some of the iterations of the outer loops inner loops are empty. // Loop nest is in canonical form (with <= / >=). bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest, kmp_index_t n, /*out*/ kmp_point_t original_ivs) { … } //----------Calculate next point in the original loop space------------------- // From current set of original IVs calculate next point. // Return false if there is no next point in the loop bounds. bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest, kmp_index_t n, const kmp_point_t original_ivs, /*out*/ kmp_point_t next_original_ivs) { … } //----------Calculate chunk end in the original loop space-------------------- // For one level calculate old induction variable corresponding to overall // new_iv for the chunk end. // Return true if it fits into upper bound on this level // (if not, we need to re-calculate) template <typename T> bool kmp_calc_one_iv_for_chunk_end_XX( const bounds_infoXX_template<T> *bounds, const bounds_infoXX_template<T> *updated_bounds, /*in/out*/ kmp_point_t original_ivs, const kmp_iterations_t iterations, kmp_index_t ind, bool start_with_lower_bound, bool compare_with_start, const kmp_point_t original_ivs_start) { … } // For one level calculate old induction variable corresponding to overall // new_iv for the chunk end. bool kmp_calc_one_iv_for_chunk_end(const bounds_info_t *bounds, const bounds_info_t *updated_bounds, /*in/out*/ kmp_point_t original_ivs, const kmp_iterations_t iterations, kmp_index_t ind, bool start_with_lower_bound, bool compare_with_start, const kmp_point_t original_ivs_start) { … } // Calculate old induction variables corresponding to overall new_iv for the // chunk end. If due to space extension we are getting old IVs outside of the // boundaries, bring them into the boundaries. Need to do this in the runtime, // esp. on the lower bounds side. When getting result need to make sure that the // new chunk starts at next position to old chunk, not overlaps with it (this is // done elsewhere), and need to make sure end of the chunk is further than the // beginning of the chunk. We don't need an exact ending point here, just // something more-or-less close to the desired chunk length, bigger is fine // (smaller would be fine, but we risk going into infinite loop, so do smaller // only at the very end of the space). result: false if could not find the // ending point in the original loop space. In this case the caller can use // original upper bounds as the end of the chunk. Chunk won't be empty, because // it'll have at least the starting point, which is by construction in the // original space. bool kmp_calc_original_ivs_for_chunk_end( const bounds_info_t *original_bounds_nest, kmp_index_t n, const bounds_info_internal_t *updated_bounds_nest, const kmp_point_t original_ivs_start, kmp_loop_nest_iv_t new_iv, /*out*/ kmp_point_t original_ivs) { … } //----------Calculate upper bounds for the last chunk------------------------- // Calculate one upper bound for the end. template <typename T> void kmp_calc_one_iv_end_XX(const bounds_infoXX_template<T> *bounds, /*in/out*/ kmp_point_t original_ivs, kmp_index_t ind) { … } void kmp_calc_one_iv_end(const bounds_info_t *bounds, /*in/out*/ kmp_point_t original_ivs, kmp_index_t ind) { … } // Calculate upper bounds for the last loop iteration. Just use original upper // bounds (adjusted when canonicalized to use <= / >=). No need to check that // this point is in the original space (it's likely not) void kmp_calc_original_ivs_for_end( const bounds_info_t *const original_bounds_nest, kmp_index_t n, /*out*/ kmp_point_t original_ivs) { … } /************************************************************************** * Identify nested loop structure - loops come in the canonical form * Lower triangle matrix: i = 0; i <= N; i++ {0,0}:{N,0} * j = 0; j <= 0/-1+1*i; j++ {0,0}:{0/-1,1} * Upper Triangle matrix * i = 0; i <= N; i++ {0,0}:{N,0} * j = 0+1*i; j <= N; j++ {0,1}:{N,0} * ************************************************************************/ nested_loop_type_t kmp_identify_nested_loop_structure(/*in*/ bounds_info_t *original_bounds_nest, /*in*/ kmp_index_t n) { … } /************************************************************************** * SQRT Approximation: https://math.mit.edu/~stevenj/18.335/newton-sqrt.pdf * Start point is x so the result is always > sqrt(x) * The method has uniform convergence, PRECISION is set to 0.1 * ************************************************************************/ #define level_of_precision … double sqrt_newton_approx(/*in*/ kmp_uint64 x) { … } /************************************************************************** * Handle lower triangle matrix in the canonical form * i = 0; i <= N; i++ {0,0}:{N,0} * j = 0; j <= 0/-1 + 1*i; j++ {0,0}:{0/-1,1} * ************************************************************************/ void kmp_handle_lower_triangle_matrix( /*in*/ kmp_uint32 nth, /*in*/ kmp_uint32 tid, /*in */ kmp_index_t n, /*in/out*/ bounds_info_t *original_bounds_nest, /*out*/ bounds_info_t *chunk_bounds_nest) { … } /************************************************************************** * Handle upper triangle matrix in the canonical form * i = 0; i <= N; i++ {0,0}:{N,0} * j = 0+1*i; j <= N; j++ {0,1}:{N,0} * ************************************************************************/ void kmp_handle_upper_triangle_matrix( /*in*/ kmp_uint32 nth, /*in*/ kmp_uint32 tid, /*in */ kmp_index_t n, /*in/out*/ bounds_info_t *original_bounds_nest, /*out*/ bounds_info_t *chunk_bounds_nest) { … } //----------Init API for non-rectangular loops-------------------------------- // Init API for collapsed loops (static, no chunks defined). // "bounds_nest" has to be allocated per thread. // API will modify original bounds_nest array to bring it to a canonical form // (only <= and >=, no !=, <, >). If the original loop nest was already in a // canonical form there will be no changes to bounds in bounds_nest array // (only trip counts will be calculated). Internally API will expand the space // to parallelogram/parallelepiped, calculate total, calculate bounds for the // chunks in terms of the new IV, re-calc them in terms of old IVs (especially // important on the left side, to hit the lower bounds and not step over), and // pick the correct chunk for this thread (so it will calculate chunks up to the // needed one). It could be optimized to calculate just this chunk, potentially // a bit less well distributed among threads. It is designed to make sure that // threads will receive predictable chunks, deterministically (so that next nest // of loops with similar characteristics will get exactly same chunks on same // threads). // Current contract: chunk_bounds_nest has only lb0 and ub0, // lb1 and ub1 are set to 0 and can be ignored. (This may change in the future). extern "C" kmp_int32 __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid, /*in/out*/ bounds_info_t *original_bounds_nest, /*out*/ bounds_info_t *chunk_bounds_nest, kmp_index_t n, /*out*/ kmp_int32 *plastiter) { … }