kmp_tasking.cpp | Explore in Territory

/*
 * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
 */

//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "kmp.h"
#include "kmp_i18n.h"
#include "kmp_itt.h"
#include "kmp_stats.h"
#include "kmp_wait_release.h"
#include "kmp_taskdeps.h"

#if OMPT_SUPPORT
#include "ompt-specific.h"
#endif

#if ENABLE_LIBOMPTARGET
static void (*tgt_target_nowait_query)(void **);

void __kmp_init_target_task() { … }
#endif

/* forward declaration */
static void __kmp_enable_tasking(kmp_task_team_t *task_team,
                                 kmp_info_t *this_thr);
static void __kmp_alloc_task_deque(kmp_info_t *thread,
                                   kmp_thread_data_t *thread_data);
static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
                                           kmp_task_team_t *task_team);
static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
#if OMPX_TASKGRAPH
static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id);
int __kmp_taskloop_task(int gtid, void *ptask);
#endif

#ifdef BUILD_TIED_TASK_STACK

//  __kmp_trace_task_stack: print the tied tasks from the task stack in order
//  from top do bottom
//
//  gtid: global thread identifier for thread containing stack
//  thread_data: thread data for task team thread containing stack
//  threshold: value above which the trace statement triggers
//  location: string identifying call site of this function (for trace)
static void __kmp_trace_task_stack(kmp_int32 gtid,
                                   kmp_thread_data_t *thread_data,
                                   int threshold, char *location) {
  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
  kmp_taskdata_t **stack_top = task_stack->ts_top;
  kmp_int32 entries = task_stack->ts_entries;
  kmp_taskdata_t *tied_task;

  KA_TRACE(
      threshold,
      ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
       "first_block = %p, stack_top = %p \n",
       location, gtid, entries, task_stack->ts_first_block, stack_top));

  KMP_DEBUG_ASSERT(stack_top != NULL);
  KMP_DEBUG_ASSERT(entries > 0);

  while (entries != 0) {
    KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
    // fix up ts_top if we need to pop from previous block
    if (entries & TASK_STACK_INDEX_MASK == 0) {
      kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);

      stack_block = stack_block->sb_prev;
      stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
    }

    // finish bookkeeping
    stack_top--;
    entries--;

    tied_task = *stack_top;

    KMP_DEBUG_ASSERT(tied_task != NULL);
    KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);

    KA_TRACE(threshold,
             ("__kmp_trace_task_stack(%s):             gtid=%d, entry=%d, "
              "stack_top=%p, tied_task=%p\n",
              location, gtid, entries, stack_top, tied_task));
  }
  KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);

  KA_TRACE(threshold,
           ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
            location, gtid));
}

//  __kmp_init_task_stack: initialize the task stack for the first time
//  after a thread_data structure is created.
//  It should not be necessary to do this again (assuming the stack works).
//
//  gtid: global thread identifier of calling thread
//  thread_data: thread data for task team thread containing stack
static void __kmp_init_task_stack(kmp_int32 gtid,
                                  kmp_thread_data_t *thread_data) {
  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
  kmp_stack_block_t *first_block;

  // set up the first block of the stack
  first_block = &task_stack->ts_first_block;
  task_stack->ts_top = (kmp_taskdata_t **)first_block;
  memset((void *)first_block, '\0',
         TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));

  // initialize the stack to be empty
  task_stack->ts_entries = TASK_STACK_EMPTY;
  first_block->sb_next = NULL;
  first_block->sb_prev = NULL;
}

//  __kmp_free_task_stack: free the task stack when thread_data is destroyed.
//
//  gtid: global thread identifier for calling thread
//  thread_data: thread info for thread containing stack
static void __kmp_free_task_stack(kmp_int32 gtid,
                                  kmp_thread_data_t *thread_data) {
  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
  kmp_stack_block_t *stack_block = &task_stack->ts_first_block;

  KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
  // free from the second block of the stack
  while (stack_block != NULL) {
    kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;

    stack_block->sb_next = NULL;
    stack_block->sb_prev = NULL;
    if (stack_block != &task_stack->ts_first_block) {
      __kmp_thread_free(thread,
                        stack_block); // free the block, if not the first
    }
    stack_block = next_block;
  }
  // initialize the stack to be empty
  task_stack->ts_entries = 0;
  task_stack->ts_top = NULL;
}

//  __kmp_push_task_stack: Push the tied task onto the task stack.
//     Grow the stack if necessary by allocating another block.
//
//  gtid: global thread identifier for calling thread
//  thread: thread info for thread containing stack
//  tied_task: the task to push on the stack
static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
                                  kmp_taskdata_t *tied_task) {
  // GEH - need to consider what to do if tt_threads_data not allocated yet
  kmp_thread_data_t *thread_data =
      &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;

  if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
    return; // Don't push anything on stack if team or team tasks are serialized
  }

  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);

  KA_TRACE(20,
           ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
            gtid, thread, tied_task));
  // Store entry
  *(task_stack->ts_top) = tied_task;

  // Do bookkeeping for next push
  task_stack->ts_top++;
  task_stack->ts_entries++;

  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
    // Find beginning of this task block
    kmp_stack_block_t *stack_block =
        (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);

    // Check if we already have a block
    if (stack_block->sb_next !=
        NULL) { // reset ts_top to beginning of next block
      task_stack->ts_top = &stack_block->sb_next->sb_block[0];
    } else { // Alloc new block and link it up
      kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
          thread, sizeof(kmp_stack_block_t));

      task_stack->ts_top = &new_block->sb_block[0];
      stack_block->sb_next = new_block;
      new_block->sb_prev = stack_block;
      new_block->sb_next = NULL;

      KA_TRACE(
          30,
          ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
           gtid, tied_task, new_block));
    }
  }
  KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
                tied_task));
}

//  __kmp_pop_task_stack: Pop the tied task from the task stack.  Don't return
//  the task, just check to make sure it matches the ending task passed in.
//
//  gtid: global thread identifier for the calling thread
//  thread: thread info structure containing stack
//  tied_task: the task popped off the stack
//  ending_task: the task that is ending (should match popped task)
static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
                                 kmp_taskdata_t *ending_task) {
  // GEH - need to consider what to do if tt_threads_data not allocated yet
  kmp_thread_data_t *thread_data =
      &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
  kmp_taskdata_t *tied_task;

  if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
    // Don't pop anything from stack if team or team tasks are serialized
    return;
  }

  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
  KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);

  KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
                thread));

  // fix up ts_top if we need to pop from previous block
  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
    kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);

    stack_block = stack_block->sb_prev;
    task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
  }

  // finish bookkeeping
  task_stack->ts_top--;
  task_stack->ts_entries--;

  tied_task = *(task_stack->ts_top);

  KMP_DEBUG_ASSERT(tied_task != NULL);
  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
  KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly

  KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
                tied_task));
  return;
}
#endif /* BUILD_TIED_TASK_STACK */

// returns 1 if new task is allowed to execute, 0 otherwise
// checks Task Scheduling constraint (if requested) and
// mutexinoutset dependencies if any
static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
                                  const kmp_taskdata_t *tasknew,
                                  const kmp_taskdata_t *taskcurr) { … }

// __kmp_realloc_task_deque:
// Re-allocates a task deque for a particular thread, copies the content from
// the old deque and adjusts the necessary data structures relating to the
// deque. This operation must be done with the deque_lock being held
static void __kmp_realloc_task_deque(kmp_info_t *thread,
                                     kmp_thread_data_t *thread_data) { … }

static kmp_task_pri_t *__kmp_alloc_task_pri_list() { … }

// The function finds the deque of priority tasks with given priority, or
// allocates a new deque and put it into sorted (high -> low) list of deques.
// Deques of non-default priority tasks are shared between all threads in team,
// as opposed to per-thread deques of tasks with default priority.
// The function is called under the lock task_team->tt.tt_task_pri_lock.
static kmp_thread_data_t *
__kmp_get_priority_deque_data(kmp_task_team_t *task_team, kmp_int32 pri) { … }

//  __kmp_push_priority_task: Add a task to the team's priority task deque
static kmp_int32 __kmp_push_priority_task(kmp_int32 gtid, kmp_info_t *thread,
                                          kmp_taskdata_t *taskdata,
                                          kmp_task_team_t *task_team,
                                          kmp_int32 pri) { … }

//  __kmp_push_task: Add a task to the thread's deque
static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) { … }

// __kmp_pop_current_task_from_thread: set up current task from called thread
// when team ends
//
// this_thr: thread structure to set current_task in.
void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) { … }

// __kmp_push_current_task_to_thread: set up current task in called thread for a
// new team
//
// this_thr: thread structure to set up
// team: team for implicit task data
// tid: thread within team to set up
void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
                                       int tid) { … }

// __kmp_task_start: bookkeeping for a task starting execution
//
// GTID: global thread id of calling thread
// task: task starting execution
// current_task: task suspending
static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
                             kmp_taskdata_t *current_task) { … }

#if OMPT_SUPPORT
//------------------------------------------------------------------------------
// __ompt_task_init:
//   Initialize OMPT fields maintained by a task. This will only be called after
//   ompt_start_tool, so we already know whether ompt is enabled or not.

static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) { … }

// __ompt_task_start:
//   Build and trigger task-begin event
static inline void __ompt_task_start(kmp_task_t *task,
                                     kmp_taskdata_t *current_task,
                                     kmp_int32 gtid) { … }

// __ompt_task_finish:
//   Build and trigger final task-schedule event
static inline void __ompt_task_finish(kmp_task_t *task,
                                      kmp_taskdata_t *resumed_task,
                                      ompt_task_status_t status) { … }
#endif

template <bool ompt>
static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
                                               kmp_task_t *task,
                                               void *frame_address,
                                               void *return_address) { … }

#if OMPT_SUPPORT
OMPT_NOINLINE
static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
                                           kmp_task_t *task,
                                           void *frame_address,
                                           void *return_address) { … }
#endif // OMPT_SUPPORT

// __kmpc_omp_task_begin_if0: report that a given serialized task has started
// execution
//
// loc_ref: source location information; points to beginning of task block.
// gtid: global thread number.
// task: task thunk for the started task.
#ifdef __s390x__
// This is required for OMPT_GET_FRAME_ADDRESS(1) to compile on s390x.
// In order for it to work correctly, the caller also needs to be compiled with
// backchain. If a caller is compiled without backchain,
// OMPT_GET_FRAME_ADDRESS(1) will produce an incorrect value, but will not
// crash.
__attribute__((target("backchain")))
#endif
void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
                               kmp_task_t *task) { … }

#ifdef TASK_UNUSED
// __kmpc_omp_task_begin: report that a given task has started execution
// NEVER GENERATED BY COMPILER, DEPRECATED!!!
void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;

  KA_TRACE(
      10,
      ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
       gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));

  __kmp_task_start(gtid, task, current_task);

  KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
                loc_ref, KMP_TASK_TO_TASKDATA(task)));
  return;
}
#endif // TASK_UNUSED

// __kmp_free_task: free the current task space and the space for shareds
//
// gtid: Global thread ID of calling thread
// taskdata: task to free
// thread: thread data structure of caller
static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
                            kmp_info_t *thread) { … }

// __kmp_free_task_and_ancestors: free the current task and ancestors without
// children
//
// gtid: Global thread ID of calling thread
// taskdata: task to free
// thread: thread data structure of caller
static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
                                          kmp_taskdata_t *taskdata,
                                          kmp_info_t *thread) { … }

// Only need to keep track of child task counts if any of the following:
// 1. team parallel and tasking not serialized;
// 2. it is a proxy or detachable or hidden helper task
// 3. the children counter of its parent task is greater than 0.
// The reason for the 3rd one is for serialized team that found detached task,
// hidden helper task, T. In this case, the execution of T is still deferred,
// and it is also possible that a regular task depends on T. In this case, if we
// don't track the children, task synchronization will be broken.
static bool __kmp_track_children_task(kmp_taskdata_t *taskdata) { … }

// __kmp_task_finish: bookkeeping to do when a task finishes execution
//
// gtid: global thread ID for calling thread
// task: task to be finished
// resumed_task: task to be resumed.  (may be NULL if task is serialized)
//
// template<ompt>: effectively ompt_enabled.enabled!=0
// the version with ompt=false is inlined, allowing to optimize away all ompt
// code in this case
template <bool ompt>
static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
                              kmp_taskdata_t *resumed_task) { … }

template <bool ompt>
static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
                                                  kmp_int32 gtid,
                                                  kmp_task_t *task) { … }

#if OMPT_SUPPORT
OMPT_NOINLINE
void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
                                       kmp_task_t *task) { … }
#endif // OMPT_SUPPORT

// __kmpc_omp_task_complete_if0: report that a task has completed execution
//
// loc_ref: source location information; points to end of task block.
// gtid: global thread number.
// task: task thunk for the completed task.
void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
                                  kmp_task_t *task) { … }

#ifdef TASK_UNUSED
// __kmpc_omp_task_complete: report that a task has completed execution
// NEVER GENERATED BY COMPILER, DEPRECATED!!!
void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
                              kmp_task_t *task) {
  KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
                loc_ref, KMP_TASK_TO_TASKDATA(task)));

  __kmp_task_finish<false>(gtid, task,
                           NULL); // Not sure how to find task to resume

  KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
                loc_ref, KMP_TASK_TO_TASKDATA(task)));
  return;
}
#endif // TASK_UNUSED

// __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
// task for a given thread
//
// loc_ref:  reference to source location of parallel region
// this_thr:  thread data structure corresponding to implicit task
// team: team for this_thr
// tid: thread id of given thread within team
// set_curr_task: TRUE if need to push current task to thread
// NOTE: Routine does not set up the implicit task ICVS.  This is assumed to
// have already been done elsewhere.
// TODO: Get better loc_ref.  Value passed in may be NULL
void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
                              kmp_team_t *team, int tid, int set_curr_task) { … }

// __kmp_finish_implicit_task: Release resources associated to implicit tasks
// at the end of parallel regions. Some resources are kept for reuse in the next
// parallel region.
//
// thread:  thread data structure corresponding to implicit task
void __kmp_finish_implicit_task(kmp_info_t *thread) { … }

// __kmp_free_implicit_task: Release resources associated to implicit tasks
// when these are destroyed regions
//
// thread:  thread data structure corresponding to implicit task
void __kmp_free_implicit_task(kmp_info_t *thread) { … }

// Round up a size to a power of two specified by val: Used to insert padding
// between structures co-allocated using a single malloc() call
static size_t __kmp_round_up_to_val(size_t size, size_t val) { … } // __kmp_round_up_to_va

// __kmp_task_alloc: Allocate the taskdata and task data structures for a task
//
// loc_ref: source location information
// gtid: global thread number.
// flags: include tiedness & task type (explicit vs. implicit) of the ''new''
// task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
// sizeof_kmp_task_t:  Size in bytes of kmp_task_t data structure including
// private vars accessed in task.
// sizeof_shareds:  Size in bytes of array of pointers to shared vars accessed
// in task.
// task_entry: Pointer to task code entry point generated by compiler.
// returns: a pointer to the allocated kmp_task_t structure (task).
kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
                             kmp_tasking_flags_t *flags,
                             size_t sizeof_kmp_task_t, size_t sizeof_shareds,
                             kmp_routine_entry_t task_entry) { … }

kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
                                  kmp_int32 flags, size_t sizeof_kmp_task_t,
                                  size_t sizeof_shareds,
                                  kmp_routine_entry_t task_entry) { … }

kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
                                         kmp_int32 flags,
                                         size_t sizeof_kmp_task_t,
                                         size_t sizeof_shareds,
                                         kmp_routine_entry_t task_entry,
                                         kmp_int64 device_id) { … }

/*!
@ingroup TASKING
@param loc_ref location of the original task directive
@param gtid Global Thread ID of encountering thread
@param new_task task thunk allocated by __kmpc_omp_task_alloc() for the ''new
task''
@param naffins Number of affinity items
@param affin_list List of affinity items
@return Returns non-zero if registering affinity information was not successful.
 Returns 0 if registration was successful
This entry registers the affinity information attached to a task with the task
thunk structure kmp_taskdata_t.
*/
kmp_int32
__kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid,
                                  kmp_task_t *new_task, kmp_int32 naffins,
                                  kmp_task_affinity_info_t *affin_list) { … }

//  __kmp_invoke_task: invoke the specified task
//
// gtid: global thread ID of caller
// task: the task to invoke
// current_task: the task to resume after task invocation
#ifdef __s390x__
__attribute__((target("backchain")))
#endif
static void
__kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
                  kmp_taskdata_t *current_task) { … }

// __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
//
// loc_ref: location of original task pragma (ignored)
// gtid: Global Thread ID of encountering thread
// new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
// Returns:
//    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
//    be resumed later.
//    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
//    resumed later.
kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
                                kmp_task_t *new_task) { … }

// __kmp_omp_task: Schedule a non-thread-switchable task for execution
//
// gtid: Global Thread ID of encountering thread
// new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
// serialize_immediate: if TRUE then if the task is executed immediately its
// execution will be serialized
// Returns:
//    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
//    be resumed later.
//    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
//    resumed later.
kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
                         bool serialize_immediate) { … }

// __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
// non-thread-switchable task from the parent thread only!
//
// loc_ref: location of original task pragma (ignored)
// gtid: Global Thread ID of encountering thread
// new_task: non-thread-switchable task thunk allocated by
// __kmp_omp_task_alloc()
// Returns:
//    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
//    be resumed later.
//    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
//    resumed later.
kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
                          kmp_task_t *new_task) { … }

// __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
// a taskloop task with the correct OMPT return address
//
// loc_ref: location of original task pragma (ignored)
// gtid: Global Thread ID of encountering thread
// new_task: non-thread-switchable task thunk allocated by
// __kmp_omp_task_alloc()
// codeptr_ra: return address for OMPT callback
// Returns:
//    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
//    be resumed later.
//    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
//    resumed later.
kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
                                  kmp_task_t *new_task, void *codeptr_ra) { … }

template <bool ompt>
static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
                                              void *frame_address,
                                              void *return_address) { … }

#if OMPT_SUPPORT && OMPT_OPTIONAL
OMPT_NOINLINE
static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
                                          void *frame_address,
                                          void *return_address) { … }
#endif // OMPT_SUPPORT && OMPT_OPTIONAL

// __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
// complete
kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) { … }

// __kmpc_omp_taskyield: switch to a different task
kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) { … }

// Task Reduction implementation
//
// Note: initial implementation didn't take into account the possibility
// to specify omp_orig for initializer of the UDR (user defined reduction).
// Corrected implementation takes into account the omp_orig object.
// Compiler is free to use old implementation if omp_orig is not specified.

/*!
@ingroup BASIC_TYPES
@{
*/

/*!
Flags for special info per task reduction item.
*/
kmp_taskred_flags_t;

/*!
Internal struct for reduction data item related info set up by compiler.
*/
kmp_task_red_input_t;

/*!
Internal struct for reduction data item related info saved by the library.
*/
kmp_taskred_data_t;

/*!
Internal struct for reduction data item related info set up by compiler.

New interface: added reduce_orig field to provide omp_orig for UDR initializer.
*/
kmp_taskred_input_t;
/*!
@}
*/

template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
template <>
void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
                                             kmp_task_red_input_t &src) { … }
template <>
void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
                                            kmp_taskred_input_t &src) { … }

template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, size_t j);
template <>
void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
                                           size_t offset) { … }
template <>
void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
                                          size_t offset) { … }

template <typename T>
void *__kmp_task_reduction_init(int gtid, int num, T *data) { … }

/*!
@ingroup TASKING
@param gtid      Global thread ID
@param num       Number of data items to reduce
@param data      Array of data for reduction
@return The taskgroup identifier

Initialize task reduction for the taskgroup.

Note: this entry supposes the optional compiler-generated initializer routine
has single parameter - pointer to object to be initialized. That means
the reduction either does not use omp_orig object, or the omp_orig is accessible
without help of the runtime library.
*/
void *__kmpc_task_reduction_init(int gtid, int num, void *data) { … }

/*!
@ingroup TASKING
@param gtid      Global thread ID
@param num       Number of data items to reduce
@param data      Array of data for reduction
@return The taskgroup identifier

Initialize task reduction for the taskgroup.

Note: this entry supposes the optional compiler-generated initializer routine
has two parameters, pointer to object to be initialized and pointer to omp_orig
*/
void *__kmpc_taskred_init(int gtid, int num, void *data) { … }

// Copy task reduction data (except for shared pointers).
template <typename T>
void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
                                    kmp_taskgroup_t *tg, void *reduce_data) { … }

/*!
@ingroup TASKING
@param gtid    Global thread ID
@param tskgrp  The taskgroup ID (optional)
@param data    Shared location of the item
@return The pointer to per-thread data

Get thread-specific location of data item
*/
void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) { … }

// Finalize task reduction.
// Called from __kmpc_end_taskgroup()
static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) { … }

// Cleanup task reduction data for parallel or worksharing,
// do not touch task private data other threads still working with.
// Called from __kmpc_end_taskgroup()
static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) { … }

template <typename T>
void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
                                         int num, T *data) { … }

/*!
@ingroup TASKING
@param loc       Source location info
@param gtid      Global thread ID
@param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
@param num       Number of data items to reduce
@param data      Array of data for reduction
@return The taskgroup identifier

Initialize task reduction for a parallel or worksharing.

Note: this entry supposes the optional compiler-generated initializer routine
has single parameter - pointer to object to be initialized. That means
the reduction either does not use omp_orig object, or the omp_orig is accessible
without help of the runtime library.
*/
void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
                                          int num, void *data) { … }

/*!
@ingroup TASKING
@param loc       Source location info
@param gtid      Global thread ID
@param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
@param num       Number of data items to reduce
@param data      Array of data for reduction
@return The taskgroup identifier

Initialize task reduction for a parallel or worksharing.

Note: this entry supposes the optional compiler-generated initializer routine
has two parameters, pointer to object to be initialized and pointer to omp_orig
*/
void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
                                   void *data) { … }

/*!
@ingroup TASKING
@param loc       Source location info
@param gtid      Global thread ID
@param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise

Finalize task reduction for a parallel or worksharing.
*/
void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) { … }

// __kmpc_taskgroup: Start a new taskgroup
void __kmpc_taskgroup(ident_t *loc, int gtid) { … }

// __kmpc_end_taskgroup: Wait until all tasks generated by the current task
//                       and its descendants are complete
void __kmpc_end_taskgroup(ident_t *loc, int gtid) { … }

static kmp_task_t *__kmp_get_priority_task(kmp_int32 gtid,
                                           kmp_task_team_t *task_team,
                                           kmp_int32 is_constrained) { … }

// __kmp_remove_my_task: remove a task from my own deque
static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
                                        kmp_task_team_t *task_team,
                                        kmp_int32 is_constrained) { … }

// __kmp_steal_task: remove a task from another thread's deque
// Assume that calling thread has already checked existence of
// task_team thread_data before calling this routine.
static kmp_task_t *__kmp_steal_task(kmp_int32 victim_tid, kmp_int32 gtid,
                                    kmp_task_team_t *task_team,
                                    std::atomic<kmp_int32> *unfinished_threads,
                                    int *thread_finished,
                                    kmp_int32 is_constrained) { … }

// __kmp_execute_tasks_template: Choose and execute tasks until either the
// condition is statisfied (return true) or there are none left (return false).
//
// final_spin is TRUE if this is the spin at the release barrier.
// thread_finished indicates whether the thread is finished executing all
// the tasks it has on its deque, and is at the release barrier.
// spinner is the location on which to spin.
// spinner == NULL means only execute a single task and return.
// checker is the value to check to terminate the spin.
template <class C>
static inline int __kmp_execute_tasks_template(
    kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
    int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
    kmp_int32 is_constrained) { … }

template <bool C, bool S>
int __kmp_execute_tasks_32(
    kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32<C, S> *flag, int final_spin,
    int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
    kmp_int32 is_constrained) { … }

template <bool C, bool S>
int __kmp_execute_tasks_64(
    kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64<C, S> *flag, int final_spin,
    int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
    kmp_int32 is_constrained) { … }

template <bool C, bool S>
int __kmp_atomic_execute_tasks_64(
    kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64<C, S> *flag,
    int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
    kmp_int32 is_constrained) { … }

int __kmp_execute_tasks_oncore(
    kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
    int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
    kmp_int32 is_constrained) { … }

template int
__kmp_execute_tasks_32<false, false>(kmp_info_t *, kmp_int32,
                                     kmp_flag_32<false, false> *, int,
                                     int *USE_ITT_BUILD_ARG(…), kmp_int32);

template int __kmp_execute_tasks_64<false, true>(kmp_info_t *, kmp_int32,
                                                 kmp_flag_64<false, true> *,
                                                 int,
                                                 int *USE_ITT_BUILD_ARG(…),
                                                 kmp_int32);

template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32,
                                                 kmp_flag_64<true, false> *,
                                                 int,
                                                 int *USE_ITT_BUILD_ARG(…),
                                                 kmp_int32);

template int __kmp_atomic_execute_tasks_64<false, true>(
    kmp_info_t *, kmp_int32, kmp_atomic_flag_64<false, true> *, int,
    int *USE_ITT_BUILD_ARG(…), kmp_int32);

template int __kmp_atomic_execute_tasks_64<true, false>(
    kmp_info_t *, kmp_int32, kmp_atomic_flag_64<true, false> *, int,
    int *USE_ITT_BUILD_ARG(…), kmp_int32);

// __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
// next barrier so they can assist in executing enqueued tasks.
// First thread in allocates the task team atomically.
static void __kmp_enable_tasking(kmp_task_team_t *task_team,
                                 kmp_info_t *this_thr) { … }

/* // TODO: Check the comment consistency
 * Utility routines for "task teams".  A task team (kmp_task_t) is kind of
 * like a shadow of the kmp_team_t data struct, with a different lifetime.
 * After a child * thread checks into a barrier and calls __kmp_release() from
 * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
 * longer assume that the kmp_team_t structure is intact (at any moment, the
 * primary thread may exit the barrier code and free the team data structure,
 * and return the threads to the thread pool).
 *
 * This does not work with the tasking code, as the thread is still
 * expected to participate in the execution of any tasks that may have been
 * spawned my a member of the team, and the thread still needs access to all
 * to each thread in the team, so that it can steal work from it.
 *
 * Enter the existence of the kmp_task_team_t struct.  It employs a reference
 * counting mechanism, and is allocated by the primary thread before calling
 * __kmp_<barrier_kind>_release, and then is release by the last thread to
 * exit __kmp_<barrier_kind>_release at the next barrier.  I.e. the lifetimes
 * of the kmp_task_team_t structs for consecutive barriers can overlap
 * (and will, unless the primary thread is the last thread to exit the barrier
 * release phase, which is not typical). The existence of such a struct is
 * useful outside the context of tasking.
 *
 * We currently use the existence of the threads array as an indicator that
 * tasks were spawned since the last barrier.  If the structure is to be
 * useful outside the context of tasking, then this will have to change, but
 * not setting the field minimizes the performance impact of tasking on
 * barriers, when no explicit tasks were spawned (pushed, actually).
 */

static kmp_task_team_t *__kmp_free_task_teams = …; // Free list for task_team data structures
// Lock for task team data structures
kmp_bootstrap_lock_t __kmp_task_team_lock = …;

// __kmp_alloc_task_deque:
// Allocates a task deque for a particular thread, and initialize the necessary
// data structures relating to the deque.  This only happens once per thread
// per task team since task teams are recycled. No lock is needed during
// allocation since each thread allocates its own deque.
static void __kmp_alloc_task_deque(kmp_info_t *thread,
                                   kmp_thread_data_t *thread_data) { … }

// __kmp_free_task_deque:
// Deallocates a task deque for a particular thread. Happens at library
// deallocation so don't need to reset all thread data fields.
static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) { … }

// __kmp_realloc_task_threads_data:
// Allocates a threads_data array for a task team, either by allocating an
// initial array or enlarging an existing array.  Only the first thread to get
// the lock allocs or enlarges the array and re-initializes the array elements.
// That thread returns "TRUE", the rest return "FALSE".
// Assumes that the new array size is given by task_team -> tt.tt_nproc.
// The current size is given by task_team -> tt.tt_max_threads.
static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
                                           kmp_task_team_t *task_team) { … }

// __kmp_free_task_threads_data:
// Deallocates a threads_data array for a task team, including any attached
// tasking deques.  Only occurs at library shutdown.
static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) { … }

// __kmp_free_task_pri_list:
// Deallocates tasking deques used for priority tasks.
// Only occurs at library shutdown.
static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) { … }

static inline void __kmp_task_team_init(kmp_task_team_t *task_team,
                                        kmp_team_t *team) { … }

// __kmp_allocate_task_team:
// Allocates a task team associated with a specific team, taking it from
// the global task team free list if possible.  Also initializes data
// structures.
static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
                                                 kmp_team_t *team) { … }

// __kmp_free_task_team:
// Frees the task team associated with a specific thread, and adds it
// to the global task team free list.
void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) { … }

// __kmp_reap_task_teams:
// Free all the task teams on the task team free list.
// Should only be done during library shutdown.
// Cannot do anything that needs a thread structure or gtid since they are
// already gone.
void __kmp_reap_task_teams(void) { … }

// View the array of two task team pointers as a pair of pointers:
//  1) a single task_team pointer
//  2) next pointer for stack
// Serial teams can create a stack of task teams for nested serial teams.
void __kmp_push_task_team_node(kmp_info_t *thread, kmp_team_t *team) { … }

// Serial team pops a task team off the stack
void __kmp_pop_task_team_node(kmp_info_t *thread, kmp_team_t *team) { … }

// __kmp_wait_to_unref_task_teams:
// Some threads could still be in the fork barrier release code, possibly
// trying to steal tasks.  Wait for each thread to unreference its task team.
void __kmp_wait_to_unref_task_teams(void) { … }

// __kmp_task_team_setup:  Create a task_team for the current team, but use
// an already created, unused one if it already exists.
void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team) { … }

// __kmp_task_team_sync: Propagation of task team data from team to threads
// which happens just after the release phase of a team barrier.  This may be
// called by any thread. This is not called for serial or root teams.
void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) { … }

// __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
// barrier gather phase. Only called by the primary thread.
//
// wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
// by passing in 0 optionally as the last argument. When wait is zero, primary
// thread does not wait for unfinished_threads to reach 0.
void __kmp_task_team_wait(
    kmp_info_t *this_thr,
    kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) { … }

// __kmp_tasking_barrier:
// This routine is called only when __kmp_tasking_mode == tskm_extra_barrier.
// Internal function to execute all tasks prior to a regular barrier or a join
// barrier. It is a full barrier itself, which unfortunately turns regular
// barriers into double barriers and join barriers into 1 1/2 barriers.
void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) { … }

// __kmp_give_task puts a task into a given thread queue if:
//  - the queue for that thread was created
//  - there's space in that queue
// Because of this, __kmp_push_task needs to check if there's space after
// getting the lock
static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
                            kmp_int32 pass) { … }

#define PROXY_TASK_FLAG …
/* The finish of the proxy tasks is divided in two pieces:
    - the top half is the one that can be done from a thread outside the team
    - the bottom half must be run from a thread within the team

   In order to run the bottom half the task gets queued back into one of the
   threads of the team. Once the td_incomplete_child_task counter of the parent
   is decremented the threads can leave the barriers. So, the bottom half needs
   to be queued before the counter is decremented. The top half is therefore
   divided in two parts:
    - things that can be run before queuing the bottom half
    - things that must be run after queuing the bottom half

   This creates a second race as the bottom half can free the task before the
   second top half is executed. To avoid this we use the
   td_incomplete_child_task of the proxy task to synchronize the top and bottom
   half. */
static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) { … }

static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) { … }

static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) { … }

/*!
@ingroup TASKING
@param gtid Global Thread ID of encountering thread
@param ptask Task which execution is completed

Execute the completion of a proxy task from a thread of that is part of the
team. Run first and bottom halves directly.
*/
void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) { … }

void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start = 0) { … }

/*!
@ingroup TASKING
@param ptask Task which execution is completed

Execute the completion of a proxy task from a thread that could not belong to
the team.
*/
void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) { … }

kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid,
                                                kmp_task_t *task) { … }

void __kmp_fulfill_event(kmp_event_t *event) { … }

// __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
// for taskloop
//
// thread:   allocating thread
// task_src: pointer to source task to be duplicated
// taskloop_recur: used only when dealing with taskgraph,
//      indicating whether we need to update task->td_task_id
// returns:  a pointer to the allocated kmp_task_t structure (task).
kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src
#if OMPX_TASKGRAPH
                                 , int taskloop_recur
#endif
) { … }

// Routine optionally generated by the compiler for setting the lastprivate flag
// and calling needed constructors for private/firstprivate objects
// (used to form taskloop tasks from pattern task)
// Parameters: dest task, src task, lastprivate flag.
p_task_dup_t;

KMP_BUILD_ASSERT(…);

// class to encapsulate manipulating loop bounds in a taskloop task.
// this abstracts away the Intel vs GOMP taskloop interface for setting/getting
// the loop bound variables.
class kmp_taskloop_bounds_t { … };

// __kmp_taskloop_linear: Start tasks of the taskloop linearly
//
// loc        Source location information
// gtid       Global thread ID
// task       Pattern task, exposes the loop iteration range
// lb         Pointer to loop lower bound in task structure
// ub         Pointer to loop upper bound in task structure
// st         Loop stride
// ub_glob    Global upper bound (used for lastprivate check)
// num_tasks  Number of tasks to execute
// grainsize  Number of loop iterations per task
// extras     Number of chunks with grainsize+1 iterations
// last_chunk Reduction of grainsize for last task
// tc         Iterations count
// task_dup   Tasks duplication routine
// codeptr_ra Return address for OMPT events
void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
                           kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
                           kmp_uint64 ub_glob, kmp_uint64 num_tasks,
                           kmp_uint64 grainsize, kmp_uint64 extras,
                           kmp_int64 last_chunk, kmp_uint64 tc,
#if OMPT_SUPPORT
                           void *codeptr_ra,
#endif
                           void *task_dup) { … }

// Structure to keep taskloop parameters for auxiliary task
// kept in the shareds of the task structure.
__taskloop_params_t;

void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
                          kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
                          kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64,
                          kmp_uint64,
#if OMPT_SUPPORT
                          void *,
#endif
                          void *);

// Execute part of the taskloop submitted as a task.
int __kmp_taskloop_task(int gtid, void *ptask) { … }

// Schedule part of the taskloop as a task,
// execute the rest of the taskloop.
//
// loc        Source location information
// gtid       Global thread ID
// task       Pattern task, exposes the loop iteration range
// lb         Pointer to loop lower bound in task structure
// ub         Pointer to loop upper bound in task structure
// st         Loop stride
// ub_glob    Global upper bound (used for lastprivate check)
// num_tasks  Number of tasks to execute
// grainsize  Number of loop iterations per task
// extras     Number of chunks with grainsize+1 iterations
// last_chunk Reduction of grainsize for last task
// tc         Iterations count
// num_t_min  Threshold to launch tasks recursively
// task_dup   Tasks duplication routine
// codeptr_ra Return address for OMPT events
void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
                          kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
                          kmp_uint64 ub_glob, kmp_uint64 num_tasks,
                          kmp_uint64 grainsize, kmp_uint64 extras,
                          kmp_int64 last_chunk, kmp_uint64 tc,
                          kmp_uint64 num_t_min,
#if OMPT_SUPPORT
                          void *codeptr_ra,
#endif
                          void *task_dup) { … }

static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
                           kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
                           int nogroup, int sched, kmp_uint64 grainsize,
                           int modifier, void *task_dup) { … }

/*!
@ingroup TASKING
@param loc       Source location information
@param gtid      Global thread ID
@param task      Task structure
@param if_val    Value of the if clause
@param lb        Pointer to loop lower bound in task structure
@param ub        Pointer to loop upper bound in task structure
@param st        Loop stride
@param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
@param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
@param grainsize Schedule value if specified
@param task_dup  Tasks duplication routine

Execute the taskloop construct.
*/
void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
                     kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
                     int sched, kmp_uint64 grainsize, void *task_dup) { … }

/*!
@ingroup TASKING
@param loc       Source location information
@param gtid      Global thread ID
@param task      Task structure
@param if_val    Value of the if clause
@param lb        Pointer to loop lower bound in task structure
@param ub        Pointer to loop upper bound in task structure
@param st        Loop stride
@param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
@param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
@param grainsize Schedule value if specified
@param modifier  Modifier 'strict' for sched, 1 if present, 0 otherwise
@param task_dup  Tasks duplication routine

Execute the taskloop construct.
*/
void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
                       kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
                       int nogroup, int sched, kmp_uint64 grainsize,
                       int modifier, void *task_dup) { … }

/*!
@ingroup TASKING
@param gtid Global Thread ID of current thread
@return Returns a pointer to the thread's current task async handle. If no task
is present or gtid is invalid, returns NULL.

Acqurires a pointer to the target async handle from the current task.
*/
void **__kmpc_omp_get_target_async_handle_ptr(kmp_int32 gtid) { … }

/*!
@ingroup TASKING
@param gtid Global Thread ID of current thread
@return Returns TRUE if the current task being executed of the given thread has
a task team allocated to it. Otherwise, returns FALSE.

Checks if the current thread has a task team.
*/
bool __kmpc_omp_has_task_team(kmp_int32 gtid) { … }

#if OMPX_TASKGRAPH
// __kmp_find_tdg: identify a TDG through its ID
// gtid:   Global Thread ID
// tdg_id: ID of the TDG
// returns: If a TDG corresponding to this ID is found and not
// its initial state, return the pointer to it, otherwise nullptr
static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id) {
  kmp_tdg_info_t *res = nullptr;
  if (__kmp_max_tdgs == 0)
    return res;

  if (__kmp_global_tdgs == NULL)
    __kmp_global_tdgs = (kmp_tdg_info_t **)__kmp_allocate(
        sizeof(kmp_tdg_info_t *) * __kmp_max_tdgs);

  if ((__kmp_global_tdgs[tdg_id]) &&
      (__kmp_global_tdgs[tdg_id]->tdg_status != KMP_TDG_NONE))
    res = __kmp_global_tdgs[tdg_id];
  return res;
}

// __kmp_print_tdg_dot: prints the TDG to a dot file
// tdg:    ID of the TDG
void __kmp_print_tdg_dot(kmp_tdg_info_t *tdg) {
  kmp_int32 tdg_id = tdg->tdg_id;
  KA_TRACE(10, ("__kmp_print_tdg_dot(enter): T#%d tdg_id=%d \n", gtid, tdg_id));

  char file_name[20];
  sprintf(file_name, "tdg_%d.dot", tdg_id);
  kmp_safe_raii_file_t tdg_file(file_name, "w");

  kmp_int32 num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
  fprintf(tdg_file,
          "digraph TDG {\n"
          "   compound=true\n"
          "   subgraph cluster {\n"
          "      label=TDG_%d\n",
          tdg_id);
  for (kmp_int32 i = 0; i < num_tasks; i++) {
    fprintf(tdg_file, "      %d[style=bold]\n", i);
  }
  fprintf(tdg_file, "   }\n");
  for (kmp_int32 i = 0; i < num_tasks; i++) {
    kmp_int32 nsuccessors = tdg->record_map[i].nsuccessors;
    kmp_int32 *successors = tdg->record_map[i].successors;
    if (nsuccessors > 0) {
      for (kmp_int32 j = 0; j < nsuccessors; j++)
        fprintf(tdg_file, "   %d -> %d \n", i, successors[j]);
    }
  }
  fprintf(tdg_file, "}");
  KA_TRACE(10, ("__kmp_print_tdg_dot(exit): T#%d tdg_id=%d \n", gtid, tdg_id));
}

// __kmp_start_record: launch the execution of a previous
// recorded TDG
// gtid:   Global Thread ID
// tdg:    ID of the TDG
void __kmp_exec_tdg(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
  KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_READY);
  KA_TRACE(10, ("__kmp_exec_tdg(enter): T#%d tdg_id=%d num_roots=%d\n", gtid,
                tdg->tdg_id, tdg->num_roots));
  kmp_node_info_t *this_record_map = tdg->record_map;
  kmp_int32 *this_root_tasks = tdg->root_tasks;
  kmp_int32 this_num_roots = tdg->num_roots;
  kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);

  kmp_info_t *thread = __kmp_threads[gtid];
  kmp_taskdata_t *parent_task = thread->th.th_current_task;

  if (tdg->rec_taskred_data) {
    __kmpc_taskred_init(gtid, tdg->rec_num_taskred, tdg->rec_taskred_data);
  }

  for (kmp_int32 j = 0; j < this_num_tasks; j++) {
    kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(this_record_map[j].task);

    td->td_parent = parent_task;
    this_record_map[j].parent_task = parent_task;

    kmp_taskgroup_t *parent_taskgroup =
        this_record_map[j].parent_task->td_taskgroup;

    KMP_ATOMIC_ST_RLX(&this_record_map[j].npredecessors_counter,
                      this_record_map[j].npredecessors);
    KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_incomplete_child_tasks);

    if (parent_taskgroup) {
      KMP_ATOMIC_INC(&parent_taskgroup->count);
      // The taskgroup is different so we must update it
      td->td_taskgroup = parent_taskgroup;
    } else if (td->td_taskgroup != nullptr) {
      // If the parent doesnt have a taskgroup, remove it from the task
      td->td_taskgroup = nullptr;
    }
    if (this_record_map[j].parent_task->td_flags.tasktype == TASK_EXPLICIT)
      KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_allocated_child_tasks);
  }

  for (kmp_int32 j = 0; j < this_num_roots; ++j) {
    __kmp_omp_task(gtid, this_record_map[this_root_tasks[j]].task, true);
  }
  KA_TRACE(10, ("__kmp_exec_tdg(exit): T#%d tdg_id=%d num_roots=%d\n", gtid,
                tdg->tdg_id, tdg->num_roots));
}

// __kmp_start_record: set up a TDG structure and turn the
// recording flag to true
// gtid:        Global Thread ID of the encountering thread
// input_flags: Flags associated with the TDG
// tdg_id:      ID of the TDG to record
static inline void __kmp_start_record(kmp_int32 gtid,
                                      kmp_taskgraph_flags_t *flags,
                                      kmp_int32 tdg_id) {
  kmp_tdg_info_t *tdg =
      (kmp_tdg_info_t *)__kmp_allocate(sizeof(kmp_tdg_info_t));
  __kmp_global_tdgs[__kmp_curr_tdg_idx] = tdg;
  // Initializing the TDG structure
  tdg->tdg_id = tdg_id;
  tdg->map_size = INIT_MAPSIZE;
  tdg->num_roots = -1;
  tdg->root_tasks = nullptr;
  tdg->tdg_status = KMP_TDG_RECORDING;
  tdg->rec_num_taskred = 0;
  tdg->rec_taskred_data = nullptr;
  KMP_ATOMIC_ST_RLX(&tdg->num_tasks, 0);

  // Initializing the list of nodes in this TDG
  kmp_node_info_t *this_record_map =
      (kmp_node_info_t *)__kmp_allocate(INIT_MAPSIZE * sizeof(kmp_node_info_t));
  for (kmp_int32 i = 0; i < INIT_MAPSIZE; i++) {
    kmp_int32 *successorsList =
        (kmp_int32 *)__kmp_allocate(__kmp_successors_size * sizeof(kmp_int32));
    this_record_map[i].task = nullptr;
    this_record_map[i].successors = successorsList;
    this_record_map[i].nsuccessors = 0;
    this_record_map[i].npredecessors = 0;
    this_record_map[i].successors_size = __kmp_successors_size;
    KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter, 0);
  }

  __kmp_global_tdgs[__kmp_curr_tdg_idx]->record_map = this_record_map;
}

// __kmpc_start_record_task: Wrapper around __kmp_start_record to mark
// the beginning of the record process of a task region
// loc_ref:     Location of TDG, not used yet
// gtid:        Global Thread ID of the encountering thread
// input_flags: Flags associated with the TDG
// tdg_id:      ID of the TDG to record, for now, incremental integer
// returns:     1 if we record, otherwise, 0
kmp_int32 __kmpc_start_record_task(ident_t *loc_ref, kmp_int32 gtid,
                                   kmp_int32 input_flags, kmp_int32 tdg_id) {

  kmp_int32 res;
  kmp_taskgraph_flags_t *flags = (kmp_taskgraph_flags_t *)&input_flags;
  KA_TRACE(10,
           ("__kmpc_start_record_task(enter): T#%d loc=%p flags=%d tdg_id=%d\n",
            gtid, loc_ref, input_flags, tdg_id));

  if (__kmp_max_tdgs == 0) {
    KA_TRACE(
        10,
        ("__kmpc_start_record_task(abandon): T#%d loc=%p flags=%d tdg_id = %d, "
         "__kmp_max_tdgs = 0\n",
         gtid, loc_ref, input_flags, tdg_id));
    return 1;
  }

  __kmpc_taskgroup(loc_ref, gtid);
  if (kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id)) {
    // TODO: use re_record flag
    __kmp_exec_tdg(gtid, tdg);
    res = 0;
  } else {
    __kmp_curr_tdg_idx = tdg_id;
    KMP_DEBUG_ASSERT(__kmp_curr_tdg_idx < __kmp_max_tdgs);
    __kmp_start_record(gtid, flags, tdg_id);
    __kmp_num_tdg++;
    res = 1;
  }
  KA_TRACE(10, ("__kmpc_start_record_task(exit): T#%d TDG %d starts to %s\n",
                gtid, tdg_id, res ? "record" : "execute"));
  return res;
}

// __kmp_end_record: set up a TDG after recording it
// gtid:   Global thread ID
// tdg:    Pointer to the TDG
void __kmp_end_record(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
  // Store roots
  kmp_node_info_t *this_record_map = tdg->record_map;
  kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
  kmp_int32 *this_root_tasks =
      (kmp_int32 *)__kmp_allocate(this_num_tasks * sizeof(kmp_int32));
  kmp_int32 this_map_size = tdg->map_size;
  kmp_int32 this_num_roots = 0;
  kmp_info_t *thread = __kmp_threads[gtid];

  for (kmp_int32 i = 0; i < this_num_tasks; i++) {
    if (this_record_map[i].npredecessors == 0) {
      this_root_tasks[this_num_roots++] = i;
    }
  }

  // Update with roots info and mapsize
  tdg->map_size = this_map_size;
  tdg->num_roots = this_num_roots;
  tdg->root_tasks = this_root_tasks;
  KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_RECORDING);
  tdg->tdg_status = KMP_TDG_READY;

  if (thread->th.th_current_task->td_dephash) {
    __kmp_dephash_free(thread, thread->th.th_current_task->td_dephash);
    thread->th.th_current_task->td_dephash = NULL;
  }

  // Reset predecessor counter
  for (kmp_int32 i = 0; i < this_num_tasks; i++) {
    KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter,
                      this_record_map[i].npredecessors);
  }
  KMP_ATOMIC_ST_RLX(&__kmp_tdg_task_id, 0);

  if (__kmp_tdg_dot)
    __kmp_print_tdg_dot(tdg);
}

// __kmpc_end_record_task: wrapper around __kmp_end_record to mark
// the end of recording phase
//
// loc_ref:      Source location information
// gtid:         Global thread ID
// input_flags:  Flags attached to the graph
// tdg_id:       ID of the TDG just finished recording
void __kmpc_end_record_task(ident_t *loc_ref, kmp_int32 gtid,
                            kmp_int32 input_flags, kmp_int32 tdg_id) {
  kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id);

  KA_TRACE(10, ("__kmpc_end_record_task(enter): T#%d loc=%p finishes recording"
                " tdg=%d with flags=%d\n",
                gtid, loc_ref, tdg_id, input_flags));
  if (__kmp_max_tdgs) {
    // TODO: use input_flags->nowait
    __kmpc_end_taskgroup(loc_ref, gtid);
    if (__kmp_tdg_is_recording(tdg->tdg_status))
      __kmp_end_record(gtid, tdg);
  }
  KA_TRACE(10, ("__kmpc_end_record_task(exit): T#%d loc=%p finished recording"
                " tdg=%d, its status is now READY\n",
                gtid, loc_ref, tdg_id));
}
#endif
llvm/openmp/runtime/src/kmp_tasking.cpp