llvm/openmp/runtime/test/ompt/callback.h

#ifndef _BSD_SOURCE
#define _BSD_SOURCE
#endif
#ifndef _DEFAULT_SOURCE
#define _DEFAULT_SOURCE
#endif
#include <stdio.h>
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h>
#include <omp.h>
#include <omp-tools.h>
#include "ompt-signal.h"

// Used to detect architecture
#include "../../src/kmp_platform.h"

#ifndef _TOOL_PREFIX
#define _TOOL_PREFIX ""
// If no _TOOL_PREFIX is set, we assume that we run as part of an OMPT test
#define _OMPT_TESTS
#endif

static const char *ompt_thread_t_values[] = {
    "ompt_thread_UNDEFINED", "ompt_thread_initial", "ompt_thread_worker",
    "ompt_thread_other"};

static const char *ompt_task_status_t_values[] = {
    "ompt_task_UNDEFINED",
    "ompt_task_complete", // 1
    "ompt_task_yield", // 2
    "ompt_task_cancel", // 3
    "ompt_task_detach", // 4
    "ompt_task_early_fulfill", // 5
    "ompt_task_late_fulfill", // 6
    "ompt_task_switch", // 7
    "ompt_taskwait_complete" // 8
};
static const char* ompt_cancel_flag_t_values[] = {
  "ompt_cancel_parallel",
  "ompt_cancel_sections",
  "ompt_cancel_loop",
  "ompt_cancel_taskgroup",
  "ompt_cancel_activated",
  "ompt_cancel_detected",
  "ompt_cancel_discarded_task"
};

static const char *ompt_work_t_values[] = {"undefined",
                                           "ompt_work_loop",
                                           "ompt_work_sections",
                                           "ompt_work_single_executor",
                                           "ompt_work_single_other",
                                           "ompt_work_workshare",
                                           "ompt_work_distribute",
                                           "ompt_work_taskloop",
                                           "ompt_work_scope",
                                           "ompt_work_workdistribute",
                                           "ompt_work_loop_static",
                                           "ompt_work_loop_dynamic",
                                           "ompt_work_loop_guided",
                                           "ompt_work_loop_other"};

static const char *ompt_work_events_t_values[] = {"undefined",
                                                  "ompt_event_loop",
                                                  "ompt_event_sections",
                                                  "ompt_event_single_in_block",
                                                  "ompt_event_single_others",
                                                  "ompt_event_workshare",
                                                  "ompt_event_distribute",
                                                  "ompt_event_taskloop",
                                                  "ompt_event_scope",
                                                  "ompt_event_workdistribute",
                                                  "ompt_event_loop_static",
                                                  "ompt_event_loop_dynamic",
                                                  "ompt_event_loop_guided",
                                                  "ompt_event_loop_other"};

static const char *ompt_dependence_type_t_values[36] = {
    "ompt_dependence_type_UNDEFINED",
    "ompt_dependence_type_in", // 1
    "ompt_dependence_type_out", // 2
    "ompt_dependence_type_inout", // 3
    "ompt_dependence_type_mutexinoutset", // 4
    "ompt_dependence_type_source", // 5
    "ompt_dependence_type_sink", // 6
    "ompt_dependence_type_inoutset", // 7
    "", "", "", "", "", "", // 8-13
    "", "", "", "", "", "", "", "", "", "", // 14-23
    "", "", "", "", "", "", "", "", "", "", // 24-33
    "ompt_dependence_type_out_all_memory", // 34
    "ompt_dependence_type_inout_all_memory" // 35
};

static const char *ompt_sync_region_t_values[] = {"undefined",
                                                  "barrier",
                                                  "barrier_implicit",
                                                  "barrier_explicit",
                                                  "barrier_implementation",
                                                  "taskwait",
                                                  "taskgroup",
                                                  "reduction",
                                                  "barrier_implicit_workshare",
                                                  "barrier_implicit_parallel",
                                                  "barrier_teams"};

static void format_task_type(int type, char *buffer) {
  char *progress = buffer;
  if (type & ompt_task_initial)
    progress += sprintf(progress, "ompt_task_initial");
  if (type & ompt_task_implicit)
    progress += sprintf(progress, "ompt_task_implicit");
  if (type & ompt_task_explicit)
    progress += sprintf(progress, "ompt_task_explicit");
  if (type & ompt_task_target)
    progress += sprintf(progress, "ompt_task_target");
  if (type & ompt_task_taskwait)
    progress += sprintf(progress, "ompt_task_taskwait");
  if (type & ompt_task_undeferred)
    progress += sprintf(progress, "|ompt_task_undeferred");
  if (type & ompt_task_untied)
    progress += sprintf(progress, "|ompt_task_untied");
  if (type & ompt_task_final)
    progress += sprintf(progress, "|ompt_task_final");
  if (type & ompt_task_mergeable)
    progress += sprintf(progress, "|ompt_task_mergeable");
  if (type & ompt_task_merged)
    progress += sprintf(progress, "|ompt_task_merged");
}

static ompt_set_callback_t ompt_set_callback;
static ompt_get_callback_t ompt_get_callback;
static ompt_get_state_t ompt_get_state;
static ompt_get_task_info_t ompt_get_task_info;
static ompt_get_task_memory_t ompt_get_task_memory;
static ompt_get_thread_data_t ompt_get_thread_data;
static ompt_get_parallel_info_t ompt_get_parallel_info;
static ompt_get_unique_id_t ompt_get_unique_id;
static ompt_finalize_tool_t ompt_finalize_tool;
static ompt_get_num_procs_t ompt_get_num_procs;
static ompt_get_num_places_t ompt_get_num_places;
static ompt_get_place_proc_ids_t ompt_get_place_proc_ids;
static ompt_get_place_num_t ompt_get_place_num;
static ompt_get_partition_place_nums_t ompt_get_partition_place_nums;
static ompt_get_proc_id_t ompt_get_proc_id;
static ompt_enumerate_states_t ompt_enumerate_states;
static ompt_enumerate_mutex_impls_t ompt_enumerate_mutex_impls;

static void print_ids(int level)
{
  int task_type, thread_num;
  ompt_frame_t *frame;
  ompt_data_t *task_parallel_data;
  ompt_data_t *task_data;
  int exists_task = ompt_get_task_info(level, &task_type, &task_data, &frame,
                                       &task_parallel_data, &thread_num);
  char buffer[2048];
  format_task_type(task_type, buffer);
  if (frame)
    printf("%" PRIu64 ": task level %d: parallel_id=%" PRIu64
           ", task_id=%" PRIu64 ", exit_frame=%p, reenter_frame=%p, "
           "task_type=%s=%d, thread_num=%d\n",
           ompt_get_thread_data()->value, level,
           exists_task ? task_parallel_data->value : 0,
           exists_task ? task_data->value : 0, frame->exit_frame.ptr,
           frame->enter_frame.ptr, buffer, task_type, thread_num);
}

#define get_frame_address(level) __builtin_frame_address(level)

#define print_frame(level)                                                     \
  printf("%" PRIu64 ": __builtin_frame_address(%d)=%p\n",                      \
         ompt_get_thread_data()->value, level, get_frame_address(level))

// clang (version 5.0 and above) adds an intermediate function call with debug flag (-g)
#if defined(TEST_NEED_PRINT_FRAME_FROM_OUTLINED_FN)
  #if defined(DEBUG) && defined(__clang__) && __clang_major__ >= 5
    #define print_frame_from_outlined_fn(level) print_frame(level+1)
  #else
    #define print_frame_from_outlined_fn(level) print_frame(level)
  #endif

  #if defined(__clang__) && __clang_major__ >= 5
    #warning "Clang 5.0 and later add an additional wrapper for outlined functions when compiling with debug information."
    #warning "Please define -DDEBUG iff you manually pass in -g to make the tests succeed!"
  #endif
#endif

// This macro helps to define a label at the current position that can be used
// to get the current address in the code.
//
// For print_current_address():
//   To reliably determine the offset between the address of the label and the
//   actual return address, we insert a NOP instruction as a jump target as the
//   compiler would otherwise insert an instruction that we can't control. The
//   instruction length is target dependent and is explained below.
//
// (The empty block between "#pragma omp ..." and the __asm__ statement is a
// workaround for a bug in the Intel Compiler.)
#define define_ompt_label(id) \
  {} \
  __asm__("nop"); \
ompt_label_##id:

// This macro helps to get the address of a label that is inserted by the above
// macro define_ompt_label(). The address is obtained with a GNU extension
// (&&label) that has been tested with gcc, clang and icc.
#define get_ompt_label_address(id) (&& ompt_label_##id)

// This macro prints the exact address that a previously called runtime function
// returns to.
#define print_current_address(id) \
  define_ompt_label(id) \
  print_possible_return_addresses(get_ompt_label_address(id))

#if KMP_ARCH_X86 || KMP_ARCH_X86_64
// On X86 the NOP instruction is 1 byte long. In addition, the compiler inserts
// a MOV instruction for non-void runtime functions which is 3 bytes long.
#define print_possible_return_addresses(addr) \
  printf("%" PRIu64 ": current_address=%p or %p for non-void functions\n", \
         ompt_get_thread_data()->value, ((char *)addr) - 1, ((char *)addr) - 4)
#elif KMP_ARCH_PPC64
// On Power the NOP instruction is 4 bytes long. In addition, the compiler
// inserts a second NOP instruction (another 4 bytes). For non-void runtime
// functions Clang inserts a STW instruction (but only if compiling under
// -fno-PIC which will be the default with Clang 8.0, another 4 bytes).
#define print_possible_return_addresses(addr) \
  printf("%" PRIu64 ": current_address=%p or %p\n", ompt_get_thread_data()->value, \
         ((char *)addr) - 8, ((char *)addr) - 12)
#elif KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32
// On AArch64 the NOP instruction is 4 bytes long, can be followed by inserted
// store instruction (another 4 bytes long).
// FIXME: PR #65696 addded a third possibility (12 byte offset) to make the
// tests pass on Darwin. Adding the same for other OSes. However, the proper
// fix for this is to remove the extra branch instruction being generated by
// the AArch64 backend. See issue #69627.
#define print_possible_return_addresses(addr)                                  \
  printf("%" PRIu64 ": current_address=%p or %p or %p\n",                      \
         ompt_get_thread_data()->value, ((char *)addr) - 4,                    \
         ((char *)addr) - 8, ((char *)addr) - 12)
#elif KMP_ARCH_RISCV64
#if __riscv_compressed
// On RV64GC the C.NOP instruction is 2 byte long. In addition, the compiler
// inserts a J instruction (targeting the successor basic block), which
// accounts for another 4 bytes. Finally, an additional J instruction may
// appear (adding 4 more bytes) when the C.NOP is referenced elsewhere (ie.
// another branch).
#define print_possible_return_addresses(addr) \
  printf("%" PRIu64 ": current_address=%p or %p\n", \
         ompt_get_thread_data()->value, ((char *)addr) - 6, ((char *)addr) - 10)
#else
// On RV64G the NOP instruction is 4 byte long. In addition, the compiler
// inserts a J instruction (targeting the successor basic block), which
// accounts for another 4 bytes. Finally, an additional J instruction may
// appear (adding 4 more bytes) when the NOP is referenced elsewhere (ie.
// another branch).
#define print_possible_return_addresses(addr) \
  printf("%" PRIu64 ": current_address=%p or %p\n", \
         ompt_get_thread_data()->value, ((char *)addr) - 8, ((char *)addr) - 12)
#endif
#elif KMP_ARCH_LOONGARCH64
// On LoongArch64 the NOP instruction is 4 bytes long, can be followed by
// inserted jump instruction (another 4 bytes long). And an additional jump
// instruction may appear (adding 4 more bytes) when the NOP is referenced
// elsewhere (ie. another branch).
#define print_possible_return_addresses(addr)                                  \
  printf("%" PRIu64 ": current_address=%p or %p or %p\n",                      \
         ompt_get_thread_data()->value, ((char *)addr) - 4,                    \
         ((char *)addr) - 8, ((char *)addr) - 12)
#elif KMP_ARCH_VE
// On VE the NOP instruction is 8 byte long. In addition, the compiler inserts
// a ??? instruction for non-void runtime functions which is ? bytes long.
#define print_possible_return_addresses(addr)                                  \
  printf("%" PRIu64 ": current_address=%p or %p\n",                            \
         ompt_get_thread_data()->value, ((char *)addr) - 8,                    \
         ((char *)addr) - 8)
#elif KMP_ARCH_S390X
// On s390x the NOP instruction is 2 bytes long. For non-void runtime
// functions Clang inserts a STY instruction (but only if compiling under
// -fno-PIC which will be the default with Clang 8.0, another 6 bytes).
//
// Another possibility is:
//
//                brasl %r14,__kmpc_end_master@plt
//   a7 f4 00 02  j 0f
//   47 00 00 00  0: nop
//   a7 f4 00 02  j addr
//                addr:
#define print_possible_return_addresses(addr)                                  \
  printf("%" PRIu64 ": current_address=%p or %p or %p\n",                      \
         ompt_get_thread_data()->value, ((char *)addr) - 2,                    \
         ((char *)addr) - 8, ((char *)addr) - 12)
#else
#error Unsupported target architecture, cannot determine address offset!
#endif


// This macro performs a somewhat similar job to print_current_address(), except
// that it discards a certain number of nibbles from the address and only prints
// the most significant bits / nibbles. This can be used for cases where the
// return address can only be approximated.
//
// To account for overflows (ie the most significant bits / nibbles have just
// changed as we are a few bytes above the relevant power of two) the addresses
// of the "current" and of the "previous block" are printed.
#define print_fuzzy_address(id) \
  define_ompt_label(id) \
  print_fuzzy_address_blocks(get_ompt_label_address(id))

// If you change this define you need to adapt all capture patterns in the tests
// to include or discard the new number of nibbles!
#define FUZZY_ADDRESS_DISCARD_NIBBLES 2
#define FUZZY_ADDRESS_DISCARD_BYTES (1 << ((FUZZY_ADDRESS_DISCARD_NIBBLES) * 4))
#define print_fuzzy_address_blocks(addr)                                       \
  printf("%" PRIu64 ": fuzzy_address=0x%" PRIx64 " or 0x%" PRIx64              \
         " or 0x%" PRIx64 " or 0x%" PRIx64 " (%p)\n",                          \
         ompt_get_thread_data()->value,                                        \
         ((uint64_t)addr) / FUZZY_ADDRESS_DISCARD_BYTES - 1,                   \
         ((uint64_t)addr) / FUZZY_ADDRESS_DISCARD_BYTES,                       \
         ((uint64_t)addr) / FUZZY_ADDRESS_DISCARD_BYTES + 1,                   \
         ((uint64_t)addr) / FUZZY_ADDRESS_DISCARD_BYTES + 2, addr)

#define register_ompt_callback_t(name, type)                                   \
  do {                                                                         \
    type f_##name = &on_##name;                                                \
    if (ompt_set_callback(name, (ompt_callback_t)f_##name) == ompt_set_never)  \
      printf("0: Could not register callback '" #name "'\n");                  \
  } while (0)

#define register_ompt_callback(name) register_ompt_callback_t(name, name##_t)

#ifndef USE_PRIVATE_TOOL
static void
on_ompt_callback_mutex_acquire(
  ompt_mutex_t kind,
  unsigned int hint,
  unsigned int impl,
  ompt_wait_id_t wait_id,
  const void *codeptr_ra)
{
  switch(kind)
  {
    case ompt_mutex_lock:
      printf("%" PRIu64 ":" _TOOL_PREFIX
             " ompt_event_wait_lock: wait_id=%" PRIu64 ", hint=%" PRIu32
             ", impl=%" PRIu32 ", codeptr_ra=%p \n",
             ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
      break;
    case ompt_mutex_test_lock:
      printf("%" PRIu64 ":" _TOOL_PREFIX
             " ompt_event_wait_test_lock: wait_id=%" PRIu64 ", hint=%" PRIu32
             ", impl=%" PRIu32 ", codeptr_ra=%p \n",
             ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
      break;
    case ompt_mutex_nest_lock:
      printf("%" PRIu64 ":" _TOOL_PREFIX
             " ompt_event_wait_nest_lock: wait_id=%" PRIu64 ", hint=%" PRIu32
             ", impl=%" PRIu32 ", codeptr_ra=%p \n",
             ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
      break;
    case ompt_mutex_test_nest_lock:
      printf("%" PRIu64 ":" _TOOL_PREFIX
             " ompt_event_wait_test_nest_lock: wait_id=%" PRIu64
             ", hint=%" PRIu32 ", impl=%" PRIu32 ", codeptr_ra=%p \n",
             ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
      break;
    case ompt_mutex_critical:
      printf("%" PRIu64 ":" _TOOL_PREFIX
             " ompt_event_wait_critical: wait_id=%" PRIu64 ", hint=%" PRIu32
             ", impl=%" PRIu32 ", codeptr_ra=%p \n",
             ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
      break;
    case ompt_mutex_atomic:
      printf("%" PRIu64 ":" _TOOL_PREFIX
             " ompt_event_wait_atomic: wait_id=%" PRIu64 ", hint=%" PRIu32
             ", impl=%" PRIu32 ", codeptr_ra=%p \n",
             ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
      break;
    case ompt_mutex_ordered:
      printf("%" PRIu64 ":" _TOOL_PREFIX
             " ompt_event_wait_ordered: wait_id=%" PRIu64 ", hint=%" PRIu32
             ", impl=%" PRIu32 ", codeptr_ra=%p \n",
             ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
      break;
    default:
      break;
  }
}

static void
on_ompt_callback_mutex_acquired(
  ompt_mutex_t kind,
  ompt_wait_id_t wait_id,
  const void *codeptr_ra)
{
  switch(kind)
  {
    case ompt_mutex_lock:
      printf("%" PRIu64 ":" _TOOL_PREFIX
             " ompt_event_acquired_lock: wait_id=%" PRIu64 ", codeptr_ra=%p \n",
             ompt_get_thread_data()->value, wait_id, codeptr_ra);
      break;
    case ompt_mutex_test_lock:
      printf("%" PRIu64 ":" _TOOL_PREFIX
             " ompt_event_acquired_test_lock: wait_id=%" PRIu64
             ", codeptr_ra=%p \n",
             ompt_get_thread_data()->value, wait_id, codeptr_ra);
      break;
    case ompt_mutex_nest_lock:
      printf("%" PRIu64 ":" _TOOL_PREFIX
             " ompt_event_acquired_nest_lock_first: wait_id=%" PRIu64
             ", codeptr_ra=%p \n",
             ompt_get_thread_data()->value, wait_id, codeptr_ra);
      break;
    case ompt_mutex_test_nest_lock:
      printf("%" PRIu64 ":" _TOOL_PREFIX
             " ompt_event_acquired_test_nest_lock_first: wait_id=%" PRIu64
             ", codeptr_ra=%p \n",
             ompt_get_thread_data()->value, wait_id, codeptr_ra);
      break;
    case ompt_mutex_critical:
      printf("%" PRIu64 ":" _TOOL_PREFIX
             " ompt_event_acquired_critical: wait_id=%" PRIu64
             ", codeptr_ra=%p \n",
             ompt_get_thread_data()->value, wait_id, codeptr_ra);
      break;
    case ompt_mutex_atomic:
      printf("%" PRIu64 ":" _TOOL_PREFIX
             " ompt_event_acquired_atomic: wait_id=%" PRIu64
             ", codeptr_ra=%p \n",
             ompt_get_thread_data()->value, wait_id, codeptr_ra);
      break;
    case ompt_mutex_ordered:
      printf("%" PRIu64 ":" _TOOL_PREFIX
             " ompt_event_acquired_ordered: wait_id=%" PRIu64
             ", codeptr_ra=%p \n",
             ompt_get_thread_data()->value, wait_id, codeptr_ra);
      break;
    default:
      break;
  }
}

static void
on_ompt_callback_mutex_released(
  ompt_mutex_t kind,
  ompt_wait_id_t wait_id,
  const void *codeptr_ra)
{
  switch(kind)
  {
    case ompt_mutex_lock:
      printf("%" PRIu64 ":" _TOOL_PREFIX
             " ompt_event_release_lock: wait_id=%" PRIu64 ", codeptr_ra=%p \n",
             ompt_get_thread_data()->value, wait_id, codeptr_ra);
      break;
    case ompt_mutex_nest_lock:
      printf("%" PRIu64 ":" _TOOL_PREFIX
             " ompt_event_release_nest_lock_last: wait_id=%" PRIu64
             ", codeptr_ra=%p \n",
             ompt_get_thread_data()->value, wait_id, codeptr_ra);
      break;
    case ompt_mutex_critical:
      printf("%" PRIu64 ":" _TOOL_PREFIX
             " ompt_event_release_critical: wait_id=%" PRIu64
             ", codeptr_ra=%p \n",
             ompt_get_thread_data()->value, wait_id, codeptr_ra);
      break;
    case ompt_mutex_atomic:
      printf("%" PRIu64 ":" _TOOL_PREFIX
             " ompt_event_release_atomic: wait_id=%" PRIu64
             ", codeptr_ra=%p \n",
             ompt_get_thread_data()->value, wait_id, codeptr_ra);
      break;
    case ompt_mutex_ordered:
      printf("%" PRIu64 ":" _TOOL_PREFIX
             " ompt_event_release_ordered: wait_id=%" PRIu64
             ", codeptr_ra=%p \n",
             ompt_get_thread_data()->value, wait_id, codeptr_ra);
      break;
    default:
      break;
  }
}

static void
on_ompt_callback_nest_lock(
    ompt_scope_endpoint_t endpoint,
    ompt_wait_id_t wait_id,
    const void *codeptr_ra)
{
  switch(endpoint)
  {
    case ompt_scope_begin:
      printf("%" PRIu64 ":" _TOOL_PREFIX
             " ompt_event_acquired_nest_lock_next: wait_id=%" PRIu64
             ", codeptr_ra=%p \n",
             ompt_get_thread_data()->value, wait_id, codeptr_ra);
      break;
    case ompt_scope_end:
      printf("%" PRIu64 ":" _TOOL_PREFIX
             " ompt_event_release_nest_lock_prev: wait_id=%" PRIu64
             ", codeptr_ra=%p \n",
             ompt_get_thread_data()->value, wait_id, codeptr_ra);
      break;
    case ompt_scope_beginend:
      printf("ompt_scope_beginend should never be passed to %s\n", __func__);
      exit(-1);
  }
}

static void
on_ompt_callback_sync_region(
  ompt_sync_region_t kind,
  ompt_scope_endpoint_t endpoint,
  ompt_data_t *parallel_data,
  ompt_data_t *task_data,
  const void *codeptr_ra)
{
  if (endpoint == ompt_scope_beginend) {
    printf("ompt_scope_beginend should never be passed to %s\n", __func__);
    exit(-1);
  }
  if (kind == ompt_sync_region_reduction) {
    printf("ompt_sync_region_reduction should never be passed to %s\n",
           __func__);
    exit(-1);
  }
  uint64_t parallel_data_value = parallel_data ? parallel_data->value : 0;
  const char *begin_or_end = (endpoint == ompt_scope_begin) ? "begin" : "end";
  printf("%" PRIu64 ":" _TOOL_PREFIX " ompt_event_%s_%s: parallel_id=%" PRIu64
         ", task_id=%" PRIu64 ", codeptr_ra=%p\n",
         ompt_get_thread_data()->value, ompt_sync_region_t_values[kind],
         begin_or_end, parallel_data_value, task_data->value, codeptr_ra);
  switch (kind) {
  case ompt_sync_region_barrier:
  case ompt_sync_region_barrier_implicit:
  case ompt_sync_region_barrier_implicit_workshare:
  case ompt_sync_region_barrier_implicit_parallel:
  case ompt_sync_region_barrier_teams:
  case ompt_sync_region_barrier_explicit:
  case ompt_sync_region_barrier_implementation:
    if (endpoint == ompt_scope_begin)
      print_ids(0);
  default:;
  }
}

static void
on_ompt_callback_sync_region_wait(
  ompt_sync_region_t kind,
  ompt_scope_endpoint_t endpoint,
  ompt_data_t *parallel_data,
  ompt_data_t *task_data,
  const void *codeptr_ra)
{
  if (endpoint == ompt_scope_beginend) {
    printf("ompt_scope_beginend should never be passed to %s\n", __func__);
    exit(-1);
  }
  if (kind == ompt_sync_region_reduction) {
    printf("ompt_sync_region_reduction should never be passed to %s\n",
           __func__);
    exit(-1);
  }
  uint64_t parallel_data_value = parallel_data ? parallel_data->value : 0;
  const char *begin_or_end = (endpoint == ompt_scope_begin) ? "begin" : "end";
  printf("%" PRIu64 ":" _TOOL_PREFIX
         " ompt_event_wait_%s_%s: parallel_id=%" PRIu64 ", task_id=%" PRIu64
         ", codeptr_ra=%p\n",
         ompt_get_thread_data()->value, ompt_sync_region_t_values[kind],
         begin_or_end, parallel_data_value, task_data->value, codeptr_ra);
}

static void on_ompt_callback_reduction(ompt_sync_region_t kind,
                                       ompt_scope_endpoint_t endpoint,
                                       ompt_data_t *parallel_data,
                                       ompt_data_t *task_data,
                                       const void *codeptr_ra) {
  switch (endpoint) {
  case ompt_scope_begin:
    printf("%" PRIu64 ":" _TOOL_PREFIX
           " ompt_event_reduction_begin: parallel_id=%" PRIu64
           ", task_id=%" PRIu64 ", codeptr_ra=%p\n",
           ompt_get_thread_data()->value,
           (parallel_data) ? parallel_data->value : 0, task_data->value,
           codeptr_ra);
    break;
  case ompt_scope_end:
    printf("%" PRIu64 ":" _TOOL_PREFIX
           " ompt_event_reduction_end: parallel_id=%" PRIu64
           ", task_id=%" PRIu64 ", codeptr_ra=%p\n",
           ompt_get_thread_data()->value,
           (parallel_data) ? parallel_data->value : 0, task_data->value,
           codeptr_ra);
    break;
  case ompt_scope_beginend:
    printf("ompt_scope_beginend should never be passed to %s\n", __func__);
    exit(-1);
  }
}

static void
on_ompt_callback_flush(
    ompt_data_t *thread_data,
    const void *codeptr_ra)
{
  printf("%" PRIu64 ":" _TOOL_PREFIX " ompt_event_flush: codeptr_ra=%p\n",
         thread_data->value, codeptr_ra);
}

static void
on_ompt_callback_cancel(
    ompt_data_t *task_data,
    int flags,
    const void *codeptr_ra)
{
  const char* first_flag_value;
  const char* second_flag_value;
  if(flags & ompt_cancel_parallel)
    first_flag_value = ompt_cancel_flag_t_values[0];
  else if(flags & ompt_cancel_sections)
    first_flag_value = ompt_cancel_flag_t_values[1];
  else if(flags & ompt_cancel_loop)
    first_flag_value = ompt_cancel_flag_t_values[2];
  else if(flags & ompt_cancel_taskgroup)
    first_flag_value = ompt_cancel_flag_t_values[3];

  if(flags & ompt_cancel_activated)
    second_flag_value = ompt_cancel_flag_t_values[4];
  else if(flags & ompt_cancel_detected)
    second_flag_value = ompt_cancel_flag_t_values[5];
  else if(flags & ompt_cancel_discarded_task)
    second_flag_value = ompt_cancel_flag_t_values[6];

  printf("%" PRIu64 ":" _TOOL_PREFIX " ompt_event_cancel: task_data=%" PRIu64
         ", flags=%s|%s=%" PRIu32 ", codeptr_ra=%p\n",
         ompt_get_thread_data()->value, task_data->value, first_flag_value,
         second_flag_value, flags, codeptr_ra);
}

static void
on_ompt_callback_implicit_task(
    ompt_scope_endpoint_t endpoint,
    ompt_data_t *parallel_data,
    ompt_data_t *task_data,
    unsigned int team_size,
    unsigned int thread_num,
    int flags)
{
  switch(endpoint)
  {
    case ompt_scope_begin:
      if(task_data->ptr)
        printf("%s\n", "0: task_data initially not null");
      task_data->value = ompt_get_unique_id();

      //there is no parallel_begin callback for implicit parallel region
      //thus it is initialized in initial task
      if(flags & ompt_task_initial)
      {
        char buffer[2048];

        format_task_type(flags, buffer);
        // Only check initial task not created by teams construct
        if (team_size == 1 && thread_num == 1 && parallel_data->ptr)
          printf("%s\n", "0: parallel_data initially not null");
        parallel_data->value = ompt_get_unique_id();
        printf("%" PRIu64 ":" _TOOL_PREFIX
               " ompt_event_initial_task_begin: parallel_id=%" PRIu64
               ", task_id=%" PRIu64 ", actual_parallelism=%" PRIu32
               ", index=%" PRIu32 ", flags=%" PRIu32 "\n",
               ompt_get_thread_data()->value, parallel_data->value,
               task_data->value, team_size, thread_num, flags);
      } else {
        printf("%" PRIu64 ":" _TOOL_PREFIX
               " ompt_event_implicit_task_begin: parallel_id=%" PRIu64
               ", task_id=%" PRIu64 ", team_size=%" PRIu32
               ", thread_num=%" PRIu32 "\n",
               ompt_get_thread_data()->value, parallel_data->value,
               task_data->value, team_size, thread_num);
      }

      break;
    case ompt_scope_end:
      if(flags & ompt_task_initial){
        printf("%" PRIu64 ":" _TOOL_PREFIX
               " ompt_event_initial_task_end: parallel_id=%" PRIu64
               ", task_id=%" PRIu64 ", actual_parallelism=%" PRIu32
               ", index=%" PRIu32 "\n",
               ompt_get_thread_data()->value,
               (parallel_data) ? parallel_data->value : 0, task_data->value,
               team_size, thread_num);
      } else {
        printf("%" PRIu64 ":" _TOOL_PREFIX
               " ompt_event_implicit_task_end: parallel_id=%" PRIu64
               ", task_id=%" PRIu64 ", team_size=%" PRIu32
               ", thread_num=%" PRIu32 "\n",
               ompt_get_thread_data()->value,
               (parallel_data) ? parallel_data->value : 0, task_data->value,
               team_size, thread_num);
      }
      break;
    case ompt_scope_beginend:
      printf("ompt_scope_beginend should never be passed to %s\n", __func__);
      exit(-1);
  }
}

static void
on_ompt_callback_lock_init(
  ompt_mutex_t kind,
  unsigned int hint,
  unsigned int impl,
  ompt_wait_id_t wait_id,
  const void *codeptr_ra)
{
  switch(kind)
  {
    case ompt_mutex_lock:
      printf("%" PRIu64 ":" _TOOL_PREFIX
             " ompt_event_init_lock: wait_id=%" PRIu64 ", hint=%" PRIu32
             ", impl=%" PRIu32 ", codeptr_ra=%p \n",
             ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
      break;
    case ompt_mutex_nest_lock:
      printf("%" PRIu64 ":" _TOOL_PREFIX
             " ompt_event_init_nest_lock: wait_id=%" PRIu64 ", hint=%" PRIu32
             ", impl=%" PRIu32 ", codeptr_ra=%p \n",
             ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
      break;
    default:
      break;
  }
}

static void
on_ompt_callback_lock_destroy(
  ompt_mutex_t kind,
  ompt_wait_id_t wait_id,
  const void *codeptr_ra)
{
  switch(kind)
  {
    case ompt_mutex_lock:
      printf("%" PRIu64 ":" _TOOL_PREFIX
             " ompt_event_destroy_lock: wait_id=%" PRIu64 ", codeptr_ra=%p \n",
             ompt_get_thread_data()->value, wait_id, codeptr_ra);
      break;
    case ompt_mutex_nest_lock:
      printf("%" PRIu64 ":" _TOOL_PREFIX
             " ompt_event_destroy_nest_lock: wait_id=%" PRIu64
             ", codeptr_ra=%p \n",
             ompt_get_thread_data()->value, wait_id, codeptr_ra);
      break;
    default:
      break;
  }
}

static void
on_ompt_callback_work(
  ompt_work_t wstype,
  ompt_scope_endpoint_t endpoint,
  ompt_data_t *parallel_data,
  ompt_data_t *task_data,
  uint64_t count,
  const void *codeptr_ra)
{
  switch(endpoint)
  {
  case ompt_scope_begin:
    printf("%" PRIu64 ":" _TOOL_PREFIX " %s_begin: parallel_id=%" PRIu64
           ", task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n",
           ompt_get_thread_data()->value, ompt_work_events_t_values[wstype],
           parallel_data->value, task_data->value, codeptr_ra, count);
    break;
  case ompt_scope_end:
    printf("%" PRIu64 ":" _TOOL_PREFIX " %s_end: parallel_id=%" PRIu64
           ", task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n",
           ompt_get_thread_data()->value, ompt_work_events_t_values[wstype],
           parallel_data->value, task_data->value, codeptr_ra, count);
    break;
  case ompt_scope_beginend:
    printf("ompt_scope_beginend should never be passed to %s\n", __func__);
    exit(-1);
  }
}

static void on_ompt_callback_dispatch(
    ompt_data_t *parallel_data,
    ompt_data_t *task_data,
    ompt_dispatch_t kind,
    ompt_data_t instance) {
  char *event_name = NULL;
  void *codeptr_ra = NULL;
  ompt_dispatch_chunk_t *dispatch_chunk = NULL;
  switch (kind) {
  case ompt_dispatch_section:
    event_name = "ompt_event_section_begin";
    codeptr_ra = instance.ptr;
    break;
  case ompt_dispatch_ws_loop_chunk:
    event_name = "ompt_event_ws_loop_chunk_begin";
    dispatch_chunk = (ompt_dispatch_chunk_t *)instance.ptr;
    break;
  case ompt_dispatch_taskloop_chunk:
    event_name = "ompt_event_taskloop_chunk_begin";
    dispatch_chunk = (ompt_dispatch_chunk_t *)instance.ptr;
    break;
  case ompt_dispatch_distribute_chunk:
    event_name = "ompt_event_distribute_chunk_begin";
    dispatch_chunk = (ompt_dispatch_chunk_t *)instance.ptr;
    break;
  default:
    event_name = "ompt_ws_loop_iteration_begin";
  }
  printf("%" PRIu64 ":" _TOOL_PREFIX
         " %s: parallel_id=%" PRIu64 ", task_id=%" PRIu64
         ", codeptr_ra=%p, chunk_start=%" PRIu64 ", chunk_iterations=%" PRIu64
         "\n", ompt_get_thread_data()->value, event_name, parallel_data->value,
         task_data->value, codeptr_ra,
         dispatch_chunk ? dispatch_chunk->start : 0,
         dispatch_chunk ? dispatch_chunk->iterations : 0);
}

static void on_ompt_callback_masked(ompt_scope_endpoint_t endpoint,
                                    ompt_data_t *parallel_data,
                                    ompt_data_t *task_data,
                                    const void *codeptr_ra) {
  switch(endpoint)
  {
    case ompt_scope_begin:
      printf("%" PRIu64 ":" _TOOL_PREFIX
             " ompt_event_masked_begin: parallel_id=%" PRIu64
             ", task_id=%" PRIu64 ", codeptr_ra=%p\n",
             ompt_get_thread_data()->value, parallel_data->value,
             task_data->value, codeptr_ra);
      break;
    case ompt_scope_end:
      printf("%" PRIu64 ":" _TOOL_PREFIX
             " ompt_event_masked_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64
             ", codeptr_ra=%p\n",
             ompt_get_thread_data()->value, parallel_data->value,
             task_data->value, codeptr_ra);
      break;
    case ompt_scope_beginend:
      printf("ompt_scope_beginend should never be passed to %s\n", __func__);
      exit(-1);
  }
}

static void on_ompt_callback_parallel_begin(
    ompt_data_t *encountering_task_data,
    const ompt_frame_t *encountering_task_frame, ompt_data_t *parallel_data,
    uint32_t requested_team_size, int flag, const void *codeptr_ra) {
  if(parallel_data->ptr)
    printf("0: parallel_data initially not null\n");
  parallel_data->value = ompt_get_unique_id();
  int invoker = flag & 0xF;
  const char *event = (flag & ompt_parallel_team) ? "parallel" : "teams";
  const char *size = (flag & ompt_parallel_team) ? "team_size" : "num_teams";
  printf("%" PRIu64 ":" _TOOL_PREFIX
         " ompt_event_%s_begin: parent_task_id=%" PRIu64
         ", parent_task_frame.exit=%p, parent_task_frame.reenter=%p, "
         "parallel_id=%" PRIu64 ", requested_%s=%" PRIu32
         ", codeptr_ra=%p, invoker=%d\n",
         ompt_get_thread_data()->value, event, encountering_task_data->value,
         encountering_task_frame->exit_frame.ptr,
         encountering_task_frame->enter_frame.ptr, parallel_data->value, size,
         requested_team_size, codeptr_ra, invoker);
}

static void on_ompt_callback_parallel_end(ompt_data_t *parallel_data,
                                          ompt_data_t *encountering_task_data,
                                          int flag, const void *codeptr_ra) {
  int invoker = flag & 0xF;
  const char *event = (flag & ompt_parallel_team) ? "parallel" : "teams";
  printf("%" PRIu64 ":" _TOOL_PREFIX " ompt_event_%s_end: parallel_id=%" PRIu64
         ", task_id=%" PRIu64 ", invoker=%d, codeptr_ra=%p\n",
         ompt_get_thread_data()->value, event, parallel_data->value,
         encountering_task_data->value, invoker, codeptr_ra);
}

static void
on_ompt_callback_task_create(
    ompt_data_t *encountering_task_data,
    const ompt_frame_t *encountering_task_frame,
    ompt_data_t* new_task_data,
    int type,
    int has_dependences,
    const void *codeptr_ra)
{
  if(new_task_data->ptr)
    printf("0: new_task_data initially not null\n");
  new_task_data->value = ompt_get_unique_id();
  char buffer[2048];

  format_task_type(type, buffer);

  printf(
      "%" PRIu64 ":" _TOOL_PREFIX
      " ompt_event_task_create: parent_task_id=%" PRIu64
      ", parent_task_frame.exit=%p, parent_task_frame.reenter=%p, "
      "new_task_id=%" PRIu64
      ", codeptr_ra=%p, task_type=%s=%d, has_dependences=%s\n",
      ompt_get_thread_data()->value,
      encountering_task_data ? encountering_task_data->value : 0,
      encountering_task_frame ? encountering_task_frame->exit_frame.ptr : NULL,
      encountering_task_frame ? encountering_task_frame->enter_frame.ptr : NULL,
      new_task_data->value, codeptr_ra, buffer, type,
      has_dependences ? "yes" : "no");
}

static void
on_ompt_callback_task_schedule(
    ompt_data_t *first_task_data,
    ompt_task_status_t prior_task_status,
    ompt_data_t *second_task_data)
{
  printf("%" PRIu64 ":" _TOOL_PREFIX
         " ompt_event_task_schedule: first_task_id=%" PRIu64
         ", second_task_id=%" PRIu64 ", prior_task_status=%s=%d\n",
         ompt_get_thread_data()->value, first_task_data->value,
         (second_task_data ? second_task_data->value : -1),
         ompt_task_status_t_values[prior_task_status], prior_task_status);
  if (prior_task_status == ompt_task_complete ||
      prior_task_status == ompt_task_late_fulfill ||
      prior_task_status == ompt_taskwait_complete) {
    printf("%" PRIu64 ":" _TOOL_PREFIX " ompt_event_task_end: task_id=%" PRIu64
           "\n", ompt_get_thread_data()->value, first_task_data->value);
  }
}

static void
on_ompt_callback_dependences(
  ompt_data_t *task_data,
  const ompt_dependence_t *deps,
  int ndeps)
{
  char buffer[2048];
  char *progress = buffer;
  int i;
  for (i = 0; i < ndeps && progress < buffer + 2000; i++) {
    if (deps[i].dependence_type == ompt_dependence_type_source ||
        deps[i].dependence_type == ompt_dependence_type_sink)
      progress +=
          sprintf(progress, "(%" PRIu64 ", %s), ", deps[i].variable.value,
                  ompt_dependence_type_t_values[deps[i].dependence_type]);
    else
      progress +=
          sprintf(progress, "(%p, %s), ", deps[i].variable.ptr,
                  ompt_dependence_type_t_values[deps[i].dependence_type]);
  }
  if (ndeps > 0)
    progress[-2] = 0;
  printf("%" PRIu64 ":" _TOOL_PREFIX " ompt_event_dependences: task_id=%" PRIu64
         ", deps=[%s], ndeps=%d\n",
         ompt_get_thread_data()->value, task_data->value, buffer, ndeps);
}

static void
on_ompt_callback_task_dependence(
  ompt_data_t *first_task_data,
  ompt_data_t *second_task_data)
{
  printf("%" PRIu64 ":" _TOOL_PREFIX
         " ompt_event_task_dependence_pair: first_task_id=%" PRIu64
         ", second_task_id=%" PRIu64 "\n",
         ompt_get_thread_data()->value, first_task_data->value,
         second_task_data->value);
}

static void
on_ompt_callback_thread_begin(
  ompt_thread_t thread_type,
  ompt_data_t *thread_data)
{
  if(thread_data->ptr)
    printf("%s\n", "0: thread_data initially not null");
  thread_data->value = ompt_get_unique_id();
  printf("%" PRIu64 ":" _TOOL_PREFIX
         " ompt_event_thread_begin: thread_type=%s=%d, thread_id=%" PRIu64 "\n",
         ompt_get_thread_data()->value, ompt_thread_t_values[thread_type],
         thread_type, thread_data->value);
}

static void
on_ompt_callback_thread_end(
  ompt_data_t *thread_data)
{
  printf("%" PRIu64 ":" _TOOL_PREFIX " ompt_event_thread_end: thread_id=%" PRIu64
         "\n",
         ompt_get_thread_data()->value, thread_data->value);
}

static int
on_ompt_callback_control_tool(
  uint64_t command,
  uint64_t modifier,
  void *arg,
  const void *codeptr_ra)
{
  ompt_frame_t* omptTaskFrame;
  ompt_get_task_info(0, NULL, (ompt_data_t**) NULL, &omptTaskFrame, NULL, NULL);
  printf("%" PRIu64 ":" _TOOL_PREFIX " ompt_event_control_tool: command=%" PRIu64
         ", modifier=%" PRIu64
         ", arg=%p, codeptr_ra=%p, current_task_frame.exit=%p, "
         "current_task_frame.reenter=%p \n",
         ompt_get_thread_data()->value, command, modifier, arg, codeptr_ra,
         omptTaskFrame->exit_frame.ptr, omptTaskFrame->enter_frame.ptr);

  // the following would interfere with expected output for OMPT tests, so skip
#ifndef _OMPT_TESTS
  // print task data
  int task_level = 0;
  ompt_data_t *task_data;
  while (ompt_get_task_info(task_level, NULL, (ompt_data_t **)&task_data, NULL,
                            NULL, NULL)) {
    printf("%" PRIu64 ":" _TOOL_PREFIX " task level %d: task_id=%" PRIu64 "\n",
           ompt_get_thread_data()->value, task_level, task_data->value);
    task_level++;
  }

  // print parallel data
  int parallel_level = 0;
  ompt_data_t *parallel_data;
  while (ompt_get_parallel_info(parallel_level, (ompt_data_t **)&parallel_data,
                                NULL)) {
    printf("%" PRIu64 ":" _TOOL_PREFIX " parallel level %d: parallel_id=%" PRIu64
           "\n",
           ompt_get_thread_data()->value, parallel_level, parallel_data->value);
    parallel_level++;
  }
#endif
  return 0; //success
}

static void on_ompt_callback_error(ompt_severity_t severity,
                                   const char *message, size_t length,
                                   const void *codeptr_ra) {
  printf("%" PRIu64 ": ompt_event_runtime_error: severity=%" PRIu32
         ", message=%s, length=%" PRIu64 ", codeptr_ra=%p\n",
         ompt_get_thread_data()->value, severity, message, (uint64_t)length,
         codeptr_ra);
}

int ompt_initialize(
  ompt_function_lookup_t lookup,
  int initial_device_num,
  ompt_data_t *tool_data)
{
  ompt_set_callback = (ompt_set_callback_t) lookup("ompt_set_callback");
  ompt_get_callback = (ompt_get_callback_t) lookup("ompt_get_callback");
  ompt_get_state = (ompt_get_state_t) lookup("ompt_get_state");
  ompt_get_task_info = (ompt_get_task_info_t) lookup("ompt_get_task_info");
  ompt_get_task_memory = (ompt_get_task_memory_t)lookup("ompt_get_task_memory");
  ompt_get_thread_data = (ompt_get_thread_data_t) lookup("ompt_get_thread_data");
  ompt_get_parallel_info = (ompt_get_parallel_info_t) lookup("ompt_get_parallel_info");
  ompt_get_unique_id = (ompt_get_unique_id_t) lookup("ompt_get_unique_id");
  ompt_finalize_tool = (ompt_finalize_tool_t)lookup("ompt_finalize_tool");

  ompt_get_unique_id();

  ompt_get_num_procs = (ompt_get_num_procs_t) lookup("ompt_get_num_procs");
  ompt_get_num_places = (ompt_get_num_places_t) lookup("ompt_get_num_places");
  ompt_get_place_proc_ids = (ompt_get_place_proc_ids_t) lookup("ompt_get_place_proc_ids");
  ompt_get_place_num = (ompt_get_place_num_t) lookup("ompt_get_place_num");
  ompt_get_partition_place_nums = (ompt_get_partition_place_nums_t) lookup("ompt_get_partition_place_nums");
  ompt_get_proc_id = (ompt_get_proc_id_t) lookup("ompt_get_proc_id");
  ompt_enumerate_states = (ompt_enumerate_states_t) lookup("ompt_enumerate_states");
  ompt_enumerate_mutex_impls = (ompt_enumerate_mutex_impls_t) lookup("ompt_enumerate_mutex_impls");

  register_ompt_callback(ompt_callback_mutex_acquire);
  register_ompt_callback_t(ompt_callback_mutex_acquired, ompt_callback_mutex_t);
  register_ompt_callback_t(ompt_callback_mutex_released, ompt_callback_mutex_t);
  register_ompt_callback(ompt_callback_nest_lock);
  register_ompt_callback(ompt_callback_sync_region);
  register_ompt_callback_t(ompt_callback_sync_region_wait, ompt_callback_sync_region_t);
  register_ompt_callback_t(ompt_callback_reduction, ompt_callback_sync_region_t);
  register_ompt_callback(ompt_callback_control_tool);
  register_ompt_callback(ompt_callback_flush);
  register_ompt_callback(ompt_callback_cancel);
  register_ompt_callback(ompt_callback_implicit_task);
  register_ompt_callback_t(ompt_callback_lock_init, ompt_callback_mutex_acquire_t);
  register_ompt_callback_t(ompt_callback_lock_destroy, ompt_callback_mutex_t);
  register_ompt_callback(ompt_callback_work);
  register_ompt_callback(ompt_callback_dispatch);
  register_ompt_callback(ompt_callback_masked);
  register_ompt_callback(ompt_callback_parallel_begin);
  register_ompt_callback(ompt_callback_parallel_end);
  register_ompt_callback(ompt_callback_task_create);
  register_ompt_callback(ompt_callback_task_schedule);
  register_ompt_callback(ompt_callback_dependences);
  register_ompt_callback(ompt_callback_task_dependence);
  register_ompt_callback(ompt_callback_thread_begin);
  register_ompt_callback(ompt_callback_thread_end);
  register_ompt_callback(ompt_callback_error);
  printf("0: NULL_POINTER=%p\n", (void*)NULL);
  return 1; //success
}

void ompt_finalize(ompt_data_t *tool_data)
{
  printf("0: ompt_event_runtime_shutdown\n");
}

#ifdef __cplusplus
extern "C" {
#endif
ompt_start_tool_result_t* ompt_start_tool(
  unsigned int omp_version,
  const char *runtime_version)
{
  static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,&ompt_finalize, 0};
  return &ompt_start_tool_result;
}
#ifdef __cplusplus
}
#endif
#endif // ifndef USE_PRIVATE_TOOL
#ifdef _OMPT_TESTS
#undef _OMPT_TESTS
#endif