kmp_affinity.cpp | Explore in Territory

/*
 * kmp_affinity.cpp -- affinity management
 */

//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "kmp.h"
#include "kmp_affinity.h"
#include "kmp_i18n.h"
#include "kmp_io.h"
#include "kmp_str.h"
#include "kmp_wrapper_getpid.h"
#if KMP_USE_HIER_SCHED
#include "kmp_dispatch_hier.h"
#endif
#if KMP_USE_HWLOC
// Copied from hwloc
#define HWLOC_GROUP_KIND_INTEL_MODULE …
#define HWLOC_GROUP_KIND_INTEL_TILE …
#define HWLOC_GROUP_KIND_INTEL_DIE …
#define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP …
#endif
#include <ctype.h>

// The machine topology
kmp_topology_t *__kmp_topology = …;
// KMP_HW_SUBSET environment variable
kmp_hw_subset_t *__kmp_hw_subset = …;

// Store the real or imagined machine hierarchy here
static hierarchy_info machine_hierarchy;

void __kmp_cleanup_hierarchy() { … }

#if KMP_AFFINITY_SUPPORTED
// Helper class to see if place lists further restrict the fullMask
class kmp_full_mask_modifier_t { … };

static inline const char *
__kmp_get_affinity_env_var(const kmp_affinity_t &affinity,
                           bool for_binding = false) { … }
#endif // KMP_AFFINITY_SUPPORTED

void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { … }

static int nCoresPerPkg, nPackages;
static int __kmp_nThreadsPerCore;
#ifndef KMP_DFLT_NTH_CORES
static int __kmp_ncores;
#endif

const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) { … }

const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) { … }

const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) { … }

#if KMP_AFFINITY_SUPPORTED
// If affinity is supported, check the affinity
// verbose and warning flags before printing warning
#define KMP_AFF_WARNING(s, ...) …
#else
#define KMP_AFF_WARNING …
#endif

////////////////////////////////////////////////////////////////////////////////
// kmp_hw_thread_t methods
int kmp_hw_thread_t::compare_ids(const void *a, const void *b) { … }

#if KMP_AFFINITY_SUPPORTED
int kmp_hw_thread_t::compare_compact(const void *a, const void *b) { … }
#endif

void kmp_hw_thread_t::print() const { … }

////////////////////////////////////////////////////////////////////////////////
// kmp_topology_t methods

// Add a layer to the topology based on the ids. Assume the topology
// is perfectly nested (i.e., so no object has more than one parent)
void kmp_topology_t::insert_layer(kmp_hw_t type, const int *ids) { … }

#if KMP_GROUP_AFFINITY
// Insert the Windows Processor Group structure into the topology
void kmp_topology_t::_insert_windows_proc_groups() {
  // Do not insert the processor group structure for a single group
  if (__kmp_num_proc_groups == 1)
    return;
  kmp_affin_mask_t *mask;
  int *ids = (int *)__kmp_allocate(sizeof(int) * num_hw_threads);
  KMP_CPU_ALLOC(mask);
  for (int i = 0; i < num_hw_threads; ++i) {
    KMP_CPU_ZERO(mask);
    KMP_CPU_SET(hw_threads[i].os_id, mask);
    ids[i] = __kmp_get_proc_group(mask);
  }
  KMP_CPU_FREE(mask);
  insert_layer(KMP_HW_PROC_GROUP, ids);
  __kmp_free(ids);

  // sort topology after adding proc groups
  __kmp_topology->sort_ids();
}
#endif

// Remove layers that don't add information to the topology.
// This is done by having the layer take on the id = UNKNOWN_ID (-1)
void kmp_topology_t::_remove_radix1_layers() { … }

void kmp_topology_t::_set_last_level_cache() { … }

// Gather the count of each topology layer and the ratio
void kmp_topology_t::_gather_enumeration_information() { … }

int kmp_topology_t::_get_ncores_with_attr(const kmp_hw_attr_t &attr,
                                          int above_level,
                                          bool find_all) const { … }

// Find out if the topology is uniform
void kmp_topology_t::_discover_uniformity() { … }

// Set all the sub_ids for each hardware thread
void kmp_topology_t::_set_sub_ids() { … }

void kmp_topology_t::_set_globals() { … }

kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth,
                                         const kmp_hw_t *types) { … }

void kmp_topology_t::deallocate(kmp_topology_t *topology) { … }

bool kmp_topology_t::check_ids() const { … }

void kmp_topology_t::dump() const { … }

void kmp_topology_t::print(const char *env_var) const { … }

#if KMP_AFFINITY_SUPPORTED
void kmp_topology_t::set_granularity(kmp_affinity_t &affinity) const { … }
#endif

void kmp_topology_t::canonicalize() { … }

// Canonicalize an explicit packages X cores/pkg X threads/core topology
void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg,
                                  int nthreads_per_core, int ncores) { … }

#if KMP_AFFINITY_SUPPORTED
static kmp_str_buf_t *
__kmp_hw_get_catalog_core_string(const kmp_hw_attr_t &attr, kmp_str_buf_t *buf,
                                 bool plural) { … }

bool kmp_topology_t::restrict_to_mask(const kmp_affin_mask_t *mask) { … }

// Apply the KMP_HW_SUBSET envirable to the topology
// Returns true if KMP_HW_SUBSET filtered any processors
// otherwise, returns false
bool kmp_topology_t::filter_hw_subset() { … }

bool kmp_topology_t::is_close(int hwt1, int hwt2,
                              const kmp_affinity_t &stgs) const { … }

////////////////////////////////////////////////////////////////////////////////

bool KMPAffinity::picked_api = …;

void *KMPAffinity::Mask::operator new(size_t n) { … }
void *KMPAffinity::Mask::operator new[](size_t n) { … }
void KMPAffinity::Mask::operator delete(void *p) { … }
void KMPAffinity::Mask::operator delete[](void *p) { … }
void *KMPAffinity::operator new(size_t n) { … }
void KMPAffinity::operator delete(void *p) { … }

void KMPAffinity::pick_api() { … }

void KMPAffinity::destroy_api() { … }

#define KMP_ADVANCE_SCAN …

// Print the affinity mask to the character array in a pretty format.
// The format is a comma separated list of non-negative integers or integer
// ranges: e.g., 1,2,3-5,7,9-15
// The format can also be the string "{<empty>}" if no bits are set in mask
char *__kmp_affinity_print_mask(char *buf, int buf_len,
                                kmp_affin_mask_t *mask) { … }
#undef KMP_ADVANCE_SCAN

// Print the affinity mask to the string buffer object in a pretty format
// The format is a comma separated list of non-negative integers or integer
// ranges: e.g., 1,2,3-5,7,9-15
// The format can also be the string "{<empty>}" if no bits are set in mask
kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf,
                                           kmp_affin_mask_t *mask) { … }

// Return (possibly empty) affinity mask representing the offline CPUs
// Caller must free the mask
kmp_affin_mask_t *__kmp_affinity_get_offline_cpus() { … }

// Return the number of available procs
int __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) { … }

// All of the __kmp_affinity_create_*_map() routines should allocate the
// internal topology object and set the layer ids for it.  Each routine
// returns a boolean on whether it was successful at doing so.
kmp_affin_mask_t *__kmp_affin_fullMask = …;
// Original mask is a subset of full mask in multiple processor groups topology
kmp_affin_mask_t *__kmp_affin_origMask = …;

#if KMP_USE_HWLOC
static inline bool __kmp_hwloc_is_cache_type(hwloc_obj_t obj) {
#if HWLOC_API_VERSION >= 0x00020000
  return hwloc_obj_type_is_cache(obj->type);
#else
  return obj->type == HWLOC_OBJ_CACHE;
#endif
}

// Returns KMP_HW_* type derived from HWLOC_* type
static inline kmp_hw_t __kmp_hwloc_type_2_topology_type(hwloc_obj_t obj) {

  if (__kmp_hwloc_is_cache_type(obj)) {
    if (obj->attr->cache.type == HWLOC_OBJ_CACHE_INSTRUCTION)
      return KMP_HW_UNKNOWN;
    switch (obj->attr->cache.depth) {
    case 1:
      return KMP_HW_L1;
    case 2:
#if KMP_MIC_SUPPORTED
      if (__kmp_mic_type == mic3) {
        return KMP_HW_TILE;
      }
#endif
      return KMP_HW_L2;
    case 3:
      return KMP_HW_L3;
    }
    return KMP_HW_UNKNOWN;
  }

  switch (obj->type) {
  case HWLOC_OBJ_PACKAGE:
    return KMP_HW_SOCKET;
  case HWLOC_OBJ_NUMANODE:
    return KMP_HW_NUMA;
  case HWLOC_OBJ_CORE:
    return KMP_HW_CORE;
  case HWLOC_OBJ_PU:
    return KMP_HW_THREAD;
  case HWLOC_OBJ_GROUP:
#if HWLOC_API_VERSION >= 0x00020000
    if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_DIE)
      return KMP_HW_DIE;
    else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_TILE)
      return KMP_HW_TILE;
    else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_MODULE)
      return KMP_HW_MODULE;
    else if (obj->attr->group.kind == HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP)
      return KMP_HW_PROC_GROUP;
#endif
    return KMP_HW_UNKNOWN;
#if HWLOC_API_VERSION >= 0x00020100
  case HWLOC_OBJ_DIE:
    return KMP_HW_DIE;
#endif
  }
  return KMP_HW_UNKNOWN;
}

// Returns the number of objects of type 'type' below 'obj' within the topology
// tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is
// HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET
// object.
static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj,
                                           hwloc_obj_type_t type) {
  int retval = 0;
  hwloc_obj_t first;
  for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type,
                                           obj->logical_index, type, 0);
       first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology,
                                                       obj->type, first) == obj;
       first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type,
                                          first)) {
    ++retval;
  }
  return retval;
}

// This gets the sub_id for a lower object under a higher object in the
// topology tree
static int __kmp_hwloc_get_sub_id(hwloc_topology_t t, hwloc_obj_t higher,
                                  hwloc_obj_t lower) {
  hwloc_obj_t obj;
  hwloc_obj_type_t ltype = lower->type;
  int lindex = lower->logical_index - 1;
  int sub_id = 0;
  // Get the previous lower object
  obj = hwloc_get_obj_by_type(t, ltype, lindex);
  while (obj && lindex >= 0 &&
         hwloc_bitmap_isincluded(obj->cpuset, higher->cpuset)) {
    if (obj->userdata) {
      sub_id = (int)(RCAST(kmp_intptr_t, obj->userdata));
      break;
    }
    sub_id++;
    lindex--;
    obj = hwloc_get_obj_by_type(t, ltype, lindex);
  }
  // store sub_id + 1 so that 0 is differed from NULL
  lower->userdata = RCAST(void *, sub_id + 1);
  return sub_id;
}

static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
  kmp_hw_t type;
  int hw_thread_index, sub_id;
  int depth;
  hwloc_obj_t pu, obj, root, prev;
  kmp_hw_t types[KMP_HW_LAST];
  hwloc_obj_type_t hwloc_types[KMP_HW_LAST];

  hwloc_topology_t tp = __kmp_hwloc_topology;
  *msg_id = kmp_i18n_null;
  if (__kmp_affinity.flags.verbose) {
    KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
  }

  if (!KMP_AFFINITY_CAPABLE()) {
    // Hack to try and infer the machine topology using only the data
    // available from hwloc on the current thread, and __kmp_xproc.
    KMP_ASSERT(__kmp_affinity.type == affinity_none);
    // hwloc only guarantees existance of PU object, so check PACKAGE and CORE
    hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0);
    if (o != NULL)
      nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_CORE);
    else
      nCoresPerPkg = 1; // no PACKAGE found
    o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0);
    if (o != NULL)
      __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_PU);
    else
      __kmp_nThreadsPerCore = 1; // no CORE found
    if (__kmp_nThreadsPerCore == 0)
      __kmp_nThreadsPerCore = 1;
    __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
    if (nCoresPerPkg == 0)
      nCoresPerPkg = 1; // to prevent possible division by 0
    nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
    return true;
  }

#if HWLOC_API_VERSION >= 0x00020400
  // Handle multiple types of cores if they exist on the system
  int nr_cpu_kinds = hwloc_cpukinds_get_nr(tp, 0);

  typedef struct kmp_hwloc_cpukinds_info_t {
    int efficiency;
    kmp_hw_core_type_t core_type;
    hwloc_bitmap_t mask;
  } kmp_hwloc_cpukinds_info_t;
  kmp_hwloc_cpukinds_info_t *cpukinds = nullptr;

  if (nr_cpu_kinds > 0) {
    unsigned nr_infos;
    struct hwloc_info_s *infos;
    cpukinds = (kmp_hwloc_cpukinds_info_t *)__kmp_allocate(
        sizeof(kmp_hwloc_cpukinds_info_t) * nr_cpu_kinds);
    for (unsigned idx = 0; idx < (unsigned)nr_cpu_kinds; ++idx) {
      cpukinds[idx].efficiency = -1;
      cpukinds[idx].core_type = KMP_HW_CORE_TYPE_UNKNOWN;
      cpukinds[idx].mask = hwloc_bitmap_alloc();
      if (hwloc_cpukinds_get_info(tp, idx, cpukinds[idx].mask,
                                  &cpukinds[idx].efficiency, &nr_infos, &infos,
                                  0) == 0) {
        for (unsigned i = 0; i < nr_infos; ++i) {
          if (__kmp_str_match("CoreType", 8, infos[i].name)) {
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
            if (__kmp_str_match("IntelAtom", 9, infos[i].value)) {
              cpukinds[idx].core_type = KMP_HW_CORE_TYPE_ATOM;
              break;
            } else if (__kmp_str_match("IntelCore", 9, infos[i].value)) {
              cpukinds[idx].core_type = KMP_HW_CORE_TYPE_CORE;
              break;
            }
#endif
          }
        }
      }
    }
  }
#endif

  root = hwloc_get_root_obj(tp);

  // Figure out the depth and types in the topology
  depth = 0;
  obj = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin());
  while (obj && obj != root) {
#if HWLOC_API_VERSION >= 0x00020000
    if (obj->memory_arity) {
      hwloc_obj_t memory;
      for (memory = obj->memory_first_child; memory;
           memory = hwloc_get_next_child(tp, obj, memory)) {
        if (memory->type == HWLOC_OBJ_NUMANODE)
          break;
      }
      if (memory && memory->type == HWLOC_OBJ_NUMANODE) {
        types[depth] = KMP_HW_NUMA;
        hwloc_types[depth] = memory->type;
        depth++;
      }
    }
#endif
    type = __kmp_hwloc_type_2_topology_type(obj);
    if (type != KMP_HW_UNKNOWN) {
      types[depth] = type;
      hwloc_types[depth] = obj->type;
      depth++;
    }
    obj = obj->parent;
  }
  KMP_ASSERT(depth > 0);

  // Get the order for the types correct
  for (int i = 0, j = depth - 1; i < j; ++i, --j) {
    hwloc_obj_type_t hwloc_temp = hwloc_types[i];
    kmp_hw_t temp = types[i];
    types[i] = types[j];
    types[j] = temp;
    hwloc_types[i] = hwloc_types[j];
    hwloc_types[j] = hwloc_temp;
  }

  // Allocate the data structure to be returned.
  __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);

  hw_thread_index = 0;
  pu = NULL;
  while ((pu = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, pu))) {
    int index = depth - 1;
    bool included = KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask);
    kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
    if (included) {
      hw_thread.clear();
      hw_thread.ids[index] = pu->logical_index;
      hw_thread.os_id = pu->os_index;
      hw_thread.original_idx = hw_thread_index;
      // If multiple core types, then set that attribute for the hardware thread
#if HWLOC_API_VERSION >= 0x00020400
      if (cpukinds) {
        int cpukind_index = -1;
        for (int i = 0; i < nr_cpu_kinds; ++i) {
          if (hwloc_bitmap_isset(cpukinds[i].mask, hw_thread.os_id)) {
            cpukind_index = i;
            break;
          }
        }
        if (cpukind_index >= 0) {
          hw_thread.attrs.set_core_type(cpukinds[cpukind_index].core_type);
          hw_thread.attrs.set_core_eff(cpukinds[cpukind_index].efficiency);
        }
      }
#endif
      index--;
    }
    obj = pu;
    prev = obj;
    while (obj != root && obj != NULL) {
      obj = obj->parent;
#if HWLOC_API_VERSION >= 0x00020000
      // NUMA Nodes are handled differently since they are not within the
      // parent/child structure anymore.  They are separate children
      // of obj (memory_first_child points to first memory child)
      if (obj->memory_arity) {
        hwloc_obj_t memory;
        for (memory = obj->memory_first_child; memory;
             memory = hwloc_get_next_child(tp, obj, memory)) {
          if (memory->type == HWLOC_OBJ_NUMANODE)
            break;
        }
        if (memory && memory->type == HWLOC_OBJ_NUMANODE) {
          sub_id = __kmp_hwloc_get_sub_id(tp, memory, prev);
          if (included) {
            hw_thread.ids[index] = memory->logical_index;
            hw_thread.ids[index + 1] = sub_id;
            index--;
          }
        }
        prev = obj;
      }
#endif
      type = __kmp_hwloc_type_2_topology_type(obj);
      if (type != KMP_HW_UNKNOWN) {
        sub_id = __kmp_hwloc_get_sub_id(tp, obj, prev);
        if (included) {
          hw_thread.ids[index] = obj->logical_index;
          hw_thread.ids[index + 1] = sub_id;
          index--;
        }
        prev = obj;
      }
    }
    if (included)
      hw_thread_index++;
  }

#if HWLOC_API_VERSION >= 0x00020400
  // Free the core types information
  if (cpukinds) {
    for (int idx = 0; idx < nr_cpu_kinds; ++idx)
      hwloc_bitmap_free(cpukinds[idx].mask);
    __kmp_free(cpukinds);
  }
#endif
  __kmp_topology->sort_ids();
  return true;
}
#endif // KMP_USE_HWLOC

// If we don't know how to retrieve the machine's processor topology, or
// encounter an error in doing so, this routine is called to form a "flat"
// mapping of os thread id's <-> processor id's.
static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t *const msg_id) { … }

#if KMP_GROUP_AFFINITY
// If multiple Windows* OS processor groups exist, we can create a 2-level
// topology map with the groups at level 0 and the individual procs at level 1.
// This facilitates letting the threads float among all procs in a group,
// if granularity=group (the default when there are multiple groups).
static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t *const msg_id) {
  *msg_id = kmp_i18n_null;
  int depth = 3;
  kmp_hw_t types[] = {KMP_HW_PROC_GROUP, KMP_HW_CORE, KMP_HW_THREAD};
  const static size_t BITS_PER_GROUP = CHAR_BIT * sizeof(DWORD_PTR);

  if (__kmp_affinity.flags.verbose) {
    KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
  }

  // If we aren't affinity capable, then use flat topology
  if (!KMP_AFFINITY_CAPABLE()) {
    KMP_ASSERT(__kmp_affinity.type == affinity_none);
    nPackages = __kmp_num_proc_groups;
    __kmp_nThreadsPerCore = 1;
    __kmp_ncores = __kmp_xproc;
    nCoresPerPkg = nPackages / __kmp_ncores;
    return true;
  }

  // Construct the data structure to be returned.
  __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
  int avail_ct = 0;
  int i;
  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
    // Skip this proc if it is not included in the machine model.
    if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
      continue;
    }
    kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct);
    hw_thread.clear();
    hw_thread.os_id = i;
    hw_thread.original_idx = avail_ct;
    hw_thread.ids[0] = i / BITS_PER_GROUP;
    hw_thread.ids[1] = hw_thread.ids[2] = i % BITS_PER_GROUP;
    avail_ct++;
  }
  return true;
}
#endif /* KMP_GROUP_AFFINITY */

#if KMP_ARCH_X86 || KMP_ARCH_X86_64

template <kmp_uint32 LSB, kmp_uint32 MSB>
static inline unsigned __kmp_extract_bits(kmp_uint32 v) { … }

static int __kmp_cpuid_mask_width(int count) { … }

class apicThreadInfo { … };

static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a,
                                                     const void *b) { … }

class cpuid_cache_info_t { … };

// On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
// an algorithm which cycles through the available os threads, setting
// the current thread's affinity mask to that thread, and then retrieves
// the Apic Id for each thread context using the cpuid instruction.
static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) { … }

// Hybrid cpu detection using CPUID.1A
// Thread should be pinned to processor already
static void __kmp_get_hybrid_info(kmp_hw_core_type_t *type, int *efficiency,
                                  unsigned *native_model_id) { … }

// Intel(R) microarchitecture code name Nehalem, Dunnington and later
// architectures support a newer interface for specifying the x2APIC Ids,
// based on CPUID.B or CPUID.1F
/*
 * CPUID.B or 1F, Input ECX (sub leaf # aka level number)
    Bits            Bits            Bits           Bits
    31-16           15-8            7-4            4-0
---+-----------+--------------+-------------+-----------------+
EAX| reserved  |   reserved   |   reserved  |  Bits to Shift  |
---+-----------|--------------+-------------+-----------------|
EBX| reserved  | Num logical processors at level (16 bits)    |
---+-----------|--------------+-------------------------------|
ECX| reserved  |   Level Type |      Level Number (8 bits)    |
---+-----------+--------------+-------------------------------|
EDX|                    X2APIC ID (32 bits)                   |
---+----------------------------------------------------------+
*/

enum { … };
KMP_BUILD_ASSERT(…);
#define KMP_LEAF_1F_KNOWN_LEVELS …

static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) { … }

static int __kmp_topology_type_2_intel_type(kmp_hw_t type) { … }

struct cpuid_level_info_t { … };

class cpuid_topo_desc_t { … };

struct cpuid_proc_info_t { … };

// This function takes the topology leaf, an info pointer to store the levels
// detected, and writable descriptors for the total topology.
// Returns whether total types, depth, or description were modified.
static bool __kmp_x2apicid_get_levels(int leaf, cpuid_proc_info_t *info,
                                      kmp_hw_t total_types[KMP_HW_LAST],
                                      int *total_depth,
                                      cpuid_topo_desc_t *total_description) { … }

static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) { … }
#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */

#define osIdIndex …
#define threadIdIndex …
#define coreIdIndex …
#define pkgIdIndex …
#define nodeIdIndex …

ProcCpuInfo;
static unsigned maxIndex = …;

static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a,
                                                  const void *b) { … }

#if KMP_USE_HIER_SCHED
// Set the array sizes for the hierarchy layers
static void __kmp_dispatch_set_hierarchy_values() {
  // Set the maximum number of L1's to number of cores
  // Set the maximum number of L2's to either number of cores / 2 for
  // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing
  // Or the number of cores for Intel(R) Xeon(R) processors
  // Set the maximum number of NUMA nodes and L3's to number of packages
  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] =
      nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores;
#if KMP_ARCH_X86_64 &&                                                         \
    (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY ||    \
     KMP_OS_WINDOWS) &&                                                        \
    KMP_MIC_SUPPORTED
  if (__kmp_mic_type >= mic3)
    __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2;
  else
#endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
    __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores;
  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages;
  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages;
  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1;
  // Set the number of threads per unit
  // Number of hardware threads per L1/L2/L3/NUMA/LOOP
  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1;
  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] =
      __kmp_nThreadsPerCore;
#if KMP_ARCH_X86_64 &&                                                         \
    (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY ||    \
     KMP_OS_WINDOWS) &&                                                        \
    KMP_MIC_SUPPORTED
  if (__kmp_mic_type >= mic3)
    __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
        2 * __kmp_nThreadsPerCore;
  else
#endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
    __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
        __kmp_nThreadsPerCore;
  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] =
      nCoresPerPkg * __kmp_nThreadsPerCore;
  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] =
      nCoresPerPkg * __kmp_nThreadsPerCore;
  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] =
      nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
}

// Return the index into the hierarchy for this tid and layer type (L1, L2, etc)
// i.e., this thread's L1 or this thread's L2, etc.
int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) {
  int index = type + 1;
  int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
  KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST);
  if (type == kmp_hier_layer_e::LAYER_THREAD)
    return tid;
  else if (type == kmp_hier_layer_e::LAYER_LOOP)
    return 0;
  KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0);
  if (tid >= num_hw_threads)
    tid = tid % num_hw_threads;
  return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index];
}

// Return the number of t1's per t2
int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) {
  int i1 = t1 + 1;
  int i2 = t2 + 1;
  KMP_DEBUG_ASSERT(i1 <= i2);
  KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST);
  KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST);
  KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0);
  // (nthreads/t2) / (nthreads/t1) = t1 / t2
  return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1];
}
#endif // KMP_USE_HIER_SCHED

static inline const char *__kmp_cpuinfo_get_filename() { … }

static inline const char *__kmp_cpuinfo_get_envvar() { … }

// Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
// affinity map. On AIX, the map is obtained through system SRAD (Scheduler
// Resource Allocation Domain).
static bool __kmp_affinity_create_cpuinfo_map(int *line,
                                              kmp_i18n_id_t *const msg_id) { … }

// Create and return a table of affinity masks, indexed by OS thread ID.
// This routine handles OR'ing together all the affinity masks of threads
// that are sufficiently close, if granularity > fine.
template <typename FindNextFunctionType>
static void __kmp_create_os_id_masks(unsigned *numUnique,
                                     kmp_affinity_t &affinity,
                                     FindNextFunctionType find_next) { … }

// Stuff for the affinity proclist parsers.  It's easier to declare these vars
// as file-static than to try and pass them through the calling sequence of
// the recursive-descent OMP_PLACES parser.
static kmp_affin_mask_t *newMasks;
static int numNewMasks;
static int nextNewMask;

#define ADD_MASK …

#define ADD_MASK_OSID …

// Re-parse the proclist (for the explicit affinity type), and form the list
// of affinity newMasks indexed by gtid.
static void __kmp_affinity_process_proclist(kmp_affinity_t &affinity) { … }

/*-----------------------------------------------------------------------------
Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
places.  Again, Here is the grammar:

place_list := place
place_list := place , place_list
place := num
place := place : num
place := place : num : signed
place := { subplacelist }
place := ! place                  // (lowest priority)
subplace_list := subplace
subplace_list := subplace , subplace_list
subplace := num
subplace := num : num
subplace := num : num : signed
signed := num
signed := + signed
signed := - signed
-----------------------------------------------------------------------------*/
static void __kmp_process_subplace_list(const char **scan,
                                        kmp_affinity_t &affinity, int maxOsId,
                                        kmp_affin_mask_t *tempMask,
                                        int *setSize) { … }

static void __kmp_process_place(const char **scan, kmp_affinity_t &affinity,
                                int maxOsId, kmp_affin_mask_t *tempMask,
                                int *setSize) { … }

// static void
void __kmp_affinity_process_placelist(kmp_affinity_t &affinity) { … }

#undef ADD_MASK
#undef ADD_MASK_OSID

// This function figures out the deepest level at which there is at least one
// cluster/core with more than one processing unit bound to it.
static int __kmp_affinity_find_core_level(int nprocs, int bottom_level) { … }

// This function counts number of clusters/cores at given level.
static int __kmp_affinity_compute_ncores(int nprocs, int bottom_level,
                                         int core_level) { … }
// This function finds to which cluster/core given processing unit is bound.
static int __kmp_affinity_find_core(int proc, int bottom_level,
                                    int core_level) { … }

// This function finds maximal number of processing units bound to a
// cluster/core at given level.
static int __kmp_affinity_max_proc_per_core(int nprocs, int bottom_level,
                                            int core_level) { … }

static int *procarr = …;
static int __kmp_aff_depth = …;
static int *__kmp_osid_to_hwthread_map = …;

static void __kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t *mask,
                                                  kmp_affinity_ids_t &ids,
                                                  kmp_affinity_attrs_t &attrs) { … }

static void __kmp_affinity_get_thread_topology_info(kmp_info_t *th) { … }

// Assign the topology information to each place in the place list
// A thread can then grab not only its affinity mask, but the topology
// information associated with that mask. e.g., Which socket is a thread on
static void __kmp_affinity_get_topology_info(kmp_affinity_t &affinity) { … }

// Called when __kmp_topology is ready
static void __kmp_aux_affinity_initialize_other_data(kmp_affinity_t &affinity) { … }

// Create a one element mask array (set of places) which only contains the
// initial process's affinity mask
static void __kmp_create_affinity_none_places(kmp_affinity_t &affinity) { … }

static void __kmp_aux_affinity_initialize_masks(kmp_affinity_t &affinity) { … }

static bool __kmp_aux_affinity_initialize_topology(kmp_affinity_t &affinity) { … }

static void __kmp_aux_affinity_initialize(kmp_affinity_t &affinity) { … }

void __kmp_affinity_initialize(kmp_affinity_t &affinity) { … }

void __kmp_affinity_uninitialize(void) { … }

static void __kmp_select_mask_by_gtid(int gtid, const kmp_affinity_t *affinity,
                                      int *place, kmp_affin_mask_t **mask) { … }

// This function initializes the per-thread data concerning affinity including
// the mask and topology information
void __kmp_affinity_set_init_mask(int gtid, int isa_root) { … }

void __kmp_affinity_bind_init_mask(int gtid) { … }

void __kmp_affinity_bind_place(int gtid) { … }

int __kmp_aux_set_affinity(void **mask) { … }

int __kmp_aux_get_affinity(void **mask) { … }

int __kmp_aux_get_affinity_max_proc() { … }

int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) { … }

int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) { … }

int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) { … }

#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
// Returns first os proc id with ATOM core
int __kmp_get_first_osid_with_ecore(void) { … }
#endif

// Dynamic affinity settings - Affinity balanced
void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) { … }

#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY ||     \
    KMP_OS_AIX
// We don't need this entry for Windows because
// there is GetProcessAffinityMask() api
//
// The intended usage is indicated by these steps:
// 1) The user gets the current affinity mask
// 2) Then sets the affinity by calling this function
// 3) Error check the return value
// 4) Use non-OpenMP parallelization
// 5) Reset the affinity to what was stored in step 1)
#ifdef __cplusplus
extern "C"
#endif
    int
    kmp_set_thread_affinity_mask_initial()
// the function returns 0 on success,
//   -1 if we cannot bind thread
//   >0 (errno) if an error happened during binding
{ … }
#endif

#endif // KMP_AFFINITY_SUPPORTED
llvm/openmp/runtime/src/kmp_affinity.cpp