"Fossies" - the Fresh Open Source Software Archive

Member "mvapich2-2.3.2/src/mpid/ch3/channels/common/src/affinity/hwloc_bind.c" (8 Aug 2019, 106330 Bytes) of package /linux/misc/mvapich2-2.3.2.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "hwloc_bind.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 2.3.1_vs_2.3.2.

    1 /* Copyright (c) 2001-2019, The Ohio State University. All rights
    2  * reserved.
    3  *
    4  * This file is part of the MVAPICH2 software package developed by the
    5  * team members of The Ohio State University's Network-Based Computing
    6  * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda.
    7  *
    8  * For detailed copyright and licensing information, please refer to the
    9  * copyright file COPYRIGHT in the top level MVAPICH2 directory.
   10  *
   11  */
   12 #include "mpichconf.h"
   13 #include "mpidimpl.h"
   14 #include "mpidi_ch3_impl.h"
   15 #include <mpimem.h>
   16 #include <limits.h>
   17 #include <stdlib.h>
   18 #include <stdio.h>
   19 #include <ctype.h>
   20 #include <sys/types.h>
   21 #include <sys/stat.h>
   22 #include <fcntl.h>
   23 #include <unistd.h>
   24 #include <netdb.h>
   25 #include <sys/mman.h>
   26 #include <errno.h>
   27 #include <string.h>
   28 #include <assert.h>
   29 #include "upmi.h"
   30 #include "mpiutil.h"
   31 #include "hwloc_bind.h"
   32 #if defined(HAVE_LIBIBVERBS)
   33 #include <hwloc/openfabrics-verbs.h>
   34 #endif
   35 #if defined(CHANNEL_MRAIL)
   36 #include "smp_smpi.h"
   37 #include "rdma_impl.h"
   38 #endif /*defined(CHANNEL_MRAIL)*/
   39 #include "mv2_arch_hca_detect.h"
   40 #include "debug_utils.h"
   41 
   42 /* CPU Mapping related definitions */
   43 
   44 #define CONFIG_FILE "/proc/cpuinfo"
   45 #define MAX_LINE_LENGTH 512
   46 #define MAX_NAME_LENGTH 64
   47 #define HOSTNAME_LENGTH 255
   48 #define FILENAME_LENGTH 512
   49 
   50 /* Hybrid mapping related definitions */
   51 #define HYBRID_LINEAR  0
   52 #define HYBRID_COMPACT 1
   53 #define HYBRID_SPREAD  2
   54 #define HYBRID_BUNCH   3
   55 #define HYBRID_SCATTER 4
   56 #define HYBRID_NUMA    5
   57 
   58 const char *mv2_cpu_policy_names[] = {"Bunch", "Scatter", "Hybrid"};
   59 const char *mv2_hybrid_policy_names[] = {"Linear", "Compact", "Spread", "Bunch", "Scatter", "NUMA"};
   60 
   61 int mv2_hybrid_binding_policy = HYBRID_LINEAR; /* default as linear */
   62 int mv2_pivot_core_id = 0;     /* specify pivot core to start binding MPI ranks */
   63 int mv2_threads_per_proc = 1;  /* there is atleast one thread which is MPI rank */
   64 int num_sockets = 1; /* default */
   65 int num_physical_cores = 0;
   66 int num_pu = 0;
   67 int hw_threads_per_core = 0;
   68 int *mv2_core_map; /* list of core ids achieved after hwloc tree scanning */
   69 int *mv2_core_map_per_numa; /* list of core ids based on NUMA nodes */
   70 
   71 int mv2_my_cpu_id = -1;
   72 int mv2_my_sock_id = -1;
   73 int mv2_my_async_cpu_id = -1;
   74 int *local_core_ids = NULL;
   75 int mv2_user_defined_mapping = FALSE;
   76 
   77 #ifdef ENABLE_LLNL_SITE_SPECIFIC_OPTIONS
   78 unsigned int mv2_enable_affinity = 0;
   79 #else
   80 unsigned int mv2_enable_affinity = 1;
   81 #endif /*ENABLE_LLNL_SITE_SPECIFIC_OPTIONS*/
   82 unsigned int mv2_enable_leastload = 0;
   83 unsigned int mv2_hca_aware_process_mapping = 1;
   84 
   85 typedef enum {
   86     CPU_FAMILY_NONE = 0,
   87     CPU_FAMILY_INTEL,
   88     CPU_FAMILY_AMD,
   89 } cpu_type_t;
   90 
   91 int CLOVERTOWN_MODEL = 15;
   92 int HARPERTOWN_MODEL = 23;
   93 int NEHALEM_MODEL = 26;
   94 
   95 int ip = 0;
   96 unsigned long *core_mapping = NULL;
   97 int *obj_tree = NULL;
   98 
   99 policy_type_t mv2_binding_policy;
  100 level_type_t mv2_binding_level;
  101 hwloc_topology_t topology = NULL;
  102 
  103 static int INTEL_XEON_DUAL_MAPPING[] = { 0, 1, 0, 1 };
  104 
  105 /* ((0,1),(4,5))((2,3),(6,7)) */
  106 static int INTEL_CLOVERTOWN_MAPPING[] = { 0, 0, 1, 1, 0, 0, 1, 1 };
  107 
  108 /* legacy ((0,2),(4,6))((1,3),(5,7)) */
  109 static int INTEL_HARPERTOWN_LEG_MAPPING[] = { 0, 1, 0, 1, 0, 1, 0, 1 };
  110 
  111 /* common ((0,1),(2,3))((4,5),(6,7)) */
  112 static int INTEL_HARPERTOWN_COM_MAPPING[] = { 0, 0, 0, 0, 1, 1, 1, 1 };
  113 
  114 /* legacy (0,2,4,6)(1,3,5,7) with hyperthreading */
  115 static int INTEL_NEHALEM_LEG_MAPPING[] =
  116     { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 };
  117 
  118 /* common (0,1,2,3)(4,5,6,7) with hyperthreading */
  119 static int INTEL_NEHALEM_COM_MAPPING[] =
  120     { 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1 };
  121 
  122 static int AMD_OPTERON_DUAL_MAPPING[] = { 0, 0, 1, 1 };
  123 static int AMD_BARCELONA_MAPPING[] = { 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3 };
  124 
  125 extern int use_hwloc_cpu_binding;
  126 
  127 char *s_cpu_mapping = NULL;
  128 static char *custom_cpu_mapping = NULL;
  129 int s_cpu_mapping_line_max = _POSIX2_LINE_MAX;
  130 static int custom_cpu_mapping_line_max = _POSIX2_LINE_MAX;
  131 char *cpu_mapping = NULL;
  132 char *xmlpath = NULL;
  133 
  134 int ib_socket_bind = 0;
  135 
  136 #if defined(CHANNEL_MRAIL)
  137 int get_ib_socket(struct ibv_device * ibdev)
  138 {
  139     hwloc_cpuset_t set = NULL;
  140     hwloc_obj_t osdev = NULL;
  141     char string[256];
  142     int retval = 0;
  143 
  144     if (!(set = hwloc_bitmap_alloc())) {
  145         goto fn_exit;
  146     }
  147 
  148     if (hwloc_ibv_get_device_cpuset(topology, ibdev, set)) {
  149         goto fn_exit;
  150     }
  151 
  152     osdev = hwloc_get_obj_inside_cpuset_by_type(topology, set,
  153             HWLOC_OBJ_SOCKET, 0);
  154 
  155     if (NULL == osdev) {
  156         goto fn_exit;
  157     }
  158 
  159     /*
  160      * The hwloc object "string" will have the form "Socket#n" so we are
  161      * looking at the 8th char to detect which socket is.
  162      */
  163     hwloc_obj_type_snprintf(string, sizeof(string), osdev, 1);
  164     retval = osdev->os_index;
  165 
  166 fn_exit:
  167     if (set) {
  168         hwloc_bitmap_free(set);
  169     }
  170     return retval;
  171 }
  172 #endif /* defined(CHANNEL_MRAIL) */
  173 
  174 static int first_num_from_str(char **str)
  175 {
  176     int val = atoi(*str);
  177     while (isdigit(**str)) {
  178         (*str)++;
  179     }
  180     return val;
  181 }
  182 
  183 static inline int compare_float(const float a, const float b)
  184 {
  185     const float precision = 0.00001;
  186     if ((a - precision) < b && (a + precision) > b) {
  187         return 1;
  188     } else {
  189         return 0;
  190     }
  191 }
  192 
  193 static int pid_filter(const struct dirent *dir_obj)
  194 {
  195     int i;
  196     int length = strlen(dir_obj->d_name);
  197 
  198     for (i = 0; i < length; i++) {
  199         if (!isdigit(dir_obj->d_name[i])) {
  200             return 0;
  201         }
  202     }
  203     return 1;
  204 }
  205 
  206 static void find_parent(hwloc_obj_t obj, hwloc_obj_type_t type, hwloc_obj_t * parent)
  207 {
  208     if ((type == HWLOC_OBJ_CORE) || (type == HWLOC_OBJ_SOCKET)
  209         || (type == HWLOC_OBJ_NODE)) {
  210         if (obj->parent->type == type) {
  211             *parent = obj->parent;
  212             return;
  213         } else {
  214             find_parent(obj->parent, type, parent);
  215         }
  216     } else {
  217         return;
  218     }
  219 }
  220 
  221 static void find_leastload_node(obj_attribute_type * tree, hwloc_obj_t original,
  222                                 hwloc_obj_t * result)
  223 {
  224     int i, j, k, per, ix, depth_nodes, num_nodes, depth_sockets, num_sockets;
  225     hwloc_obj_t obj, tmp;
  226 
  227     depth_nodes = hwloc_get_type_depth(topology, HWLOC_OBJ_NODE);
  228     num_nodes = hwloc_get_nbobjs_by_depth(topology, depth_nodes);
  229 
  230     /* One socket includes multi numanodes. */
  231     if ((original->type == HWLOC_OBJ_SOCKET)) {
  232         depth_sockets = hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET);
  233         num_sockets = hwloc_get_nbobjs_by_depth(topology, depth_sockets);
  234         per = num_nodes / num_sockets;
  235         ix = (original->logical_index) * per;
  236         if (per == 1) {
  237             *result = tree[depth_nodes * num_nodes + ix].obj;
  238         } else {
  239             i = depth_nodes * num_nodes + ix;
  240             for (k = 0; k < (per - 1); k++) {
  241                 j = i + k + 1;
  242                 i = (tree[i].load > tree[j].load) ? j : i;
  243             }
  244             *result = tree[i].obj;
  245         }
  246     } else if (original->type == HWLOC_OBJ_MACHINE) {
  247         tmp = NULL;
  248         for (k = 0; k < num_nodes; k++) {
  249             obj = hwloc_get_obj_by_depth(topology, depth_nodes, k);
  250             if (tmp == NULL) {
  251                 tmp = obj;
  252             } else {
  253                 i = depth_nodes * num_nodes + tmp->logical_index;
  254                 j = depth_nodes * num_nodes + obj->logical_index;
  255                 if (tree[i].load > tree[j].load)
  256                     tmp = obj;
  257             }
  258         }
  259         *result = tmp;
  260     } else {
  261         *result = NULL;
  262     }
  263     return;
  264 }
  265 
  266 static void find_leastload_socket(obj_attribute_type * tree, hwloc_obj_t original,
  267                                   hwloc_obj_t * result)
  268 {
  269     int i, j, k, per, ix, depth_sockets, num_sockets, depth_nodes, num_nodes;
  270     hwloc_obj_t obj, tmp;
  271 
  272     depth_sockets = hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET);
  273     num_sockets = hwloc_get_nbobjs_by_depth(topology, depth_sockets);
  274 
  275     /* One numanode includes multi sockets. */
  276     if ((original->type == HWLOC_OBJ_NODE)) {
  277         depth_nodes = hwloc_get_type_depth(topology, HWLOC_OBJ_NODE);
  278         num_nodes = hwloc_get_nbobjs_by_depth(topology, depth_nodes);
  279         per = num_sockets / num_nodes;
  280         ix = (original->logical_index) * per;
  281         if (per == 1) {
  282             *result = tree[depth_sockets * num_sockets + ix].obj;
  283         } else {
  284             i = depth_sockets * num_sockets + ix;
  285             for (k = 0; k < (per - 1); k++) {
  286                 j = i + k + 1;
  287                 i = (tree[i].load > tree[j].load) ? j : i;
  288             }
  289             *result = tree[i].obj;
  290         }
  291     } else if (original->type == HWLOC_OBJ_MACHINE) {
  292         tmp = NULL;
  293         for (k = 0; k < num_sockets; k++) {
  294             obj = hwloc_get_obj_by_depth(topology, depth_sockets, k);
  295             if (tmp == NULL) {
  296                 tmp = obj;
  297             } else {
  298                 i = depth_sockets * num_sockets + tmp->logical_index;
  299                 j = depth_sockets * num_sockets + obj->logical_index;
  300                 if (tree[i].load > tree[j].load)
  301                     tmp = obj;
  302             }
  303         }
  304         *result = tmp;
  305     } else {
  306         *result = NULL;
  307     }
  308     return;
  309 }
  310 
  311 static void find_leastload_core(obj_attribute_type * tree, hwloc_obj_t original,
  312                                 hwloc_obj_t * result)
  313 {
  314     int i, j, k, per, ix;
  315     int depth_cores, num_cores, depth_sockets, num_sockets, depth_nodes, num_nodes;
  316 
  317     depth_cores = hwloc_get_type_depth(topology, HWLOC_OBJ_CORE);
  318     num_cores = hwloc_get_nbobjs_by_depth(topology, depth_cores);
  319 
  320     /* Core may have Socket or Numanode as direct parent. */
  321     if ((original->type == HWLOC_OBJ_NODE)) {
  322         depth_nodes = hwloc_get_type_depth(topology, HWLOC_OBJ_NODE);
  323         num_nodes = hwloc_get_nbobjs_by_depth(topology, depth_nodes);
  324         per = num_cores / num_nodes;
  325         ix = (original->logical_index) * per;
  326         if (per == 1) {
  327             *result = tree[depth_cores * num_cores + ix].obj;
  328         } else {
  329             i = depth_cores * num_cores + ix;
  330             for (k = 0; k < (per - 1); k++) {
  331                 j = i + k + 1;
  332                 i = (tree[i].load > tree[j].load) ? j : i;
  333             }
  334             *result = tree[i].obj;
  335         }
  336     } else if (original->type == HWLOC_OBJ_SOCKET) {
  337         depth_sockets = hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET);
  338         num_sockets = hwloc_get_nbobjs_by_depth(topology, depth_sockets);
  339         per = num_cores / num_sockets;
  340         ix = (original->logical_index) * per;
  341         if (per == 1) {
  342             *result = tree[depth_cores * num_cores + ix].obj;
  343         } else {
  344             i = depth_cores * num_cores + ix;
  345             for (k = 0; k < (per - 1); k++) {
  346                 j = i + k + 1;
  347                 i = (tree[i].load > tree[j].load) ? j : i;
  348             }
  349             *result = tree[i].obj;
  350         }
  351     } else {
  352         *result = NULL;
  353     }
  354     return;
  355 }
  356 
  357 static void find_leastload_pu(obj_attribute_type * tree, hwloc_obj_t original,
  358                               hwloc_obj_t * result)
  359 {
  360     int i, j, k, per, ix, depth_pus, num_pus, depth_cores, num_cores;
  361 
  362     depth_pus = hwloc_get_type_depth(topology, HWLOC_OBJ_PU);
  363     num_pus = hwloc_get_nbobjs_by_depth(topology, depth_pus);
  364 
  365     /* Assume: pu only has core as direct parent. */
  366     if ((original->type == HWLOC_OBJ_CORE)) {
  367         depth_cores = hwloc_get_type_depth(topology, HWLOC_OBJ_CORE);
  368         num_cores = hwloc_get_nbobjs_by_depth(topology, depth_cores);
  369         per = num_pus / num_cores;
  370         ix = (original->logical_index) * per;
  371         if (per == 1) {
  372             *result = tree[depth_pus * num_pus + ix].obj;
  373         } else {
  374             i = depth_pus * num_pus + ix;
  375             for (k = 0; k < (per - 1); k++) {
  376                 j = i + k + 1;
  377                 i = (tree[i].load > tree[j].load) ? j : i;
  378             }
  379             *result = tree[i].obj;
  380         }
  381     } else {
  382         *result = NULL;
  383     }
  384     return;
  385 }
  386 
  387 
  388 static void update_obj_attribute(obj_attribute_type * tree, int ix, hwloc_obj_t obj,
  389                                  int cpuset, float load)
  390 {
  391     tree[ix].obj = obj;
  392     if (!(cpuset < 0)) {
  393         CPU_SET(cpuset, &(tree[ix].cpuset));
  394     }
  395     tree[ix].load += load;
  396 }
  397 
  398 static void insert_load(obj_attribute_type * tree, hwloc_obj_t pu, int cpuset, float load)
  399 {
  400     int k, depth_pus, num_pus = 0;
  401     int depth_cores, depth_sockets, depth_nodes, num_cores = 0, num_sockets =
  402         0, num_nodes = 0;
  403     hwloc_obj_t parent;
  404 
  405     depth_pus = hwloc_get_type_or_below_depth(topology, HWLOC_OBJ_PU);
  406     num_pus = hwloc_get_nbobjs_by_depth(topology, depth_pus);
  407 
  408     depth_nodes = hwloc_get_type_depth(topology, HWLOC_OBJ_NODE);
  409     if (depth_nodes != HWLOC_TYPE_DEPTH_UNKNOWN) {
  410         num_nodes = hwloc_get_nbobjs_by_depth(topology, depth_nodes);
  411     }
  412     depth_sockets = hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET);
  413     if (depth_sockets != HWLOC_TYPE_DEPTH_UNKNOWN) {
  414         num_sockets = hwloc_get_nbobjs_by_depth(topology, depth_sockets);
  415     }
  416     depth_cores = hwloc_get_type_depth(topology, HWLOC_OBJ_CORE);
  417     if (depth_cores != HWLOC_TYPE_DEPTH_UNKNOWN) {
  418         num_cores = hwloc_get_nbobjs_by_depth(topology, depth_cores);
  419     }
  420 
  421     /* Add obj, cpuset and load for HWLOC_OBJ_PU */
  422     k = depth_pus * num_pus + pu->logical_index;
  423     update_obj_attribute(tree, k, pu, cpuset, load);
  424     /* Add cpuset and load for HWLOC_OBJ_CORE */
  425     if (depth_cores != HWLOC_TYPE_DEPTH_UNKNOWN) {
  426         find_parent(pu, HWLOC_OBJ_CORE, &parent);
  427         k = depth_cores * num_cores + parent->logical_index;
  428         update_obj_attribute(tree, k, parent, cpuset, load);
  429     }
  430     /* Add cpuset and load for HWLOC_OBJ_SOCKET */
  431     if (depth_sockets != HWLOC_TYPE_DEPTH_UNKNOWN) {
  432         find_parent(pu, HWLOC_OBJ_SOCKET, &parent);
  433         k = depth_sockets * num_sockets + parent->logical_index;
  434         update_obj_attribute(tree, k, parent, cpuset, load);
  435     }
  436     /* Add cpuset and load for HWLOC_OBJ_NODE */
  437     if (depth_nodes != HWLOC_TYPE_DEPTH_UNKNOWN) {
  438         find_parent(pu, HWLOC_OBJ_NODE, &parent);
  439         k = depth_nodes * num_nodes + parent->logical_index;
  440         update_obj_attribute(tree, k, parent, cpuset, load);
  441     }
  442     return;
  443 }
  444 
  445 static void cac_load(obj_attribute_type * tree, cpu_set_t cpuset)
  446 {
  447     int i, j, depth_pus, num_pus;
  448     float proc_load;
  449     int num_processes = 0;
  450     hwloc_obj_t obj;
  451 
  452     depth_pus = hwloc_get_type_or_below_depth(topology, HWLOC_OBJ_PU);
  453     num_pus = hwloc_get_nbobjs_by_depth(topology, depth_pus);
  454 
  455     for (i = 0; i < num_pus; i++) {
  456         if (CPU_ISSET(i, &cpuset)) {
  457             num_processes++;
  458         }
  459     }
  460 
  461     /* Process is running on num_processes cores; for each core, the load is proc_load. */
  462     proc_load = 1 / num_processes;
  463 
  464     /*
  465      * num_objs is HWLOC_OBJ_PU number, and system CPU number;
  466      * also HWLOC_OBJ_CORE number when HT disabled or without HT.
  467      */
  468 
  469     for (i = 0; i < num_pus; i++) {
  470         if (CPU_ISSET(i, &cpuset)) {
  471             for (j = 0; j < num_pus; j++) {
  472                 obj = hwloc_get_obj_by_depth(topology, depth_pus, j);
  473                 if (obj->os_index == i) {
  474                     insert_load(tree, obj, i, proc_load);
  475                 }
  476             }
  477         }
  478     }
  479     return;
  480 }
  481 
  482 static void insert_core_mapping(int ix, hwloc_obj_t pu, obj_attribute_type * tree)
  483 {
  484     core_mapping[ix] = pu->os_index;
  485     /* This process will be binding to one pu/core.
  486      * The load for this pu/core is 1; and not update cpuset.
  487      */
  488     insert_load(tree, pu, -1, 1);
  489     return;
  490 }
  491 
  492 void map_scatter_load(obj_attribute_type * tree)
  493 {
  494     int k;
  495     int depth_cores, depth_sockets, depth_nodes, num_cores = 0;
  496     hwloc_obj_t root, node, sockets, core_parent, core, result;
  497 
  498     root = hwloc_get_root_obj(topology);
  499 
  500     depth_nodes = hwloc_get_type_depth(topology, HWLOC_OBJ_NODE);
  501 
  502     depth_sockets = hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET);
  503 
  504     depth_cores = hwloc_get_type_depth(topology, HWLOC_OBJ_CORE);
  505     if (depth_cores != HWLOC_TYPE_DEPTH_UNKNOWN) {
  506         num_cores = hwloc_get_nbobjs_by_depth(topology, depth_cores);
  507     }
  508 
  509     k = 0;
  510     /*Assume: there is always existing SOCKET, but not always existing NUMANODE(like Clovertown). */
  511     while (k < num_cores) {
  512         if (depth_nodes == HWLOC_TYPE_DEPTH_UNKNOWN) {
  513             find_leastload_socket(tree, root, &result);
  514         } else {
  515             if ((depth_nodes) < (depth_sockets)) {
  516                 find_leastload_node(tree, root, &result);
  517                 node = result;
  518                 find_leastload_socket(tree, node, &result);
  519             } else {
  520                 find_leastload_socket(tree, root, &result);
  521                 sockets = result;
  522                 find_leastload_node(tree, sockets, &result);
  523             }
  524         }
  525         core_parent = result;
  526         find_leastload_core(tree, core_parent, &result);
  527         core = result;
  528         find_leastload_pu(tree, core, &result);
  529         insert_core_mapping(k, result, tree);
  530         k++;
  531     }
  532 }
  533 
  534 void map_bunch_load(obj_attribute_type * tree)
  535 {
  536     int i, j, k, per = 0;
  537     int per_socket_node, depth_pus, num_pus = 0;
  538     float current_socketornode_load = 0, current_core_load = 0;
  539     int depth_cores, depth_sockets, depth_nodes, num_cores = 0, num_sockets =
  540         0, num_nodes = 0;
  541     hwloc_obj_t root, node, sockets, core_parent, core, pu, result;
  542 
  543     root = hwloc_get_root_obj(topology);
  544 
  545     depth_nodes = hwloc_get_type_depth(topology, HWLOC_OBJ_NODE);
  546     if (depth_nodes != HWLOC_TYPE_DEPTH_UNKNOWN) {
  547         num_nodes = hwloc_get_nbobjs_by_depth(topology, depth_nodes);
  548     }
  549 
  550     depth_sockets = hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET);
  551     if (depth_sockets != HWLOC_TYPE_DEPTH_UNKNOWN) {
  552         num_sockets = hwloc_get_nbobjs_by_depth(topology, depth_sockets);
  553     }
  554 
  555     depth_cores = hwloc_get_type_depth(topology, HWLOC_OBJ_CORE);
  556     if (depth_cores != HWLOC_TYPE_DEPTH_UNKNOWN) {
  557         num_cores = hwloc_get_nbobjs_by_depth(topology, depth_cores);
  558     }
  559 
  560     depth_pus = hwloc_get_type_depth(topology, HWLOC_OBJ_PU);
  561     if (depth_pus != HWLOC_TYPE_DEPTH_UNKNOWN) {
  562         num_pus = hwloc_get_nbobjs_by_depth(topology, depth_pus);
  563     }
  564 
  565     k = 0;
  566     /*Assume: there is always existing SOCKET, but not always existing NUMANODE(like Clovertown). */
  567     while (k < num_cores) {
  568         if (depth_nodes == HWLOC_TYPE_DEPTH_UNKNOWN) {
  569             find_leastload_socket(tree, root, &result);
  570             core_parent = result;
  571             per = num_cores / num_sockets;
  572             for (i = 0; (i < per) && (k < num_cores); i++) {
  573                 find_leastload_core(tree, core_parent, &result);
  574                 core = result;
  575                 find_leastload_pu(tree, core, &result);
  576                 pu = result;
  577                 if (i == 0) {
  578                     current_core_load =
  579                         tree[depth_pus * num_pus + pu->logical_index].load;
  580                     insert_core_mapping(k, pu, tree);
  581                     k++;
  582                 } else {
  583                     if (compare_float
  584                         (tree[depth_pus * num_pus + pu->logical_index].load,
  585                          current_core_load)) {
  586                         insert_core_mapping(k, pu, tree);
  587                         k++;
  588                     }
  589                 }
  590             }
  591         } else {
  592             if ((depth_nodes) < (depth_sockets)) {
  593                 find_leastload_node(tree, root, &result);
  594                 node = result;
  595                 per_socket_node = num_sockets / num_nodes;
  596                 for (j = 0; (j < per_socket_node) && (k < num_cores); j++) {
  597                     find_leastload_socket(tree, node, &result);
  598                     sockets = result;
  599                     if (j == 0) {
  600                         current_socketornode_load =
  601                             tree[depth_sockets * num_sockets +
  602                                  sockets->logical_index].load;
  603                         per = num_cores / num_sockets;
  604                         for (i = 0; (i < per) && (k < num_cores); i++) {
  605                             find_leastload_core(tree, sockets, &result);
  606                             core = result;
  607                             find_leastload_pu(tree, core, &result);
  608                             pu = result;
  609                             if (i == 0) {
  610                                 current_core_load =
  611                                     tree[depth_pus * num_pus + pu->logical_index].load;
  612                                 insert_core_mapping(k, pu, tree);
  613                                 k++;
  614                             } else {
  615                                 if (compare_float
  616                                     (tree[depth_pus * num_pus + pu->logical_index].load,
  617                                      current_core_load)) {
  618                                     insert_core_mapping(k, pu, tree);
  619                                     k++;
  620                                 }
  621                             }
  622                         }
  623                     } else {
  624                         if (compare_float
  625                             (tree
  626                              [depth_sockets * num_sockets + sockets->logical_index].load,
  627                              current_socketornode_load)) {
  628                             for (i = 0; (i < per) && (k < num_cores); i++) {
  629                                 find_leastload_core(tree, sockets, &result);
  630                                 core = result;
  631                                 find_leastload_pu(tree, core, &result);
  632                                 pu = result;
  633                                 if (i == 0) {
  634                                     current_core_load =
  635                                         tree[depth_pus * num_pus +
  636                                              pu->logical_index].load;
  637                                     insert_core_mapping(k, pu, tree);
  638                                     k++;
  639                                 } else {
  640                                     if (compare_float
  641                                         (tree
  642                                          [depth_pus * num_pus + pu->logical_index].load,
  643                                          current_core_load)) {
  644                                         insert_core_mapping(k, pu, tree);
  645                                         k++;
  646                                     }
  647                                 }
  648                             }
  649 
  650                         }
  651                     }
  652                 }
  653             } else {    // depth_nodes > depth_sockets
  654                 find_leastload_socket(tree, root, &result);
  655                 sockets = result;
  656                 per_socket_node = num_nodes / num_sockets;
  657                 for (j = 0; (j < per_socket_node) && (k < num_cores); j++) {
  658                     find_leastload_node(tree, sockets, &result);
  659                     node = result;
  660                     if (j == 0) {
  661                         current_socketornode_load =
  662                             tree[depth_nodes * num_nodes + node->logical_index].load;
  663                         per = num_cores / num_sockets;
  664                         for (i = 0; (i < per) && (k < num_cores); i++) {
  665                             find_leastload_core(tree, node, &result);
  666                             core = result;
  667                             find_leastload_pu(tree, core, &result);
  668                             pu = result;
  669                             if (i == 0) {
  670                                 current_core_load =
  671                                     tree[depth_pus * num_pus + pu->logical_index].load;
  672                                 insert_core_mapping(k, pu, tree);
  673                                 k++;
  674                             } else {
  675                                 if (compare_float
  676                                     (tree[depth_pus * num_pus + pu->logical_index].load,
  677                                      current_core_load)) {
  678                                     insert_core_mapping(k, pu, tree);
  679                                     k++;
  680                                 }
  681                             }
  682                         }
  683                     } else {
  684                         if (compare_float
  685                             (tree[depth_nodes * num_nodes + node->logical_index].load,
  686                              current_socketornode_load)) {
  687                             for (i = 0; (i < per) && (k < num_cores); i++) {
  688                                 find_leastload_core(tree, node, &result);
  689                                 core = result;
  690                                 find_leastload_pu(tree, core, &result);
  691                                 pu = result;
  692                                 if (i == 0) {
  693                                     current_core_load =
  694                                         tree[depth_pus * num_pus +
  695                                              pu->logical_index].load;
  696                                     insert_core_mapping(k, pu, tree);
  697                                     k++;
  698                                 } else {
  699                                     if (compare_float
  700                                         (tree
  701                                          [depth_pus * num_pus + pu->logical_index].load,
  702                                          current_core_load)) {
  703                                         insert_core_mapping(k, pu, tree);
  704                                         k++;
  705                                     }
  706                                 }
  707                             }
  708                         }
  709                     }
  710                 }
  711             }   /* depth_nodes > depth_sockets */
  712         }
  713     }   /* while */
  714 }
  715 
  716 /*
  717  * Compare two hwloc_obj_t of type HWLOC_OBJ_PU according to sibling_rank, used with qsort
  718  */
  719 static int cmpproc_smt(const void *a, const void *b)
  720 {
  721     hwloc_obj_t pa = *(hwloc_obj_t *) a;
  722     hwloc_obj_t pb = *(hwloc_obj_t *) b;
  723     return (pa->sibling_rank ==
  724             pb->sibling_rank) ? pa->os_index - pb->os_index : pa->sibling_rank -
  725         pb->sibling_rank;
  726 }
  727 
  728 static int cmpdepth_smt(const void *a, const void *b)
  729 {
  730     ancestor_type pa = *(ancestor_type *) a;
  731     ancestor_type pb = *(ancestor_type *) b;
  732     if ((pa.ancestor)->depth > (pb.ancestor)->depth) {
  733         return -1;
  734     } else if ((pa.ancestor)->depth < (pb.ancestor)->depth) {
  735         return 1;
  736     } else {
  737         return 0;
  738     }
  739 }
  740 
  741 static int cmparity_smt(const void *a, const void *b)
  742 {
  743     ancestor_type pa = *(ancestor_type *) a;
  744     ancestor_type pb = *(ancestor_type *) b;
  745     if ((pa.ancestor)->arity > (pb.ancestor)->arity) {
  746         return -1;
  747     } else if ((pa.ancestor)->arity < (pb.ancestor)->arity) {
  748         return 1;
  749     } else {
  750         return 0;
  751     }
  752 }
  753 
  754 static void get_first_obj_bunch(hwloc_obj_t * result)
  755 {
  756     hwloc_obj_t *objs;
  757     ancestor_type *array;
  758     int i, j, k, num_objs, num_ancestors;
  759 
  760     if ((num_objs = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU)) <= 0) {
  761         return;
  762     }
  763 
  764     if ((objs = (hwloc_obj_t *) MPIU_Malloc(num_objs * sizeof(hwloc_obj_t))) == NULL) {
  765         return;
  766     }
  767 
  768     for (i = 0; i < num_objs; i++) {
  769         objs[i] = hwloc_get_obj_by_type(topology, HWLOC_OBJ_PU, i);
  770     }
  771 
  772     num_ancestors = num_objs * (num_objs - 1) / 2;
  773 
  774     if ((array =
  775          (ancestor_type *) MPIU_Malloc(num_ancestors * sizeof(ancestor_type))) == NULL) {
  776         return;
  777     }
  778 
  779     k = 0;
  780     for (i = 0; i < (num_objs - 1); i++) {
  781         for (j = i + 1; j < num_objs; j++) {
  782             array[k].obja = objs[i];
  783             array[k].objb = objs[j];
  784             array[k].ancestor = hwloc_get_common_ancestor_obj(topology, objs[i], objs[j]);
  785             k++;
  786         }
  787     }
  788 
  789     qsort(array, num_ancestors, sizeof(ancestor_type), cmpdepth_smt);
  790 
  791     for (i = 0; i < (num_ancestors - 1); i++) {
  792         if ((array[i + 1].ancestor)->depth < (array[i].ancestor)->depth) {
  793             break;
  794         }
  795     }
  796 
  797     qsort(array, (i + 1), sizeof(ancestor_type), cmparity_smt);
  798 
  799     *result = array[0].obja;
  800 
  801     MPIU_Free(objs);
  802     MPIU_Free(array);
  803     return;
  804 }
  805 
  806 static void get_first_socket_bunch(hwloc_obj_t * result, hwloc_obj_type_t binding_level)
  807 {
  808     hwloc_obj_t *objs;
  809     ancestor_type *array;
  810     int i, j, k, num_objs, num_ancestors;
  811 
  812     if ((num_objs = hwloc_get_nbobjs_by_type(topology, binding_level)) <= 0) {
  813         return;
  814     }
  815 
  816     if ((objs = (hwloc_obj_t *) MPIU_Malloc(num_objs * sizeof(hwloc_obj_t))) == NULL) {
  817         return;
  818     }
  819 
  820     for (i = 0; i < num_objs; i++) {
  821         objs[i] = hwloc_get_obj_by_type(topology, binding_level, i);
  822     }
  823 
  824     num_ancestors = num_objs * (num_objs - 1) / 2;
  825 
  826     if ((array =
  827          (ancestor_type *) MPIU_Malloc(num_ancestors * sizeof(ancestor_type))) == NULL) {
  828         return;
  829     }
  830 
  831     k = 0;
  832     for (i = 0; i < (num_objs - 1); i++) {
  833         for (j = i + 1; j < num_objs; j++) {
  834             array[k].obja = objs[i];
  835             array[k].objb = objs[j];
  836             array[k].ancestor = hwloc_get_common_ancestor_obj(topology, objs[i], objs[j]);
  837             k++;
  838         }
  839     }
  840 
  841     qsort(array, num_ancestors, sizeof(ancestor_type), cmpdepth_smt);
  842 
  843     for (i = 0; i < (num_ancestors - 1); i++) {
  844         if ((array[i + 1].ancestor)->depth < (array[i].ancestor)->depth) {
  845             break;
  846         }
  847     }
  848 
  849     if (i < num_ancestors - 1)
  850         qsort(array, (i + 1), sizeof(ancestor_type), cmparity_smt);
  851 
  852     *result = array[0].obja;
  853 
  854     MPIU_Free(objs);
  855     MPIU_Free(array);
  856     return;
  857 }
  858 
  859 /*
  860  * Yields "scatter" affinity scenario in core_mapping.
  861  */
  862 void map_scatter_core(int num_cpus)
  863 {
  864     hwloc_obj_t *objs, obj, a;
  865     unsigned *pdist, maxd;
  866     int i, j, ix, jp, d, s;
  867 
  868     /* Init and load HWLOC_OBJ_PU objects */
  869     if ((objs = (hwloc_obj_t *) MPIU_Malloc(num_cpus * sizeof(hwloc_obj_t *))) == NULL)
  870         return;
  871 
  872     obj = NULL;
  873     i = 0;
  874     while ((obj = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_PU, obj)) != NULL)
  875         objs[i++] = obj;
  876     if (i != num_cpus) {
  877         MPIU_Free(objs);
  878         return;
  879     }
  880 
  881     /* Sort HWLOC_OBJ_PU objects according to sibling_rank */
  882     qsort(objs, num_cpus, sizeof(hwloc_obj_t *), cmpproc_smt);
  883 
  884     /* Init cumulative distances */
  885     if ((pdist = (unsigned *) MPIU_Malloc(num_cpus * sizeof(unsigned))) == NULL) {
  886         MPIU_Free(objs);
  887         return;
  888     }
  889 
  890     /* Loop over objects, ix is index in objs where sorted objects start */
  891     ix = num_cpus;
  892     s = -1;
  893     while (ix > 0) {
  894         /* If new group of SMT processors starts, zero distances */
  895         if (s != objs[0]->sibling_rank) {
  896             s = objs[0]->sibling_rank;
  897             for (j = 0; j < ix; j++)
  898                 pdist[j] = 0;
  899         }
  900         /*
  901          * Determine object that has max. distance to all already stored objects.
  902          * Consider only groups of SMT processors with same sibling_rank.
  903          */
  904         maxd = 0;
  905         jp = 0;
  906         for (j = 0; j < ix; j++) {
  907             if ((j) && (objs[j - 1]->sibling_rank != objs[j]->sibling_rank))
  908                 break;
  909             if (pdist[j] > maxd) {
  910                 maxd = pdist[j];
  911                 jp = j;
  912             }
  913         }
  914 
  915         /* Rotate found object to the end of the list, map out found object from distances */
  916         obj = objs[jp];
  917         for (j = jp; j < num_cpus - 1; j++) {
  918             objs[j] = objs[j + 1];
  919             pdist[j] = pdist[j + 1];
  920         }
  921         objs[j] = obj;
  922         ix--;
  923 
  924         /*
  925          * Update cumulative distances of all remaining objects with new stored one.
  926          * If two HWLOC_OBJ_PU objects don't share a common ancestor, the topology is broken.
  927          * Our scheme cannot be used in this case.
  928          */
  929         for (j = 0; j < ix; j++) {
  930             if ((a = hwloc_get_common_ancestor_obj(topology, obj, objs[j])) == NULL) {
  931                 MPIU_Free(pdist);
  932                 MPIU_Free(objs);
  933                 return;
  934             }
  935             d = objs[j]->depth + obj->depth - 2 * a->depth;
  936             pdist[j] += d * d;
  937         }
  938     }
  939 
  940     /* Collect os_indexes into core_mapping */
  941     for (i = 0; i < num_cpus; i++) {
  942         core_mapping[i] = objs[i]->os_index;
  943     }
  944 
  945     MPIU_Free(pdist);
  946     MPIU_Free(objs);
  947     return;
  948 }
  949 
  950 void map_scatter_socket(int num_sockets, hwloc_obj_type_t binding_level)
  951 {
  952     hwloc_obj_t *objs, obj, a;
  953     unsigned *pdist, maxd;
  954     int i, j, ix, jp, d, s, num_cores;
  955 
  956     /* Init and load HWLOC_OBJ_SOCKET or HWLOC_OBJ_NODE objects */
  957     if ((objs = (hwloc_obj_t *) MPIU_Malloc(num_sockets * sizeof(hwloc_obj_t *))) == NULL)
  958         return;
  959 
  960     if ((num_cores = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE)) <= 0) {
  961         return;
  962     }
  963 
  964     obj = NULL;
  965     i = 0;
  966     while ((obj = hwloc_get_next_obj_by_type(topology, binding_level, obj)) != NULL)
  967         objs[i++] = obj;
  968     if (i != num_sockets) {
  969         MPIU_Free(objs);
  970         return;
  971     }
  972 
  973     /* Sort HWLOC_OBJ_SOCKET or HWLOC_OBJ_NODE objects according to sibling_rank */
  974     qsort(objs, num_sockets, sizeof(hwloc_obj_t *), cmpproc_smt);
  975 
  976     /* Init cumulative distances */
  977     if ((pdist = (unsigned *) MPIU_Malloc(num_sockets * sizeof(unsigned))) == NULL) {
  978         MPIU_Free(objs);
  979         return;
  980     }
  981 
  982     /* Loop over objects, ix is index in objs where sorted objects start */
  983     ix = num_sockets;
  984     s = -1;
  985     while (ix > 0) {
  986         /* If new group of SMT processors starts, zero distances */
  987         if (s != objs[0]->sibling_rank) {
  988             s = objs[0]->sibling_rank;
  989             for (j = 0; j < ix; j++)
  990                 pdist[j] = 0;
  991         }
  992         /*
  993          * Determine object that has max. distance to all already stored objects.
  994          * Consider only groups of SMT processors with same sibling_rank.
  995          */
  996         maxd = 0;
  997         jp = 0;
  998         for (j = 0; j < ix; j++) {
  999             if ((j) && (objs[j - 1]->sibling_rank != objs[j]->sibling_rank))
 1000                 break;
 1001             if (pdist[j] > maxd) {
 1002                 maxd = pdist[j];
 1003                 jp = j;
 1004             }
 1005         }
 1006 
 1007         /* Rotate found object to the end of the list, map out found object from distances */
 1008         obj = objs[jp];
 1009         for (j = jp; j < num_sockets - 1; j++) {
 1010             objs[j] = objs[j + 1];
 1011             pdist[j] = pdist[j + 1];
 1012         }
 1013         objs[j] = obj;
 1014         ix--;
 1015 
 1016         /*
 1017          * Update cumulative distances of all remaining objects with new stored one.
 1018          * If two HWLOC_OBJ_SOCKET or HWLOC_OBJ_NODE objects don't share a common ancestor, the topology is broken.
 1019          * Our scheme cannot be used in this case.
 1020          */
 1021         for (j = 0; j < ix; j++) {
 1022             if ((a = hwloc_get_common_ancestor_obj(topology, obj, objs[j])) == NULL) {
 1023                 MPIU_Free(pdist);
 1024                 MPIU_Free(objs);
 1025                 return;
 1026             }
 1027             d = objs[j]->depth + obj->depth - 2 * a->depth;
 1028             pdist[j] += d * d;
 1029         }
 1030     }
 1031 
 1032     /* Collect os_indexes into core_mapping */
 1033     for (i = 0, j = 0; i < num_cores; i++, j++) {
 1034         if (j == num_sockets) {
 1035             j = 0;
 1036         }
 1037         core_mapping[i] = hwloc_bitmap_to_ulong((hwloc_const_bitmap_t) (objs[j]->cpuset));
 1038     }
 1039 
 1040     MPIU_Free(pdist);
 1041     MPIU_Free(objs);
 1042     return;
 1043 }
 1044 
 1045  /*
 1046   * Yields "bunch" affinity scenario in core_mapping.
 1047   */
 1048 void map_bunch_core(int num_cpus)
 1049 {
 1050     hwloc_obj_t *objs, obj, a;
 1051     unsigned *pdist, mind;
 1052     int i, j, ix, jp, d, s, num_cores, num_pus;
 1053 
 1054     /* Init and load HWLOC_OBJ_PU objects */
 1055     if ((objs = (hwloc_obj_t *) MPIU_Malloc(num_cpus * sizeof(hwloc_obj_t *))) == NULL)
 1056         return;
 1057 
 1058     obj = NULL;
 1059     i = 0;
 1060 
 1061     if ((num_cores = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE)) <= 0) {
 1062         MPIU_Free(objs);
 1063         return;
 1064     }
 1065 
 1066     if ((num_pus = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU)) <= 0) {
 1067         MPIU_Free(objs);
 1068         return;
 1069     }
 1070 
 1071     /* SMT Disabled */
 1072     if (num_cores == num_pus) {
 1073 
 1074         get_first_obj_bunch(&obj);
 1075 
 1076         if (obj == NULL) {
 1077             MPIU_Free(objs);
 1078             return;
 1079         }
 1080 
 1081         objs[i] = obj;
 1082         i++;
 1083 
 1084         while ((obj = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_PU, obj)) != NULL) {
 1085             objs[i] = obj;
 1086             i++;
 1087         }
 1088 
 1089         obj = NULL;
 1090         while (i != num_cpus) {
 1091             obj = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_PU, obj);
 1092             objs[i++] = obj;
 1093         }
 1094 
 1095         if (i != num_cpus) {
 1096             MPIU_Free(objs);
 1097             return;
 1098         }
 1099 
 1100     } else {    /* SMT Enabled */
 1101 
 1102         while ((obj = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_PU, obj)) != NULL)
 1103             objs[i++] = obj;
 1104 
 1105         if (i != num_cpus) {
 1106             MPIU_Free(objs);
 1107             return;
 1108         }
 1109 
 1110         /* Sort HWLOC_OBJ_PU objects according to sibling_rank */
 1111         qsort(objs, num_cpus, sizeof(hwloc_obj_t *), cmpproc_smt);
 1112     }
 1113 
 1114     /* Init cumulative distances */
 1115     if ((pdist = (unsigned *) MPIU_Malloc(num_cpus * sizeof(unsigned))) == NULL) {
 1116         MPIU_Free(objs);
 1117         return;
 1118     }
 1119 
 1120     /* Loop over objects, ix is index in objs where sorted objects start */
 1121     ix = num_cpus;
 1122     s = -1;
 1123     while (ix > 0) {
 1124         /* If new group of SMT processors starts, zero distances */
 1125         if (s != objs[0]->sibling_rank) {
 1126             s = objs[0]->sibling_rank;
 1127             for (j = 0; j < ix; j++)
 1128                 pdist[j] = UINT_MAX;
 1129         }
 1130         /*
 1131          * Determine object that has min. distance to all already stored objects.
 1132          * Consider only groups of SMT processors with same sibling_rank.
 1133          */
 1134         mind = UINT_MAX;
 1135         jp = 0;
 1136         for (j = 0; j < ix; j++) {
 1137             if ((j) && (objs[j - 1]->sibling_rank != objs[j]->sibling_rank))
 1138                 break;
 1139             if (pdist[j] < mind) {
 1140                 mind = pdist[j];
 1141                 jp = j;
 1142             }
 1143         }
 1144 
 1145         /* Rotate found object to the end of the list, map out found object from distances */
 1146         obj = objs[jp];
 1147         for (j = jp; j < num_cpus - 1; j++) {
 1148             objs[j] = objs[j + 1];
 1149             pdist[j] = pdist[j + 1];
 1150         }
 1151         objs[j] = obj;
 1152         ix--;
 1153 
 1154         /*
 1155          * Update cumulative distances of all remaining objects with new stored one.
 1156          * If two HWLOC_OBJ_PU objects don't share a common ancestor, the topology is broken.
 1157          * Our scheme cannot be used in this case.
 1158          */
 1159         for (j = 0; j < ix; j++) {
 1160             if ((a = hwloc_get_common_ancestor_obj(topology, obj, objs[j])) == NULL) {
 1161                 MPIU_Free(pdist);
 1162                 MPIU_Free(objs);
 1163                 return;
 1164             }
 1165             d = objs[j]->depth + obj->depth - 2 * a->depth;
 1166             pdist[j] += d * d;
 1167         }
 1168     }
 1169 
 1170     /* Collect os_indexes into core_mapping */
 1171     for (i = 0; i < num_cpus; i++) {
 1172         core_mapping[i] = objs[i]->os_index;
 1173     }
 1174 
 1175     MPIU_Free(pdist);
 1176     MPIU_Free(objs);
 1177     return;
 1178 }
 1179 
 1180 int check_num_child(hwloc_obj_t obj)
 1181 {
 1182     int i = 0, k, num_cores;
 1183 
 1184     if ((num_cores = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE)) <= 0) {
 1185         return 0;
 1186     }
 1187 
 1188     for (k = 0; k < num_cores; k++) {
 1189         if (hwloc_bitmap_isset((hwloc_const_bitmap_t) (obj->cpuset), k)) {
 1190             i++;
 1191         }
 1192     }
 1193 
 1194     return i;
 1195 }
 1196 
 1197 void map_bunch_socket(int num_sockets, hwloc_obj_type_t binding_level)
 1198 {
 1199     hwloc_obj_t *objs, obj, a;
 1200     unsigned *pdist, mind;
 1201     int i, j, ix, jp, d, s, num_cores, num_pus;
 1202 
 1203     /* Init and load HWLOC_OBJ_PU objects */
 1204     if ((objs = (hwloc_obj_t *) MPIU_Malloc(num_sockets * sizeof(hwloc_obj_t *))) == NULL)
 1205         return;
 1206 
 1207     obj = NULL;
 1208     i = 0;
 1209 
 1210     if ((num_cores = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE)) <= 0) {
 1211         MPIU_Free(objs);
 1212         return;
 1213     }
 1214 
 1215     if ((num_pus = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU)) <= 0) {
 1216         MPIU_Free(objs);
 1217         return;
 1218     }
 1219 
 1220     /* SMT Disabled */
 1221     if (num_cores == num_pus) {
 1222 
 1223         get_first_socket_bunch(&obj, binding_level);
 1224 
 1225         if (obj == NULL) {
 1226             MPIU_Free(objs);
 1227             return;
 1228         }
 1229 
 1230         objs[i] = obj;
 1231         i++;
 1232 
 1233         while ((obj = hwloc_get_next_obj_by_type(topology, binding_level, obj)) != NULL) {
 1234             objs[i] = obj;
 1235             i++;
 1236         }
 1237 
 1238         obj = NULL;
 1239         while (i != num_sockets) {
 1240             obj = hwloc_get_next_obj_by_type(topology, binding_level, obj);
 1241             objs[i++] = obj;
 1242         }
 1243 
 1244         if (i != num_sockets) {
 1245             MPIU_Free(objs);
 1246             return;
 1247         }
 1248 
 1249     } else {    /* SMT Enabled */
 1250 
 1251         while ((obj = hwloc_get_next_obj_by_type(topology, binding_level, obj)) != NULL)
 1252             objs[i++] = obj;
 1253 
 1254         if (i != num_sockets) {
 1255             MPIU_Free(objs);
 1256             return;
 1257         }
 1258 
 1259         /* Sort HWLOC_OBJ_SOCKET or HWLOC_OBJ_NODE objects according to sibling_rank */
 1260         qsort(objs, num_sockets, sizeof(hwloc_obj_t *), cmpproc_smt);
 1261 
 1262     }
 1263 
 1264     /* Init cumulative distances */
 1265     if ((pdist = (unsigned *) MPIU_Malloc(num_sockets * sizeof(unsigned))) == NULL) {
 1266         MPIU_Free(objs);
 1267         return;
 1268     }
 1269 
 1270     /* Loop over objects, ix is index in objs where sorted objects start */
 1271     ix = num_sockets;
 1272     s = -1;
 1273     while (ix > 0) {
 1274         /* If new group of SMT processors starts, zero distances */
 1275         if (s != objs[0]->sibling_rank) {
 1276             s = objs[0]->sibling_rank;
 1277             for (j = 0; j < ix; j++)
 1278                 pdist[j] = UINT_MAX;
 1279         }
 1280         /*
 1281          * Determine object that has min. distance to all already stored objects.
 1282          * Consider only groups of SMT processors with same sibling_rank.
 1283          */
 1284         mind = UINT_MAX;
 1285         jp = 0;
 1286         for (j = 0; j < ix; j++) {
 1287             if ((j) && (objs[j - 1]->sibling_rank != objs[j]->sibling_rank))
 1288                 break;
 1289             if (pdist[j] < mind) {
 1290                 mind = pdist[j];
 1291                 jp = j;
 1292             }
 1293         }
 1294 
 1295         /* Rotate found object to the end of the list, map out found object from distances */
 1296         obj = objs[jp];
 1297         for (j = jp; j < num_sockets - 1; j++) {
 1298             objs[j] = objs[j + 1];
 1299             pdist[j] = pdist[j + 1];
 1300         }
 1301         objs[j] = obj;
 1302         ix--;
 1303 
 1304         /*
 1305          * Update cumulative distances of all remaining objects with new stored one.
 1306          * If two HWLOC_OBJ_SOCKET or HWLOC_OBJ_NODE objects don't share a common ancestor, the topology is broken.
 1307          * Our scheme cannot be used in this case.
 1308          */
 1309         for (j = 0; j < ix; j++) {
 1310             if ((a = hwloc_get_common_ancestor_obj(topology, obj, objs[j])) == NULL) {
 1311                 MPIU_Free(pdist);
 1312                 MPIU_Free(objs);
 1313                 return;
 1314             }
 1315             d = objs[j]->depth + obj->depth - 2 * a->depth;
 1316             pdist[j] += d * d;
 1317         }
 1318     }
 1319 
 1320     /* Collect os_indexes into core_mapping */
 1321     int num_child_in_socket[num_sockets];
 1322 
 1323     for (i = 0; i < num_sockets; i++) {
 1324         num_child_in_socket[i] = check_num_child(objs[i]);
 1325     }
 1326 
 1327     for (i = 1; i < num_sockets; i++)
 1328         num_child_in_socket[i] += num_child_in_socket[i - 1];
 1329 
 1330     for (i = 0, j = 0; i < num_cores; i++) {
 1331         if (i == num_child_in_socket[j]) {
 1332             j++;
 1333         }
 1334         core_mapping[i] = hwloc_bitmap_to_ulong((hwloc_const_bitmap_t) (objs[j]->cpuset));
 1335     }
 1336 
 1337     MPIU_Free(pdist);
 1338     MPIU_Free(objs);
 1339     return;
 1340 }
 1341 
 1342 static int num_digits(unsigned long numcpus)
 1343 {
 1344     int n_digits = 0;
 1345     while (numcpus > 0) {
 1346         n_digits++;
 1347         numcpus /= 10;
 1348     }
 1349     return n_digits;
 1350 }
 1351 
 1352 int get_cpu_mapping_hwloc(long N_CPUs_online, hwloc_topology_t tp)
 1353 {
 1354     unsigned topodepth = -1, depth = -1;
 1355     int num_processes = 0, rc = 0, i;
 1356     int num_sockets = 0;
 1357     int num_numanodes = 0;
 1358     int num_cpus = 0;
 1359     char *s;
 1360     struct dirent **namelist;
 1361     pid_t pid;
 1362     obj_attribute_type *tree = NULL;
 1363     char *value;
 1364 
 1365     /* Determine topology depth */
 1366     topodepth = hwloc_topology_get_depth(tp);
 1367     if (topodepth == HWLOC_TYPE_DEPTH_UNKNOWN) {
 1368         fprintf(stderr, "Warning: %s: Failed to determine topology depth.\n", __func__);
 1369         return (topodepth);
 1370     }
 1371 
 1372     /* Count number of (logical) processors */
 1373     depth = hwloc_get_type_depth(tp, HWLOC_OBJ_PU);
 1374 
 1375     if (depth == HWLOC_TYPE_DEPTH_UNKNOWN) {
 1376         fprintf(stderr, "Warning: %s: Failed to determine number of processors.\n",
 1377                 __func__);
 1378         return (depth);
 1379     }
 1380     if ((num_cpus = hwloc_get_nbobjs_by_type(tp, HWLOC_OBJ_PU)) <= 0) {
 1381         fprintf(stderr, "Warning: %s: Failed to determine number of processors.\n",
 1382                 __func__);
 1383         return -1;
 1384     }
 1385 
 1386     /* Count number of sockets */
 1387     depth = hwloc_get_type_depth(tp, HWLOC_OBJ_SOCKET);
 1388     if (depth == HWLOC_TYPE_DEPTH_UNKNOWN) {
 1389         fprintf(stderr, "Warning: %s: Failed to determine number of sockets.\n",
 1390                 __func__);
 1391         return (depth);
 1392     } else {
 1393         num_sockets = hwloc_get_nbobjs_by_depth(tp, depth);
 1394     }
 1395 
 1396     /* Count number of numanodes */
 1397     depth = hwloc_get_type_depth(tp, HWLOC_OBJ_NODE);
 1398     if (depth == HWLOC_TYPE_DEPTH_UNKNOWN) {
 1399         num_numanodes = -1;
 1400     } else {
 1401         num_numanodes = hwloc_get_nbobjs_by_depth(tp, depth);
 1402     }
 1403 
 1404     if (s_cpu_mapping == NULL) {
 1405         /* We need to do allocate memory for the custom_cpu_mapping array
 1406          * and determine the current load on the different cpu's only
 1407          * when the user has not specified a mapping string. If the user
 1408          * has provided a mapping string, it overrides everything.
 1409          */
 1410         /*TODO: might need a better representation as number of cores per node increases */
 1411         unsigned long long_max = ULONG_MAX;
 1412         int n_digits = num_digits(long_max);
 1413         custom_cpu_mapping =
 1414             MPIU_Malloc(sizeof(char) * num_cpus * (n_digits + 1) + 1);
 1415         if (custom_cpu_mapping == NULL) {
 1416             goto error_free;
 1417         }
 1418         MPIU_Memset(custom_cpu_mapping, 0,
 1419                     sizeof(char) * num_cpus * (n_digits + 1) + 1);
 1420         core_mapping = (unsigned long *) MPIU_Malloc(num_cpus * sizeof(unsigned long));
 1421         if (core_mapping == NULL) {
 1422             goto error_free;
 1423         }
 1424         for (i = 0; i < num_cpus; i++) {
 1425             core_mapping[i] = -1;
 1426         }
 1427 
 1428         tree = MPIU_Malloc(num_cpus * topodepth * sizeof(obj_attribute_type));
 1429         if (tree == NULL) {
 1430             goto error_free;
 1431         }
 1432         for (i = 0; i < num_cpus * topodepth; i++) {
 1433             tree[i].obj = NULL;
 1434             tree[i].load = 0;
 1435             CPU_ZERO(&(tree[i].cpuset));
 1436         }
 1437 
 1438         if (!(obj_tree = (int *) MPIU_Malloc(num_cpus * topodepth * sizeof(*obj_tree)))) {
 1439             goto error_free;
 1440         }
 1441         for (i = 0; i < num_cpus * topodepth; i++) {
 1442             obj_tree[i] = -1;
 1443         }
 1444 
 1445         ip = 0;
 1446 
 1447         /* MV2_ENABLE_LEASTLOAD: map_bunch/scatter or map_bunch/scatter_load */
 1448         if ((value = getenv("MV2_ENABLE_LEASTLOAD")) != NULL) {
 1449             mv2_enable_leastload = atoi(value);
 1450             if (mv2_enable_leastload != 1) {
 1451                 mv2_enable_leastload = 0;
 1452             }
 1453         }
 1454 
 1455         /* MV2_ENABLE_LEASTLOAD=1, map_bunch_load or map_scatter_load is used */
 1456         if (mv2_enable_leastload == 1) {
 1457             /*
 1458              * Get all processes' pid and cpuset.
 1459              * Get numanode, socket, and core current load according to processes running on it.
 1460              */
 1461             num_processes = scandir("/proc", &namelist, pid_filter, alphasort);
 1462             if (num_processes < 0) {
 1463                 fprintf(stderr, "Warning: %s: Failed to scandir /proc.\n", __func__);
 1464                 return -1;
 1465             } else {
 1466                 int status;
 1467                 cpu_set_t pid_cpuset;
 1468                 CPU_ZERO(&pid_cpuset);
 1469 
 1470                 /* Get cpuset for each running process. */
 1471                 for (i = 0; i < num_processes; i++) {
 1472                     pid = atol(namelist[i]->d_name);
 1473                     status = sched_getaffinity(pid, sizeof(pid_cpuset), &pid_cpuset);
 1474                     /* Process completed. */
 1475                     if (status < 0) {
 1476                         continue;
 1477                     }
 1478                     cac_load(tree, pid_cpuset);
 1479                 }
 1480                 while (num_processes--) {
 1481                     MPIU_Free(namelist[num_processes]);
 1482                 }
 1483                 MPIU_Free(namelist);
 1484             }
 1485 
 1486             if (mv2_binding_policy == POLICY_SCATTER) {
 1487                 map_scatter_load(tree);
 1488             } else if (mv2_binding_policy == POLICY_BUNCH) {
 1489                 map_bunch_load(tree);
 1490             } else {
 1491                 goto error_free;
 1492             }
 1493         } else {
 1494             /* MV2_ENABLE_LEASTLOAD != 1 or MV2_ENABLE_LEASTLOAD == NULL, map_bunch or map_scatter is used */
 1495             if (mv2_binding_policy == POLICY_SCATTER) {
 1496                 /* Scatter */
 1497                 hwloc_obj_type_t binding_level = HWLOC_OBJ_SOCKET;
 1498                 if (mv2_binding_level == LEVEL_SOCKET) {
 1499                     map_scatter_socket(num_sockets, binding_level);
 1500                 } else if (mv2_binding_level == LEVEL_NUMANODE) {
 1501                     if (num_numanodes == -1) {
 1502                         /* There is not numanode, fallback to socket */
 1503                         map_scatter_socket(num_sockets, binding_level);
 1504                     } else {
 1505                         binding_level = HWLOC_OBJ_NODE;
 1506                         map_scatter_socket(num_numanodes, binding_level);
 1507                     }
 1508                 } else {
 1509                     map_scatter_core(num_cpus);
 1510                 }
 1511 
 1512             } else if (mv2_binding_policy == POLICY_BUNCH) {
 1513                 /* Bunch */
 1514                 hwloc_obj_type_t binding_level = HWLOC_OBJ_SOCKET;
 1515                 if (mv2_binding_level == LEVEL_SOCKET) {
 1516                     map_bunch_socket(num_sockets, binding_level);
 1517                 } else if (mv2_binding_level == LEVEL_NUMANODE) {
 1518                     if (num_numanodes == -1) {
 1519                         /* There is not numanode, fallback to socket */
 1520                         map_bunch_socket(num_sockets, binding_level);
 1521                     } else {
 1522                         binding_level = HWLOC_OBJ_NODE;
 1523                         map_bunch_socket(num_numanodes, binding_level);
 1524                     }
 1525                 } else {
 1526                     map_bunch_core(num_cpus);
 1527                 }
 1528             } else {
 1529                 goto error_free;
 1530             }
 1531         }
 1532 
 1533         /* Assemble custom_cpu_mapping string */
 1534         s = custom_cpu_mapping;
 1535         for (i = 0; i < num_cpus; i++) {
 1536             s += sprintf(s, "%lu:", core_mapping[i]);
 1537         }
 1538     }
 1539 
 1540     /* Done */
 1541     rc = MPI_SUCCESS;
 1542 
 1543   error_free:
 1544     if (core_mapping != NULL) {
 1545         MPIU_Free(core_mapping);
 1546     }
 1547     if (tree != NULL) {
 1548         MPIU_Free(tree);
 1549     }
 1550     if (obj_tree) {
 1551         MPIU_Free(obj_tree);
 1552     }
 1553 
 1554     PRINT_DEBUG(DEBUG_INIT_verbose>0,
 1555             "num_cpus: %d, num_sockets: %d, custom_cpu_mapping: %s\n",
 1556             num_cpus, num_sockets, custom_cpu_mapping);
 1557 
 1558     return rc;
 1559 }
 1560 
 1561 
 1562 int get_cpu_mapping(long N_CPUs_online)
 1563 {
 1564     char line[MAX_LINE_LENGTH];
 1565     char input[MAX_NAME_LENGTH];
 1566     char bogus1[MAX_NAME_LENGTH];
 1567     char bogus2[MAX_NAME_LENGTH];
 1568     char bogus3[MAX_NAME_LENGTH];
 1569     int physical_id;            //return value
 1570     int mapping[N_CPUs_online];
 1571     int core_index = 0;
 1572     cpu_type_t cpu_type = 0;
 1573     int model;
 1574     int vendor_set = 0, model_set = 0, num_cpus = 0;
 1575 
 1576     FILE *fp = fopen(CONFIG_FILE, "r");
 1577     if (fp == NULL) {
 1578         printf("can not open cpuinfo file \n");
 1579         return 0;
 1580     }
 1581 
 1582     MPIU_Memset(mapping, 0, sizeof(mapping));
 1583     custom_cpu_mapping = (char *) MPIU_Malloc(sizeof(char) * N_CPUs_online * 2);
 1584     if (custom_cpu_mapping == NULL) {
 1585         return 0;
 1586     }
 1587     MPIU_Memset(custom_cpu_mapping, 0, sizeof(char) * N_CPUs_online * 2);
 1588 
 1589     while (!feof(fp)) {
 1590         MPIU_Memset(line, 0, MAX_LINE_LENGTH);
 1591         fgets(line, MAX_LINE_LENGTH, fp);
 1592 
 1593         MPIU_Memset(input, 0, MAX_NAME_LENGTH);
 1594         sscanf(line, "%s", input);
 1595 
 1596         if (!vendor_set) {
 1597             if (strcmp(input, "vendor_id") == 0) {
 1598                 MPIU_Memset(input, 0, MAX_NAME_LENGTH);
 1599                 sscanf(line, "%s%s%s", bogus1, bogus2, input);
 1600 
 1601                 if (strcmp(input, "AuthenticAMD") == 0) {
 1602                     cpu_type = CPU_FAMILY_AMD;
 1603                 } else {
 1604                     cpu_type = CPU_FAMILY_INTEL;
 1605                 }
 1606                 vendor_set = 1;
 1607             }
 1608         }
 1609 
 1610         if (!model_set) {
 1611             if (strcmp(input, "model") == 0) {
 1612                 sscanf(line, "%s%s%d", bogus1, bogus2, &model);
 1613                 model_set = 1;
 1614             }
 1615         }
 1616 
 1617         if (strcmp(input, "physical") == 0) {
 1618             sscanf(line, "%s%s%s%d", bogus1, bogus2, bogus3, &physical_id);
 1619             mapping[core_index++] = physical_id;
 1620         }
 1621     }
 1622 
 1623     num_cpus = core_index;
 1624     if (num_cpus == 4) {
 1625         if ((memcmp(INTEL_XEON_DUAL_MAPPING, mapping, sizeof(int) * num_cpus) == 0)
 1626             && (cpu_type == CPU_FAMILY_INTEL)) {
 1627             strcpy(custom_cpu_mapping, "0:2:1:3");
 1628         } else
 1629             if ((memcmp(AMD_OPTERON_DUAL_MAPPING, mapping, sizeof(int) * num_cpus) == 0)
 1630                 && (cpu_type == CPU_FAMILY_AMD)) {
 1631             strcpy(custom_cpu_mapping, "0:1:2:3");
 1632         }
 1633     } else if (num_cpus == 8) {
 1634         if (cpu_type == CPU_FAMILY_INTEL) {
 1635             if (model == CLOVERTOWN_MODEL) {
 1636                 if (memcmp(INTEL_CLOVERTOWN_MAPPING, mapping, sizeof(int) * num_cpus) ==
 1637                     0) {
 1638                     strcpy(custom_cpu_mapping, "0:1:4:5:2:3:6:7");
 1639                 }
 1640             } else if (model == HARPERTOWN_MODEL) {
 1641                 if (memcmp(INTEL_HARPERTOWN_LEG_MAPPING, mapping, sizeof(int) * num_cpus)
 1642                     == 0) {
 1643                     strcpy(custom_cpu_mapping, "0:1:4:5:2:3:6:7");
 1644                 } else
 1645                     if (memcmp
 1646                         (INTEL_HARPERTOWN_COM_MAPPING, mapping,
 1647                          sizeof(int) * num_cpus) == 0) {
 1648                     strcpy(custom_cpu_mapping, "0:4:2:6:1:5:3:7");
 1649                 }
 1650             } else if (model == NEHALEM_MODEL) {
 1651                 if (memcmp(INTEL_NEHALEM_LEG_MAPPING, mapping, sizeof(int) * num_cpus) ==
 1652                     0) {
 1653                     strcpy(custom_cpu_mapping, "0:2:4:6:1:3:5:7");
 1654                 } else
 1655                     if (memcmp(INTEL_NEHALEM_COM_MAPPING, mapping, sizeof(int) * num_cpus)
 1656                         == 0) {
 1657                     strcpy(custom_cpu_mapping, "0:4:1:5:2:6:3:7");
 1658                 }
 1659             }
 1660         }
 1661     } else if (num_cpus == 16) {
 1662         if (cpu_type == CPU_FAMILY_INTEL) {
 1663             if (model == NEHALEM_MODEL) {
 1664                 if (memcmp(INTEL_NEHALEM_LEG_MAPPING, mapping, sizeof(int) * num_cpus) ==
 1665                     0) {
 1666                     strcpy(custom_cpu_mapping, "0:2:4:6:1:3:5:7:8:10:12:14:9:11:13:15");
 1667                 } else
 1668                     if (memcmp(INTEL_NEHALEM_COM_MAPPING, mapping, sizeof(int) * num_cpus)
 1669                         == 0) {
 1670                     strcpy(custom_cpu_mapping, "0:4:1:5:2:6:3:7:8:12:9:13:10:14:11:15");
 1671                 }
 1672             }
 1673         } else if (cpu_type == CPU_FAMILY_AMD) {
 1674             if (memcmp(AMD_BARCELONA_MAPPING, mapping, sizeof(int) * num_cpus) == 0) {
 1675                 strcpy(custom_cpu_mapping, "0:1:2:3:4:5:6:7:8:9:10:11:12:13:14:15");
 1676             }
 1677         }
 1678     }
 1679     fclose(fp);
 1680 
 1681     return MPI_SUCCESS;
 1682 }
 1683 
 1684 #if defined(CHANNEL_MRAIL)
 1685 int get_socket_id (int ib_socket, int cpu_socket, int num_sockets,
 1686         tab_socket_t * tab_socket)
 1687 {
 1688     extern int rdma_local_id, rdma_num_hcas;
 1689 
 1690     int rdma_num_proc_per_hca;
 1691     int offset_id;
 1692     int j;
 1693     int socket_id = ib_socket;
 1694     int delta = cpu_socket / tab_socket[ib_socket].num_hca;
 1695 
 1696     rdma_num_proc_per_hca = rdma_num_local_procs / rdma_num_hcas;
 1697 
 1698     if (rdma_num_local_procs % rdma_num_hcas) {
 1699         rdma_num_proc_per_hca++;
 1700     }
 1701 
 1702     offset_id = rdma_local_id % rdma_num_proc_per_hca;
 1703 
 1704     if (offset_id < delta) {
 1705         return ib_socket;
 1706     }
 1707 
 1708     for (j = 0; j < num_sockets - 1; j++) {
 1709         socket_id = tab_socket[ib_socket].closest[j];
 1710 
 1711         if (tab_socket[socket_id].num_hca == 0) {
 1712             offset_id -= delta;
 1713 
 1714             if (offset_id < delta) {
 1715                 return socket_id;
 1716             }
 1717         }
 1718     }
 1719 
 1720     /*
 1721      * Couldn't find a free socket, spread remaining processes
 1722      */
 1723     return rdma_local_id % num_sockets;
 1724 }
 1725 
 1726 #undef FUNCNAME
 1727 #define FUNCNAME mv2_get_cpu_core_closest_to_hca
 1728 #undef FCNAME
 1729 #define FCNAME MPL_QUOTE(FUNCNAME)
 1730 int mv2_get_cpu_core_closest_to_hca(int my_local_id, int total_num_cores,
 1731                                     int num_sockets, int depth_sockets)
 1732 {
 1733     int i = 0, k = 0;
 1734     int ib_hca_selected = 0;
 1735     int selected_socket = 0;
 1736     int cores_per_socket = 0;
 1737     tab_socket_t *tab_socket = NULL;
 1738     int linelen = strlen(custom_cpu_mapping);
 1739 
 1740     if (linelen < custom_cpu_mapping_line_max) {
 1741         custom_cpu_mapping_line_max = linelen;
 1742     }
 1743 
 1744     cores_per_socket = total_num_cores / num_sockets;
 1745 
 1746     /*
 1747      * Make ib_hca_selected global or make this section a function
 1748      */
 1749     if (FIXED_MAPPING == rdma_rail_sharing_policy) {
 1750         ib_hca_selected = rdma_process_binding_rail_offset /
 1751                             rdma_num_rails_per_hca;
 1752     } else {
 1753         ib_hca_selected = 0;
 1754     }
 1755 
 1756     tab_socket = (tab_socket_t*)MPIU_Malloc(num_sockets * sizeof(tab_socket_t));
 1757     if (NULL == tab_socket) {
 1758         fprintf(stderr, "could not allocate the socket table\n");
 1759         return -1;
 1760     }
 1761 
 1762     for (i = 0; i < num_sockets; i++) {
 1763         tab_socket[i].num_hca = 0;
 1764 
 1765         for(k = 0; k < num_sockets; k++) {
 1766             tab_socket[i].closest[k] = -1;
 1767         }
 1768     }
 1769 
 1770     for (i = 0; i < rdma_num_hcas; i++) {
 1771         struct ibv_device * ibdev = mv2_MPIDI_CH3I_RDMA_Process.ib_dev[i];
 1772         int socket_id = get_ib_socket(ibdev);
 1773         /*
 1774          * Make this information available globally
 1775          */
 1776         if (i == ib_hca_selected) {
 1777             ib_socket_bind = socket_id;
 1778         }
 1779         tab_socket[socket_id].num_hca++;
 1780     }
 1781 
 1782     hwloc_obj_t obj_src;
 1783     hwloc_obj_t objs[num_sockets];
 1784     char string[20];
 1785 
 1786     for (i = 0; i < num_sockets; i++) {
 1787         obj_src = hwloc_get_obj_by_type(topology, HWLOC_OBJ_SOCKET,i);
 1788         hwloc_get_closest_objs(topology, obj_src, (hwloc_obj_t *)&objs,
 1789                                 num_sockets - 1);
 1790 
 1791         for (k = 0; k < num_sockets - 1; k++) {
 1792             hwloc_obj_type_snprintf(string, sizeof(string),
 1793                                 objs[k], 1);
 1794             tab_socket[i].closest[k] = objs[k]->os_index;
 1795         }
 1796     }
 1797 
 1798     selected_socket = get_socket_id(ib_socket_bind, cores_per_socket,
 1799                                     num_sockets, tab_socket);
 1800     MPIU_Free(tab_socket);
 1801 
 1802     return selected_socket;
 1803 }
 1804 #endif /* defined(CHANNEL_MRAIL) */
 1805 
 1806 #undef FUNCNAME
 1807 #define FUNCNAME mv2_get_assigned_cpu_core
 1808 #undef FCNAME
 1809 #define FCNAME MPL_QUOTE(FUNCNAME)
 1810 int mv2_get_assigned_cpu_core(int my_local_id, char *cpu_mapping, int max_cpu_map_len, char *tp_str)
 1811 {
 1812     int i=0, j=0, c=0;
 1813     char *cp = NULL;
 1814     char *tp = cpu_mapping;
 1815     long N_CPUs_online = sysconf(_SC_NPROCESSORS_ONLN);
 1816 
 1817     while (*tp != '\0') {
 1818         i = 0;
 1819         cp = tp;
 1820 
 1821         while (*cp != '\0' && *cp != ':' && i < max_cpu_map_len) {
 1822             ++cp;
 1823             ++i;
 1824         }
 1825 
 1826         if (j == my_local_id) {
 1827             strncpy(tp_str, tp, i);
 1828             c = atoi(tp);
 1829             if ((mv2_binding_level == LEVEL_CORE) && (c < 0 || c >= N_CPUs_online)) {
 1830                 fprintf(stderr, "Warning! : Core id %d does not exist on this architecture! \n", c);
 1831                 fprintf(stderr, "CPU Affinity is undefined \n");
 1832                 mv2_enable_affinity = 0;
 1833                 return -1;
 1834             }
 1835             tp_str[i] = '\0';
 1836             return 0;
 1837         }
 1838 
 1839         if (*cp == '\0') {
 1840             break;
 1841         }
 1842 
 1843         tp = cp;
 1844         ++tp;
 1845         ++j;
 1846     }
 1847 
 1848     return -1;
 1849 }
 1850 
 1851 #if defined(CHANNEL_MRAIL)
 1852 #undef FUNCNAME
 1853 #define FUNCNAME smpi_set_progress_thread_affinity
 1854 #undef FCNAME
 1855 #define FCNAME MPL_QUOTE(FUNCNAME)
 1856 int smpi_set_progress_thread_affinity()
 1857 {
 1858     int mpi_errno = MPI_SUCCESS;
 1859     hwloc_cpuset_t cpuset;
 1860 
 1861     /* Alloc cpuset */
 1862     cpuset = hwloc_bitmap_alloc();
 1863     /* Set cpuset to mv2_my_async_cpu_id */
 1864     hwloc_bitmap_set(cpuset, mv2_my_async_cpu_id);
 1865     /* Attachement progress thread to mv2_my_async_cpu_id */
 1866     hwloc_set_thread_cpubind(topology, pthread_self(), cpuset, 0);
 1867     /* Free cpuset */
 1868     hwloc_bitmap_free(cpuset);
 1869 
 1870     return mpi_errno;
 1871 }
 1872 
 1873 #undef FUNCNAME
 1874 #define FUNCNAME smpi_identify_allgather_local_core_ids
 1875 #undef FCNAME
 1876 #define FCNAME MPL_QUOTE(FUNCNAME)
 1877 int smpi_identify_allgather_local_core_ids(MPIDI_PG_t * pg)
 1878 {
 1879     int mpi_errno = MPI_SUCCESS;
 1880     int p = 0;
 1881     MPIDI_VC_t *vc = NULL;
 1882     MPID_Request **request = NULL;
 1883     MPI_Status *status= NULL;
 1884     MPIR_Errflag_t errflag = MPIR_ERR_NONE;
 1885     MPID_Comm *comm_ptr=NULL;
 1886 
 1887     MPID_Comm_get_ptr(MPI_COMM_WORLD, comm_ptr );
 1888 
 1889     /* Allocate memory */
 1890     local_core_ids = MPIU_Malloc(g_smpi.num_local_nodes * sizeof(int));
 1891     if (local_core_ids== NULL) {
 1892         ibv_error_abort(GEN_EXIT_ERR, "Failed to allocate memory for local_core_ids\n");
 1893     }
 1894     request = MPIU_Malloc(g_smpi.num_local_nodes * 2 * sizeof(MPID_Request*));
 1895     if (request == NULL) {
 1896         ibv_error_abort(GEN_EXIT_ERR, "Failed to allocate memory for requests\n");
 1897     }
 1898     status = MPIU_Malloc(g_smpi.num_local_nodes * 2 * sizeof(MPI_Status));
 1899     if (request == NULL) {
 1900         ibv_error_abort(GEN_EXIT_ERR, "Failed to allocate memory for statuses\n");
 1901     }
 1902     /* Perform intra-node allgather */
 1903     for (p = 0; p < g_smpi.num_local_nodes; ++p) {
 1904         MPIDI_PG_Get_vc(pg, g_smpi.l2g_rank[p], &vc);
 1905         if (vc->smp.local_nodes >= 0) {
 1906             mpi_errno = MPIC_Irecv((void*)&local_core_ids[vc->smp.local_nodes],
 1907                                     1, MPI_INT, vc->pg_rank, MPIR_ALLGATHER_TAG,
 1908                                     comm_ptr, &request[g_smpi.num_local_nodes+p]);
 1909             if (mpi_errno) {
 1910                 MPIR_ERR_POP(mpi_errno);
 1911             }
 1912             mpi_errno = MPIC_Isend((void*)&mv2_my_cpu_id, 1, MPI_INT, vc->pg_rank,
 1913                                     MPIR_ALLGATHER_TAG, comm_ptr, &request[p], &errflag);
 1914             if (mpi_errno) {
 1915                 MPIR_ERR_POP(mpi_errno);
 1916             }
 1917         }
 1918     }
 1919     /* Wait for intra-node allgather to finish */
 1920     mpi_errno = MPIC_Waitall(g_smpi.num_local_nodes*2, request, status, &errflag);
 1921     if (mpi_errno) {
 1922         MPIR_ERR_POP(mpi_errno);
 1923     }
 1924 
 1925 fn_exit:
 1926     if (request) {
 1927         MPIU_Free(request);
 1928     }
 1929     if (status) {
 1930         MPIU_Free(status);
 1931     }
 1932     return mpi_errno;
 1933 fn_fail:
 1934     goto fn_exit;
 1935 }
 1936 
 1937 #undef FUNCNAME
 1938 #define FUNCNAME smpi_identify_free_cores
 1939 #undef FCNAME
 1940 #define FCNAME MPL_QUOTE(FUNCNAME)
 1941 int smpi_identify_free_cores(hwloc_cpuset_t *sock_cpuset, hwloc_cpuset_t *free_sock_cpuset)
 1942 {
 1943     int i = 0;
 1944     int mpi_errno = MPI_SUCCESS;
 1945     int num_sockets = -1;
 1946     int depth_sockets = -1;
 1947     hwloc_obj_t socket = NULL;
 1948     hwloc_cpuset_t my_cpuset = NULL;
 1949     char cpu_str[128];
 1950 
 1951     /* Alloc cpuset */
 1952     my_cpuset = hwloc_bitmap_alloc();
 1953     *sock_cpuset = hwloc_bitmap_alloc();
 1954     /* Clear CPU set */
 1955     hwloc_bitmap_zero(my_cpuset);
 1956     hwloc_bitmap_zero(*sock_cpuset);
 1957     /* Set cpuset to mv2_my_cpu_id */
 1958     hwloc_bitmap_set(my_cpuset, mv2_my_cpu_id);
 1959 
 1960     depth_sockets   = hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET);
 1961     num_sockets     = hwloc_get_nbobjs_by_depth(topology, depth_sockets);
 1962 
 1963     for (i = 0; i < num_sockets; ++i) {
 1964         socket = hwloc_get_obj_by_depth(topology, depth_sockets, i);
 1965         /* Find the list of CPUs we're allowed to use in the socket */
 1966         hwloc_bitmap_and(*sock_cpuset, socket->online_cpuset, socket->allowed_cpuset);
 1967         /* Find the socket the core I'm bound to resides on */
 1968         if (hwloc_bitmap_intersects(my_cpuset, *sock_cpuset)) {
 1969             /* Create a copy to identify list of free coress */
 1970             *free_sock_cpuset = hwloc_bitmap_dup(*sock_cpuset);
 1971             /* Store my sock ID */
 1972             mv2_my_sock_id = i;
 1973             break;
 1974         }
 1975     }
 1976     if (i == num_sockets) {
 1977         mpi_errno = MPI_ERR_OTHER;
 1978         MPIR_ERR_POP(mpi_errno);
 1979     } else {
 1980         /* Remove cores used by processes from list of available cores */
 1981         for (i = 0; i < g_smpi.num_local_nodes; ++i) {
 1982             hwloc_bitmap_clr(*free_sock_cpuset, local_core_ids[i]);
 1983         }
 1984         hwloc_bitmap_snprintf(cpu_str, 128, *free_sock_cpuset);
 1985         PRINT_DEBUG(DEBUG_INIT_verbose, "Free sock_cpuset = %s\n", cpu_str);
 1986     }
 1987 
 1988     if (my_cpuset) {
 1989         hwloc_bitmap_free(my_cpuset);
 1990     }
 1991 fn_fail:
 1992     return mpi_errno;
 1993 }
 1994 
 1995 #undef FUNCNAME
 1996 #define FUNCNAME smpi_identify_core_for_async_thread
 1997 #undef FCNAME
 1998 #define FCNAME MPL_QUOTE(FUNCNAME)
 1999 int smpi_identify_core_for_async_thread(MPIDI_PG_t * pg)
 2000 {
 2001     int i = 0;
 2002     int mpi_errno = MPI_SUCCESS;
 2003     hwloc_cpuset_t sock_cpuset = NULL;
 2004     hwloc_cpuset_t free_sock_cpuset = NULL;
 2005 
 2006     /* Gather IDs of cores local processes are bound to */
 2007     mpi_errno = smpi_identify_allgather_local_core_ids(pg);
 2008     if (mpi_errno) {
 2009         MPIR_ERR_POP(mpi_errno);
 2010     }
 2011     /* Identify my socket and cores available in my socket */
 2012     mpi_errno = smpi_identify_free_cores(&sock_cpuset, &free_sock_cpuset);
 2013     if (mpi_errno) {
 2014         MPIR_ERR_POP(mpi_errno);
 2015     }
 2016     /* Identify core to be used for async thread */
 2017     if (!hwloc_bitmap_iszero(free_sock_cpuset)) {
 2018         for (i = 0; i < g_smpi.num_local_nodes; ++i) {
 2019             /* If local process 'i' is on a core on my socket */
 2020             if (hwloc_bitmap_isset(sock_cpuset, local_core_ids[i])) {
 2021                 mv2_my_async_cpu_id = hwloc_bitmap_next(free_sock_cpuset, mv2_my_async_cpu_id);
 2022                 if (i == g_smpi.my_local_id) {
 2023                     break;
 2024                 }
 2025             }
 2026         }
 2027         /* Ensure async thread gets bound to a core */
 2028         while (mv2_my_async_cpu_id < 0) {
 2029             mv2_my_async_cpu_id = hwloc_bitmap_next(free_sock_cpuset, mv2_my_async_cpu_id);
 2030         }
 2031     }
 2032     PRINT_DEBUG(DEBUG_INIT_verbose>0, "[local_rank: %d]: sock_id = %d, cpu_id = %d, async_cpu_id = %d\n",
 2033                     g_smpi.my_local_id, mv2_my_sock_id, mv2_my_cpu_id, mv2_my_async_cpu_id);
 2034 
 2035 fn_exit:
 2036     /* Free temporary memory */
 2037     if (local_core_ids) {
 2038         MPIU_Free(local_core_ids);
 2039     }
 2040     /* Free cpuset */
 2041     if (sock_cpuset) {
 2042         hwloc_bitmap_free(sock_cpuset);
 2043     }
 2044     if (free_sock_cpuset) {
 2045         hwloc_bitmap_free(free_sock_cpuset);
 2046     }
 2047     return mpi_errno;
 2048 
 2049 fn_fail:
 2050     goto fn_exit;
 2051 }
 2052 #endif /*defined(CHANNEL_MRAIL)*/
 2053 
 2054 #undef FUNCNAME
 2055 #define FUNCNAME SMPI_LOAD_HWLOC_TOPOLOGY
 2056 #undef FCNAME
 2057 #define FCNAME MPL_QUOTE(FUNCNAME)
 2058 int smpi_load_hwloc_topology(void)
 2059 {
 2060     int bcast_topology = 1;
 2061     int mpi_errno = MPI_SUCCESS;
 2062     char *kvsname, *value;
 2063     char *hostname = NULL;
 2064     char *tmppath = NULL;
 2065     int uid, my_local_id;
 2066 
 2067     MPIDI_STATE_DECL(SMPI_LOAD_HWLOC_TOPOLOGY);
 2068     MPIDI_FUNC_ENTER(SMPI_LOAD_HWLOC_TOPOLOGY);
 2069 
 2070     if (topology != NULL) {
 2071         goto fn_exit;
 2072     }
 2073 
 2074     mpi_errno = hwloc_topology_init(&topology);
 2075     hwloc_topology_set_flags(topology,
 2076             HWLOC_TOPOLOGY_FLAG_IO_DEVICES   |
 2077     
 2078     /* removing HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM flag since we now 
 2079      * have cpu_cores in the heterogeneity detection logic
 2080      */
 2081      //     HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM |
 2082      
 2083             HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM);
 2084 
 2085     uid = getuid();
 2086     my_local_id = MPIDI_Process.my_pg->ch.local_process_id;
 2087     MPIDI_PG_GetConnKVSname(&kvsname);
 2088  
 2089     if ((value = getenv("MV2_BCAST_HWLOC_TOPOLOGY")) != NULL) {
 2090         bcast_topology = !!atoi(value);
 2091     }
 2092 
 2093     if (my_local_id < 0) {
 2094         if (MPIDI_Process.my_pg_rank == 0) {
 2095             PRINT_ERROR("WARNING! Invalid my_local_id: %d, Disabling hwloc topology broadcast\n", my_local_id);
 2096         }
 2097         bcast_topology = 0;
 2098     }
 2099 
 2100     if (!bcast_topology) {
 2101         /* Each process loads topology individually */
 2102         mpi_errno = hwloc_topology_load(topology);
 2103         goto fn_exit;
 2104     }
 2105 
 2106     hostname = (char *) MPIU_Malloc(sizeof(char) * HOSTNAME_LENGTH);
 2107     tmppath = (char *) MPIU_Malloc(sizeof(char) * FILENAME_LENGTH);
 2108     xmlpath = (char *) MPIU_Malloc(sizeof(char) * FILENAME_LENGTH);
 2109     if (hostname == NULL || tmppath == NULL || xmlpath == NULL) {
 2110         MPIR_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**nomem",
 2111                                   "**nomem %s", "mv2_hwloc_topology_file");
 2112     }
 2113 
 2114     if (gethostname(hostname, sizeof (char) * HOSTNAME_LENGTH) < 0) {
 2115         MPIR_ERR_SETFATALANDJUMP2(mpi_errno, MPI_ERR_OTHER, "**fail", "%s: %s",
 2116                                   "gethostname", strerror(errno));
 2117     }
 2118     sprintf(tmppath, "/tmp/mv2-hwloc-%s-%s-%d.tmp", kvsname, hostname, uid);
 2119     sprintf(xmlpath, "/tmp/mv2-hwloc-%s-%s-%d.xml", kvsname, hostname, uid);
 2120 
 2121     /* Local Rank 0 broadcasts topology using xml */
 2122     if (0 == my_local_id) {
 2123         mpi_errno = hwloc_topology_load(topology);
 2124         if (mpi_errno) MPIR_ERR_POP(mpi_errno);
 2125 
 2126         mpi_errno = hwloc_topology_export_xml(topology, tmppath);
 2127         if (mpi_errno) MPIR_ERR_POP(mpi_errno);
 2128 
 2129         if(rename(tmppath, xmlpath) < 0) {
 2130             MPIR_ERR_SETFATALANDJUMP2(mpi_errno, MPI_ERR_OTHER, "**fail", "%s: %s",
 2131                                   "rename", strerror(errno));
 2132         }
 2133     } else {
 2134         while(access(xmlpath, F_OK) == -1) {
 2135             usleep(1000);
 2136         }
 2137         mpi_errno = hwloc_topology_set_xml(topology, xmlpath);
 2138         if (mpi_errno) MPIR_ERR_POP(mpi_errno);
 2139 
 2140         mpi_errno = hwloc_topology_load(topology);
 2141         if (mpi_errno) MPIR_ERR_POP(mpi_errno);
 2142     }
 2143 
 2144   fn_exit:
 2145     if (hostname) {
 2146         MPIU_Free(hostname);
 2147     }
 2148     if (tmppath) {
 2149         MPIU_Free(tmppath);
 2150     }
 2151     MPIDI_FUNC_EXIT(SMPI_LOAD_HWLOC_TOPOLOGY);
 2152     return mpi_errno;
 2153 
 2154   fn_fail:
 2155     goto fn_exit;
 2156 }
 2157 
 2158 #undef FUNCNAME
 2159 #define FUNCNAME SMPI_UNLINK_HWLOC_TOPOLOGY_FILE
 2160 #undef FCNAME
 2161 #define FCNAME MPL_QUOTE(FUNCNAME)
 2162 int smpi_unlink_hwloc_topology_file(void)
 2163 {
 2164     int mpi_errno = MPI_SUCCESS;
 2165     MPIDI_STATE_DECL(SMPI_UNLINK_HWLOC_TOPOLOGY_FILE);
 2166     MPIDI_FUNC_ENTER(SMPI_UNLINK_HWLOC_TOPOLOGY_FILE);
 2167 
 2168     if (xmlpath) {
 2169         unlink(xmlpath);
 2170     }
 2171 
 2172     MPIDI_FUNC_EXIT(SMPI_UNLINK_HWLOC_TOPOLOGY_FILE);
 2173     return mpi_errno;
 2174 }
 2175 
 2176 #undef FUNCNAME
 2177 #define FUNCNAME SMPI_DESTROY_HWLOC_TOPOLOGY
 2178 #undef FCNAME
 2179 #define FCNAME MPL_QUOTE(FUNCNAME)
 2180 int smpi_destroy_hwloc_topology(void)
 2181 {
 2182     int mpi_errno = MPI_SUCCESS;
 2183     MPIDI_STATE_DECL(SMPI_DESTROY_HWLOC_TOPOLOGY);
 2184     MPIDI_FUNC_ENTER(SMPI_DESTROY_HWLOC_TOPOLOGY);
 2185 
 2186     if (topology) {
 2187         hwloc_topology_destroy(topology);
 2188         topology = NULL;
 2189     }
 2190 
 2191     if (xmlpath) {
 2192         MPIU_Free(xmlpath);
 2193     }
 2194 
 2195     MPIDI_FUNC_EXIT(SMPI_DESTROY_HWLOC_TOPOLOGY);
 2196     return mpi_errno;
 2197 }
 2198 
 2199 #undef FUNCNAME
 2200 #define FUNCNAME smpi_setaffinity
 2201 #undef FCNAME
 2202 #define FCNAME MPL_QUOTE(FUNCNAME)
 2203 int smpi_setaffinity(int my_local_id)
 2204 {
 2205     int selected_socket = 0;
 2206     int mpi_errno = MPI_SUCCESS;
 2207 
 2208     hwloc_cpuset_t cpuset;
 2209     MPIDI_STATE_DECL(MPID_STATE_SMPI_SETAFFINITY);
 2210     MPIDI_FUNC_ENTER(MPID_STATE_SMPI_SETAFFINITY);
 2211 
 2212 #if !defined(CHANNEL_MRAIL)
 2213     mv2_hca_aware_process_mapping = 0;
 2214 #endif
 2215 
 2216     PRINT_DEBUG(DEBUG_INIT_verbose>0, 
 2217             "my_local_id: %d, mv2_enable_affinity: %d, mv2_binding_level: %d, mv2_binding_policy: %d\n",
 2218             my_local_id, mv2_enable_affinity, mv2_binding_level, mv2_binding_policy);
 2219 
 2220     if (mv2_enable_affinity > 0) {
 2221         long N_CPUs_online = sysconf(_SC_NPROCESSORS_ONLN);
 2222 
 2223         if (N_CPUs_online < 1) {
 2224             MPIR_ERR_SETFATALANDJUMP2(mpi_errno,
 2225                                       MPI_ERR_OTHER,
 2226                                       "**fail", "%s: %s", "sysconf",
 2227                                       strerror(errno));
 2228         }
 2229 
 2230         mpi_errno = smpi_load_hwloc_topology();
 2231         if (mpi_errno != MPI_SUCCESS) {
 2232             MPIR_ERR_POP(mpi_errno);
 2233         }
 2234         cpuset = hwloc_bitmap_alloc();
 2235 
 2236         /* Call the cpu_mapping function to find out about how the
 2237          * processors are numbered on the different sockets.
 2238          * The hardware information gathered from this function
 2239          * is required to determine the best set of intra-node thresholds.
 2240          * However, since the user has specified a mapping pattern,
 2241          * we are not going to use any of our proposed binding patterns
 2242          */
 2243         mpi_errno = get_cpu_mapping_hwloc(N_CPUs_online, topology);
 2244         if (mpi_errno != MPI_SUCCESS) {
 2245             /* In case, we get an error from the hwloc mapping function */
 2246             mpi_errno = get_cpu_mapping(N_CPUs_online);
 2247         }
 2248 
 2249         if (s_cpu_mapping) {
 2250             /* If the user has specified how to map the processes, use it */
 2251             char tp_str[s_cpu_mapping_line_max + 1];
 2252 
 2253             mpi_errno = mv2_get_assigned_cpu_core(my_local_id, s_cpu_mapping,
 2254                                                     s_cpu_mapping_line_max, tp_str);
 2255             if (mpi_errno != 0) {
 2256                 fprintf(stderr, "Error parsing CPU mapping string\n");
 2257                 mv2_enable_affinity = 0;
 2258                 MPIU_Free(s_cpu_mapping);
 2259                 s_cpu_mapping = NULL;
 2260                 goto fn_fail;
 2261             }
 2262 
 2263             // parsing of the string
 2264             char *token = tp_str;
 2265             int cpunum = 0;
 2266             while (*token != '\0') {
 2267                 if (isdigit(*token)) {
 2268                     cpunum = first_num_from_str(&token);
 2269                     if (cpunum >= N_CPUs_online) {
 2270                         fprintf(stderr,
 2271                                 "Warning! : Core id %d does not exist on this architecture! \n",
 2272                                 cpunum);
 2273                         fprintf(stderr, "CPU Affinity is undefined \n");
 2274                         mv2_enable_affinity = 0;
 2275                         MPIU_Free(s_cpu_mapping);
 2276                         goto fn_fail;
 2277                     }
 2278                     hwloc_bitmap_set(cpuset, cpunum);
 2279                     mv2_my_cpu_id = cpunum;
 2280                     PRINT_DEBUG(DEBUG_INIT_verbose>0, "Set mv2_my_cpu_id = %d\n", mv2_my_cpu_id);
 2281                 } else if (*token == ',') {
 2282                     token++;
 2283                 } else if (*token == '-') {
 2284                     token++;
 2285                     if (!isdigit(*token)) {
 2286                         fprintf(stderr,
 2287                                 "Warning! : Core id %c does not exist on this architecture! \n",
 2288                                 *token);
 2289                         fprintf(stderr, "CPU Affinity is undefined \n");
 2290                         mv2_enable_affinity = 0;
 2291                         MPIU_Free(s_cpu_mapping);
 2292                         goto fn_fail;
 2293                     } else {
 2294                         int cpuend = first_num_from_str(&token);
 2295                         if (cpuend >= N_CPUs_online || cpuend < cpunum) {
 2296                             fprintf(stderr,
 2297                                     "Warning! : Core id %d does not exist on this architecture! \n",
 2298                                     cpuend);
 2299                             fprintf(stderr, "CPU Affinity is undefined \n");
 2300                             mv2_enable_affinity = 0;
 2301                             MPIU_Free(s_cpu_mapping);
 2302                             goto fn_fail;
 2303                         }
 2304                         int cpuval;
 2305                         for (cpuval = cpunum + 1; cpuval <= cpuend; cpuval++)
 2306                             hwloc_bitmap_set(cpuset, cpuval);
 2307                     }
 2308                 } else if (*token != '\0') {
 2309                     fprintf(stderr,
 2310                             "Warning! Error parsing the given CPU mask! \n");
 2311                     fprintf(stderr, "CPU Affinity is undefined \n");
 2312                     mv2_enable_affinity = 0;
 2313                     MPIU_Free(s_cpu_mapping);
 2314                     goto fn_fail;
 2315                 }
 2316             }
 2317             // then attachement
 2318             hwloc_set_cpubind(topology, cpuset, 0);
 2319 
 2320             MPIU_Free(s_cpu_mapping);
 2321             s_cpu_mapping = NULL;
 2322         } else {
 2323             /* The user has not specified how to map the processes,
 2324              * use the data available in /proc/cpuinfo file to decide
 2325              * on the best cpu mapping pattern
 2326              */
 2327             if (mpi_errno != MPI_SUCCESS || custom_cpu_mapping == NULL) {
 2328                 /* For some reason, we were not able to retrieve the cpu mapping
 2329                  * information. We are falling back on the linear mapping.
 2330                  * This may not deliver the best performace
 2331                  */
 2332                 hwloc_bitmap_only(cpuset, my_local_id % N_CPUs_online);
 2333                 mv2_my_cpu_id = (my_local_id % N_CPUs_online);
 2334                 PRINT_DEBUG(DEBUG_INIT_verbose>0, "Set mv2_my_cpu_id = %d\n", mv2_my_cpu_id);
 2335                 hwloc_set_cpubind(topology, cpuset, 0);
 2336             } else {
 2337                 /*
 2338                  * We have all the information that we need. We will bind the
 2339                  * processes to the cpu's now
 2340                  */
 2341                 char tp_str[custom_cpu_mapping_line_max + 1];
 2342 
 2343                 mpi_errno = mv2_get_assigned_cpu_core(my_local_id, custom_cpu_mapping,
 2344                         custom_cpu_mapping_line_max, tp_str);
 2345                 if (mpi_errno != 0) {
 2346                     fprintf(stderr, "Error parsing CPU mapping string\n");
 2347                     mv2_enable_affinity = 0;
 2348                     goto fn_fail;
 2349                 }
 2350 
 2351                 int cores_per_socket = 0;
 2352 #if defined(CHANNEL_MRAIL)
 2353                 if (!SMP_ONLY && !mv2_user_defined_mapping) {
 2354                     char *value = NULL;
 2355                     if ((value = getenv("MV2_HCA_AWARE_PROCESS_MAPPING")) != NULL) {
 2356                         mv2_hca_aware_process_mapping = !!atoi(value);
 2357                     }
 2358                     if (likely(mv2_hca_aware_process_mapping)) {
 2359                         int num_cpus = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU);
 2360                         int depth_sockets = hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET);
 2361                         int num_sockets = hwloc_get_nbobjs_by_depth(topology, depth_sockets);
 2362 
 2363                         selected_socket = mv2_get_cpu_core_closest_to_hca(my_local_id, num_cpus,
 2364                                 num_sockets, depth_sockets);
 2365                         if (selected_socket < 0) {
 2366                             fprintf(stderr, "Error getting closest socket\n");
 2367                             mv2_enable_affinity = 0;
 2368                             goto fn_fail;
 2369                         }
 2370                         cores_per_socket = num_cpus/num_sockets;
 2371                     }
 2372                 }
 2373 #endif /* defined(CHANNEL_MRAIL) */
 2374 
 2375                 if (mv2_binding_level == LEVEL_CORE) {
 2376                     if (
 2377 #if defined(CHANNEL_MRAIL)
 2378                         SMP_ONLY ||
 2379 #endif
 2380                         mv2_user_defined_mapping || !mv2_hca_aware_process_mapping
 2381                        )
 2382                     {
 2383                         hwloc_bitmap_only(cpuset, atol(tp_str));
 2384                         mv2_my_cpu_id = atol(tp_str);
 2385                         PRINT_DEBUG(DEBUG_INIT_verbose>0, "Set mv2_my_cpu_id = %d\n", mv2_my_cpu_id);
 2386                     } else {
 2387                         hwloc_bitmap_only(cpuset,
 2388                                 (atol(tp_str) % cores_per_socket)
 2389                                 + (selected_socket * cores_per_socket));
 2390                         mv2_my_cpu_id = ((atol(tp_str) % cores_per_socket)
 2391                                         + (selected_socket * cores_per_socket));
 2392                         PRINT_DEBUG(DEBUG_INIT_verbose>0, "Set mv2_my_cpu_id = %d\n", mv2_my_cpu_id);
 2393                     }
 2394                 } else {
 2395                     if (
 2396 #if defined(CHANNEL_MRAIL)
 2397                         SMP_ONLY ||
 2398 #endif
 2399                         mv2_user_defined_mapping || !mv2_hca_aware_process_mapping
 2400                         ) {
 2401                         hwloc_bitmap_from_ulong(cpuset, atol(tp_str));
 2402                     } else {
 2403                         hwloc_bitmap_from_ulong(cpuset,
 2404                                 (atol(tp_str) % cores_per_socket)
 2405                                 + (selected_socket * cores_per_socket));
 2406                     }
 2407                 }
 2408                 hwloc_set_cpubind(topology, cpuset, 0);
 2409             }
 2410 
 2411             MPIU_Free(custom_cpu_mapping);
 2412         }
 2413         /* Free cpuset */
 2414         hwloc_bitmap_free(cpuset);
 2415     }
 2416 
 2417   fn_exit:
 2418     MPIDI_FUNC_EXIT(MPID_STATE_SMPI_SETAFFINITY);
 2419     return mpi_errno;
 2420 
 2421   fn_fail:
 2422     goto fn_exit;
 2423 }
 2424 
 2425 #if defined(CHANNEL_MRAIL) || defined(CHANNEL_PSM)
 2426 void mv2_show_cpu_affinity(int verbosity)
 2427 {
 2428     int i = 0, j = 0, num_cpus = 0, my_rank = 0, pg_size = 0;
 2429     int mpi_errno = MPI_SUCCESS;
 2430     MPIR_Errflag_t errflag = MPIR_ERR_NONE;
 2431     char *buf = NULL;
 2432     cpu_set_t *allproc_cpu_set = NULL;
 2433     MPID_Comm *comm_world = NULL;
 2434     MPIDI_VC_t *vc = NULL;
 2435 
 2436     comm_world = MPIR_Process.comm_world;
 2437     pg_size = comm_world->local_size;
 2438     my_rank = comm_world->rank;
 2439 
 2440     allproc_cpu_set = (cpu_set_t *) MPIU_Malloc(sizeof(cpu_set_t) * pg_size);
 2441     CPU_ZERO(&allproc_cpu_set[my_rank]);
 2442     sched_getaffinity(0, sizeof(cpu_set_t), &allproc_cpu_set[my_rank]);
 2443 
 2444     mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, allproc_cpu_set,
 2445                                     sizeof(cpu_set_t), MPI_BYTE, comm_world, &errflag);
 2446     if (mpi_errno != MPI_SUCCESS) {
 2447         fprintf(stderr, "MPIR_Allgather_impl returned error");
 2448         return;
 2449     }
 2450     if (my_rank == 0) {
 2451         char *value;
 2452         value = getenv("OMP_NUM_THREADS");
 2453         num_cpus = sysconf(_SC_NPROCESSORS_CONF);
 2454         fprintf(stderr, "-------------CPU AFFINITY-------------\n");
 2455         fprintf(stderr, "OMP_NUM_THREADS           : %d\n",(value != NULL) ? atoi(value) : 0);
 2456         fprintf(stderr, "MV2_THREADS_PER_PROCESS   : %d\n",mv2_threads_per_proc);        
 2457         fprintf(stderr, "MV2_CPU_BINDING_POLICY    : %s\n",mv2_cpu_policy_names[mv2_binding_policy]);
 2458         /* hybrid binding policy is only applicable when mv2_binding_policy is hybrid */
 2459         if (mv2_binding_policy ==  POLICY_HYBRID) {
 2460             fprintf(stderr, "MV2_HYBRID_BINDING_POLICY : %s\n",
 2461                               mv2_hybrid_policy_names[mv2_hybrid_binding_policy]);
 2462         }
 2463         fprintf(stderr, "--------------------------------------\n");
 2464 
 2465         buf = (char *) MPIU_Malloc(sizeof(char) * 6 * num_cpus);
 2466         for (i = 0; i < pg_size; i++) {
 2467             MPIDI_Comm_get_vc(comm_world, i, &vc);
 2468             if (vc->smp.local_rank != -1 || verbosity > 1) {
 2469                 MPIU_Memset(buf, 0, sizeof(buf));
 2470                 for (j = 0; j < num_cpus; j++) {
 2471                     if (CPU_ISSET(j, &allproc_cpu_set[vc->pg_rank])) {
 2472                         sprintf((char *) (buf + strlen(buf)), "%4d", j);
 2473                     }
 2474                 }
 2475                 fprintf(stderr, "RANK:%2d  CPU_SET: %s\n", i, buf);
 2476             }
 2477         }
 2478         fprintf(stderr, "-------------------------------------\n");
 2479         MPIU_Free(buf);
 2480     }
 2481     MPIU_Free(allproc_cpu_set);
 2482 }
 2483 #endif /* defined(CHANNEL_MRAIL) || defined(CHANNEL_PSM) */
 2484 
 2485 #if defined(CHANNEL_MRAIL)
 2486 int mv2_show_hca_affinity(int verbosity)
 2487 {
 2488     int pg_size = 0;
 2489     int my_rank = 0;
 2490     int i = 0, j = 0, k = 0;
 2491     int mpi_errno = MPI_SUCCESS;
 2492     MPIR_Errflag_t errflag = MPIR_ERR_NONE;
 2493 
 2494     struct ibv_device **hcas = NULL;
 2495 
 2496     char *hca_names = NULL; 
 2497     char *all_hca_names = NULL;
 2498     
 2499     MPIDI_VC_t *vc = NULL;
 2500     MPID_Comm *comm_world = NULL;
 2501 
 2502     comm_world = MPIR_Process.comm_world;
 2503     pg_size = comm_world->local_size;
 2504     my_rank = comm_world->rank;
 2505 
 2506     hcas = mv2_MPIDI_CH3I_RDMA_Process.ib_dev;
 2507     
 2508     hca_names = (char *) MPIU_Malloc(MAX_NUM_HCAS * (IBV_SYSFS_NAME_MAX+1) 
 2509                                     * sizeof(char));
 2510     k = 0; 
 2511     for(i=0; i < rdma_num_hcas; i++) {
 2512         if (i > 0) {
 2513             strcat(hca_names, " ");
 2514             strcat(hca_names, hcas[i]->name);
 2515         } else {
 2516             strcpy(hca_names, hcas[i]->name);
 2517         }
 2518         PRINT_DEBUG(DEBUG_INIT_verbose>0, "Adding hcas[%d]->name = %s\n", i, hcas[i]->name);
 2519     }
 2520     strcat(hca_names, ";");
 2521 
 2522     if(my_rank == 0) {
 2523         all_hca_names = (char *) MPIU_Malloc(strlen(hca_names) * pg_size);
 2524     }
 2525 
 2526     PRINT_DEBUG(DEBUG_INIT_verbose>0, "hca_names = %s, strlen(hca_names) = %ld\n", hca_names, strlen(hca_names));
 2527     mpi_errno = MPIR_Gather_impl(hca_names, strlen(hca_names), MPI_CHAR, 
 2528                     all_hca_names, strlen(hca_names), MPI_CHAR, 0, 
 2529                     comm_world, &errflag);
 2530 
 2531     if (mpi_errno != MPI_SUCCESS) {
 2532         fprintf(stderr, "MPIR_Allgather_impl returned error: %d", mpi_errno);
 2533         return mpi_errno;
 2534     }
 2535     if(my_rank == 0 && all_hca_names != NULL) {
 2536         fprintf(stderr, "-------------HCA AFFINITY-------------\n");
 2537         j = 0;
 2538     
 2539         char *buffer = MPIU_Malloc(MAX_NUM_HCAS * (IBV_SYSFS_NAME_MAX+1) * sizeof(char));
 2540         for(i = 0; i < pg_size; i++) {
 2541             MPIDI_Comm_get_vc(comm_world, i, &vc);
 2542             if (vc->smp.local_rank != -1 || verbosity > 1) {
 2543                 k = 0;
 2544                 MPIU_Memset(buffer, 0, sizeof(buffer)); 
 2545                 fprintf(stderr, "Process: %d HCAs: ", i);
 2546                 while(all_hca_names[j] != ';') {
 2547                     buffer[k] = all_hca_names[j];
 2548                     j++;
 2549                     k++;
 2550                 }
 2551                 buffer[k] = '\0';
 2552                 j++;
 2553                 fprintf(stderr, "%s\n", buffer);
 2554             }
 2555         }
 2556         MPIU_Free(buffer);
 2557         
 2558         fprintf(stderr, "-------------------------------------\n");
 2559         MPIU_Free(all_hca_names);
 2560     }
 2561     MPIU_Free(hca_names);
 2562     return mpi_errno;
 2563 }
 2564 #endif /* defined(CHANNEL_MRAIL) */
 2565 
 2566 
 2567 /* helper function to get PU ids of a given socket */
 2568 void mv2_get_pu_list_on_socket (hwloc_topology_t topology, hwloc_obj_t obj, 
 2569                     int depth, int *pu_ids, int *idx) {
 2570     int i;
 2571     if (obj->type == HWLOC_OBJ_PU) {
 2572         pu_ids[*idx] = obj->os_index;
 2573        *idx = *idx + 1;
 2574         return;
 2575     }
 2576 
 2577     for (i = 0; i < obj->arity; i++) {
 2578         mv2_get_pu_list_on_socket (topology, obj->children[i], depth+1, pu_ids, idx);
 2579     }
 2580 
 2581     return;
 2582 }
 2583 
 2584 void get_pu_list_on_numanode (hwloc_topology_t topology, hwloc_obj_t obj, int depth, 
 2585                     int *pu_ids, int *idx) {
 2586     int i;
 2587     if (obj->type == HWLOC_OBJ_PU) {
 2588         pu_ids[*idx] = obj->os_index;
 2589         *idx = *idx + 1;
 2590         return;
 2591     }
 2592 
 2593     for (i = 0; i < obj->arity; i++) {
 2594         get_pu_list_on_numanode (topology, obj->children[i], depth+1, pu_ids, idx);
 2595     }
 2596 
 2597     return;
 2598 }
 2599 
 2600 
 2601 
 2602 #undef FUNCNAME
 2603 #define FUNCNAME mv2_generate_implicit_cpu_mapping
 2604 #undef FCNAME
 2605 #define FCNAME MPL_QUOTE(FUNCNAME)
 2606 static int mv2_generate_implicit_cpu_mapping (int local_procs, int num_app_threads) {
 2607     
 2608     hwloc_obj_t obj;
 2609 
 2610     int i, j, k, l, curr, count, chunk, size, scanned, step, node_offset, node_base_pu;
 2611     int topodepth, num_physical_cores_per_socket, num_pu_per_socket;
 2612     int num_numanodes, num_pu_per_numanode;
 2613     char mapping [s_cpu_mapping_line_max];
 2614     
 2615     i = j = k = l = curr = count = chunk = size = scanned = step = node_offset = node_base_pu = 0;
 2616     count = mv2_pivot_core_id;
 2617     
 2618     /* call optimized topology load */
 2619     smpi_load_hwloc_topology ();
 2620 
 2621     num_sockets = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_SOCKET);
 2622     num_numanodes = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_NUMANODE);
 2623 
 2624     num_physical_cores = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE);
 2625     num_pu = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU);
 2626 
 2627     num_physical_cores_per_socket = num_physical_cores / num_sockets;
 2628     num_pu_per_socket = num_pu / num_sockets;
 2629     num_pu_per_numanode = num_pu / num_numanodes;
 2630 
 2631     topodepth = hwloc_get_type_depth (topology, HWLOC_OBJ_CORE);
 2632     obj = hwloc_get_obj_by_depth (topology, topodepth, 0); /* check on core 0*/
 2633 
 2634     hw_threads_per_core = hwloc_bitmap_weight (obj->allowed_cpuset);
 2635     
 2636     mv2_core_map = MPIU_Malloc(sizeof(int) * num_pu);
 2637     mv2_core_map_per_numa = MPIU_Malloc(sizeof(int) * num_pu);
 2638 
 2639     /* generate core map of the system by scanning the hwloc tree and save it 
 2640      *  in mv2_core_map array. All the policies below are core_map aware now */
 2641     topodepth = hwloc_get_type_depth (topology, HWLOC_OBJ_SOCKET);
 2642     for (i = 0; i < num_sockets; i++) {
 2643         obj = hwloc_get_obj_by_depth (topology, topodepth, i);
 2644         mv2_get_pu_list_on_socket (topology, obj, topodepth, mv2_core_map, &scanned);
 2645     } 
 2646     
 2647     size = scanned;
 2648         
 2649 
 2650     /* generate core map of the system basd on NUMA domains by scanning the hwloc 
 2651      * tree and save it in mv2_core_map_per_numa array. NUMA based policies are now 
 2652      * map-aware */
 2653     scanned = 0;
 2654     for (i = 0; i < num_numanodes; i++) {
 2655         obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_NUMANODE, i);
 2656         get_pu_list_on_numanode (topology, obj, topodepth, mv2_core_map_per_numa, &scanned);
 2657     }
 2658 
 2659     /* make sure total PUs are same when we scanned the machine w.r.t sockets and NUMA */
 2660     MPIU_Assert(size == scanned);
 2661 
 2662     if (mv2_hybrid_binding_policy == HYBRID_COMPACT) {
 2663         /* Compact mapping: Bind each MPI rank to a single phyical core, and bind
 2664          * its associated threads to the hardware threads of the same physical core.
 2665          * Use first socket followed by the second socket */
 2666         if (num_app_threads > hw_threads_per_core) {
 2667             PRINT_INFO((MPIDI_Process.my_pg_rank == 0), "WARNING: COMPACT mapping is "
 2668                "only meant for hardware multi-threaded (hyper-threaded) processors. "
 2669                "We have detected that your processor does not have hyper-threading "
 2670                "enabled. Note that proceeding with this option on current system will cause "
 2671                "over-subscription, hence leading to severe performance degradation. "
 2672                "We recommend using LINEAR or SPREAD policy for this run.\n");
 2673         }
 2674         
 2675         for (i = 0; i < local_procs; i++) {
 2676             curr = count;
 2677             for (k = 0; k < num_app_threads; k++) {
 2678                 j += snprintf (mapping+j, _POSIX2_LINE_MAX, "%d,", mv2_core_map[curr]);
 2679                 curr = (curr + 1) % num_pu;
 2680             }
 2681             mapping [--j] = '\0'; 
 2682             j += snprintf (mapping+j, _POSIX2_LINE_MAX, ":");
 2683             count = (count + hw_threads_per_core) % num_pu;
 2684         }
 2685     } else if (mv2_hybrid_binding_policy == HYBRID_LINEAR) {
 2686         /* Linear mapping: Bind each MPI rank as well as its associated threads to
 2687          * phyical cores. Only use hardware threads when you run out of physical
 2688          * resources  */
 2689         for (i = 0; i < local_procs; i++) {
 2690             for (k = 0; k < num_app_threads; k++) {
 2691                 j += snprintf (mapping+j, _POSIX2_LINE_MAX, "%d,", mv2_core_map[curr]);
 2692 
 2693                 curr = ((curr + hw_threads_per_core) >= num_pu) ?
 2694                             ((curr + hw_threads_per_core+ ++step) % num_pu) :
 2695                             (curr + hw_threads_per_core) % num_pu;
 2696             }
 2697             mapping [--j] = '\0';
 2698             j += snprintf (mapping+j, _POSIX2_LINE_MAX, ":");
 2699         }    
 2700     } else if (mv2_hybrid_binding_policy == HYBRID_SPREAD) {
 2701         /* Spread mapping: Evenly distributes all the PUs among MPI ranks and
 2702          * ensures that no two MPI ranks get bound to the same phyiscal core. */
 2703         if (num_physical_cores < local_procs) {
 2704             PRINT_INFO((MPIDI_Process.my_pg_rank == 0), "WARNING: This configuration "
 2705                         "might lead to oversubscription of cores !!!\n");
 2706             /* limit the mapping to max available PUs */
 2707             num_physical_cores = num_pu;
 2708         }
 2709         chunk = num_physical_cores / local_procs;
 2710         for (i = 0; i < local_procs; i++) {
 2711              for (k = curr; k < curr+chunk; k++) {
 2712                  for (l = 0; l < hw_threads_per_core; l++) {
 2713                     j += snprintf (mapping+j, _POSIX2_LINE_MAX, "%d,", 
 2714                             mv2_core_map[k * hw_threads_per_core + l]);
 2715                  }
 2716              }
 2717              mapping [--j] = '\0';
 2718              j += snprintf (mapping+j, _POSIX2_LINE_MAX, ":");
 2719              curr = (curr + chunk) % size;
 2720         } 
 2721     } else if (mv2_hybrid_binding_policy == HYBRID_BUNCH) {
 2722         /* Bunch mapping: Bind each MPI rank to a single phyical core of first
 2723          * socket followed by second secket */
 2724         for (i = 0; i < local_procs; i++) {
 2725             j += snprintf (mapping+j, _POSIX2_LINE_MAX, "%d:", mv2_core_map[k]);
 2726             k = (k + hw_threads_per_core) % size;
 2727         } 
 2728     } else if (mv2_hybrid_binding_policy == HYBRID_SCATTER) {
 2729         /* scatter mapping: Bind consecutive MPI ranks to different sockets in
 2730          * round-robin fashion */
 2731         if (num_sockets < 2) {
 2732             PRINT_INFO((MPIDI_Process.my_pg_rank == 0), "WARNING: Scatter is not a valid policy "
 2733                     "for single-socket systems. Please re-run with Bunch or any other "
 2734                     "applicable policy\n");
 2735             return MPI_ERR_OTHER;
 2736         }
 2737         for (i = 0; i < local_procs; i++) {
 2738             j += snprintf (mapping+j, _POSIX2_LINE_MAX, "%d:", mv2_core_map[k]);
 2739             k = (i % num_sockets == 0) ?
 2740                     (k + num_pu_per_socket) % size :
 2741                     (k + num_pu_per_socket + hw_threads_per_core) % size;
 2742         }
 2743     } else if (mv2_hybrid_binding_policy == HYBRID_NUMA) {
 2744         /* NUMA mapping: Bind consecutive MPI ranks to different NUMA domains in
 2745          * round-robin fashion. */
 2746         for (i = 0; i < local_procs; i++) {
 2747             j += snprintf (mapping+j, _POSIX2_LINE_MAX, "%d,", 
 2748                                mv2_core_map_per_numa[node_base_pu+node_offset]);
 2749             mapping [--j] = '\0';
 2750             j += snprintf (mapping+j, _POSIX2_LINE_MAX, ":");
 2751             node_base_pu = (node_base_pu + num_pu_per_numanode) % size;
 2752             node_offset = (node_base_pu == 0) ? 
 2753                             (node_offset + ((hw_threads_per_core > 0) ? hw_threads_per_core : 1)) : 
 2754                             node_offset;
 2755         }
 2756     }
 2757 
 2758     /* copy the generated mapping string to final mapping*/
 2759     s_cpu_mapping = (char *) MPIU_Malloc (sizeof (char) * j);
 2760     strncpy (s_cpu_mapping, mapping, j);
 2761     s_cpu_mapping[j-1] = '\0';
 2762 
 2763     if (MPIDI_Process.my_pg_rank == 0) {
 2764         PRINT_DEBUG(DEBUG_INIT_verbose>0, "num_physical_cores_per_socket %d, mapping: %s", 
 2765                 num_physical_cores_per_socket, s_cpu_mapping);
 2766     }
 2767     
 2768     /* cleanup */
 2769     MPIU_Free(mv2_core_map);
 2770     MPIU_Free(mv2_core_map_per_numa);
 2771      
 2772     return MPI_SUCCESS;
 2773 }
 2774 
 2775 #undef FUNCNAME
 2776 #define FUNCNAME MPIDI_CH3I_set_affinity
 2777 #undef FCNAME
 2778 #define FCNAME MPL_QUOTE(FUNCNAME)
 2779 int MPIDI_CH3I_set_affinity(MPIDI_PG_t * pg, int pg_rank)
 2780 {
 2781     char *value;
 2782     int mpi_errno = MPI_SUCCESS;
 2783     int my_local_id;
 2784     int num_local_procs;
 2785     long N_CPUs_online;
 2786     mv2_arch_type arch_type;
 2787 
 2788     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SET_AFFINITY);
 2789     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SET_AFFINITY);
 2790 
 2791     num_local_procs = MPIDI_Num_local_processes (pg);
 2792     
 2793     N_CPUs_online = sysconf(_SC_NPROCESSORS_ONLN);
 2794 
 2795     if ((value = getenv("MV2_ENABLE_AFFINITY")) != NULL) {
 2796         mv2_enable_affinity = atoi(value);
 2797     }
 2798 
 2799     arch_type = mv2_get_arch_type ();
 2800     /* set CPU_BINDING_POLICY=hybrid for Power, Skylake, and KNL */
 2801     if (arch_type == MV2_ARCH_IBM_POWER8 ||
 2802         arch_type == MV2_ARCH_IBM_POWER9 ||
 2803         arch_type == MV2_ARCH_INTEL_XEON_PHI_7250 ||
 2804         arch_type == MV2_ARCH_INTEL_PLATINUM_8170_2S_52 ||
 2805         arch_type == MV2_ARCH_INTEL_PLATINUM_8160_2S_48 ||
 2806         arch_type == MV2_ARCH_AMD_EPYC_7551_64 /* EPYC */ ||
 2807         arch_type == MV2_ARCH_AMD_EPYC_7742_128 /* rome */) {
 2808         setenv ("MV2_CPU_BINDING_POLICY", "hybrid", 0);
 2809         
 2810         /* if CPU is EPYC, further force hybrid_binding_policy to NUMA */
 2811         if (arch_type == MV2_ARCH_AMD_EPYC_7551_64 ||
 2812             arch_type == MV2_ARCH_AMD_EPYC_7742_128 /* rome */) {
 2813             setenv ("MV2_HYBRID_BINDING_POLICY", "numa", 0);
 2814         } 
 2815     }
 2816 
 2817     if (mv2_enable_affinity && (num_local_procs > N_CPUs_online)) {
 2818         if (MPIDI_Process.my_pg_rank == 0) {
 2819             PRINT_ERROR ("WARNING: You are running %d MPI processes on a processor "
 2820                             "that supports up to %ld cores. If you still wish to run "
 2821                             "in oversubscribed mode, please set MV2_ENABLE_AFFINITY=0 "
 2822                             "and re-run the program.\n\n", 
 2823                             num_local_procs, N_CPUs_online);
 2824 
 2825             MPIR_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER,
 2826                     "**fail", "**fail %s",
 2827                     "MV2_ENABLE_AFFINITY: oversubscribed cores.");
 2828         }
 2829         goto fn_fail;
 2830     }
 2831 
 2832     if (mv2_enable_affinity && (value = getenv("MV2_CPU_MAPPING")) != NULL) {
 2833         /* Affinity is on and the user has supplied a cpu mapping string */
 2834         int linelen = strlen(value);
 2835         if (linelen < s_cpu_mapping_line_max) {
 2836             s_cpu_mapping_line_max = linelen;
 2837         }
 2838         s_cpu_mapping =
 2839             (char *) MPIU_Malloc(sizeof(char) * (s_cpu_mapping_line_max + 1));
 2840         strncpy(s_cpu_mapping, value, s_cpu_mapping_line_max);
 2841         s_cpu_mapping[s_cpu_mapping_line_max] = '\0';
 2842         mv2_user_defined_mapping = TRUE;
 2843     }
 2844 
 2845     if (mv2_enable_affinity && (value = getenv("MV2_CPU_MAPPING")) == NULL) {
 2846         /* Affinity is on and the user has not specified a mapping string */
 2847         if ((value = getenv("MV2_CPU_BINDING_POLICY")) != NULL) {
 2848             /* User has specified a binding policy */
 2849             if (!strcmp(value, "bunch") || !strcmp(value, "BUNCH")) {
 2850                 mv2_binding_policy = POLICY_BUNCH;
 2851             } else if (!strcmp(value, "scatter") || !strcmp(value, "SCATTER")) {
 2852                 mv2_binding_policy = POLICY_SCATTER;
 2853             } else if (!strcmp(value, "hybrid") || !strcmp(value, "HYBRID")) {
 2854                 mv2_binding_policy = POLICY_HYBRID;
 2855                /* check if the OMP_NUM_THREADS is exported or user has
 2856                 * explicitly set MV2_THREADS_PER_PROCESS variable*/
 2857                if ((value = getenv("OMP_NUM_THREADS")) != NULL) {
 2858                    mv2_threads_per_proc = atoi (value);
 2859                    if (mv2_threads_per_proc < 0) {
 2860                        if (MPIDI_Process.my_pg_rank == 0) {
 2861                            PRINT_ERROR ("OMP_NUM_THREADS: value can not be set to negative.\n");
 2862                            MPIR_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER,
 2863                                    "**fail", "**fail %s",
 2864                                    "OMP_NUM_THREADS: negative value.");
 2865                        }
 2866                    }
 2867                }
 2868 
 2869                if ((value = getenv("MV2_THREADS_PER_PROCESS")) != NULL) {
 2870                    mv2_threads_per_proc = atoi (value);
 2871                    if (mv2_threads_per_proc < 0) {
 2872                        if (MPIDI_Process.my_pg_rank == 0) {
 2873                            PRINT_ERROR ("MV2_THREADS_PER_PROCESS: "
 2874                                    "value can not be set to negative.\n");
 2875                            MPIR_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER,
 2876                                    "**fail", "**fail %s",
 2877                                    "MV2_THREADS_PER_PROCESS: negative value.");
 2878                        }
 2879                    }
 2880                }
 2881 
 2882                if (mv2_threads_per_proc > 0) {
 2883                    if ( (mv2_threads_per_proc * num_local_procs) > N_CPUs_online) {
 2884                        if (MPIDI_Process.my_pg_rank == 0) {
 2885                            PRINT_ERROR ("User defined values for MV2_CPU_BINDING_POLICY and "
 2886                                    "MV2_THREADS_PER_PROCESS will lead to oversubscription of "
 2887                                    "the available CPUs. If this was intentional, please "
 2888                                    "re-run the application after setting MV2_ENABLE_AFFINITY=0 or "
 2889                                    "with explicit CPU mapping using MV2_CPU_MAPPING.\n"); 
 2890                            MPIR_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER,
 2891                                    "**fail", "**fail %s",
 2892                                    "CPU_BINDING_PRIMITIVE: over-subscribed hybrid configuration.");
 2893                        }
 2894                    } 
 2895                     
 2896                    /* Check to see if any pivot core is designated */
 2897                    if ((value = getenv("MV2_PIVOT_CORE_ID")) != NULL) {
 2898                        mv2_pivot_core_id = atoi(value);
 2899                    }
 2900                    
 2901                    /* since mv2_threads_per_proc > 0, check if any threads
 2902                     * binding policy have been explicitly specified */
 2903                    if ((value = getenv("MV2_HYBRID_BINDING_POLICY")) != NULL) {
 2904                        if (!strcmp(value, "linear") || !strcmp(value, "LINEAR")) {
 2905                            mv2_hybrid_binding_policy = HYBRID_LINEAR;
 2906                        } else if (!strcmp(value, "compact") || !strcmp(value, "COMPACT")) {
 2907                            mv2_hybrid_binding_policy = HYBRID_COMPACT;
 2908                        } else if (!strcmp(value, "spread") || !strcmp(value, "SPREAD")) {
 2909                            mv2_hybrid_binding_policy = HYBRID_SPREAD;
 2910                        } else if (!strcmp(value, "bunch") || !strcmp(value, "BUNCH")) {
 2911                            mv2_hybrid_binding_policy = HYBRID_BUNCH;
 2912                        } else if (!strcmp(value, "scatter") || !strcmp(value, "SCATTER")) {
 2913                            mv2_hybrid_binding_policy = HYBRID_SCATTER;
 2914                        } else if (!strcmp(value, "numa") || !strcmp(value, "NUMA")) {
 2915                            /* we only force NUMA binding if we have more than 2 ppn,
 2916                             * otherwise we use bunch (linear) mapping */
 2917                            mv2_hybrid_binding_policy =
 2918                                (num_local_procs > 2) ?  HYBRID_NUMA : HYBRID_LINEAR;
 2919                        }
 2920                    }
 2921 
 2922                    mv2_binding_level = LEVEL_MULTIPLE_CORES;
 2923                
 2924                } else {
 2925                        PRINT_INFO((MPIDI_Process.my_pg_rank == 0), "WARNING: Process mapping "
 2926                                "mode has been set to 'hybrid' "
 2927                                "indicating an attempt to run a multi-threaded program. However, "
 2928                                "neither the MV2_THREADS_PER_PROCESS nor OMP_NUM_THREADS have been "
 2929                                "set. Please set either one of these variable to the number threads "
 2930                                "desired per process for optimal performance\n");
 2931                                
 2932                }
 2933             } else {
 2934                 PRINT_INFO((MPIDI_Process.my_pg_rank == 0),
 2935                             "MV2_CPU_BINDING_POLICY should be "
 2936                             "bunch, scatter or hybrid (upper or lower case).\n");
 2937                 MPIR_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER,
 2938                             "**fail", "**fail %s",
 2939                             "CPU_BINDING_PRIMITIVE: Policy should be bunch, scatter or hybrid.");
 2940             }
 2941             mv2_user_defined_mapping = TRUE;
 2942         } else {
 2943             /* User has not specified a binding policy.
 2944              * We are going to do "hybrid-bunch" binding, by default  */
 2945             mv2_binding_policy = POLICY_HYBRID;
 2946         }
 2947     }
 2948 
 2949     /* generate implicit mapping string based on hybrid binding policy */
 2950     if (mv2_binding_policy == POLICY_HYBRID) {
 2951         mpi_errno = mv2_generate_implicit_cpu_mapping (num_local_procs, 
 2952                mv2_threads_per_proc);
 2953         if (mpi_errno != MPI_SUCCESS) {
 2954            goto fn_fail;
 2955         }
 2956     }
 2957 
 2958     if (mv2_enable_affinity && (value = getenv("MV2_CPU_MAPPING")) == NULL) {
 2959         /* Affinity is on and the user has not specified a mapping string */
 2960         if ((value = getenv("MV2_CPU_BINDING_LEVEL")) != NULL) {
 2961             /* User has specified a binding level */
 2962             if (!strcmp(value, "core") || !strcmp(value, "CORE")) {
 2963                 mv2_binding_level = LEVEL_CORE;
 2964             } else if (!strcmp(value, "socket") || !strcmp(value, "SOCKET")) {
 2965                 mv2_binding_level = LEVEL_SOCKET;
 2966             } else if (!strcmp(value, "numanode") || !strcmp(value, "NUMANODE")) {
 2967                 mv2_binding_level = LEVEL_NUMANODE;
 2968             } else {
 2969                 MPIR_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER,
 2970                     "**fail", "**fail %s",
 2971                     "CPU_BINDING_PRIMITIVE: Level should be core, socket, or numanode.");
 2972             }
 2973             if (MV2_ARCH_INTEL_XEON_PHI_7250 == arch_type &&
 2974                     mv2_binding_level != LEVEL_CORE) {
 2975                 if (MPIDI_Process.my_pg_rank == 0) {
 2976                     fprintf(stderr, "CPU_BINDING_PRIMITIVE: Only core level binding supported for this architecture.\n");
 2977                 }
 2978                 mpi_errno = MPI_ERR_OTHER;
 2979                 goto fn_fail;
 2980             }
 2981             mv2_user_defined_mapping = TRUE;
 2982         } else {
 2983             /* User has not specified a binding level and we've not
 2984              * assigned LEVEL_MULTIPLE_CORES earlier. We are going to
 2985              * do "core" binding, by default  */
 2986             if (mv2_binding_level != LEVEL_MULTIPLE_CORES) {
 2987                 mv2_binding_level = LEVEL_CORE;
 2988             }
 2989         }
 2990     }
 2991 
 2992     if (mv2_enable_affinity) {
 2993         my_local_id = pg->ch.local_process_id;
 2994         mpi_errno = smpi_setaffinity(my_local_id);
 2995         if (mpi_errno != MPI_SUCCESS) {
 2996             MPIR_ERR_POP(mpi_errno);
 2997         }
 2998     }
 2999   fn_exit:
 3000     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_SET_AFFINITY);
 3001     return mpi_errno;
 3002   fn_fail:
 3003     goto fn_exit;
 3004 }