"Fossies" - the Fresh Open Source Software Archive

Member "mvapich2-2.3.2/src/mpid/ch3/channels/psm/src/ch3_win_fns.c" (8 Aug 2019, 38651 Bytes) of package /linux/misc/mvapich2-2.3.2.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "ch3_win_fns.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 2.3.1_vs_2.3.2.

    1 /* -*- Mode: C; c-basic-offset:4 ; -*- */
    2 /*
    3  * Copyright (c) 2001-2019, The Ohio State University. All rights
    4  * reserved.
    5  *
    6  * This file is part of the MVAPICH2 software package developed by the
    7  * team members of The Ohio State University's Network-Based Computing
    8  * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda.
    9  *
   10  * For detailed copyright and licensing information, please refer to the
   11  * copyright file COPYRIGHT in the top level MVAPICH2 directory.
   12  */
   13 
   14 /*
   15  *  (C) 2001 by Argonne National Laboratory.
   16  *      See COPYRIGHT in top-level directory.
   17  */
   18 
   19 #include "mpidimpl.h"
   20 #include "mpiinfo.h"
   21 #include "mpidrma.h"
   22 #include "mpiu_os_wrappers_pre.h"
   23 #include "mpiu_shm_wrappers.h"
   24 #include "coll_shmem.h"
   25 #include "mpidi_ch3_impl.h"
   26 
   27 #include "bcast_tuning.h"
   28 
   29 #undef FUNCNAME
   30 
   31 /* FIXME: get this from OS */
   32 #define MPIDI_CH3_PAGESIZE ((MPI_Aint)4096)
   33 #define MPIDI_CH3_PAGESIZE_MASK (~(MPIDI_CH3_PAGESIZE-1))
   34 #define MPIDI_CH3_ROUND_UP_PAGESIZE(x) ((((MPI_Aint)x)+(~MPIDI_CH3_PAGESIZE_MASK)) & MPIDI_CH3_PAGESIZE_MASK)
   35 
   36 MPIR_T_PVAR_DOUBLE_TIMER_DECL_EXTERN(RMA, rma_wincreate_allgather);
   37 
   38 MPIDI_SHM_Wins_list_t shm_wins_list;
   39 
   40 static int MPIDI_CH3I_Win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
   41                                MPID_Info * info, MPID_Comm * comm_ptr, MPID_Win ** win_ptr);
   42 
   43 static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPID_Info * info,
   44                                        MPID_Comm * comm_ptr, void *base_ptr, MPID_Win ** win_ptr);
   45 
   46 static int MPIDI_CH3I_Win_detect_shm(MPID_Win ** win_ptr);
   47 
   48 static int MPIDI_CH3I_Win_gather_info(void *base, MPI_Aint size, int disp_unit, MPID_Info * info,
   49                                       MPID_Comm * comm_ptr, MPID_Win ** win_ptr);
   50 
   51 #undef FUNCNAME
   52 #define FUNCNAME MPIDI_CH3_Win_shared_query
   53 #undef FCNAME
   54 #define FCNAME MPL_QUOTE(FUNCNAME)
   55 int MPIDI_CH3_SHM_Win_shared_query(MPID_Win * win_ptr, int target_rank, MPI_Aint * size,
   56                                    int *disp_unit, void *baseptr)
   57 {
   58     int comm_size;
   59     int mpi_errno = MPI_SUCCESS;
   60     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_WIN_SHARED_QUERY);
   61 
   62     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3_WIN_SHARED_QUERY);
   63 
   64     comm_size = win_ptr->comm_ptr->local_size;
   65 
   66     if (FALSE == win_ptr->shm_allocated || comm_size <= 1) {
   67         mpi_errno = MPIDI_CH3U_Win_shared_query(win_ptr, target_rank, size, disp_unit, baseptr);
   68         if (mpi_errno != MPI_SUCCESS) {
   69             MPIR_ERR_POP(mpi_errno);
   70         }
   71         goto fn_exit;
   72     }
   73 
   74     /* Scan the sizes to locate the first process that allocated a nonzero
   75      * amount of space */
   76     if (target_rank == MPI_PROC_NULL) {
   77         int i;
   78 
   79         /* Default, if no processes have size > 0. */
   80         *size = 0;
   81         *disp_unit = 0;
   82         *((void **) baseptr) = NULL;
   83 
   84         for (i = 0; i < comm_size; i++) {
   85             if (win_ptr->basic_info_table[i].size > 0) {
   86                 int local_i = win_ptr->comm_ptr->intranode_table[i];
   87                 MPIU_Assert(local_i >= 0 && local_i < win_ptr->comm_ptr->node_comm->local_size);
   88                 *size = win_ptr->basic_info_table[i].size;
   89                 *disp_unit = win_ptr->basic_info_table[i].disp_unit;
   90                 *((void **) baseptr) = win_ptr->shm_base_addrs[local_i];
   91                 break;
   92             }
   93         }
   94 
   95     }
   96     else {
   97         int local_target_rank = win_ptr->comm_ptr->intranode_table[target_rank];
   98         MPIU_Assert(local_target_rank >= 0 &&
   99                     local_target_rank < win_ptr->comm_ptr->node_comm->local_size);
  100         *size = win_ptr->basic_info_table[target_rank].size;
  101         *disp_unit = win_ptr->basic_info_table[target_rank].disp_unit;
  102         *((void **) baseptr) = win_ptr->shm_base_addrs[local_target_rank];
  103     }
  104 
  105   fn_exit:
  106     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3_WIN_SHARED_QUERY);
  107     return mpi_errno;
  108 
  109   fn_fail:
  110     goto fn_exit;
  111 }
  112 
  113 
  114 #undef FUNCNAME
  115 #define FUNCNAME MPIDI_CH3_SHM_Win_free
  116 #undef FCNAME
  117 #define FCNAME MPL_QUOTE(FUNCNAME)
  118 int MPIDI_CH3_SHM_Win_free(MPID_Win ** win_ptr)
  119 {
  120     int mpi_errno = MPI_SUCCESS;
  121     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_SHM_WIN_FREE);
  122 
  123     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3_SHM_WIN_FREE);
  124 
  125     if ((*win_ptr)->comm_ptr->local_size <= 1 || (*win_ptr)->node_comm_ptr == NULL) {
  126         goto fn_exit;
  127     }
  128 
  129     /* Free shared memory region */
  130     if ((*win_ptr)->shm_allocated) {
  131         /* free shm_base_addrs that's only used for shared memory windows */
  132         MPIU_Free((*win_ptr)->shm_base_addrs);
  133 
  134         /* Only allocate and allocate_shared allocate new shared segments */
  135         if (((*win_ptr)->create_flavor == MPI_WIN_FLAVOR_SHARED ||
  136              (*win_ptr)->create_flavor == MPI_WIN_FLAVOR_ALLOCATE) &&
  137             (*win_ptr)->shm_segment_len > 0) {
  138             /* detach from shared memory segment */
  139             mpi_errno =
  140                 MPIU_SHMW_Seg_detach((*win_ptr)->shm_segment_handle,
  141                                      (char **) &(*win_ptr)->shm_base_addr,
  142                                      (*win_ptr)->shm_segment_len);
  143             if (mpi_errno)
  144                 MPIR_ERR_POP(mpi_errno);
  145 
  146             MPIU_SHMW_Hnd_finalize(&(*win_ptr)->shm_segment_handle);
  147         }
  148     }
  149 
  150     /* Free shared process mutex memory region */
  151     /* Only allocate and allocate_shared allocate new shared mutex.
  152      * FIXME: it causes unnecessary synchronization when using the same mutex.  */
  153     if (((*win_ptr)->create_flavor == MPI_WIN_FLAVOR_SHARED ||
  154          (*win_ptr)->create_flavor == MPI_WIN_FLAVOR_ALLOCATE) &&
  155         (*win_ptr)->shm_mutex && (*win_ptr)->shm_segment_len > 0) {
  156 
  157         /* When allocating shared memory region segment, we need comm of processes
  158          * that are on the same node as this process (node_comm).
  159          * If node_comm == NULL, this process is the only one on this node, therefore
  160          * we use comm_self as node comm. */
  161 
  162         MPID_Comm *node_comm_ptr = (*win_ptr)->node_comm_ptr;
  163         MPIU_Assert(node_comm_ptr != NULL);
  164         if (node_comm_ptr->rank == 0) {
  165             MPIDI_CH3I_SHM_MUTEX_DESTROY(*win_ptr);
  166         }
  167 
  168         /* detach from shared memory segment */
  169         mpi_errno =
  170             MPIU_SHMW_Seg_detach((*win_ptr)->shm_mutex_segment_handle,
  171                                  (char **) &(*win_ptr)->shm_mutex, sizeof(MPIDI_CH3I_SHM_MUTEX));
  172         if (mpi_errno)
  173             MPIR_ERR_POP(mpi_errno);
  174 
  175         MPIU_SHMW_Hnd_finalize(&(*win_ptr)->shm_mutex_segment_handle);
  176     }
  177 
  178     /* Free shared memory region for window info */
  179     if ((*win_ptr)->info_shm_base_addr != NULL) {
  180         mpi_errno = MPIU_SHMW_Seg_detach((*win_ptr)->info_shm_segment_handle,
  181                                          (char **) &(*win_ptr)->info_shm_base_addr,
  182                                          (*win_ptr)->info_shm_segment_len);
  183         if (mpi_errno != MPI_SUCCESS)
  184             MPIR_ERR_POP(mpi_errno);
  185 
  186         MPIU_SHMW_Hnd_finalize(&(*win_ptr)->info_shm_segment_handle);
  187 
  188         (*win_ptr)->basic_info_table = NULL;
  189     }
  190 
  191     /* Unlink from global SHM window list if it is original shared window */
  192     if ((*win_ptr)->create_flavor == MPI_WIN_FLAVOR_SHARED ||
  193         (*win_ptr)->create_flavor == MPI_WIN_FLAVOR_ALLOCATE) {
  194         MPIDI_CH3I_SHM_Wins_unlink(&shm_wins_list, (*win_ptr));
  195     }
  196 
  197   fn_exit:
  198     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3_SHM_WIN_FREE);
  199     return mpi_errno;
  200 
  201   fn_fail:
  202     goto fn_exit;
  203 }
  204 
  205 #undef FUNCNAME
  206 #define FUNCNAME MPIDI_CH3_Win_fns_init
  207 #undef FCNAME
  208 #define FCNAME MPL_QUOTE(FUNCNAME)
  209 int MPIDI_CH3_Win_fns_init(MPIDI_CH3U_Win_fns_t *win_fns)
  210 {
  211     int mpi_errno = MPI_SUCCESS;
  212     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_WIN_FNS_INIT);
  213 
  214     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3_WIN_FNS_INIT);
  215 
  216     win_fns->allocate_shm = MPIDI_CH3I_Win_allocate_shm;
  217     win_fns->detect_shm = MPIDI_CH3I_Win_detect_shm;
  218     win_fns->gather_info = MPIDI_CH3I_Win_gather_info;
  219     win_fns->shared_query = MPIDI_CH3_SHM_Win_shared_query;
  220 
  221     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3_WIN_FNS_INIT);
  222 
  223     return mpi_errno;
  224 }
  225 
  226 #undef FUNCNAME
  227 #define FUNCNAME MPIDI_CH3_Win_hooks_init
  228 #undef FCNAME
  229 #define FCNAME MPL_QUOTE(FUNCNAME)
  230 int MPIDI_CH3_Win_hooks_init(MPIDI_CH3U_Win_hooks_t * win_hooks)
  231 {
  232     int mpi_errno = MPI_SUCCESS;
  233     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_WIN_HOOKS_INIT);
  234 
  235     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3_WIN_HOOKS_INIT);
  236 
  237     win_hooks->win_init = MPIDI_CH3I_Win_init;
  238     win_hooks->win_free = MPIDI_CH3_SHM_Win_free;
  239 
  240     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3_WIN_HOOKS_INIT);
  241 
  242     return mpi_errno;
  243 }
  244 
  245 #undef FUNCNAME
  246 #define FUNCNAME MPIDI_CH3_Win_init
  247 #undef FCNAME
  248 #define FCNAME MPL_QUOTE(FUNCNAME)
  249 static int MPIDI_CH3I_Win_init(MPI_Aint size, int disp_unit, int create_flavor, int model,
  250                                MPID_Info * info, MPID_Comm * comm_ptr, MPID_Win ** win_ptr)
  251 {
  252     int mpi_errno = MPI_SUCCESS;
  253     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_WIN_INIT);
  254 
  255     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_WIN_INIT);
  256 
  257     (*win_ptr)->shm_base_addr = NULL;
  258     (*win_ptr)->shm_segment_len = 0;
  259     (*win_ptr)->shm_segment_handle = 0;
  260     (*win_ptr)->shm_mutex = NULL;
  261     (*win_ptr)->shm_mutex_segment_handle = 0;
  262 
  263     (*win_ptr)->info_shm_base_addr = NULL;
  264     (*win_ptr)->info_shm_segment_len = 0;
  265     (*win_ptr)->info_shm_segment_handle = 0;
  266 
  267     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_WIN_INIT);
  268     return mpi_errno;
  269 }
  270 
  271 #undef FUNCNAME
  272 #define FUNCNAME MPIDI_CH3I_SHM_Wins_match
  273 #undef FCNAME
  274 #define FCNAME MPL_QUOTE(FUNCNAME)
  275 static int MPIDI_CH3I_SHM_Wins_match(MPID_Win ** win_ptr, MPID_Win ** matched_win,
  276                                      MPI_Aint ** base_shm_offs_ptr)
  277 {
  278     int mpi_errno = MPI_SUCCESS;
  279     int i, comm_size;
  280     int node_size, node_rank, shm_node_size;
  281     int group_diff;
  282     int base_diff;
  283 
  284     MPID_Comm *node_comm_ptr = NULL, *shm_node_comm_ptr = NULL;
  285     int *node_ranks = NULL, *node_ranks_in_shm_node = NULL;
  286     MPID_Group *node_group_ptr = NULL, *shm_node_group_ptr = NULL;
  287     MPIR_Errflag_t errflag = MPIR_ERR_NONE;
  288     MPI_Aint *base_shm_offs;
  289 
  290     MPIDI_SHM_Win_t *elem = shm_wins_list;
  291 
  292     MPIU_CHKLMEM_DECL(2);
  293     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SHM_WINS_MATCH);
  294     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SHM_WINS_MATCH);
  295 
  296     *matched_win = NULL;
  297     base_shm_offs = *base_shm_offs_ptr;
  298     node_comm_ptr = (*win_ptr)->comm_ptr->node_comm;
  299     MPIU_Assert(node_comm_ptr != NULL);
  300     node_size = node_comm_ptr->local_size;
  301     node_rank = node_comm_ptr->rank;
  302 
  303     comm_size = (*win_ptr)->comm_ptr->local_size;
  304 
  305     MPIU_CHKLMEM_MALLOC(node_ranks, int *, node_size * sizeof(int), mpi_errno, "node_ranks");
  306     MPIU_CHKLMEM_MALLOC(node_ranks_in_shm_node, int *, node_size * sizeof(int),
  307                         mpi_errno, "node_ranks_in_shm_comm");
  308 
  309     for (i = 0; i < node_size; i++) {
  310         node_ranks[i] = i;
  311     }
  312 
  313     mpi_errno = MPIR_Comm_group_impl(node_comm_ptr, &node_group_ptr);
  314     if (mpi_errno)
  315         MPIR_ERR_POP(mpi_errno);
  316 
  317     while (elem != NULL) {
  318         MPID_Win *shm_win = elem->win;
  319         if (!shm_win)
  320             MPIDI_SHM_Wins_next_and_continue(elem);
  321 
  322         /* Compare node_comm.
  323          *
  324          * Only support shm if new node_comm is equal to or a subset of shm node_comm.
  325          * Shm node_comm == a subset of node_comm is not supported, because it means
  326          * some processes of node_comm cannot be shared, but RMA operation simply checks
  327          * the node_id of a target process for distinguishing shm target.  */
  328         shm_node_comm_ptr = shm_win->comm_ptr->node_comm;
  329         shm_node_size = shm_node_comm_ptr->local_size;
  330 
  331         if (node_size > shm_node_size)
  332             MPIDI_SHM_Wins_next_and_continue(elem);
  333 
  334         mpi_errno = MPIR_Comm_group_impl(shm_win->comm_ptr, &shm_node_group_ptr);
  335         if (mpi_errno)
  336             MPIR_ERR_POP(mpi_errno);
  337 
  338         mpi_errno = MPIR_Group_translate_ranks_impl(node_group_ptr, node_size,
  339                                                     node_ranks, shm_node_group_ptr,
  340                                                     node_ranks_in_shm_node);
  341         if (mpi_errno)
  342             MPIR_ERR_POP(mpi_errno);
  343 
  344         mpi_errno = MPIR_Group_free_impl(shm_node_group_ptr);
  345         if (mpi_errno)
  346             MPIR_ERR_POP(mpi_errno);
  347         shm_node_group_ptr = NULL;
  348 
  349         group_diff = 0;
  350         for (i = 0; i < node_size; i++) {
  351             /* not exist in shm_comm->node_comm */
  352             if (node_ranks_in_shm_node[i] == MPI_UNDEFINED) {
  353                 group_diff = 1;
  354                 break;
  355             }
  356         }
  357         if (group_diff)
  358             MPIDI_SHM_Wins_next_and_continue(elem);
  359 
  360         /* Gather the offset of base_addr from all local processes. Match only
  361          * when all of them are included in the shm segment in current shm_win.
  362          *
  363          * Note that this collective call must be called after checking the
  364          * group match in order to guarantee all the local processes can perform
  365          * this call. */
  366         base_shm_offs[node_rank] = (MPI_Aint) ((*win_ptr)->base)
  367             - (MPI_Aint) (shm_win->shm_base_addr);
  368         mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
  369                                         base_shm_offs, 1, MPI_AINT, node_comm_ptr, &errflag);
  370         if (mpi_errno)
  371             MPIR_ERR_POP(mpi_errno);
  372         MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
  373 
  374         base_diff = 0;
  375         for (i = 0; i < comm_size; ++i) {
  376             int i_node_rank = (*win_ptr)->comm_ptr->intranode_table[i];
  377             if (i_node_rank >= 0) {
  378                 MPIU_Assert(i_node_rank < node_size);
  379 
  380                 if (base_shm_offs[i_node_rank] < 0 ||
  381                     base_shm_offs[i_node_rank] + (*win_ptr)->basic_info_table[i].size >
  382                     shm_win->shm_segment_len) {
  383                     base_diff = 1;
  384                     break;
  385                 }
  386             }
  387         }
  388 
  389         if (base_diff)
  390             MPIDI_SHM_Wins_next_and_continue(elem);
  391 
  392         /* Found the first matched shm_win */
  393         *matched_win = shm_win;
  394         break;
  395     }
  396 
  397   fn_exit:
  398     if (node_group_ptr != NULL)
  399         mpi_errno = MPIR_Group_free_impl(node_group_ptr);
  400     /* Only free it here when group_translate_ranks fails. */
  401     if (shm_node_group_ptr != NULL)
  402         mpi_errno = MPIR_Group_free_impl(shm_node_group_ptr);
  403 
  404     MPIU_CHKLMEM_FREEALL();
  405     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_SHM_WINS_MATCH);
  406     return mpi_errno;
  407     /* --BEGIN ERROR HANDLING-- */
  408   fn_fail:
  409     goto fn_exit;
  410     /* --END ERROR HANDLING-- */
  411 }
  412 
  413 #undef FUNCNAME
  414 #define FUNCNAME MPIDI_CH3I_Win_detect_shm
  415 #undef FCNAME
  416 #define FCNAME MPL_QUOTE(FUNCNAME)
  417 static int MPIDI_CH3I_Win_detect_shm(MPID_Win ** win_ptr)
  418 {
  419     int mpi_errno = MPI_SUCCESS;
  420     MPID_Win *shm_win_ptr = NULL;
  421     int i, node_size;
  422     MPI_Aint *base_shm_offs;
  423 
  424     MPIU_CHKPMEM_DECL(1);
  425     MPIU_CHKLMEM_DECL(1);
  426     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_WIN_DETECT_SHM);
  427     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_WIN_DETECT_SHM);
  428 
  429     if ((*win_ptr)->comm_ptr->node_comm == NULL) {
  430         goto fn_exit;
  431     }
  432 
  433     node_size = (*win_ptr)->comm_ptr->node_comm->local_size;
  434 
  435     MPIU_CHKLMEM_MALLOC(base_shm_offs, MPI_Aint *, node_size * sizeof(MPI_Aint),
  436                         mpi_errno, "base_shm_offs");
  437 
  438     /* Return the first matched shared window.
  439      * It is noted that the shared windows including all local processes are
  440      * stored in every local process in the same order, hence the first matched
  441      * shared window on every local process should be the same. */
  442     mpi_errno = MPIDI_CH3I_SHM_Wins_match(win_ptr, &shm_win_ptr, &base_shm_offs);
  443     if (mpi_errno)
  444         MPIR_ERR_POP(mpi_errno);
  445     if (shm_win_ptr == NULL)
  446         goto fn_exit;
  447 
  448     (*win_ptr)->shm_allocated = TRUE;
  449     MPIU_CHKPMEM_MALLOC((*win_ptr)->shm_base_addrs, void **,
  450                         node_size * sizeof(void *), mpi_errno, "(*win_ptr)->shm_base_addrs");
  451 
  452     /* Compute the base address of shm buffer on each process.
  453      * shm_base_addrs[i] = my_shm_base_addr + off[i] */
  454     for (i = 0; i < node_size; i++) {
  455         (*win_ptr)->shm_base_addrs[i] =
  456             (void *) ((MPI_Aint) shm_win_ptr->shm_base_addr + base_shm_offs[i]);
  457     }
  458 
  459     /* TODO: should we use the same mutex or create a new one ?
  460      * It causes unnecessary synchronization.*/
  461     (*win_ptr)->shm_mutex = shm_win_ptr->shm_mutex;
  462 
  463   fn_exit:
  464     MPIU_CHKLMEM_FREEALL();
  465     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_WIN_DETECT_SHM);
  466     return mpi_errno;
  467     /* --BEGIN ERROR HANDLING-- */
  468   fn_fail:
  469     MPIU_CHKPMEM_REAP();
  470     goto fn_exit;
  471     /* --END ERROR HANDLING-- */
  472 }
  473 
  474 #undef FUNCNAME
  475 #define FUNCNAME MPIDI_CH3I_Win_gather_info
  476 #undef FCNAME
  477 #define FCNAME MPL_QUOTE(FUNCNAME)
  478 static int MPIDI_CH3I_Win_gather_info(void *base, MPI_Aint size, int disp_unit, MPID_Info * info,
  479                                       MPID_Comm * comm_ptr, MPID_Win ** win_ptr)
  480 {
  481     MPID_Comm *node_comm_ptr = NULL;
  482     int node_rank, node_size;
  483     int comm_rank, comm_size;
  484     MPI_Aint *tmp_buf = NULL;
  485     MPIDI_VC_t *vc = NULL;
  486     int i, k;
  487     MPIR_Errflag_t errflag = MPIR_ERR_NONE;
  488     int mpi_errno = MPI_SUCCESS;
  489     MPIU_CHKLMEM_DECL(1);
  490     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_WIN_GATHER_INFO);
  491 
  492     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_WIN_GATHER_INFO);
  493 
  494     comm_size = (*win_ptr)->comm_ptr->local_size;
  495     comm_rank = (*win_ptr)->comm_ptr->rank;
  496 
  497     if (comm_size <= 1) {
  498         mpi_errno = MPIDI_CH3U_Win_gather_info(base, size, disp_unit, info, comm_ptr, win_ptr);
  499         goto fn_exit;
  500     }
  501 
  502     /* tell everyone, what is my COMM_WORLD rank */
  503     (*win_ptr)->rank_mapping[comm_rank] = MPIDI_Process.my_pg_rank;
  504     MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
  505            (*win_ptr)->rank_mapping, sizeof(uint32_t), MPI_BYTE,
  506           comm_ptr, &errflag);
  507         
  508    
  509     if (!mv2_enable_shmem_collectives && (*win_ptr)->shm_coll_comm_ref == -1) {
  510         MPIDI_PG_Get_vc(MPIDI_Process.my_pg, MPIDI_Process.my_pg_rank, &vc);
  511         /* Shared memory for collectives */
  512         mpi_errno = MPIDI_CH3I_SHMEM_COLL_init(MPIDI_Process.my_pg,
  513                 vc->smp.local_rank);
  514         if (mpi_errno) {
  515             MPIR_ERR_POP(mpi_errno);
  516         }
  517 
  518         /* local barrier */
  519         mpi_errno = MPIR_Barrier_impl(comm_ptr, &errflag);
  520         if (mpi_errno) {
  521             MPIR_ERR_POP(mpi_errno);
  522         }
  523 
  524         /* Memory Mapping shared files for collectives*/
  525         mpi_errno = MPIDI_CH3I_SHMEM_COLL_Mmap(MPIDI_Process.my_pg,
  526                 vc->smp.local_rank);
  527         if (mpi_errno) {
  528             MPIR_ERR_POP(mpi_errno);
  529         }
  530 
  531         /* local barrier */
  532         mpi_errno = MPIR_Barrier_impl(comm_ptr, &errflag);
  533         if (mpi_errno) {
  534             MPIR_ERR_POP(mpi_errno);
  535         }
  536 
  537         /* Unlink mapped files so that they get cleaned up when
  538          * process exits */
  539         MPIDI_CH3I_SHMEM_COLL_Unlink();
  540         (*win_ptr)->shm_coll_comm_ref = 1;
  541     } else if ((*win_ptr)->shm_coll_comm_ref > 0) {
  542         (*win_ptr)->shm_coll_comm_ref++;
  543     }
  544 
  545     if ((*win_ptr)->node_comm_ptr == NULL) {
  546         if((*win_ptr)->comm_ptr->dev.ch.shmem_coll_ok == 0) {
  547             mpi_errno = create_2level_comm((*win_ptr)->comm_ptr->handle, 
  548                     (*win_ptr)->comm_ptr->local_size, (*win_ptr)->comm_ptr->rank);
  549             if(mpi_errno) {
  550                 MPIR_ERR_POP(mpi_errno);
  551             }
  552         }           
  553 
  554         MPI_Comm shmem_comm;
  555         shmem_comm = (*win_ptr)->comm_ptr->dev.ch.shmem_comm;
  556         MPID_Comm_get_ptr(shmem_comm, node_comm_ptr);
  557 
  558         /* Fall back to no_shm function if shmem_comm is not created
  559          * successfully*/
  560         if (node_comm_ptr == NULL) {
  561             mpi_errno = MPIDI_CH3U_Win_gather_info(base, size, disp_unit, info, comm_ptr, win_ptr);
  562             (*win_ptr)->shm_allocated = FALSE;
  563             goto fn_exit;
  564         }
  565 
  566         MPIU_Assert(node_comm_ptr != NULL);
  567 
  568         (*win_ptr)->node_comm_ptr = node_comm_ptr;
  569     } else {
  570         node_comm_ptr = (*win_ptr)->node_comm_ptr;
  571     }
  572 
  573     node_size = node_comm_ptr->local_size;
  574     node_rank = node_comm_ptr->rank;
  575 
  576     (*win_ptr)->node_comm_size = node_size;
  577     (*win_ptr)->info_shm_segment_len = comm_size * sizeof(MPIDI_Win_basic_info_t);
  578 
  579     mpi_errno = MPIU_SHMW_Hnd_init(&(*win_ptr)->info_shm_segment_handle);
  580     if (mpi_errno != MPI_SUCCESS)
  581         MPIR_ERR_POP(mpi_errno);
  582 
  583     if (node_rank == 0) {
  584         char *serialized_hnd_ptr = NULL;
  585 
  586         /* create shared memory region for all processes in win and map. */
  587         mpi_errno = MPIU_SHMW_Seg_create_and_attach((*win_ptr)->info_shm_segment_handle,
  588                                                     (*win_ptr)->info_shm_segment_len,
  589                                                     (char **) &(*win_ptr)->info_shm_base_addr, 0);
  590         if (mpi_errno)
  591             MPIR_ERR_POP(mpi_errno);
  592 
  593         /* serialize handle and broadcast it to the other processes in win */
  594         mpi_errno =
  595             MPIU_SHMW_Hnd_get_serialized_by_ref((*win_ptr)->info_shm_segment_handle,
  596                                                 &serialized_hnd_ptr);
  597         if (mpi_errno)
  598             MPIR_ERR_POP(mpi_errno);
  599 
  600         mpi_errno =
  601             MPIR_Shmem_Bcast_MV2(serialized_hnd_ptr, MPIU_SHMW_GHND_SZ, MPI_CHAR, 0, node_comm_ptr,
  602                     &errflag);
  603         if (mpi_errno)
  604             MPIR_ERR_POP(mpi_errno);
  605         MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
  606 
  607         /* wait for other processes to attach to win */
  608         mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
  609         if (mpi_errno)
  610             MPIR_ERR_POP(mpi_errno);
  611         MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
  612 
  613         /* unlink shared memory region so it gets deleted when all processes exit */
  614         mpi_errno = MPIU_SHMW_Seg_remove((*win_ptr)->info_shm_segment_handle);
  615         if (mpi_errno)
  616             MPIR_ERR_POP(mpi_errno);
  617     }
  618     else {
  619         char serialized_hnd[MPIU_SHMW_GHND_SZ] = { 0 };
  620 
  621         /* get serialized handle from rank 0 and deserialize it */
  622         mpi_errno =
  623             MPIR_Shmem_Bcast_MV2(serialized_hnd, MPIU_SHMW_GHND_SZ, MPI_CHAR, 0, node_comm_ptr,
  624                     &errflag);
  625         if (mpi_errno)
  626             MPIR_ERR_POP(mpi_errno);
  627         MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
  628 
  629         mpi_errno = MPIU_SHMW_Hnd_deserialize((*win_ptr)->info_shm_segment_handle, serialized_hnd,
  630                                               strlen(serialized_hnd));
  631         if (mpi_errno)
  632             MPIR_ERR_POP(mpi_errno);
  633 
  634         /* attach to shared memory region created by rank 0 */
  635         mpi_errno =
  636             MPIU_SHMW_Seg_attach((*win_ptr)->info_shm_segment_handle,
  637                                  (*win_ptr)->info_shm_segment_len,
  638                                  (char **) &(*win_ptr)->info_shm_base_addr, 0);
  639         if (mpi_errno)
  640             MPIR_ERR_POP(mpi_errno);
  641 
  642         mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
  643         if (mpi_errno)
  644             MPIR_ERR_POP(mpi_errno);
  645         MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
  646     }
  647 
  648     (*win_ptr)->basic_info_table = (MPIDI_Win_basic_info_t *) ((*win_ptr)->info_shm_base_addr);
  649 
  650     MPIU_CHKLMEM_MALLOC(tmp_buf, MPI_Aint *, 4 * comm_size * sizeof(MPI_Aint),
  651                         mpi_errno, "tmp_buf");
  652 
  653     tmp_buf[4 * comm_rank] = MPIU_PtrToAint(base);
  654     tmp_buf[4 * comm_rank + 1] = size;
  655     tmp_buf[4 * comm_rank + 2] = (MPI_Aint) disp_unit;
  656     tmp_buf[4 * comm_rank + 3] = (MPI_Aint) (*win_ptr)->handle;
  657 
  658     mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, tmp_buf, 4, MPI_AINT,
  659                                     (*win_ptr)->comm_ptr, &errflag);
  660     if (mpi_errno != MPI_SUCCESS)
  661         MPIR_ERR_POP(mpi_errno);
  662 
  663     if (node_rank == 0) {
  664         /* only node_rank == 0 writes results to basic_info_table on shared memory region. */
  665         k = 0;
  666         for (i = 0; i < comm_size; i++) {
  667             (*win_ptr)->basic_info_table[i].base_addr = MPIU_AintToPtr(tmp_buf[k++]);
  668             (*win_ptr)->basic_info_table[i].size = tmp_buf[k++];
  669             (*win_ptr)->basic_info_table[i].disp_unit = (int) tmp_buf[k++];
  670             (*win_ptr)->basic_info_table[i].win_handle = (MPI_Win) tmp_buf[k++];
  671         }
  672     }
  673 
  674     /* Make sure that all local processes see the results written by node_rank == 0 */
  675     mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
  676     if (mpi_errno != MPI_SUCCESS)
  677         MPIR_ERR_POP(mpi_errno);
  678 
  679     /* call psm to pre-post receive buffers for Puts */
  680     psm_prepost_1sc();
  681     MPIR_Barrier_impl((*win_ptr)->comm_ptr, &errflag);
  682 
  683   fn_exit:
  684     MPIU_CHKLMEM_FREEALL();
  685     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_WIN_GATHER_INFO);
  686     return mpi_errno;
  687     /* --BEGIN ERROR HANDLING-- */
  688   fn_fail:
  689     goto fn_exit;
  690     /* --END ERROR HANDLING-- */
  691 }
  692 
  693 #undef FUNCNAME
  694 #define FUNCNAME MPIDI_CH3I_Win_allocate_shm
  695 #undef FCNAME
  696 #define FCNAME MPL_QUOTE(FUNCNAME)
  697 static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPID_Info * info,
  698                                        MPID_Comm * comm_ptr, void *base_ptr, MPID_Win ** win_ptr)
  699 {
  700     int mpi_errno = MPI_SUCCESS;
  701     void **base_pp = (void **) base_ptr;
  702     int i, node_size, node_rank;
  703     MPID_Comm *node_comm_ptr = NULL;
  704     MPI_Aint *node_sizes;
  705     MPIR_Errflag_t errflag = MPIR_ERR_NONE;
  706     int noncontig = FALSE;
  707     int comm_size;
  708     MPIDI_VC_t *vc = NULL;
  709     MPIU_CHKPMEM_DECL(1);
  710     MPIU_CHKLMEM_DECL(1);
  711     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM);
  712 
  713     MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM);
  714 
  715     comm_size = (*win_ptr)->comm_ptr->local_size;
  716 
  717     if (comm_size <= 1) {
  718         mpi_errno =
  719             MPIDI_CH3U_Win_allocate_no_shm(size, disp_unit, info, comm_ptr, base_ptr, win_ptr);
  720         goto fn_exit;
  721     }
  722 
  723     /* see if we can allocate all windows contiguously */
  724     noncontig = (*win_ptr)->info_args.alloc_shared_noncontig;
  725 
  726     (*win_ptr)->shm_allocated = TRUE;
  727 
  728     /* When allocating shared memory region segment, we need comm of processes
  729      * that are on the same node as this process (node_comm).
  730      * If node_comm == NULL, this process is the only one on this node, therefore
  731      * we use comm_self as node comm. */
  732 
  733     if (likely(!(*win_ptr)->node_comm_ptr)) {
  734         if (!mv2_enable_shmem_collectives && (*win_ptr)->shm_coll_comm_ref == -1) {
  735             MPIDI_PG_Get_vc(MPIDI_Process.my_pg, MPIDI_Process.my_pg_rank, &vc);
  736             /* Shared memory for collectives */
  737             mpi_errno = MPIDI_CH3I_SHMEM_COLL_init(MPIDI_Process.my_pg,
  738                     vc->smp.local_rank);
  739             if (mpi_errno) {
  740                 MPIR_ERR_POP(mpi_errno);
  741             }
  742 
  743             /* local barrier */
  744             mpi_errno = MPIR_Barrier_impl(comm_ptr, &errflag);
  745             if (mpi_errno) {
  746                 MPIR_ERR_POP(mpi_errno);
  747             }
  748 
  749             /* Memory Mapping shared files for collectives*/
  750             mpi_errno = MPIDI_CH3I_SHMEM_COLL_Mmap(MPIDI_Process.my_pg,
  751                     vc->smp.local_rank);
  752             if (mpi_errno) {
  753                 MPIR_ERR_POP(mpi_errno);
  754             }
  755 
  756             /* local barrier */
  757             mpi_errno = MPIR_Barrier_impl(comm_ptr, &errflag);
  758             if (mpi_errno) {
  759                 MPIR_ERR_POP(mpi_errno);
  760             }
  761 
  762             /* Unlink mapped files so that they get cleaned up when
  763              * process exits */
  764             MPIDI_CH3I_SHMEM_COLL_Unlink();
  765             (*win_ptr)->shm_coll_comm_ref = 1;
  766         } else if ((*win_ptr)->shm_coll_comm_ref > 0) {
  767             (*win_ptr)->shm_coll_comm_ref++;
  768         }
  769 
  770         if((*win_ptr)->comm_ptr->dev.ch.shmem_coll_ok == 0) {
  771             mpi_errno = create_2level_comm((*win_ptr)->comm_ptr->handle,
  772                     (*win_ptr)->comm_ptr->local_size, (*win_ptr)->comm_ptr->rank);
  773             if(mpi_errno) {
  774                 MPIR_ERR_POP(mpi_errno);
  775             }
  776         }
  777 
  778         MPI_Comm shmem_comm;
  779         shmem_comm = (*win_ptr)->comm_ptr->dev.ch.shmem_comm;
  780         MPID_Comm_get_ptr(shmem_comm, node_comm_ptr);
  781 
  782 
  783         /* Fall back to no_shm function if shmem_comm is not created successfully*/
  784         if (node_comm_ptr == NULL) {
  785             mpi_errno =
  786                 MPIDI_CH3U_Win_allocate_no_shm(size, disp_unit, info, comm_ptr, base_ptr, win_ptr);
  787             (*win_ptr)->shm_allocated = FALSE;
  788             goto fn_exit;
  789         }
  790 
  791         MPIU_Assert(node_comm_ptr != NULL);
  792 
  793         node_size = node_comm_ptr->local_size;
  794         node_rank = node_comm_ptr->rank;
  795         (*win_ptr)->node_comm_ptr = node_comm_ptr;
  796     } else {
  797         node_comm_ptr = (*win_ptr)->node_comm_ptr;
  798     }
  799 
  800     MPIR_T_PVAR_TIMER_START(RMA, rma_wincreate_allgather);
  801     /* allocate memory for the base addresses, disp_units, and
  802      * completion counters of all processes */
  803     MPIU_CHKPMEM_MALLOC((*win_ptr)->shm_base_addrs, void **,
  804                         node_size * sizeof(void *), mpi_errno, "(*win_ptr)->shm_base_addrs");
  805 
  806     /* get the sizes of the windows and window objectsof
  807      * all processes.  allocate temp. buffer for communication */
  808     MPIU_CHKLMEM_MALLOC(node_sizes, MPI_Aint *, node_size * sizeof(MPI_Aint), mpi_errno,
  809                         "node_sizes");
  810 
  811     /* FIXME: This needs to be fixed for heterogeneous systems */
  812     node_sizes[node_rank] = (MPI_Aint) size;
  813 
  814     mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
  815             node_sizes, sizeof(MPI_Aint), MPI_BYTE,
  816             node_comm_ptr, &errflag);
  817 
  818     MPIR_T_PVAR_TIMER_END(RMA, rma_wincreate_allgather);
  819     if (mpi_errno)
  820         MPIR_ERR_POP(mpi_errno);
  821     MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
  822 
  823     (*win_ptr)->shm_segment_len = 0;
  824 
  825     for (i = 0; i < node_size; i++) {
  826         if (noncontig)
  827             /* Round up to next page size */
  828             (*win_ptr)->shm_segment_len += MPIDI_CH3_ROUND_UP_PAGESIZE(node_sizes[i]);
  829         else
  830             (*win_ptr)->shm_segment_len += node_sizes[i];
  831     }
  832 
  833     if ((*win_ptr)->shm_segment_len == 0) {
  834         (*win_ptr)->base = NULL;
  835     }
  836 
  837     else {
  838         mpi_errno = MPIU_SHMW_Hnd_init(&(*win_ptr)->shm_segment_handle);
  839         if (mpi_errno)
  840             MPIR_ERR_POP(mpi_errno);
  841 
  842         if (node_rank == 0) {
  843             char *serialized_hnd_ptr = NULL;
  844 
  845             /* create shared memory region for all processes in win and map */
  846             mpi_errno =
  847                 MPIU_SHMW_Seg_create_and_attach((*win_ptr)->shm_segment_handle,
  848                                                 (*win_ptr)->shm_segment_len,
  849                                                 (char **) &(*win_ptr)->shm_base_addr, 0);
  850             if (mpi_errno)
  851                 MPIR_ERR_POP(mpi_errno);
  852 
  853             /* serialize handle and broadcast it to the other processes in win */
  854             mpi_errno =
  855                 MPIU_SHMW_Hnd_get_serialized_by_ref((*win_ptr)->shm_segment_handle,
  856                                                     &serialized_hnd_ptr);
  857             if (mpi_errno)
  858                 MPIR_ERR_POP(mpi_errno);
  859 
  860             mpi_errno =
  861                 MPIR_Shmem_Bcast_MV2(serialized_hnd_ptr, MPIU_SHMW_GHND_SZ, MPI_CHAR, 0, node_comm_ptr,
  862                         &errflag);
  863             if (mpi_errno)
  864                 MPIR_ERR_POP(mpi_errno);
  865             MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
  866 
  867             /* wait for other processes to attach to win */
  868             mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
  869             if (mpi_errno)
  870                 MPIR_ERR_POP(mpi_errno);
  871             MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
  872 
  873             /* unlink shared memory region so it gets deleted when all processes exit */
  874             mpi_errno = MPIU_SHMW_Seg_remove((*win_ptr)->shm_segment_handle);
  875             if (mpi_errno)
  876                 MPIR_ERR_POP(mpi_errno);
  877 
  878         }
  879         else {
  880             char serialized_hnd[MPIU_SHMW_GHND_SZ] = { 0 };
  881 
  882             /* get serialized handle from rank 0 and deserialize it */
  883             mpi_errno =
  884                 MPIR_Shmem_Bcast_MV2(serialized_hnd, MPIU_SHMW_GHND_SZ, MPI_CHAR, 0, node_comm_ptr,
  885                         &errflag);
  886             if (mpi_errno)
  887                 MPIR_ERR_POP(mpi_errno);
  888             MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
  889 
  890             mpi_errno =
  891                 MPIU_SHMW_Hnd_deserialize((*win_ptr)->shm_segment_handle, serialized_hnd,
  892                                           strlen(serialized_hnd));
  893             if (mpi_errno)
  894                 MPIR_ERR_POP(mpi_errno);
  895 
  896             /* attach to shared memory region created by rank 0 */
  897             mpi_errno =
  898                 MPIU_SHMW_Seg_attach((*win_ptr)->shm_segment_handle, (*win_ptr)->shm_segment_len,
  899                                      (char **) &(*win_ptr)->shm_base_addr, 0);
  900             if (mpi_errno)
  901                 MPIR_ERR_POP(mpi_errno);
  902 
  903             mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
  904             if (mpi_errno)
  905                 MPIR_ERR_POP(mpi_errno);
  906             MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
  907         }
  908 
  909         /* Allocated the interprocess mutex segment. */
  910         mpi_errno = MPIU_SHMW_Hnd_init(&(*win_ptr)->shm_mutex_segment_handle);
  911         if (mpi_errno)
  912             MPIR_ERR_POP(mpi_errno);
  913 
  914         if (node_rank == 0) {
  915             char *serialized_hnd_ptr = NULL;
  916 
  917             /* create shared memory region for all processes in win and map */
  918             mpi_errno =
  919                 MPIU_SHMW_Seg_create_and_attach((*win_ptr)->shm_mutex_segment_handle,
  920                                                 sizeof(MPIDI_CH3I_SHM_MUTEX),
  921                                                 (char **) &(*win_ptr)->shm_mutex, 0);
  922             if (mpi_errno)
  923                 MPIR_ERR_POP(mpi_errno);
  924 
  925             MPIDI_CH3I_SHM_MUTEX_INIT(*win_ptr);
  926 
  927             /* serialize handle and broadcast it to the other processes in win */
  928             mpi_errno =
  929                 MPIU_SHMW_Hnd_get_serialized_by_ref((*win_ptr)->shm_mutex_segment_handle,
  930                                                     &serialized_hnd_ptr);
  931             if (mpi_errno)
  932                 MPIR_ERR_POP(mpi_errno);
  933 
  934             mpi_errno =
  935                 MPIR_Shmem_Bcast_MV2(serialized_hnd_ptr, MPIU_SHMW_GHND_SZ, MPI_CHAR, 0, node_comm_ptr,
  936                         &errflag);
  937             if (mpi_errno)
  938                 MPIR_ERR_POP(mpi_errno);
  939             MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
  940 
  941             /* wait for other processes to attach to win */
  942             mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
  943             if (mpi_errno)
  944                 MPIR_ERR_POP(mpi_errno);
  945             MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
  946 
  947             /* unlink shared memory region so it gets deleted when all processes exit */
  948             mpi_errno = MPIU_SHMW_Seg_remove((*win_ptr)->shm_mutex_segment_handle);
  949             if (mpi_errno)
  950                 MPIR_ERR_POP(mpi_errno);
  951         }
  952         else {
  953             char serialized_hnd[MPIU_SHMW_GHND_SZ] = { 0 };
  954 
  955             /* get serialized handle from rank 0 and deserialize it */
  956             mpi_errno =
  957                 MPIR_Shmem_Bcast_MV2(serialized_hnd, MPIU_SHMW_GHND_SZ, MPI_CHAR, 0, node_comm_ptr,
  958                         &errflag);
  959             if (mpi_errno)
  960                 MPIR_ERR_POP(mpi_errno);
  961             MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
  962 
  963             mpi_errno =
  964                 MPIU_SHMW_Hnd_deserialize((*win_ptr)->shm_mutex_segment_handle, serialized_hnd,
  965                                           strlen(serialized_hnd));
  966             if (mpi_errno)
  967                 MPIR_ERR_POP(mpi_errno);
  968 
  969             /* attach to shared memory region created by rank 0 */
  970             mpi_errno =
  971                 MPIU_SHMW_Seg_attach((*win_ptr)->shm_mutex_segment_handle,
  972                                      sizeof(MPIDI_CH3I_SHM_MUTEX), (char **) &(*win_ptr)->shm_mutex,
  973                                      0);
  974             if (mpi_errno)
  975                 MPIR_ERR_POP(mpi_errno);
  976 
  977             mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
  978             if (mpi_errno)
  979                 MPIR_ERR_POP(mpi_errno);
  980             MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
  981         }
  982 
  983         /* compute the base addresses of each process within the shared memory segment */
  984         {
  985             char *cur_base;
  986             int cur_rank;
  987 
  988             cur_base = (*win_ptr)->shm_base_addr;
  989             cur_rank = 0;
  990             ((*win_ptr)->shm_base_addrs)[0] = (*win_ptr)->shm_base_addr;
  991             for (i = 1; i < node_size; ++i) {
  992                 if (node_sizes[i]) {
  993                     /* For the base addresses, we track the previous
  994                      * process that has allocated non-zero bytes of shared
  995                      * memory.  We can not simply use "i-1" for the
  996                      * previous process because rank "i-1" might not have
  997                      * allocated any memory. */
  998                     if (noncontig) {
  999                         ((*win_ptr)->shm_base_addrs)[i] =
 1000                             cur_base + MPIDI_CH3_ROUND_UP_PAGESIZE(node_sizes[cur_rank]);
 1001                     }
 1002                     else {
 1003                         ((*win_ptr)->shm_base_addrs)[i] = cur_base + node_sizes[cur_rank];
 1004                     }
 1005                     cur_base = ((*win_ptr)->shm_base_addrs)[i];
 1006                     cur_rank = i;
 1007                 }
 1008                 else {
 1009                     ((*win_ptr)->shm_base_addrs)[i] = NULL;
 1010                 }
 1011             }
 1012         }
 1013 
 1014         (*win_ptr)->base = (*win_ptr)->shm_base_addrs[node_rank];
 1015     }
 1016 
 1017     *base_pp = (*win_ptr)->base;
 1018 
 1019     /* gather window information among processes via shared memory region. */
 1020     mpi_errno = MPIDI_CH3I_Win_gather_info((*base_pp), size, disp_unit, info, comm_ptr, win_ptr);
 1021     if (mpi_errno != MPI_SUCCESS)
 1022         MPIR_ERR_POP(mpi_errno);
 1023 
 1024     /* Cache SHM windows */
 1025     MPIDI_CH3I_SHM_Wins_append(&shm_wins_list, (*win_ptr));
 1026 
 1027   fn_exit:
 1028     MPIU_CHKLMEM_FREEALL();
 1029     MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM);
 1030     return mpi_errno;
 1031     /* --BEGIN ERROR HANDLING-- */
 1032   fn_fail:
 1033     MPIU_CHKPMEM_REAP();
 1034     goto fn_exit;
 1035     /* --END ERROR HANDLING-- */
 1036 }