"Fossies" - the Fresh Open Source Software Archive

Member "mvapich2-2.3.2/src/mpi/coll/reduce.c" (8 Aug 2019, 49692 Bytes) of package /linux/misc/mvapich2-2.3.2.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "reduce.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 2.3.1_vs_2.3.2.

    1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
    2 /*
    3  *
    4  *  (C) 2001 by Argonne National Laboratory.
    5  *      See COPYRIGHT in top-level directory.
    6  */
    7 
    8 /* Copyright (c) 2001-2019, The Ohio State University. All rights
    9  * reserved.
   10  *
   11  * This file is part of the MVAPICH2 software package developed by the
   12  * team members of The Ohio State University's Network-Based Computing
   13  * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda.
   14  *
   15  * For detailed copyright and licensing information, please refer to the
   16  * copyright file COPYRIGHT in the top level MVAPICH2 directory.
   17  *
   18  */
   19 
   20 #include "mpiimpl.h"
   21 #include "collutil.h"
   22 #ifdef _OSU_MVAPICH_
   23 #   include "coll_shmem.h"
   24 #endif /* _OSU_MVAPICH_ */
   25 
   26 /*
   27 === BEGIN_MPI_T_CVAR_INFO_BLOCK ===
   28 
   29 cvars:
   30     - name        : MPIR_CVAR_REDUCE_SHORT_MSG_SIZE
   31       category    : COLLECTIVE
   32       type        : int
   33       default     : 2048
   34       class       : device
   35       verbosity   : MPI_T_VERBOSITY_USER_BASIC
   36       scope       : MPI_T_SCOPE_ALL_EQ
   37       description : >-
   38         the short message algorithm will be used if the send buffer size is <=
   39         this value (in bytes)
   40 
   41     - name        : MPIR_CVAR_ENABLE_SMP_REDUCE
   42       category    : COLLECTIVE
   43       type        : boolean
   44       default     : true
   45       class       : device
   46       verbosity   : MPI_T_VERBOSITY_USER_BASIC
   47       scope       : MPI_T_SCOPE_ALL_EQ
   48       description : >-
   49         Enable SMP aware reduce.
   50 
   51     - name        : MPIR_CVAR_MAX_SMP_REDUCE_MSG_SIZE
   52       category    : COLLECTIVE
   53       type        : int
   54       default     : 0
   55       class       : device
   56       verbosity   : MPI_T_VERBOSITY_USER_BASIC
   57       scope       : MPI_T_SCOPE_ALL_EQ
   58       description : >-
   59         Maximum message size for which SMP-aware reduce is used.  A
   60         value of '0' uses SMP-aware reduce for all message sizes.
   61 
   62 
   63 === END_MPI_T_CVAR_INFO_BLOCK ===
   64 */
   65 
   66 /* -- Begin Profiling Symbol Block for routine MPI_Reduce */
   67 #if defined(HAVE_PRAGMA_WEAK)
   68 #pragma weak MPI_Reduce = PMPI_Reduce
   69 #elif defined(HAVE_PRAGMA_HP_SEC_DEF)
   70 #pragma _HP_SECONDARY_DEF PMPI_Reduce  MPI_Reduce
   71 #elif defined(HAVE_PRAGMA_CRI_DUP)
   72 #pragma _CRI duplicate MPI_Reduce as PMPI_Reduce
   73 #elif defined(HAVE_WEAK_ATTRIBUTE)
   74 int MPI_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
   75                MPI_Op op, int root, MPI_Comm comm)
   76                __attribute__((weak,alias("PMPI_Reduce")));
   77 #endif
   78 /* -- End Profiling Symbol Block */
   79 
   80 /* Define MPICH_MPI_FROM_PMPI if weak symbols are not supported to build
   81    the MPI routines */
   82 #ifndef MPICH_MPI_FROM_PMPI
   83 #undef MPI_Reduce
   84 #define MPI_Reduce PMPI_Reduce
   85 
   86 /* This function implements a binomial tree reduce.
   87 
   88    Cost = lgp.alpha + n.lgp.beta + n.lgp.gamma
   89  */
   90 #undef FUNCNAME
   91 #define FUNCNAME MPIR_Reduce_binomial
   92 #undef FCNAME
   93 #define FCNAME MPL_QUOTE(FUNCNAME)
   94 static int MPIR_Reduce_binomial ( 
   95     const void *sendbuf,
   96     void *recvbuf,
   97     int count,
   98     MPI_Datatype datatype,
   99     MPI_Op op,
  100     int root,
  101     MPID_Comm *comm_ptr,
  102     MPIR_Errflag_t *errflag )
  103 {
  104     int mpi_errno = MPI_SUCCESS;
  105     int mpi_errno_ret = MPI_SUCCESS;
  106     MPI_Status status;
  107     int comm_size, rank, is_commutative, type_size ATTRIBUTE((unused));
  108     int mask, relrank, source, lroot;
  109     MPI_Aint true_lb, true_extent, extent; 
  110     void *tmp_buf;
  111     MPIU_CHKLMEM_DECL(2);
  112 
  113     if (count == 0) return MPI_SUCCESS;
  114 
  115     comm_size = comm_ptr->local_size;
  116     rank = comm_ptr->rank;
  117 
  118     /* Create a temporary buffer */
  119 
  120     MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent);
  121     MPID_Datatype_get_extent_macro(datatype, extent);
  122 
  123     is_commutative = MPIR_Op_is_commutative(op);
  124 
  125     /* I think this is the worse case, so we can avoid an assert() 
  126      * inside the for loop */
  127     /* should be buf+{this}? */
  128     MPIU_Ensure_Aint_fits_in_pointer(count * MPIR_MAX(extent, true_extent));
  129 
  130     MPIU_CHKLMEM_MALLOC(tmp_buf, void *, count*(MPIR_MAX(extent,true_extent)),
  131                         mpi_errno, "temporary buffer");
  132     /* adjust for potential negative lower bound in datatype */
  133     tmp_buf = (void *)((char*)tmp_buf - true_lb);
  134     
  135     /* If I'm not the root, then my recvbuf may not be valid, therefore
  136        I have to allocate a temporary one */
  137     if (rank != root) {
  138         MPIU_CHKLMEM_MALLOC(recvbuf, void *, 
  139                             count*(MPIR_MAX(extent,true_extent)), 
  140                             mpi_errno, "receive buffer");
  141         recvbuf = (void *)((char*)recvbuf - true_lb);
  142     }
  143 
  144     if ((rank != root) || (sendbuf != MPI_IN_PLACE)) {
  145         mpi_errno = MPIR_Localcopy(sendbuf, count, datatype, recvbuf,
  146                                    count, datatype);
  147         if (mpi_errno) { MPIR_ERR_POP(mpi_errno); }
  148     }
  149 
  150     MPID_Datatype_get_size_macro(datatype, type_size);
  151 
  152     /* This code is from MPICH-1. */
  153 
  154     /* Here's the algorithm.  Relative to the root, look at the bit pattern in 
  155        my rank.  Starting from the right (lsb), if the bit is 1, send to 
  156        the node with that bit zero and exit; if the bit is 0, receive from the
  157        node with that bit set and combine (as long as that node is within the
  158        group)
  159        
  160        Note that by receiving with source selection, we guarentee that we get
  161        the same bits with the same input.  If we allowed the parent to receive 
  162        the children in any order, then timing differences could cause different
  163        results (roundoff error, over/underflows in some cases, etc).
  164        
  165        Because of the way these are ordered, if root is 0, then this is correct
  166        for both commutative and non-commutitive operations.  If root is not
  167        0, then for non-commutitive, we use a root of zero and then send
  168        the result to the root.  To see this, note that the ordering is
  169        mask = 1: (ab)(cd)(ef)(gh)            (odds send to evens)
  170        mask = 2: ((ab)(cd))((ef)(gh))        (3,6 send to 0,4)
  171        mask = 4: (((ab)(cd))((ef)(gh)))      (4 sends to 0)
  172        
  173        Comments on buffering.  
  174        If the datatype is not contiguous, we still need to pass contiguous 
  175        data to the user routine.  
  176        In this case, we should make a copy of the data in some format, 
  177        and send/operate on that.
  178        
  179        In general, we can't use MPI_PACK, because the alignment of that
  180        is rather vague, and the data may not be re-usable.  What we actually
  181        need is a "squeeze" operation that removes the skips.
  182     */
  183     mask    = 0x1;
  184     if (is_commutative) 
  185         lroot   = root;
  186     else
  187         lroot   = 0;
  188     relrank = (rank - lroot + comm_size) % comm_size;
  189     
  190     while (/*(mask & relrank) == 0 && */mask < comm_size) {
  191         /* Receive */
  192         if ((mask & relrank) == 0) {
  193             source = (relrank | mask);
  194             if (source < comm_size) {
  195                 source = (source + lroot) % comm_size;
  196                 mpi_errno = MPIC_Recv(tmp_buf, count, datatype, source,
  197                                          MPIR_REDUCE_TAG, comm_ptr, &status, errflag);
  198                 if (mpi_errno) {
  199                     /* for communication errors, just record the error but continue */
  200                     *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
  201                     MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
  202                     MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
  203                 }
  204 
  205                 /* The sender is above us, so the received buffer must be
  206                    the second argument (in the noncommutative case). */
  207                 if (is_commutative) {
  208                     mpi_errno = MPIR_Reduce_local_impl(tmp_buf, recvbuf, count, datatype, op);
  209                     if (mpi_errno) MPIR_ERR_POP(mpi_errno);
  210                 }
  211                 else {
  212                     mpi_errno = MPIR_Reduce_local_impl(recvbuf, tmp_buf, count, datatype, op);
  213                     if (mpi_errno) MPIR_ERR_POP(mpi_errno);
  214 
  215                     mpi_errno = MPIR_Localcopy(tmp_buf, count, datatype,
  216                                                recvbuf, count, datatype);
  217                     if (mpi_errno) { MPIR_ERR_POP(mpi_errno); }
  218                 }
  219             }
  220         }
  221         else {
  222             /* I've received all that I'm going to.  Send my result to 
  223                my parent */
  224             source = ((relrank & (~ mask)) + lroot) % comm_size;
  225             mpi_errno  = MPIC_Send(recvbuf, count, datatype,
  226                                       source, MPIR_REDUCE_TAG, comm_ptr, errflag);
  227             if (mpi_errno) {
  228                 /* for communication errors, just record the error but continue */
  229                 *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
  230                 MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
  231                 MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
  232             }
  233             break;
  234         }
  235         mask <<= 1;
  236     }
  237 
  238     if (!is_commutative && (root != 0))
  239     {
  240         if (rank == 0)
  241         {
  242             mpi_errno  = MPIC_Send(recvbuf, count, datatype, root,
  243                                       MPIR_REDUCE_TAG, comm_ptr, errflag);
  244         }
  245         else if (rank == root)
  246         {
  247             mpi_errno = MPIC_Recv(recvbuf, count, datatype, 0,
  248                                     MPIR_REDUCE_TAG, comm_ptr, &status, errflag);
  249         }
  250         if (mpi_errno) {
  251             /* for communication errors, just record the error but continue */
  252             *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
  253             MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
  254             MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
  255         }
  256     }
  257 
  258 fn_exit:
  259     MPIU_CHKLMEM_FREEALL();
  260     if (mpi_errno_ret)
  261         mpi_errno = mpi_errno_ret;
  262     else if (*errflag != MPIR_ERR_NONE)
  263         MPIR_ERR_SET(mpi_errno, *errflag, "**coll_fail");
  264     return mpi_errno;
  265 fn_fail:
  266     goto fn_exit;
  267 }
  268 
  269 /* An implementation of Rabenseifner's reduce algorithm (see
  270    http://www.hlrs.de/mpi/myreduce.html).
  271 
  272    This algorithm implements the reduce in two steps: first a
  273    reduce-scatter, followed by a gather to the root. A
  274    recursive-halving algorithm (beginning with processes that are
  275    distance 1 apart) is used for the reduce-scatter, and a binomial tree
  276    algorithm is used for the gather. The non-power-of-two case is
  277    handled by dropping to the nearest lower power-of-two: the first
  278    few odd-numbered processes send their data to their left neighbors
  279    (rank-1), and the reduce-scatter happens among the remaining
  280    power-of-two processes. If the root is one of the excluded
  281    processes, then after the reduce-scatter, rank 0 sends its result to
  282    the root and exits; the root now acts as rank 0 in the binomial tree
  283    algorithm for gather.
  284 
  285    For the power-of-two case, the cost for the reduce-scatter is 
  286    lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma. The cost for the
  287    gather to root is lgp.alpha + n.((p-1)/p).beta. Therefore, the
  288    total cost is:
  289    Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta + n.((p-1)/p).gamma
  290 
  291    For the non-power-of-two case, assuming the root is not one of the
  292    odd-numbered processes that get excluded in the reduce-scatter,
  293    Cost = (2.floor(lgp)+1).alpha + (2.((p-1)/p) + 1).n.beta + 
  294            n.(1+(p-1)/p).gamma
  295 */
  296 #undef FUNCNAME
  297 #define FUNCNAME MPIR_Reduce_redscat_gather
  298 #undef FCNAME
  299 #define FCNAME MPL_QUOTE(FUNCNAME)
  300 static int MPIR_Reduce_redscat_gather ( 
  301     const void *sendbuf,
  302     void *recvbuf,
  303     int count,
  304     MPI_Datatype datatype,
  305     MPI_Op op,
  306     int root,
  307     MPID_Comm *comm_ptr,
  308     MPIR_Errflag_t *errflag )
  309 {
  310     int mpi_errno = MPI_SUCCESS;
  311     int mpi_errno_ret = MPI_SUCCESS;
  312     int comm_size, rank, type_size ATTRIBUTE((unused)), pof2, rem, newrank;
  313     int mask, *cnts, *disps, i, j, send_idx=0;
  314     int recv_idx, last_idx=0, newdst;
  315     int dst, send_cnt, recv_cnt, newroot, newdst_tree_root, newroot_tree_root; 
  316     MPI_Aint true_lb, true_extent, extent; 
  317     void *tmp_buf;
  318 
  319     MPIU_CHKLMEM_DECL(4);
  320     MPID_THREADPRIV_DECL;
  321 
  322     comm_size = comm_ptr->local_size;
  323     rank = comm_ptr->rank;
  324 
  325     /* set op_errno to 0. stored in perthread structure */
  326     MPID_THREADPRIV_GET;
  327     MPID_THREADPRIV_FIELD(op_errno) = 0;
  328 
  329     /* Create a temporary buffer */
  330 
  331     MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent);
  332     MPID_Datatype_get_extent_macro(datatype, extent);
  333 
  334     /* I think this is the worse case, so we can avoid an assert() 
  335      * inside the for loop */
  336     /* should be buf+{this}? */
  337     MPIU_Ensure_Aint_fits_in_pointer(count * MPIR_MAX(extent, true_extent));
  338 
  339     MPIU_CHKLMEM_MALLOC(tmp_buf, void *, count*(MPIR_MAX(extent,true_extent)),
  340                         mpi_errno, "temporary buffer");
  341     /* adjust for potential negative lower bound in datatype */
  342     tmp_buf = (void *)((char*)tmp_buf - true_lb);
  343     
  344     /* If I'm not the root, then my recvbuf may not be valid, therefore
  345        I have to allocate a temporary one */
  346     if (rank != root) {
  347         MPIU_CHKLMEM_MALLOC(recvbuf, void *, 
  348                             count*(MPIR_MAX(extent,true_extent)), 
  349                             mpi_errno, "receive buffer");
  350         recvbuf = (void *)((char*)recvbuf - true_lb);
  351     }
  352 
  353     if ((rank != root) || (sendbuf != MPI_IN_PLACE)) {
  354         mpi_errno = MPIR_Localcopy(sendbuf, count, datatype, recvbuf,
  355                                    count, datatype);
  356         if (mpi_errno) { MPIR_ERR_POP(mpi_errno); }
  357     }
  358 
  359     MPID_Datatype_get_size_macro(datatype, type_size);
  360 
  361     /* find nearest power-of-two less than or equal to comm_size */
  362     pof2 = 1;
  363     while (pof2 <= comm_size) pof2 <<= 1;
  364     pof2 >>=1;
  365 
  366     rem = comm_size - pof2;
  367 
  368     /* In the non-power-of-two case, all odd-numbered
  369        processes of rank < 2*rem send their data to
  370        (rank-1). These odd-numbered processes no longer
  371        participate in the algorithm until the very end. The
  372        remaining processes form a nice power-of-two. 
  373 
  374        Note that in MPI_Allreduce we have the even-numbered processes
  375        send data to odd-numbered processes. That is better for
  376        non-commutative operations because it doesn't require a
  377        buffer copy. However, for MPI_Reduce, the most common case
  378        is commutative operations with root=0. Therefore we want
  379        even-numbered processes to participate the computation for
  380        the root=0 case, in order to avoid an extra send-to-root
  381        communication after the reduce-scatter. In MPI_Allreduce it
  382        doesn't matter because all processes must get the result. */
  383     
  384     if (rank < 2*rem) {
  385         if (rank % 2 != 0) { /* odd */
  386             mpi_errno = MPIC_Send(recvbuf, count,
  387                                      datatype, rank-1,
  388                                      MPIR_REDUCE_TAG, comm_ptr, errflag);
  389             if (mpi_errno) {
  390                 /* for communication errors, just record the error but continue */
  391                 *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
  392                 MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
  393                 MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
  394             }
  395             
  396             /* temporarily set the rank to -1 so that this
  397                process does not pariticipate in recursive
  398                doubling */
  399             newrank = -1; 
  400         }
  401         else { /* even */
  402             mpi_errno = MPIC_Recv(tmp_buf, count,
  403                                      datatype, rank+1,
  404                                      MPIR_REDUCE_TAG, comm_ptr,
  405                                      MPI_STATUS_IGNORE, errflag);
  406             if (mpi_errno) {
  407                 /* for communication errors, just record the error but continue */
  408                 *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
  409                 MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
  410                 MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
  411             }
  412             
  413             /* do the reduction on received data. */
  414             /* This algorithm is used only for predefined ops
  415                and predefined ops are always commutative. */
  416         mpi_errno = MPIR_Reduce_local_impl(tmp_buf, recvbuf, 
  417                            count, datatype, op);
  418             /* change the rank */
  419             newrank = rank / 2;
  420         }
  421     }
  422     else  /* rank >= 2*rem */
  423         newrank = rank - rem;
  424     
  425     /* for the reduce-scatter, calculate the count that
  426        each process receives and the displacement within
  427        the buffer */
  428 
  429     /* We allocate these arrays on all processes, even if newrank=-1,
  430        because if root is one of the excluded processes, we will
  431        need them on the root later on below. */
  432     MPIU_CHKLMEM_MALLOC(cnts, int *, pof2*sizeof(int), mpi_errno, "counts");
  433     MPIU_CHKLMEM_MALLOC(disps, int *, pof2*sizeof(int), mpi_errno, "displacements");
  434     
  435     if (newrank != -1) {
  436         for (i=0; i<(pof2-1); i++) 
  437             cnts[i] = count/pof2;
  438         cnts[pof2-1] = count - (count/pof2)*(pof2-1);
  439         
  440         disps[0] = 0;
  441         for (i=1; i<pof2; i++)
  442             disps[i] = disps[i-1] + cnts[i-1];
  443         
  444         mask = 0x1;
  445         send_idx = recv_idx = 0;
  446         last_idx = pof2;
  447         while (mask < pof2) {
  448             newdst = newrank ^ mask;
  449             /* find real rank of dest */
  450             dst = (newdst < rem) ? newdst*2 : newdst + rem;
  451             
  452             send_cnt = recv_cnt = 0;
  453             if (newrank < newdst) {
  454                 send_idx = recv_idx + pof2/(mask*2);
  455                 for (i=send_idx; i<last_idx; i++)
  456                     send_cnt += cnts[i];
  457                 for (i=recv_idx; i<send_idx; i++)
  458                     recv_cnt += cnts[i];
  459             }
  460             else {
  461                 recv_idx = send_idx + pof2/(mask*2);
  462                 for (i=send_idx; i<recv_idx; i++)
  463                     send_cnt += cnts[i];
  464                 for (i=recv_idx; i<last_idx; i++)
  465                     recv_cnt += cnts[i];
  466             }
  467             
  468 /*                    printf("Rank %d, send_idx %d, recv_idx %d, send_cnt %d, recv_cnt %d, last_idx %d\n", newrank, send_idx, recv_idx,
  469                   send_cnt, recv_cnt, last_idx);
  470 */
  471             /* Send data from recvbuf. Recv into tmp_buf */ 
  472             mpi_errno = MPIC_Sendrecv((char *) recvbuf +
  473                                          disps[send_idx]*extent,
  474                                          send_cnt, datatype,
  475                                          dst, MPIR_REDUCE_TAG,
  476                                          (char *) tmp_buf +
  477                                          disps[recv_idx]*extent,
  478                                          recv_cnt, datatype, dst,
  479                                          MPIR_REDUCE_TAG, comm_ptr,
  480                                          MPI_STATUS_IGNORE, errflag);
  481             if (mpi_errno) {
  482                 /* for communication errors, just record the error but continue */
  483                 *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
  484                 MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
  485                 MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
  486             }
  487             
  488             /* tmp_buf contains data received in this step.
  489                recvbuf contains data accumulated so far */
  490             
  491             /* This algorithm is used only for predefined ops
  492                and predefined ops are always commutative. */
  493         mpi_errno = MPIR_Reduce_local_impl( 
  494                (char *) tmp_buf + disps[recv_idx]*extent,
  495                        (char *) recvbuf + disps[recv_idx]*extent, 
  496                        recv_cnt, datatype, op );
  497             /* update send_idx for next iteration */
  498             send_idx = recv_idx;
  499             mask <<= 1;
  500 
  501             /* update last_idx, but not in last iteration
  502                because the value is needed in the gather
  503                step below. */
  504             if (mask < pof2)
  505                 last_idx = recv_idx + pof2/mask;
  506         }
  507     }
  508 
  509     /* now do the gather to root */
  510     
  511     /* Is root one of the processes that was excluded from the
  512        computation above? If so, send data from newrank=0 to
  513        the root and have root take on the role of newrank = 0 */ 
  514 
  515     if (root < 2*rem) {
  516         if (root % 2 != 0) {
  517             if (rank == root) {    /* recv */
  518                 /* initialize the arrays that weren't initialized */
  519                 for (i=0; i<(pof2-1); i++) 
  520                     cnts[i] = count/pof2;
  521                 cnts[pof2-1] = count - (count/pof2)*(pof2-1);
  522                 
  523                 disps[0] = 0;
  524                 for (i=1; i<pof2; i++)
  525                     disps[i] = disps[i-1] + cnts[i-1];
  526                 
  527                 mpi_errno = MPIC_Recv(recvbuf, cnts[0], datatype,
  528                                          0, MPIR_REDUCE_TAG, comm_ptr,
  529                                          MPI_STATUS_IGNORE, errflag);
  530                 if (mpi_errno) {
  531                     /* for communication errors, just record the error but continue */
  532                     *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
  533                     MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
  534                     MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
  535                 }
  536                 newrank = 0;
  537                 send_idx = 0;
  538                 last_idx = 2;
  539             }
  540             else if (newrank == 0) {  /* send */
  541                 mpi_errno = MPIC_Send(recvbuf, cnts[0], datatype,
  542                                          root, MPIR_REDUCE_TAG, comm_ptr, errflag);
  543                 if (mpi_errno) {
  544                     /* for communication errors, just record the error but continue */
  545                     *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
  546                     MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
  547                     MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
  548                 }
  549                 newrank = -1;
  550             }
  551             newroot = 0;
  552         }
  553         else newroot = root / 2;
  554     }
  555     else
  556         newroot = root - rem;
  557 
  558     if (newrank != -1) {
  559         j = 0;
  560         mask = 0x1;
  561         while (mask < pof2) {
  562             mask <<= 1;
  563             j++;
  564         }
  565         mask >>= 1;
  566         j--;
  567         while (mask > 0) {
  568             newdst = newrank ^ mask;
  569 
  570             /* find real rank of dest */
  571             dst = (newdst < rem) ? newdst*2 : newdst + rem;
  572             /* if root is playing the role of newdst=0, adjust for
  573                it */
  574             if ((newdst == 0) && (root < 2*rem) && (root % 2 != 0))
  575                 dst = root;
  576             
  577             /* if the root of newdst's half of the tree is the
  578                same as the root of newroot's half of the tree, send to
  579                newdst and exit, else receive from newdst. */
  580 
  581             newdst_tree_root = newdst >> j;
  582             newdst_tree_root <<= j;
  583             
  584             newroot_tree_root = newroot >> j;
  585             newroot_tree_root <<= j;
  586 
  587             send_cnt = recv_cnt = 0;
  588             if (newrank < newdst) {
  589                 /* update last_idx except on first iteration */
  590                 if (mask != pof2/2)
  591                     last_idx = last_idx + pof2/(mask*2);
  592                 
  593                 recv_idx = send_idx + pof2/(mask*2);
  594                 for (i=send_idx; i<recv_idx; i++)
  595                     send_cnt += cnts[i];
  596                 for (i=recv_idx; i<last_idx; i++)
  597                     recv_cnt += cnts[i];
  598             }
  599             else {
  600                 recv_idx = send_idx - pof2/(mask*2);
  601                 for (i=send_idx; i<last_idx; i++)
  602                     send_cnt += cnts[i];
  603                 for (i=recv_idx; i<send_idx; i++)
  604                     recv_cnt += cnts[i];
  605             }
  606             
  607             if (newdst_tree_root == newroot_tree_root) {
  608                 /* send and exit */
  609                 /* printf("Rank %d, send_idx %d, send_cnt %d, last_idx %d\n", newrank, send_idx, send_cnt, last_idx);
  610                    fflush(stdout); */
  611                 /* Send data from recvbuf. Recv into tmp_buf */ 
  612                 mpi_errno = MPIC_Send((char *) recvbuf +
  613                                          disps[send_idx]*extent,
  614                                          send_cnt, datatype,
  615                                          dst, MPIR_REDUCE_TAG,
  616                                          comm_ptr, errflag);
  617                 if (mpi_errno) {
  618                     /* for communication errors, just record the error but continue */
  619                     *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
  620                     MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
  621                     MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
  622                 }
  623                 break;
  624             }
  625             else {
  626                 /* recv and continue */
  627                 /* printf("Rank %d, recv_idx %d, recv_cnt %d, last_idx %d\n", newrank, recv_idx, recv_cnt, last_idx);
  628                    fflush(stdout); */
  629                 mpi_errno = MPIC_Recv((char *) recvbuf +
  630                                          disps[recv_idx]*extent,
  631                                          recv_cnt, datatype, dst,
  632                                          MPIR_REDUCE_TAG, comm_ptr,
  633                                          MPI_STATUS_IGNORE, errflag);
  634                 if (mpi_errno) {
  635                     /* for communication errors, just record the error but continue */
  636                     *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
  637                     MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
  638                     MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
  639                 }
  640             }
  641             
  642             if (newrank > newdst) send_idx = recv_idx;
  643             
  644             mask >>= 1;
  645             j--;
  646         }
  647     }
  648 
  649     /* FIXME does this need to be checked after each uop invocation for
  650        predefined operators? */
  651     /* --BEGIN ERROR HANDLING-- */
  652     if (MPID_THREADPRIV_FIELD(op_errno)) {
  653         mpi_errno = MPID_THREADPRIV_FIELD(op_errno);
  654         goto fn_fail;
  655     }
  656     /* --END ERROR HANDLING-- */
  657 
  658 fn_exit:
  659     MPIU_CHKLMEM_FREEALL();
  660     if (mpi_errno_ret)
  661         mpi_errno = mpi_errno_ret;
  662     else if (*errflag != MPIR_ERR_NONE)
  663         MPIR_ERR_SET(mpi_errno, *errflag, "**coll_fail");
  664     return mpi_errno;
  665 fn_fail:
  666     goto fn_exit;
  667 }
  668 
  669 /* This is the default implementation of reduce. The algorithm is:
  670    
  671    Algorithm: MPI_Reduce
  672 
  673    For long messages and for builtin ops and if count >= pof2 (where
  674    pof2 is the nearest power-of-two less than or equal to the number
  675    of processes), we use Rabenseifner's algorithm (see 
  676    http://www.hlrs.de/organization/par/services/models/mpi/myreduce.html ).
  677    This algorithm implements the reduce in two steps: first a
  678    reduce-scatter, followed by a gather to the root. A
  679    recursive-halving algorithm (beginning with processes that are
  680    distance 1 apart) is used for the reduce-scatter, and a binomial tree
  681    algorithm is used for the gather. The non-power-of-two case is
  682    handled by dropping to the nearest lower power-of-two: the first
  683    few odd-numbered processes send their data to their left neighbors
  684    (rank-1), and the reduce-scatter happens among the remaining
  685    power-of-two processes. If the root is one of the excluded
  686    processes, then after the reduce-scatter, rank 0 sends its result to
  687    the root and exits; the root now acts as rank 0 in the binomial tree
  688    algorithm for gather.
  689 
  690    For the power-of-two case, the cost for the reduce-scatter is 
  691    lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma. The cost for the
  692    gather to root is lgp.alpha + n.((p-1)/p).beta. Therefore, the
  693    total cost is:
  694    Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta + n.((p-1)/p).gamma
  695 
  696    For the non-power-of-two case, assuming the root is not one of the
  697    odd-numbered processes that get excluded in the reduce-scatter,
  698    Cost = (2.floor(lgp)+1).alpha + (2.((p-1)/p) + 1).n.beta + 
  699            n.(1+(p-1)/p).gamma
  700 
  701 
  702    For short messages, user-defined ops, and count < pof2, we use a
  703    binomial tree algorithm for both short and long messages. 
  704 
  705    Cost = lgp.alpha + n.lgp.beta + n.lgp.gamma
  706 
  707 
  708    We use the binomial tree algorithm in the case of user-defined ops
  709    because in this case derived datatypes are allowed, and the user
  710    could pass basic datatypes on one process and derived on another as
  711    long as the type maps are the same. Breaking up derived datatypes
  712    to do the reduce-scatter is tricky.
  713 
  714    FIXME: Per the MPI-2.1 standard this case is not possible.  We
  715    should be able to use the reduce-scatter/gather approach as long as
  716    count >= pof2.  [goodell@ 2009-01-21]
  717 
  718    Possible improvements: 
  719 
  720    End Algorithm: MPI_Reduce
  721 */
  722 
  723 
  724 /* not declared static because a machine-specific function may call this one 
  725    in some cases */
  726 #undef FUNCNAME
  727 #define FUNCNAME MPIR_Reduce_intra
  728 #undef FCNAME
  729 #define FCNAME MPL_QUOTE(FUNCNAME)
  730 int MPIR_Reduce_intra ( 
  731     const void *sendbuf,
  732     void *recvbuf,
  733     int count,
  734     MPI_Datatype datatype,
  735     MPI_Op op,
  736     int root,
  737     MPID_Comm *comm_ptr,
  738     MPIR_Errflag_t *errflag )
  739 {
  740     int mpi_errno = MPI_SUCCESS;
  741     int mpi_errno_ret = MPI_SUCCESS;
  742     int comm_size, is_commutative, type_size, pof2;
  743     int nbytes = 0;
  744     MPID_Op *op_ptr;
  745     MPIU_CHKLMEM_DECL(1);
  746 
  747     if (count == 0) return MPI_SUCCESS;
  748     /* check if multiple threads are calling this collective function */
  749     MPIDU_ERR_CHECK_MULTIPLE_THREADS_ENTER( comm_ptr );
  750 
  751     if (MPIR_CVAR_ENABLE_SMP_COLLECTIVES && MPIR_CVAR_ENABLE_SMP_REDUCE) {
  752     /* is the op commutative? We do SMP optimizations only if it is. */
  753     if (HANDLE_GET_KIND(op) == HANDLE_KIND_BUILTIN)
  754         is_commutative = 1;
  755     else {
  756         MPID_Op_get_ptr(op, op_ptr);
  757         is_commutative = (op_ptr->kind == MPID_OP_USER_NONCOMMUTE) ? 0 : 1;
  758     }
  759 
  760     MPID_Datatype_get_size_macro(datatype, type_size);
  761     nbytes = MPIR_CVAR_MAX_SMP_REDUCE_MSG_SIZE ? type_size*count : 0;
  762     if (MPIR_Comm_is_node_aware(comm_ptr) && is_commutative &&
  763         nbytes <= MPIR_CVAR_MAX_SMP_REDUCE_MSG_SIZE) {
  764 
  765         void *tmp_buf = NULL;
  766         MPI_Aint  true_lb, true_extent, extent;
  767 
  768         /* Create a temporary buffer on local roots of all nodes */
  769         if (comm_ptr->node_roots_comm != NULL) {
  770 
  771             MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent);
  772             MPID_Datatype_get_extent_macro(datatype, extent);
  773 
  774             MPIU_Ensure_Aint_fits_in_pointer(count * MPIR_MAX(extent, true_extent));
  775 
  776             MPIU_CHKLMEM_MALLOC(tmp_buf, void *, count*(MPIR_MAX(extent,true_extent)),
  777                                 mpi_errno, "temporary buffer");
  778             /* adjust for potential negative lower bound in datatype */
  779             tmp_buf = (void *)((char*)tmp_buf - true_lb);
  780         }
  781 
  782         /* do the intranode reduce on all nodes other than the root's node */
  783         if (comm_ptr->node_comm != NULL &&
  784             MPIU_Get_intranode_rank(comm_ptr, root) == -1) {
  785             mpi_errno = MPIR_Reduce_impl(sendbuf, tmp_buf, count, datatype,
  786                                          op, 0, comm_ptr->node_comm, errflag);
  787             if (mpi_errno) {
  788                 /* for communication errors, just record the error but continue */
  789                 *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
  790                 MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
  791                 MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
  792             }
  793         }
  794 
  795         /* do the internode reduce to the root's node */
  796         if (comm_ptr->node_roots_comm != NULL) {
  797             if (comm_ptr->node_roots_comm->rank != MPIU_Get_internode_rank(comm_ptr, root)) {
  798                 /* I am not on root's node.  Use tmp_buf if we
  799                    participated in the first reduce, otherwise use sendbuf */
  800                 const void *buf = (comm_ptr->node_comm == NULL ? sendbuf : tmp_buf);
  801                 mpi_errno = MPIR_Reduce_impl(buf, NULL, count, datatype,
  802                                              op, MPIU_Get_internode_rank(comm_ptr, root),
  803                                              comm_ptr->node_roots_comm, errflag);
  804                 if (mpi_errno) {
  805                     /* for communication errors, just record the error but continue */
  806                     *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
  807                     MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
  808                     MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
  809                 }
  810             }
  811             else { /* I am on root's node. I have not participated in the earlier reduce. */
  812                 if (comm_ptr->rank != root) {
  813                     /* I am not the root though. I don't have a valid recvbuf.
  814                        Use tmp_buf as recvbuf. */
  815 
  816                     mpi_errno = MPIR_Reduce_impl(sendbuf, tmp_buf, count, datatype,
  817                                                  op, MPIU_Get_internode_rank(comm_ptr, root),
  818                                                  comm_ptr->node_roots_comm, errflag);
  819                     if (mpi_errno) {
  820                         /* for communication errors, just record the error but continue */
  821                         *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
  822                         MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
  823                         MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
  824                     }
  825 
  826                     /* point sendbuf at tmp_buf to make final intranode reduce easy */
  827                     sendbuf = tmp_buf;
  828                 }
  829                 else {
  830                     /* I am the root. in_place is automatically handled. */
  831 
  832                     mpi_errno = MPIR_Reduce_impl(sendbuf, recvbuf, count, datatype,
  833                                                  op, MPIU_Get_internode_rank(comm_ptr, root),
  834                                                  comm_ptr->node_roots_comm, errflag);
  835                     if (mpi_errno) {
  836                         /* for communication errors, just record the error but continue */
  837                         *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
  838                         MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
  839                         MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
  840                     }
  841 
  842                     /* set sendbuf to MPI_IN_PLACE to make final intranode reduce easy. */
  843                     sendbuf = MPI_IN_PLACE;
  844                 }
  845             }
  846 
  847         }
  848 
  849         /* do the intranode reduce on the root's node */
  850         if (comm_ptr->node_comm != NULL &&
  851             MPIU_Get_intranode_rank(comm_ptr, root) != -1) { 
  852             mpi_errno = MPIR_Reduce_impl(sendbuf, recvbuf, count, datatype,
  853                                          op, MPIU_Get_intranode_rank(comm_ptr, root),
  854                                          comm_ptr->node_comm, errflag);
  855             if (mpi_errno) {
  856                 /* for communication errors, just record the error but continue */
  857                 *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
  858                 MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
  859                 MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
  860             }
  861         }
  862         
  863         goto fn_exit;
  864     }
  865     }
  866 
  867     comm_size = comm_ptr->local_size;
  868 
  869     MPID_Datatype_get_size_macro(datatype, type_size);
  870 
  871     /* find nearest power-of-two less than or equal to comm_size */
  872     pof2 = 1;
  873     while (pof2 <= comm_size) pof2 <<= 1;
  874     pof2 >>=1;
  875 
  876     if ((count*type_size > MPIR_CVAR_REDUCE_SHORT_MSG_SIZE) &&
  877         (HANDLE_GET_KIND(op) == HANDLE_KIND_BUILTIN) && (count >= pof2)) {
  878         /* do a reduce-scatter followed by gather to root. */
  879         mpi_errno = MPIR_Reduce_redscat_gather(sendbuf, recvbuf, count, datatype, op, root, comm_ptr, errflag);
  880         if (mpi_errno) {
  881             /* for communication errors, just record the error but continue */
  882             *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
  883             MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
  884             MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
  885         }
  886     }
  887     else {
  888         /* use a binomial tree algorithm */ 
  889         mpi_errno = MPIR_Reduce_binomial(sendbuf, recvbuf, count, datatype, op, root, comm_ptr, errflag);
  890         if (mpi_errno) {
  891             /* for communication errors, just record the error but continue */
  892             *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
  893             MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
  894             MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
  895         }
  896     }
  897         
  898 
  899   fn_exit:
  900     /* check if multiple threads are calling this collective function */
  901     MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
  902 
  903     MPIU_CHKLMEM_FREEALL();
  904 
  905     if (mpi_errno_ret)
  906         mpi_errno = mpi_errno_ret;
  907     else if (*errflag != MPIR_ERR_NONE)
  908         MPIR_ERR_SET(mpi_errno, *errflag, "**coll_fail");
  909     return mpi_errno;
  910   fn_fail:
  911     goto fn_exit;
  912 }
  913 
  914 
  915 
  916 /* Needed in intercommunicator allreduce */
  917 #undef FUNCNAME
  918 #define FUNCNAME MPIR_Reduce_inter
  919 #undef FCNAME
  920 #define FCNAME MPL_QUOTE(FUNCNAME)
  921 int MPIR_Reduce_inter ( 
  922     const void *sendbuf,
  923     void *recvbuf,
  924     int count,
  925     MPI_Datatype datatype,
  926     MPI_Op op,
  927     int root,
  928     MPID_Comm *comm_ptr,
  929     MPIR_Errflag_t *errflag )
  930 {
  931 /*  Intercommunicator reduce.
  932     Remote group does a local intracommunicator
  933     reduce to rank 0. Rank 0 then sends data to root.
  934 
  935     Cost: (lgp+1).alpha + n.(lgp+1).beta
  936 */
  937 
  938     int rank, mpi_errno;
  939     int mpi_errno_ret = MPI_SUCCESS;
  940     MPI_Status status;
  941     MPI_Aint true_extent, true_lb, extent;
  942     void *tmp_buf=NULL;
  943     MPID_Comm *newcomm_ptr = NULL;
  944     MPIU_CHKLMEM_DECL(1);
  945 
  946     if (root == MPI_PROC_NULL) {
  947         /* local processes other than root do nothing */
  948         return MPI_SUCCESS;
  949     }
  950 
  951     MPIDU_ERR_CHECK_MULTIPLE_THREADS_ENTER( comm_ptr );
  952 
  953 
  954     if (root == MPI_ROOT) {
  955         /* root receives data from rank 0 on remote group */
  956         mpi_errno = MPIC_Recv(recvbuf, count, datatype, 0,
  957                                  MPIR_REDUCE_TAG, comm_ptr, &status, errflag);
  958         if (mpi_errno) {
  959             /* for communication errors, just record the error but continue */
  960             *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
  961             MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
  962             MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
  963         }
  964     }
  965     else {
  966         /* remote group. Rank 0 allocates temporary buffer, does
  967            local intracommunicator reduce, and then sends the data
  968            to root. */
  969         
  970         rank = comm_ptr->rank;
  971         
  972         if (rank == 0) {
  973             MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent);
  974 
  975             MPID_Datatype_get_extent_macro(datatype, extent);
  976         /* I think this is the worse case, so we can avoid an assert() 
  977          * inside the for loop */
  978         /* Should MPIU_CHKLMEM_MALLOC do this? */
  979         MPIU_Ensure_Aint_fits_in_pointer(count * MPIR_MAX(extent, true_extent));
  980         MPIU_CHKLMEM_MALLOC(tmp_buf, void *, count*(MPIR_MAX(extent,true_extent)), mpi_errno, "temporary buffer");
  981             /* adjust for potential negative lower bound in datatype */
  982             tmp_buf = (void *)((char*)tmp_buf - true_lb);
  983         }
  984         
  985         /* Get the local intracommunicator */
  986         if (!comm_ptr->local_comm) {
  987             mpi_errno = MPIR_Setup_intercomm_localcomm( comm_ptr );
  988             if (mpi_errno) MPIR_ERR_POP(mpi_errno);
  989         }
  990 
  991         newcomm_ptr = comm_ptr->local_comm;
  992         
  993         /* now do a local reduce on this intracommunicator */
  994         mpi_errno = MPIR_Reduce_intra(sendbuf, tmp_buf, count, datatype,
  995                                       op, 0, newcomm_ptr, errflag);
  996         if (mpi_errno) {
  997             /* for communication errors, just record the error but continue */
  998             *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
  999             MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
 1000             MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
 1001         }
 1002 
 1003         if (rank == 0)
 1004     {
 1005             mpi_errno = MPIC_Send(tmp_buf, count, datatype, root,
 1006                                      MPIR_REDUCE_TAG, comm_ptr, errflag);
 1007             if (mpi_errno) {
 1008                 /* for communication errors, just record the error but continue */
 1009                 *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
 1010                 MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
 1011                 MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
 1012             }
 1013         }
 1014     }
 1015 
 1016   fn_exit:
 1017     MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr ); 
 1018     MPIU_CHKLMEM_FREEALL();
 1019     if (mpi_errno_ret)
 1020         mpi_errno = mpi_errno_ret;
 1021     else if (*errflag != MPIR_ERR_NONE)
 1022         MPIR_ERR_SET(mpi_errno, *errflag, "**coll_fail");
 1023     return mpi_errno;
 1024 
 1025   fn_fail:
 1026     goto fn_exit;
 1027 }
 1028 
 1029 
 1030 /* MPIR_Reduce performs an reduce using point-to-point messages.
 1031    This is intended to be used by device-specific implementations of
 1032    reduce.  In all other cases MPIR_Reduce_impl should be
 1033    used. */
 1034 #undef FUNCNAME
 1035 #define FUNCNAME MPIR_Reduce
 1036 #undef FCNAME
 1037 #define FCNAME MPL_QUOTE(FUNCNAME)
 1038 int MPIR_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
 1039                 MPI_Op op, int root, MPID_Comm *comm_ptr, MPIR_Errflag_t *errflag)
 1040 {
 1041     int mpi_errno = MPI_SUCCESS;
 1042         
 1043     if (comm_ptr->comm_kind == MPID_INTRACOMM) {
 1044         /* intracommunicator */
 1045         mpi_errno = MPIR_Reduce_intra(sendbuf, recvbuf, count, datatype,
 1046                                       op, root, comm_ptr, errflag);
 1047         if (mpi_errno) MPIR_ERR_POP(mpi_errno);
 1048     } else {
 1049         /* intercommunicator */
 1050         mpi_errno = MPIR_Reduce_inter(sendbuf, recvbuf, count, datatype,
 1051                                       op, root, comm_ptr, errflag);
 1052         if (mpi_errno) MPIR_ERR_POP(mpi_errno);
 1053     }
 1054 
 1055  fn_exit:
 1056     return mpi_errno;
 1057  fn_fail:
 1058     goto fn_exit;
 1059 }
 1060 
 1061 /* MPIR_Reduce_impl should be called by any internal component that
 1062    would otherwise call MPI_Reduce.  This differs from
 1063    MPIR_Reduce in that this will call the coll_fns version if it
 1064    exists.  This function replaces NMPI_Reduce. */
 1065 #undef FUNCNAME
 1066 #define FUNCNAME MPIR_Reduce_impl
 1067 #undef FCNAME
 1068 #define FCNAME MPL_QUOTE(FUNCNAME)
 1069 int MPIR_Reduce_impl(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
 1070                      MPI_Op op, int root, MPID_Comm *comm_ptr, MPIR_Errflag_t *errflag)
 1071 {
 1072     int mpi_errno = MPI_SUCCESS;
 1073     
 1074     if (comm_ptr->coll_fns != NULL && comm_ptr->coll_fns->Reduce != NULL) {
 1075     /* --BEGIN USEREXTENSION-- */
 1076     mpi_errno = comm_ptr->coll_fns->Reduce(sendbuf, recvbuf, count,
 1077                                                datatype, op, root, comm_ptr, errflag);
 1078         if (mpi_errno) MPIR_ERR_POP(mpi_errno);
 1079     /* --END USEREXTENSION-- */
 1080     } else {
 1081         if (comm_ptr->comm_kind == MPID_INTRACOMM) {
 1082             /* intracommunicator */
 1083             mpi_errno = MPIR_Reduce_intra(sendbuf, recvbuf, count, datatype,
 1084                                           op, root, comm_ptr, errflag);
 1085             if (mpi_errno) MPIR_ERR_POP(mpi_errno);
 1086     } else {
 1087             /* intercommunicator */
 1088             mpi_errno = MPIR_Reduce_inter(sendbuf, recvbuf, count, datatype,
 1089                                           op, root, comm_ptr, errflag);
 1090             if (mpi_errno) MPIR_ERR_POP(mpi_errno);
 1091         }
 1092     }
 1093 
 1094  fn_exit:
 1095     return mpi_errno;
 1096  fn_fail:
 1097     goto fn_exit;
 1098 }
 1099 
 1100 
 1101 #endif
 1102 
 1103 
 1104 #undef FUNCNAME
 1105 #define FUNCNAME MPI_Reduce
 1106 #undef FCNAME
 1107 #define FCNAME MPL_QUOTE(FUNCNAME)
 1108 
 1109 /*@
 1110 
 1111 MPI_Reduce - Reduces values on all processes to a single value
 1112 
 1113 Input Parameters:
 1114 + sendbuf - address of send buffer (choice) 
 1115 . count - number of elements in send buffer (integer) 
 1116 . datatype - data type of elements of send buffer (handle) 
 1117 . op - reduce operation (handle) 
 1118 . root - rank of root process (integer) 
 1119 - comm - communicator (handle) 
 1120 
 1121 Output Parameters:
 1122 . recvbuf - address of receive buffer (choice, 
 1123  significant only at 'root') 
 1124 
 1125 .N ThreadSafe
 1126 
 1127 .N Fortran
 1128 
 1129 .N collops
 1130 
 1131 .N Errors
 1132 .N MPI_SUCCESS
 1133 .N MPI_ERR_COMM
 1134 .N MPI_ERR_COUNT
 1135 .N MPI_ERR_TYPE
 1136 .N MPI_ERR_BUFFER
 1137 .N MPI_ERR_BUFFER_ALIAS
 1138 
 1139 @*/
 1140 int MPI_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
 1141            MPI_Op op, int root, MPI_Comm comm)
 1142 {
 1143     int mpi_errno = MPI_SUCCESS;
 1144     MPID_Comm *comm_ptr = NULL;
 1145     MPIR_Errflag_t errflag = MPIR_ERR_NONE;
 1146     MPID_MPI_STATE_DECL(MPID_STATE_MPI_REDUCE);
 1147 
 1148     MPIR_ERRTEST_INITIALIZED_ORDIE();
 1149     
 1150     MPID_THREAD_CS_ENTER(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
 1151     MPID_MPI_COLL_FUNC_ENTER(MPID_STATE_MPI_REDUCE);
 1152 
 1153     /* Validate parameters, especially handles needing to be converted */
 1154 #   ifdef HAVE_ERROR_CHECKING
 1155     {
 1156         MPID_BEGIN_ERROR_CHECKS;
 1157         {
 1158         MPIR_ERRTEST_COMM(comm, mpi_errno);
 1159     }
 1160         MPID_END_ERROR_CHECKS;
 1161     }
 1162 #   endif /* HAVE_ERROR_CHECKING */
 1163 
 1164     /* Convert MPI object handles to object pointers */
 1165     MPID_Comm_get_ptr( comm, comm_ptr );
 1166 
 1167     /* Validate parameters and objects (post conversion) */
 1168 #   ifdef HAVE_ERROR_CHECKING
 1169     {
 1170         MPID_BEGIN_ERROR_CHECKS;
 1171         {
 1172         MPID_Datatype *datatype_ptr = NULL;
 1173             MPID_Op *op_ptr = NULL;
 1174             int rank;
 1175         
 1176             MPID_Comm_valid_ptr( comm_ptr, mpi_errno, FALSE );
 1177             if (mpi_errno != MPI_SUCCESS) goto fn_fail;
 1178 
 1179         if (comm_ptr->comm_kind == MPID_INTRACOMM) {
 1180         MPIR_ERRTEST_INTRA_ROOT(comm_ptr, root, mpi_errno);
 1181 
 1182                 MPIR_ERRTEST_COUNT(count, mpi_errno);
 1183                 MPIR_ERRTEST_DATATYPE(datatype, "datatype", mpi_errno);
 1184                 if (HANDLE_GET_KIND(datatype) != HANDLE_KIND_BUILTIN) {
 1185                     MPID_Datatype_get_ptr(datatype, datatype_ptr);
 1186                     MPID_Datatype_valid_ptr( datatype_ptr, mpi_errno );
 1187                     if (mpi_errno != MPI_SUCCESS) goto fn_fail;
 1188                     MPID_Datatype_committed_ptr( datatype_ptr, mpi_errno );
 1189                     if (mpi_errno != MPI_SUCCESS) goto fn_fail;
 1190                 }
 1191 
 1192                 if (sendbuf != MPI_IN_PLACE)
 1193                     MPIR_ERRTEST_USERBUFFER(sendbuf,count,datatype,mpi_errno);
 1194 
 1195                 rank = comm_ptr->rank;
 1196                 if (rank == root) {
 1197                     MPIR_ERRTEST_RECVBUF_INPLACE(recvbuf, count, mpi_errno);
 1198                     MPIR_ERRTEST_USERBUFFER(recvbuf,count,datatype,mpi_errno);
 1199                     if (count != 0 && sendbuf != MPI_IN_PLACE) {
 1200                         MPIR_ERRTEST_ALIAS_COLL(sendbuf, recvbuf, mpi_errno);
 1201                     }
 1202                 }
 1203                 else
 1204                     MPIR_ERRTEST_SENDBUF_INPLACE(sendbuf, count, mpi_errno);
 1205             }
 1206 
 1207         if (comm_ptr->comm_kind == MPID_INTERCOMM) {
 1208         MPIR_ERRTEST_INTER_ROOT(comm_ptr, root, mpi_errno);
 1209 
 1210                 if (root == MPI_ROOT) {
 1211                     MPIR_ERRTEST_COUNT(count, mpi_errno);
 1212                     MPIR_ERRTEST_DATATYPE(datatype, "datatype", mpi_errno);
 1213                     if (HANDLE_GET_KIND(datatype) != HANDLE_KIND_BUILTIN) {
 1214                         MPID_Datatype_get_ptr(datatype, datatype_ptr);
 1215                         MPID_Datatype_valid_ptr( datatype_ptr, mpi_errno );
 1216                         if (mpi_errno != MPI_SUCCESS) goto fn_fail;
 1217                         MPID_Datatype_committed_ptr( datatype_ptr, mpi_errno );
 1218                         if (mpi_errno != MPI_SUCCESS) goto fn_fail;
 1219                     }
 1220                     MPIR_ERRTEST_RECVBUF_INPLACE(recvbuf, count, mpi_errno);
 1221                     MPIR_ERRTEST_USERBUFFER(recvbuf,count,datatype,mpi_errno);
 1222                 }
 1223                 
 1224                 else if (root != MPI_PROC_NULL) {
 1225                     MPIR_ERRTEST_COUNT(count, mpi_errno);
 1226                     MPIR_ERRTEST_DATATYPE(datatype, "datatype", mpi_errno);
 1227                     if (HANDLE_GET_KIND(datatype) != HANDLE_KIND_BUILTIN) {
 1228                         MPID_Datatype_get_ptr(datatype, datatype_ptr);
 1229                         MPID_Datatype_valid_ptr( datatype_ptr, mpi_errno );
 1230                         if (mpi_errno != MPI_SUCCESS) goto fn_fail;
 1231                         MPID_Datatype_committed_ptr( datatype_ptr, mpi_errno );
 1232                         if (mpi_errno != MPI_SUCCESS) goto fn_fail;
 1233                     }
 1234                     MPIR_ERRTEST_SENDBUF_INPLACE(sendbuf, count, mpi_errno);
 1235                     MPIR_ERRTEST_USERBUFFER(sendbuf,count,datatype,mpi_errno);
 1236                 }
 1237             }
 1238 
 1239         MPIR_ERRTEST_OP(op, mpi_errno);
 1240 
 1241             if (mpi_errno != MPI_SUCCESS) goto fn_fail;
 1242             if (HANDLE_GET_KIND(op) != HANDLE_KIND_BUILTIN) {
 1243                 MPID_Op_get_ptr(op, op_ptr);
 1244                 MPID_Op_valid_ptr( op_ptr, mpi_errno );
 1245             }
 1246             if (HANDLE_GET_KIND(op) == HANDLE_KIND_BUILTIN) {
 1247                 mpi_errno = 
 1248                     ( * MPIR_OP_HDL_TO_DTYPE_FN(op) )(datatype); 
 1249             }
 1250         if (mpi_errno != MPI_SUCCESS) goto fn_fail;
 1251         }
 1252         MPID_END_ERROR_CHECKS;
 1253     }
 1254 #   endif /* HAVE_ERROR_CHECKING */
 1255 
 1256     /* ... body of routine ...  */
 1257 
 1258     mpi_errno = MPIR_Reduce_impl(sendbuf, recvbuf, count, datatype, op, root, comm_ptr, &errflag);
 1259     if (mpi_errno) goto fn_fail;
 1260 
 1261 #ifdef _OSU_MVAPICH_
 1262     if (mv2_use_osu_collectives) {
 1263         mpi_errno = mv2_increment_shmem_coll_counter(comm_ptr);
 1264         if(comm_ptr->dev.ch.allgather_comm_ok == 0) {
 1265             mpi_errno = mv2_increment_allgather_coll_counter(comm_ptr);
 1266             if (mpi_errno) {
 1267                 MPIR_ERR_POP(mpi_errno);
 1268             }
 1269         }
 1270         if (mpi_errno) {
 1271             MPIR_ERR_POP(mpi_errno);
 1272         }
 1273     }
 1274 #endif /* _OSU_MVAPICH_ */
 1275 
 1276     
 1277     /* ... end of body of routine ... */
 1278     
 1279   fn_exit:
 1280     MPID_MPI_COLL_FUNC_EXIT(MPID_STATE_MPI_REDUCE);
 1281     MPID_THREAD_CS_EXIT(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
 1282     return mpi_errno;
 1283 
 1284   fn_fail:
 1285     /* --BEGIN ERROR HANDLING-- */
 1286 #   ifdef HAVE_ERROR_CHECKING
 1287     {
 1288     mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, 
 1289                      FCNAME, __LINE__, MPI_ERR_OTHER,
 1290     "**mpi_reduce", "**mpi_reduce %p %p %d %D %O %d %C", sendbuf, recvbuf, 
 1291                      count, datatype, op, root, comm);
 1292     }
 1293 #   endif
 1294     mpi_errno = MPIR_Err_return_comm( comm_ptr, FCNAME, mpi_errno );
 1295     goto fn_exit;
 1296     /* --END ERROR HANDLING-- */
 1297 }