"Fossies" - the Fresh Open Source Software Archive

Member "mvapich2-2.3.2/src/mpid/ch3/channels/mrail/src/gen2/ibv_send.c" (8 Aug 2019, 72993 Bytes) of package /linux/misc/mvapich2-2.3.2.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "ibv_send.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 2.3.1_vs_2.3.2.

    1 /* -*- Mode: C; c-basic-offset:4 ; -*- */
    2 /*
    3  *  (C) 2001 by Argonne National Laboratory.
    4  *      See COPYRIGHT in top-level directory.
    5  */
    6 
    7 /* Copyright (c) 2001-2019, The Ohio State University. All rights
    8  * reserved.
    9  *
   10  * This file is part of the MVAPICH2 software package developed by the
   11  * team members of The Ohio State University's Network-Based Computing
   12  * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda.
   13  *
   14  * For detailed copyright and licensing information, please refer to the
   15  * copyright file COPYRIGHT in the top level MVAPICH2 directory.
   16  *
   17  */
   18 
   19 #include "mpichconf.h"
   20 #include "mpiimpl.h"
   21 #include <mpimem.h>
   22 #include "rdma_impl.h"
   23 #include "ibv_impl.h"
   24 #include "vbuf.h"
   25 #include "upmi.h"
   26 #include "mpiutil.h"
   27 #include "dreg.h"
   28 #include "debug_utils.h"
   29 #if defined(_MCST_SUPPORT_)
   30 #include "ibv_mcast.h"
   31 #endif 
   32 
   33 #undef DEBUG_PRINT
   34 #ifdef DEBUG
   35 #define DEBUG_PRINT(args...) \
   36 do {                                                          \
   37     int rank;                                                 \
   38     UPMI_GET_RANK(&rank);                                      \
   39     fprintf(stderr, "[%d][%s:%d] ", rank, __FILE__, __LINE__);\
   40     fprintf(stderr, args);                                    \
   41 } while (0)
   42 #else
   43 #define DEBUG_PRINT(args...)
   44 #endif
   45 
   46 MPIR_T_PVAR_ULONG_COUNTER_DECL_EXTERN(MV2, mv2_vbuf_allocated);
   47 MPIR_T_PVAR_ULONG_COUNTER_DECL_EXTERN(MV2, mv2_vbuf_freed);
   48 MPIR_T_PVAR_ULONG_LEVEL_DECL_EXTERN(MV2, mv2_vbuf_available);
   49 MPIR_T_PVAR_ULONG_COUNTER_DECL_EXTERN(MV2, mv2_ud_vbuf_allocated);
   50 MPIR_T_PVAR_ULONG_COUNTER_DECL_EXTERN(MV2, mv2_ud_vbuf_freed);
   51 MPIR_T_PVAR_ULONG_LEVEL_DECL_EXTERN(MV2, mv2_ud_vbuf_available);
   52 
   53 #define INCR_EXT_SENDQ_SIZE(_c,_rail) \
   54     ++ rdma_global_ext_sendq_size;      \
   55     ++ (_c)->mrail.rails[(_rail)].ext_sendq_size;
   56 
   57 #define DECR_EXT_SENDQ_SIZE(_c,_rail)  \
   58         -- rdma_global_ext_sendq_size;      \
   59         -- (_c)->mrail.rails[(_rail)].ext_sendq_size; 
   60 
   61 static inline vbuf * MRAILI_Get_Vbuf(MPIDI_VC_t * vc, size_t pkt_len);
   62 static inline int MPIDI_CH3I_MRAILI_Fast_rdma_ok(MPIDI_VC_t * vc, MPIDI_msg_sz_t len);
   63 
   64 static inline int MRAILI_Coalesce_ok(MPIDI_VC_t * vc, int rail)
   65 {
   66     if(unlikely(rdma_use_coalesce && 
   67             (vc->mrail.outstanding_eager_vbufs >= rdma_coalesce_threshold || 
   68                vc->mrail.rails[rail].send_wqes_avail == 0) &&
   69          (mv2_MPIDI_CH3I_RDMA_Process.has_srq || 
   70           (vc->mrail.srp.credits[rail].remote_credit > 0 && 
   71            NULL == &(vc->mrail.srp.credits[rail].backlog))))) {
   72         return 1;
   73     }
   74 
   75     return 0;
   76 }
   77 
   78 /* to handle Send Q overflow, we maintain an extended send queue
   79  * above the HCA.  This permits use to have a virtually unlimited send Q depth
   80  * (limited by number of vbufs available for send)
   81  */
   82 #undef FUNCNAME
   83 #define FUNCNAME MRAILI_Ext_sendq_enqueue
   84 #undef FCNAME
   85 #define FCNAME MPL_QUOTE(FUNCNAME)
   86 static inline void MRAILI_Ext_sendq_enqueue(MPIDI_VC_t *c,
   87                                             int rail, 
   88                                             vbuf * v)          
   89 {
   90     MPIDI_STATE_DECL(MPID_STATE_MRAILI_EXT_SENDQ_ENQUEUE);
   91     MPIDI_FUNC_ENTER(MPID_STATE_MRAILI_EXT_SENDQ_ENQUEUE);
   92 
   93     v->desc.next = NULL;
   94     
   95     if (c->mrail.rails[rail].ext_sendq_head == NULL) {
   96         c->mrail.rails[rail].ext_sendq_head = v;
   97     } else {                                     
   98         c->mrail.rails[rail].ext_sendq_tail->desc.next = v;
   99     }
  100     c->mrail.rails[rail].ext_sendq_tail = v;  
  101     DEBUG_PRINT("[ibv_send] enqueue, head %p, tail %p\n", 
  102             c->mrail.rails[rail].ext_sendq_head, 
  103             c->mrail.rails[rail].ext_sendq_tail); 
  104 
  105     INCR_EXT_SENDQ_SIZE(c, rail)
  106 
  107     if (c->mrail.rails[rail].ext_sendq_size > rdma_rndv_ext_sendq_size) {
  108 #ifdef _ENABLE_CUDA_
  109         if (!rdma_enable_cuda)
  110 #endif
  111         {
  112             c->force_rndv = 1;
  113         }
  114     }
  115 
  116     MPIDI_FUNC_EXIT(MPID_STATE_MRAILI_EXT_SENDQ_ENQUEUE);
  117 }
  118 
  119 int check_cq_overflow_for_ib(MPIDI_VC_t *c, int rail)
  120 {
  121     return 0;
  122 }
  123 
  124 int check_cq_overflow_for_iwarp(MPIDI_VC_t *c, int rail)
  125 {
  126     char cq_overflow = 0;
  127 
  128     if(rdma_iwarp_use_multiple_cq) {
  129       if ((NULL != c->mrail.rails[rail].send_cq_hndl) &&
  130           (mv2_MPIDI_CH3I_RDMA_Process.global_used_send_cq >= 
  131            rdma_default_max_cq_size)) {
  132           /* We are monitoring CQ's and there is CQ overflow */
  133           cq_overflow = 1;
  134       }
  135     } else {
  136       if ((NULL != c->mrail.rails[rail].send_cq_hndl) &&
  137           ((mv2_MPIDI_CH3I_RDMA_Process.global_used_send_cq +
  138             mv2_MPIDI_CH3I_RDMA_Process.global_used_recv_cq) >= 
  139             rdma_default_max_cq_size)) {
  140           /* We are monitoring CQ's and there is CQ overflow */       
  141           cq_overflow = 1; 
  142       }
  143     }
  144 
  145     return cq_overflow;
  146 }
  147 
  148 /* dequeue and send as many as we can from the extended send queue
  149  * this is called in each function which may post send prior to it attempting
  150  * its send, hence ordering of sends is maintained
  151  */
  152 #undef FUNCNAME
  153 #define FUNCNAME MRAILI_Ext_sendq_send
  154 #undef FCNAME
  155 #define FCNAME MPL_QUOTE(FUNCNAME)
  156 static inline void MRAILI_Ext_sendq_send(MPIDI_VC_t *c, int rail)    
  157 {
  158     vbuf *v;
  159     char cq_overflow = 0;
  160 
  161     MPIDI_STATE_DECL(MPID_STATE_MRAILI_EXT_SENDQ_SEND);
  162     MPIDI_FUNC_ENTER(MPID_STATE_MRAILI_EXT_SENDQ_SEND);
  163 
  164 #ifdef _ENABLE_XRC_
  165     MPIU_Assert (!USE_XRC || VC_XST_ISUNSET (c, XF_INDIRECT_CONN));
  166 #endif
  167 
  168     cq_overflow = check_cq_overflow(c, rail);
  169 
  170     while (c->mrail.rails[rail].send_wqes_avail
  171             && !cq_overflow
  172             && c->mrail.rails[rail].ext_sendq_head) {
  173         v = c->mrail.rails[rail].ext_sendq_head;
  174         c->mrail.rails[rail].ext_sendq_head = v->desc.next;
  175         if (v == c->mrail.rails[rail].ext_sendq_tail) {
  176             c->mrail.rails[rail].ext_sendq_tail = NULL;
  177         }
  178         v->desc.next = NULL;
  179         -- c->mrail.rails[rail].send_wqes_avail;                
  180 
  181         DECR_EXT_SENDQ_SIZE(c, rail)
  182 
  183         if (unlikely(1 == v->coalesce)) {
  184             DEBUG_PRINT("Sending coalesce vbuf %p\n", v);
  185             MPIDI_CH3I_MRAILI_Pkt_comm_header *p = v->pheader;
  186             vbuf_init_send(v, v->content_size, v->rail);
  187 
  188             p->seqnum = v->seqnum;
  189 
  190             if(c->mrail.coalesce_vbuf == v) {
  191                 c->mrail.coalesce_vbuf = NULL;
  192             }
  193         } 
  194 
  195         IBV_POST_SR(v, c, rail, "Mrail_post_sr (MRAILI_Ext_sendq_send)");
  196     }
  197 
  198     DEBUG_PRINT( "[ibv_send] dequeue, head %p, tail %p\n",
  199         c->mrail.rails[rail].ext_sendq_head,
  200         c->mrail.rails[rail].ext_sendq_tail);
  201 
  202     if (c->mrail.rails[rail].ext_sendq_size <= rdma_rndv_ext_sendq_size) {
  203         c->force_rndv = 0;
  204     }
  205 
  206     MPIDI_FUNC_EXIT(MPID_STATE_MRAILI_EXT_SENDQ_SEND);
  207 }
  208 
  209 #define FLUSH_SQUEUE(_vc) {                                           \
  210     if(NULL != (_vc)->mrail.coalesce_vbuf) {                          \
  211         MRAILI_Ext_sendq_send(_vc, (_vc)->mrail.coalesce_vbuf->rail); \
  212     }                                                                 \
  213 }
  214 
  215 #define FLUSH_RAIL(_vc,_rail) {                                       \
  216     if(unlikely(NULL != (_vc)->mrail.coalesce_vbuf &&                 \
  217             (_vc)->mrail.coalesce_vbuf->rail == _rail)) {             \
  218         MRAILI_Ext_sendq_send(_vc, (_vc)->mrail.coalesce_vbuf->rail); \
  219         (_vc)->mrail.coalesce_vbuf = NULL;                            \
  220     }                                                                 \
  221 }
  222 
  223 
  224 #undef FUNCNAME
  225 #define FUNCNAME MPIDI_CH3I_RDMA_put_datav
  226 #undef FCNAME
  227 #define FCNAME MPL_QUOTE(FUNCNAME)
  228 int MPIDI_CH3I_RDMA_put_datav(MPIDI_VC_t * vc, MPL_IOV * iov, int n,
  229                               int *num_bytes_ptr)
  230 {
  231     int mpi_errno = MPI_SUCCESS;
  232     /* all variable must be declared before the state declarations */
  233     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_PUT_DATAV);
  234     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_PUT_DATAV);
  235 
  236     /* Insert implementation here */
  237     PRINT_ERROR("MPIDI_CH3I_RDMA_put_datav is not implemented\n" );
  238     exit(EXIT_FAILURE);
  239 
  240     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_PUT_DATAV);
  241     return mpi_errno;
  242 }
  243 
  244 #undef FUNCNAME
  245 #define FUNCNAME MPIDI_CH3I_RDMA_read_datav
  246 #undef FCNAME
  247 #define FCNAME MPL_QUOTE(FUNCNAME)
  248 int MPIDI_CH3I_RDMA_read_datav(MPIDI_VC_t * recv_vc_ptr, MPL_IOV * iov,
  249                                int iovlen, int
  250                                *num_bytes_ptr)
  251 {
  252     int mpi_errno = MPI_SUCCESS;
  253     /* all variable must be declared before the state declarations */
  254     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_RDMA_READ_DATAV);
  255     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_RDMA_READ_DATAV);
  256 
  257     /* Insert implementation here */
  258     PRINT_ERROR("MPIDI_CH3I_RDMA_read_datav Function not implemented\n");
  259     exit(EXIT_FAILURE);
  260 
  261     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_RDMA_READ_DATAV);
  262     return mpi_errno;
  263 }
  264 
  265 #undef FUNCNAME
  266 #define FUNCNAME MPIDI_CH3I_MRAILI_Fast_rdma_fill_start_buf
  267 #undef FCNAME
  268 #define FCNAME MPL_QUOTE(FUNCNAME)
  269 static inline int MRAILI_Fast_rdma_fill_start_buf(MPIDI_VC_t * vc,
  270                                     MPL_IOV * iov, int n_iov,
  271                                     int *num_bytes_ptr)
  272 {
  273     /* FIXME: Here we assume that iov holds a packet header */
  274 #ifndef MV2_DISABLE_HEADER_CACHING 
  275     MPIDI_CH3_Pkt_send_t *cached =  vc->mrail.rfp.cached_outgoing;
  276 #endif
  277     MPIDI_CH3_Pkt_send_t *header;
  278     vbuf *v = &(vc->mrail.rfp.RDMA_send_buf[vc->mrail.rfp.phead_RDMA_send]);
  279     void *vstart;
  280     void *data_buf;
  281 
  282     int len = *num_bytes_ptr, avail = 0; 
  283     int seq_num;
  284     int i;
  285 
  286     header = iov[0].MPL_IOV_BUF;
  287     
  288     seq_num =  header->seqnum = vc->mrail.seqnum_next_tosend;
  289     vc->mrail.seqnum_next_tosend++;
  290 
  291     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_MRAILI_FAST_RDMA_FILL_START_BUF);
  292     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_MRAILI_FAST_RDMA_FILL_START_BUF);
  293 
  294     /* Calculate_IOV_len(iov, n_iov, len); */
  295 
  296     avail   = len;
  297     PACKET_SET_RDMA_CREDIT(header, vc);
  298     *num_bytes_ptr = 0;
  299 
  300     DEBUG_PRINT("Header info, tag %d, rank %d, context_id %d\n", 
  301             header->match.parts.tag, header->match.parts.rank, header->match.parts.context_id);
  302 #ifndef MV2_DISABLE_HEADER_CACHING 
  303 
  304     if ((header->type == MPIDI_CH3_PKT_EAGER_SEND) &&
  305         (len - sizeof(MPIDI_CH3_Pkt_eager_send_t) <= MAX_SIZE_WITH_HEADER_CACHING) &&
  306         (header->match.parts.tag == cached->match.parts.tag) &&
  307         (header->match.parts.rank == cached->match.parts.rank) &&
  308         (header->match.parts.context_id == cached->match.parts.context_id) &&
  309         (header->vbuf_credit == cached->vbuf_credit) &&
  310         (header->remote_credit == cached->remote_credit) &&
  311         (header->rdma_credit == cached->rdma_credit)) {
  312         /* change the header contents */
  313         ++vc->mrail.rfp.cached_hit;
  314 
  315         if (header->sender_req_id == cached->sender_req_id) {
  316             MPIDI_CH3I_MRAILI_Pkt_fast_eager *fast_header;
  317             vstart = v->buffer;
  318 
  319             /*
  320             DEBUG_PRINT 
  321                 ("[send: fill buf], head cached, head_flag %p, vstart %p, length %d",
  322                  &v->head_flag, vstart,
  323                  len - sizeof(MPIDI_CH3_Pkt_eager_send_t) + 
  324          sizeof(MPIDI_CH3I_MRAILI_Pkt_fast_eager));
  325                  */
  326     
  327             fast_header = vstart;
  328             fast_header->type = MPIDI_CH3_PKT_FAST_EAGER_SEND;
  329             fast_header->bytes_in_pkt = len - sizeof(MPIDI_CH3_Pkt_eager_send_t);
  330             fast_header->seqnum = seq_num;
  331             v->pheader = fast_header;
  332             data_buf = (void *) ((unsigned long) vstart +
  333                                  sizeof(MPIDI_CH3I_MRAILI_Pkt_fast_eager));
  334    
  335         if (iov[0].MPL_IOV_LEN - sizeof(MPIDI_CH3_Pkt_eager_send_t)) 
  336           MPIU_Memcpy(data_buf, (void *)((uintptr_t)iov[0].MPL_IOV_BUF +
  337                sizeof(MPIDI_CH3_Pkt_eager_send_t)), 
  338                iov[0].MPL_IOV_LEN - sizeof(MPIDI_CH3_Pkt_eager_send_t));
  339 
  340         data_buf = (void *)((uintptr_t)data_buf + iov[0].MPL_IOV_LEN -
  341             sizeof(MPIDI_CH3_Pkt_eager_send_t));
  342 
  343             *num_bytes_ptr += sizeof(MPIDI_CH3I_MRAILI_Pkt_fast_eager);
  344             avail -= sizeof(MPIDI_CH3I_MRAILI_Pkt_fast_eager);
  345         } else {
  346             MPIDI_CH3I_MRAILI_Pkt_fast_eager_with_req *fast_header;
  347             vstart = v->buffer;
  348 
  349             DEBUG_PRINT
  350                 ("[send: fill buf], head cached, head_flag %p, vstart %p, length %d\n",
  351                  &v->head_flag, vstart,
  352                  len - sizeof(MPIDI_CH3_Pkt_eager_send_t) + 
  353          sizeof(MPIDI_CH3I_MRAILI_Pkt_fast_eager_with_req));
  354              
  355             fast_header = vstart;
  356             fast_header->type = MPIDI_CH3_PKT_FAST_EAGER_SEND_WITH_REQ;
  357             fast_header->bytes_in_pkt = len - sizeof(MPIDI_CH3_Pkt_eager_send_t);
  358             fast_header->seqnum = seq_num;
  359             fast_header->sender_req_id = header->sender_req_id;
  360             cached->sender_req_id = header->sender_req_id;
  361             v->pheader = fast_header;
  362             data_buf =
  363                 (void *) ((unsigned long) vstart +
  364                           sizeof(MPIDI_CH3I_MRAILI_Pkt_fast_eager_with_req));
  365         if (iov[0].MPL_IOV_LEN - sizeof(MPIDI_CH3_Pkt_eager_send_t)) 
  366           MPIU_Memcpy(data_buf, (void *)((uintptr_t)iov[0].MPL_IOV_BUF +
  367                sizeof(MPIDI_CH3_Pkt_eager_send_t)), 
  368                iov[0].MPL_IOV_LEN - sizeof(MPIDI_CH3_Pkt_eager_send_t));
  369 
  370         data_buf = (void *)((uintptr_t)data_buf + iov[0].MPL_IOV_LEN -
  371             sizeof(MPIDI_CH3_Pkt_eager_send_t));
  372 
  373             *num_bytes_ptr += sizeof(MPIDI_CH3I_MRAILI_Pkt_fast_eager_with_req);
  374             avail -= sizeof(MPIDI_CH3I_MRAILI_Pkt_fast_eager_with_req);
  375         }
  376     } else
  377 #endif
  378     {
  379         vstart = v->buffer;
  380         DEBUG_PRINT
  381             ("[send: fill buf], head not cached, v %p, vstart %p, length %d, header size %d\n",
  382              v, vstart, len, iov[0].MPL_IOV_LEN);
  383         MPIU_Memcpy(vstart, header, iov[0].MPL_IOV_LEN);
  384 #ifndef MV2_DISABLE_HEADER_CACHING 
  385         if (header->type == MPIDI_CH3_PKT_EAGER_SEND &&
  386             ((len - sizeof(MPIDI_CH3_Pkt_eager_send_t)) <= MAX_SIZE_WITH_HEADER_CACHING)) {
  387             MPIU_Memcpy(cached, header, sizeof(MPIDI_CH3_Pkt_eager_send_t));
  388             ++vc->mrail.rfp.cached_miss;
  389         }
  390 #endif
  391         data_buf = (void *) ((unsigned long) vstart + iov[0].MPL_IOV_LEN);
  392         *num_bytes_ptr += iov[0].MPL_IOV_LEN;
  393         avail -= iov[0].MPL_IOV_LEN;
  394         v->pheader = vstart;
  395     }
  396 
  397     
  398     /* We have filled the header, it is time to fit in the actual data */
  399 #ifdef _ENABLE_CUDA_
  400     if (rdma_enable_cuda && n_iov > 1 && is_device_buffer(iov[1].MPL_IOV_BUF)) {
  401         /* in the case of GPU buffers, there is only one data iov, if data is non-contiguous
  402          * it should have been packed before this */
  403         MPIU_Assert(n_iov == 2);
  404 
  405         MPIU_Memcpy_CUDA(data_buf,
  406                 iov[1].MPL_IOV_BUF,
  407                 iov[1].MPL_IOV_LEN,
  408                 cudaMemcpyDeviceToHost);
  409         *num_bytes_ptr += iov[1].MPL_IOV_LEN;
  410         avail -= iov[1].MPL_IOV_LEN;
  411 
  412         MPIU_Assert(avail >= 0);
  413     } else
  414 #endif
  415     {
  416         for (i = 1; i < n_iov; i++) {
  417             if (avail >= iov[i].MPL_IOV_LEN) {
  418               MPIU_Memcpy(data_buf, iov[i].MPL_IOV_BUF, iov[i].MPL_IOV_LEN);
  419                 data_buf = (void *) ((unsigned long) data_buf + iov[i].MPL_IOV_LEN);
  420                 *num_bytes_ptr += iov[i].MPL_IOV_LEN;
  421                 avail -= iov[i].MPL_IOV_LEN;
  422             } else if (avail > 0) {
  423               MPIU_Memcpy(data_buf, iov[i].MPL_IOV_BUF, avail);
  424                 data_buf = (void *) ((unsigned long) data_buf + avail);
  425                 *num_bytes_ptr += avail;
  426                 avail = 0;
  427                 break;
  428             } else break;
  429         }
  430     }
  431 
  432     DEBUG_PRINT("[send: fill buf], num bytes copied %d\n", *num_bytes_ptr);
  433     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_MRAILI_FAST_RDMA_FILL_START_BUF);
  434     return MPI_SUCCESS;
  435 }
  436 
  437 #undef FUNCNAME
  438 #define FUNCNAME MPIDI_CH3I_MRAILI_Fast_rdma_send_complete
  439 #undef FCNAME
  440 #define FCNAME MPL_QUOTE(FUNCNAME)
  441 /* INOUT: num_bytes_ptr holds the pkt_len as input parameter */
  442 static inline int MPIDI_CH3I_MRAILI_Fast_rdma_send_complete(MPIDI_VC_t * vc,
  443                                               MPL_IOV * iov,
  444                                               int n_iov,
  445                                               int *num_bytes_ptr,
  446                                               vbuf ** vbuf_handle)
  447 {
  448     int rail;
  449     int  post_len;
  450     char cq_overflow = 0;
  451     VBUF_FLAG_TYPE flag;
  452     vbuf *v =
  453         &(vc->mrail.rfp.RDMA_send_buf[vc->mrail.rfp.phead_RDMA_send]);
  454     char *rstart;
  455 
  456     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_MRAILI_FAST_RDMA_SEND_COMPLETE);
  457     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_MRAILI_FAST_RDMA_SEND_COMPLETE);
  458 
  459     rail = MRAILI_Send_select_rail(vc);
  460     MRAILI_Fast_rdma_fill_start_buf(vc, iov, n_iov, num_bytes_ptr);
  461 
  462     post_len = *num_bytes_ptr;
  463     rstart = vc->mrail.rfp.remote_RDMA_buf +
  464             (vc->mrail.rfp.phead_RDMA_send * rdma_fp_buffer_size);
  465     DEBUG_PRINT("[send: rdma_send] local vbuf %p, remote start %p, align size %d\n",
  466                v, rstart, post_len);
  467 
  468     if (++(vc->mrail.rfp.phead_RDMA_send) >= num_rdma_buffer)
  469         vc->mrail.rfp.phead_RDMA_send = 0;
  470 
  471     v->rail = rail;
  472     v->padding = BUSY_FLAG;
  473 
  474     /* requirements for coalescing */
  475     ++vc->mrail.outstanding_eager_vbufs;
  476     v->eager = 1;
  477     v->vc = (void *) vc;
  478 
  479     /* set tail flag with the size of the content */
  480     if ((int) *(VBUF_FLAG_TYPE *) (v->buffer + post_len) == post_len) {
  481         flag = (VBUF_FLAG_TYPE) (post_len + FAST_RDMA_ALT_TAG);
  482     } else {
  483         flag = (VBUF_FLAG_TYPE) post_len;
  484     }
  485     /* set head flag */
  486     *v->head_flag = (VBUF_FLAG_TYPE) flag;
  487     /* set tail flag */    
  488     *((VBUF_FLAG_TYPE *)(v->buffer + post_len)) = flag;
  489 
  490     DEBUG_PRINT("incrementing the outstanding eager vbufs: RFP %d\n", vc->mrail.outstanding_eager_vbufs);
  491 
  492     /* generate a completion, following statements should have been executed during
  493      * initialization */
  494     post_len += VBUF_FAST_RDMA_EXTRA_BYTES;
  495 
  496     DEBUG_PRINT("[send: rdma_send] lkey %p, rkey %p, len %d, flag %d\n",
  497                 vc->mrail.rfp.RDMA_send_buf_mr[vc->mrail.rails[rail].hca_index]->lkey,
  498                 vc->mrail.rfp.RDMA_remote_buf_rkey, post_len, *v->head_flag);
  499 
  500     VBUF_SET_RDMA_ADDR_KEY(v, post_len, v->head_flag,
  501             vc->mrail.rfp.RDMA_send_buf_mr[vc->mrail.rails[rail].hca_index]->lkey, rstart,
  502             vc->mrail.rfp.RDMA_remote_buf_rkey[vc->mrail.rails[rail].hca_index]);
  503 
  504     XRC_FILL_SRQN_FIX_CONN (v, vc, rail);
  505     FLUSH_RAIL(vc, rail);
  506 #ifdef CRC_CHECK
  507     p->crc = update_crc(1, (void *)((uintptr_t)p+sizeof *p),
  508                               *v->head_flag - sizeof *p);
  509 #endif
  510 
  511     cq_overflow = check_cq_overflow(vc, rail);
  512 
  513     if (likely(vc->mrail.rails[rail].send_wqes_avail > 0 && !cq_overflow)) {
  514         --vc->mrail.rails[rail].send_wqes_avail;
  515         *vbuf_handle = v;
  516 
  517         IBV_POST_SR(v, vc, rail, "ibv_post_sr (post_fast_rdma)");
  518         DEBUG_PRINT("[send:post rdma] desc posted\n");
  519     } else {
  520         DEBUG_PRINT("[send: rdma_send] Warning! no send wqe or send cq available\n");
  521         MRAILI_Ext_sendq_enqueue(vc, rail, v);
  522         *vbuf_handle = v;
  523         return MPI_MRAIL_MSG_QUEUED;
  524     }
  525 
  526     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_MRAILI_FAST_RDMA_SEND_COMPLETE);
  527     return MPI_SUCCESS;
  528 }
  529 
  530 #undef FUNCNAME
  531 #define FUNCNAME MPIDI_CH3I_MRAILI_Fast_rdma_ok
  532 #undef FCNAME
  533 #define FCNAME MPL_QUOTE(FUNCNAME)
  534 static inline int MPIDI_CH3I_MRAILI_Fast_rdma_ok(MPIDI_VC_t * vc, MPIDI_msg_sz_t len)
  535 {
  536     int i = 0;
  537 
  538     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_MRAILI_FAST_RDMA_OK);
  539     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_MRAILI_FAST_RDMA_OK);
  540 
  541     if(unlikely(vc->tmp_dpmvc)) {
  542         return 0;
  543     }
  544     
  545 #ifdef _ENABLE_UD_
  546     if(rdma_enable_hybrid)
  547     {
  548             if(unlikely(!(vc->mrail.state & MRAILI_RC_CONNECTED))) {
  549                     return 0;
  550             }
  551     }
  552 #endif /* _ENABLE_UD_ */
  553 
  554     if (unlikely(len > MRAIL_MAX_RDMA_FP_SIZE)) {
  555         return 0;
  556     }
  557 
  558     if (unlikely(num_rdma_buffer < 2
  559         || vc->mrail.rfp.phead_RDMA_send == vc->mrail.rfp.ptail_RDMA_send
  560         || vc->mrail.rfp.RDMA_send_buf[vc->mrail.rfp.phead_RDMA_send].padding == BUSY_FLAG
  561         || MRAILI_Coalesce_ok(vc, 0))) /* We can only coalesce with send/recv. */
  562     {
  563         MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_MRAILI_FAST_RDMA_OK);
  564         return 0;
  565     }
  566 
  567     if (unlikely(!mv2_MPIDI_CH3I_RDMA_Process.has_srq)) {
  568         for (i = 0; i < rdma_num_rails; i++)
  569         {
  570             if (vc->mrail.srp.credits[i].backlog.len != 0)
  571             {
  572                 MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_MRAILI_FAST_RDMA_OK);
  573             return 0;
  574             }
  575         }
  576     }
  577 
  578     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_MRAILI_FAST_RDMA_OK);
  579     return 1;
  580 } 
  581 
  582 #undef FUNCNAME
  583 #define FUNCNAME mv2_post_srq_buffers
  584 #undef FCNAME
  585 #define FCNAME MPL_QUOTE(FUNCNAME)
  586 int mv2_post_srq_buffers(int num_bufs, int hca_num)
  587 {
  588     int i = 0;
  589     vbuf* v = NULL;
  590     struct ibv_recv_wr* bad_wr = NULL;
  591     MPIDI_STATE_DECL(MPID_STATE_POST_SRQ_BUFFERS);
  592     MPIDI_FUNC_ENTER(MPID_STATE_POST_SRQ_BUFFERS);
  593 
  594     if (num_bufs > mv2_srq_fill_size)
  595     {
  596         ibv_va_error_abort(
  597             GEN_ASSERT_ERR,
  598             "Try to post %d to SRQ, max %d\n",
  599             num_bufs,
  600             mv2_srq_fill_size);
  601     }
  602 
  603     for (; i < num_bufs; ++i)
  604     {
  605         if ((v = get_vbuf_by_offset(MV2_RECV_VBUF_POOL_OFFSET)) == NULL)
  606         {
  607             break;
  608         }
  609 
  610         VBUF_INIT_RECV(
  611             v,
  612             VBUF_BUFFER_SIZE,
  613             hca_num * rdma_num_ports * rdma_num_qp_per_port);
  614             v->transport = IB_TRANSPORT_RC;
  615 
  616         if (ibv_post_srq_recv(mv2_MPIDI_CH3I_RDMA_Process.srq_hndl[hca_num], &v->desc.u.rr, &bad_wr))
  617         {
  618             MRAILI_Release_vbuf(v);
  619             break;
  620         }
  621     }
  622 
  623     DEBUG_PRINT("Posted %d buffers to SRQ\n",num_bufs);
  624 
  625     MPIDI_FUNC_EXIT(MPID_STATE_POST_SRQ_BUFFERS);
  626     return i;
  627 }
  628 
  629 #ifdef _ENABLE_UD_
  630 #undef FUNCNAME
  631 #define FUNCNAME mv2_post_ud_recv_buffers
  632 #undef FCNAME
  633 #define FCNAME MPL_QUOTE(FUNCNAME)
  634 int mv2_post_ud_recv_buffers(int num_bufs, mv2_ud_ctx_t *ud_ctx)
  635 {
  636     int i = 0,ret = 0;
  637     vbuf* v = NULL;
  638     struct ibv_recv_wr* bad_wr = NULL;
  639     MPIDI_STATE_DECL(MPID_STATE_POST_RECV_BUFFERS);
  640     MPIDI_FUNC_ENTER(MPID_STATE_POST_RECV_BUFFERS);
  641 
  642     if (num_bufs > rdma_default_max_ud_recv_wqe)
  643     {
  644         ibv_va_error_abort(
  645                 GEN_ASSERT_ERR,
  646                 "Try to post %d to UD recv buffers, max %d\n",
  647                 num_bufs, rdma_default_max_ud_recv_wqe);
  648     }
  649 
  650     for (i = 0; i < num_bufs; ++i)
  651     {
  652         MV2_GET_AND_INIT_UD_VBUF(v);
  653         if (v == NULL)
  654         {
  655             break;
  656         }
  657 
  658         vbuf_init_ud_recv(v, rdma_default_ud_mtu, ud_ctx->hca_num);
  659         v->transport = IB_TRANSPORT_UD;
  660         if (ud_ctx->qp->srq) {
  661             ret = ibv_post_srq_recv(ud_ctx->qp->srq, &v->desc.u.rr, &bad_wr);
  662         } else {
  663             ret = ibv_post_recv(ud_ctx->qp, &v->desc.u.rr, &bad_wr);
  664         }
  665         if (ret)
  666         {
  667             MRAILI_Release_vbuf(v);
  668             break;
  669         }
  670     }
  671 
  672     PRINT_DEBUG(DEBUG_UD_verbose>0 ,"Posted %d buffers of size:%d to UD QP on HCA %d\n",
  673                 num_bufs, rdma_default_ud_mtu, ud_ctx->hca_num);
  674 
  675     MPIDI_FUNC_EXIT(MPID_STATE_POST_RECV_BUFFERS);
  676     return i;
  677 }
  678 
  679 #undef FUNCNAME
  680 #undef FUNCNAME
  681 #define FUNCNAME post_hybrid
  682 #undef FCNAME
  683 #define FCNAME MPL_QUOTE(FUNCNAME)
  684 int post_hybrid_send(MPIDI_VC_t* vc, vbuf* v, int rail)
  685 {
  686     mv2_MPIDI_CH3I_RDMA_Process_t *proc = &mv2_MPIDI_CH3I_RDMA_Process;
  687 
  688     MPIDI_STATE_DECL(MPID_STATE_POST_HYBRID_SEND);
  689     MPIDI_FUNC_ENTER(MPID_STATE_POST_HYBRID_SEND);
  690 
  691     switch (v->transport) {
  692         case IB_TRANSPORT_UD:
  693             vc->mrail.rely.total_messages++;
  694             /* Enable RC conection if total no of msgs on UD channel reachd a
  695              * threshold and total rc connections less than threshold  
  696              */
  697             if (!(vc->mrail.state & (MRAILI_RC_CONNECTED | MRAILI_RC_CONNECTING)) 
  698                 && (rdma_ud_num_msg_limit)
  699                 && (vc->mrail.rely.total_messages > rdma_ud_num_msg_limit)
  700                 && ((mv2_MPIDI_CH3I_RDMA_Process.rc_connections + rdma_hybrid_pending_rc_conn)
  701                     < rdma_hybrid_max_rc_conn)
  702                 && vc->mrail.rely.ext_window.head == NULL
  703                 && !(vc->state == MPIDI_VC_STATE_LOCAL_CLOSE || vc->state == MPIDI_VC_STATE_CLOSE_ACKED)) {
  704                 /* This is hack to create RC channel usig CM protocol.
  705                 ** Need to handle this by sending REQ/REP on UD channel itself
  706                 */
  707                 vc->ch.state = MPIDI_CH3I_VC_STATE_UNCONNECTED;
  708 #ifdef _ENABLE_XRC_
  709                 if(USE_XRC) {
  710                     VC_XST_CLR (vc, XF_SEND_IDLE);
  711                 }
  712 #endif
  713                 PRINT_DEBUG(DEBUG_UD_verbose>1, "Connection initiated to :%d\n", vc->pg_rank);
  714                 MV2_HYBRID_SET_RC_CONN_INITIATED(vc);
  715             } 
  716             post_ud_send(vc, v, rail, NULL);
  717             break;
  718         case IB_TRANSPORT_RC:
  719             MPIU_Assert(vc->mrail.state & MRAILI_RC_CONNECTED);
  720             if(proc->has_srq) {
  721                 post_srq_send(vc, v, rail);
  722             } else {
  723                 post_send(vc, v, rail);
  724             }
  725             break;
  726         default:
  727             PRINT_DEBUG(DEBUG_UD_verbose>1,"Invalid IB transport protocol\n");
  728             return -1;
  729     }
  730 
  731     MPIDI_FUNC_EXIT(MPID_STATE_POST_HYBRID_SEND);
  732     return 0;
  733 }
  734 #endif /* _ENABLE_UD_ */
  735 
  736 #undef FUNCNAME
  737 #define FUNCNAME mv2_eager_fast_send
  738 #undef FCNAME
  739 #define FCNAME MPL_QUOTE(FUNCNAME)
  740 int mv2_eager_fast_send(MPIDI_VC_t* vc, const void *buf,
  741                         MPIDI_msg_sz_t data_sz, int rank, int tag,
  742                         MPID_Comm *comm, int context_offset, MPID_Request **sreq_p)
  743 {
  744     int rail = 0;
  745     int retval = 0;
  746     vbuf* v = NULL;
  747     int len = 0;
  748     void *ptr = NULL;
  749     MPID_Seqnum_t seqnum;
  750     MPIDI_CH3_Pkt_t *upkt = NULL;
  751     MPIDI_CH3_Pkt_eager_send_t *eager_pkt = NULL;
  752 
  753     rail = MRAILI_Send_select_rail(vc);
  754 
  755     /* Get VBUF */
  756     MRAILI_Get_buffer(vc, v, data_sz+sizeof(MPIDI_CH3_Pkt_eager_send_t));
  757 
  758     /* Point header to start of buffer */
  759     upkt = (MPIDI_CH3_Pkt_t *) v->buffer;
  760     eager_pkt = &((*upkt).eager_send);
  761 
  762     /* Create packet header */
  763     MPIDI_Pkt_init(eager_pkt, MPIDI_CH3_PKT_EAGER_SEND);
  764     eager_pkt->data_sz                 = data_sz;
  765     eager_pkt->match.parts.tag         = tag;
  766     eager_pkt->match.parts.rank        = comm->rank;
  767     eager_pkt->match.parts.context_id  = comm->context_id + context_offset;
  768 
  769     /* Set sequence number */
  770     MPIDI_VC_FAI_send_seqnum(vc, seqnum);
  771     MPIDI_Pkt_set_seqnum(eager_pkt, seqnum);
  772 
  773     /* Copy data */
  774     ptr = (void*) v->buffer + sizeof(MPIDI_CH3_Pkt_eager_send_t);
  775 
  776     memcpy(ptr, buf, data_sz);
  777     /* Compute size of pkt */
  778     len = sizeof(MPIDI_CH3_Pkt_eager_send_t) + data_sz;
  779 
  780     /* Initialize other vbuf parameters */
  781     vbuf_init_send(v, len, rail);
  782 
  783     /* Send the packet */
  784     retval = mv2_MPIDI_CH3I_RDMA_Process.post_send(vc, v, rail);
  785 
  786     return retval;
  787 }
  788 
  789 #undef FUNCNAME
  790 #define FUNCNAME mv2_eager_fast_coalesce_send
  791 #undef FCNAME
  792 #define FCNAME MPL_QUOTE(FUNCNAME)
  793 int mv2_eager_fast_coalesce_send(MPIDI_VC_t* vc, const void *buf,
  794                         MPIDI_msg_sz_t data_sz, int rank, int tag,
  795                         MPID_Comm *comm, int context_offset, MPID_Request **sreq_p)
  796 {
  797     int retval = 0;
  798     vbuf* v = NULL;
  799     int len = 0;
  800     void *ptr = NULL;
  801     MPID_Seqnum_t seqnum;
  802     MPIDI_CH3_Pkt_t *upkt = NULL;
  803     MPIDI_CH3_Pkt_eager_send_t *eager_pkt = NULL;
  804 
  805     /* Get VBUF */
  806     v = MRAILI_Get_Vbuf(vc, data_sz+sizeof(MPIDI_CH3_Pkt_eager_send_t));
  807 
  808     /* Point header to start of buffer */
  809     upkt = (MPIDI_CH3_Pkt_t *) (v->buffer + v->content_size);
  810     eager_pkt = &((*upkt).eager_send);
  811 
  812     /* Create packet header */
  813     MPIDI_Pkt_init(eager_pkt, MPIDI_CH3_PKT_EAGER_SEND);
  814     eager_pkt->data_sz                 = data_sz;
  815     eager_pkt->match.parts.tag         = tag;
  816     eager_pkt->match.parts.rank        = comm->rank;
  817     eager_pkt->match.parts.context_id  = comm->context_id + context_offset;
  818 
  819     /* Set sequence number */
  820     MPIDI_VC_FAI_send_seqnum(vc, seqnum);
  821     MPIDI_Pkt_set_seqnum(eager_pkt, seqnum);
  822 
  823     /* Copy data */
  824     ptr = (void*) v->buffer + v->content_size + sizeof(MPIDI_CH3_Pkt_eager_send_t);
  825 
  826     memcpy(ptr, buf, data_sz);
  827     /* Compute size of pkt */
  828     len = sizeof(MPIDI_CH3_Pkt_eager_send_t) + data_sz;
  829 
  830     /* Update length */
  831     v->content_size += len;
  832 
  833     /* send the buffer if we aren't trying to coalesce it */
  834     if(likely(vc->mrail.coalesce_vbuf != v))  {
  835         /* Initialize other vbuf parameters */
  836         vbuf_init_send(v, len, v->rail);
  837         /* Send the packet */
  838         retval = mv2_MPIDI_CH3I_RDMA_Process.post_send(vc, v, v->rail);
  839     } else {
  840         MPIDI_CH3I_MRAILI_Pkt_comm_header *p = (MPIDI_CH3I_MRAILI_Pkt_comm_header *)
  841             (v->buffer + v->content_size - len);
  842 
  843         PACKET_SET_CREDIT(p, vc, v->rail);
  844 #ifdef CRC_CHECK
  845         p->crc = update_crc(1, (void *)((uintptr_t)p+sizeof *p),
  846                                   v->desc.sg_entry.length - sizeof *p);
  847 #endif
  848         v->vc                = (void *) vc;
  849         p->rail        = v->rail;
  850 #ifdef _ENABLE_UD_
  851         if(rdma_enable_hybrid) {
  852                 p->src.rank    = MPIDI_Process.my_pg_rank;
  853         } else
  854 #endif
  855         {
  856                 p->src.vc_addr = vc->mrail.remote_vc_addr;
  857         }
  858     }
  859 
  860     return retval;
  861 }
  862 
  863 #undef FUNCNAME
  864 #define FUNCNAME mv2_eager_fast_rfp_send
  865 #undef FCNAME
  866 #define FCNAME MPL_QUOTE(FUNCNAME)
  867 int mv2_eager_fast_rfp_send(MPIDI_VC_t* vc, const void *buf,
  868                         MPIDI_msg_sz_t data_sz, int rank, int tag,
  869                         MPID_Comm *comm, int context_offset, MPID_Request **sreq_p)
  870 {
  871     /* For short send n_iov is always 2 */
  872     int n_iov = 2;
  873     MPID_Seqnum_t seqnum;
  874     vbuf *buf_handle = NULL;
  875     int num_bytes_ptr = 0;
  876     MPL_IOV iov[2];
  877     MPIDI_CH3_Pkt_t upkt;
  878     MPIDI_CH3_Pkt_eager_send_t * const eager_pkt = &upkt.eager_send;
  879 
  880     if (unlikely(!MPIDI_CH3I_MRAILI_Fast_rdma_ok(vc, data_sz+sizeof(*eager_pkt)))) {
  881         return vc->eager_fast_fn(vc, buf, data_sz, rank,
  882                                 tag, comm, context_offset, sreq_p);
  883     }
  884 
  885     /* Create packet header */
  886     MPIDI_Pkt_init(eager_pkt, MPIDI_CH3_PKT_EAGER_SEND);
  887     eager_pkt->data_sz                 = data_sz;
  888     eager_pkt->match.parts.tag         = tag;
  889     eager_pkt->match.parts.rank        = comm->rank;
  890     eager_pkt->match.parts.context_id  = comm->context_id + context_offset;
  891 
  892     /* Create IOV (header) */
  893     iov[0].MPL_IOV_BUF = (MPL_IOV_BUF_CAST)eager_pkt;
  894     iov[0].MPL_IOV_LEN = sizeof(*eager_pkt);
  895     /* Create IOV (data) */
  896     iov[1].MPL_IOV_BUF = (MPL_IOV_BUF_CAST) buf;
  897     iov[1].MPL_IOV_LEN = data_sz;
  898 
  899     /* Compute size of pkt */
  900     num_bytes_ptr = iov[0].MPL_IOV_LEN + iov[1].MPL_IOV_LEN;
  901 
  902     /* Set sequence number */
  903     MPIDI_VC_FAI_send_seqnum(vc, seqnum);
  904     MPIDI_Pkt_set_seqnum(eager_pkt, seqnum);
  905 
  906     return MPIDI_CH3I_MRAILI_Fast_rdma_send_complete(vc, iov,
  907                 n_iov, &num_bytes_ptr, &buf_handle);
  908 }
  909 
  910 #undef FUNCNAME
  911 #define FUNCNAME post_srq_send
  912 #undef FCNAME
  913 #define FCNAME MPL_QUOTE(FUNCNAME)
  914 int post_srq_send(MPIDI_VC_t* vc, vbuf* v, int rail)
  915 {
  916     char cq_overflow = 0;
  917     MPIDI_CH3I_MRAILI_Pkt_comm_header *p = v->pheader;
  918     PACKET_SET_CREDIT(p, vc, rail);
  919 
  920     MPIDI_STATE_DECL(MPID_STATE_POST_SRQ_SEND);
  921     MPIDI_FUNC_ENTER(MPID_STATE_POST_SRQ_SEND);
  922 
  923     v->vc = (void *) vc;
  924     p->rail        = rail;
  925 #ifdef _ENABLE_UD_
  926     if(rdma_enable_hybrid) {
  927             p->src.rank    = MPIDI_Process.my_pg_rank;
  928             while (vc->mrail.rails[rail].qp_hndl->state != IBV_QPS_RTS) {
  929                     MPID_Progress_test();
  930             }
  931     } else
  932 #endif
  933     {
  934             p->src.vc_addr = vc->mrail.remote_vc_addr;
  935     }
  936     MPIU_Assert(v->transport == IB_TRANSPORT_RC);
  937     
  938     if (p->type == MPIDI_CH3_PKT_NOOP) {
  939         v->seqnum = p->seqnum = -1;
  940     } else {
  941         v->seqnum = p->seqnum = vc->mrail.seqnum_next_tosend;
  942         vc->mrail.seqnum_next_tosend++;
  943     }
  944     
  945     p->acknum = vc->mrail.seqnum_next_toack;
  946     MARK_ACK_COMPLETED(vc);
  947     
  948     XRC_FILL_SRQN_FIX_CONN (v, vc, rail);
  949 
  950     FLUSH_RAIL(vc, rail);
  951 
  952     cq_overflow = check_cq_overflow(vc, rail);
  953 
  954     if (likely(vc->mrail.rails[rail].send_wqes_avail > 0 && !cq_overflow)) {
  955         --vc->mrail.rails[rail].send_wqes_avail;
  956 
  957         IBV_POST_SR(v, vc, rail, "ibv_post_sr (post_send_desc)");
  958     } else {
  959         MRAILI_Ext_sendq_enqueue(vc, rail, v);
  960         MPIDI_FUNC_EXIT(MPID_STATE_POST_SRQ_SEND);
  961         return MPI_MRAIL_MSG_QUEUED;
  962     }
  963 
  964     MPIDI_FUNC_EXIT(MPID_STATE_POST_SRQ_SEND);
  965     return 0;
  966 }
  967 
  968 #undef FUNCNAME
  969 #define FUNCNAME post_send
  970 #undef FCNAME
  971 #define FCNAME MPL_QUOTE(FUNCNAME)
  972 int post_send(MPIDI_VC_t * vc, vbuf * v, int rail)
  973 {
  974     char cq_overflow = 0;
  975     MPIDI_CH3I_MRAILI_Pkt_comm_header *p = v->pheader;
  976 
  977     MPIDI_STATE_DECL(MPID_STATE_POST_SEND);
  978     MPIDI_FUNC_ENTER(MPID_STATE_POST_SEND);
  979     DEBUG_PRINT(
  980                 "[post send] credit %d,type noop %d, "
  981                 "backlog %d, wqe %d, nb will be %d\n",
  982                 vc->mrail.srp.credits[rail].remote_credit,
  983                 p->type == MPIDI_CH3_PKT_NOOP, 
  984                 vc->mrail.srp.credits[0].backlog.len,
  985                 vc->mrail.rails[rail].send_wqes_avail,
  986                 v->desc.sg_entry.length);
  987 
  988     v->vc = (void *) vc;
  989     p->rail        = rail;
  990 #ifdef _ENABLE_UD_
  991     if(rdma_enable_hybrid) {
  992             p->src.rank = MPIDI_Process.my_pg_rank;
  993     } else
  994 #endif
  995     {
  996             p->src.vc_addr = vc->mrail.remote_vc_addr;
  997     }
  998 
  999     MPIU_Assert(v->transport == IB_TRANSPORT_RC);
 1000    
 1001     if (p->type == MPIDI_CH3_PKT_NOOP) {
 1002         v->seqnum = p->seqnum = -1;
 1003     } else {
 1004         v->seqnum = p->seqnum = vc->mrail.seqnum_next_tosend;
 1005         vc->mrail.seqnum_next_tosend++;
 1006     }
 1007     p->acknum = vc->mrail.seqnum_next_toack;
 1008     MARK_ACK_COMPLETED(vc);
 1009 
 1010     PRINT_DEBUG(DEBUG_UD_verbose>1, "sending seqnum:%d acknum:%d\n",p->seqnum,p->acknum);
 1011 
 1012     if (vc->mrail.srp.credits[rail].remote_credit > 0
 1013         || p->type == MPIDI_CH3_PKT_NOOP) {
 1014 
 1015         PACKET_SET_CREDIT(p, vc, rail);
 1016 #ifdef CRC_CHECK
 1017     p->crc = update_crc(1, (void *)((uintptr_t)p+sizeof *p),
 1018                   v->desc.sg_entry.length - sizeof *p );
 1019 #endif
 1020         if (p->type != MPIDI_CH3_PKT_NOOP)
 1021         {
 1022             --vc->mrail.srp.credits[rail].remote_credit;
 1023         }
 1024 
 1025         v->vc = (void *) vc;
 1026 
 1027         XRC_FILL_SRQN_FIX_CONN (v, vc, rail);
 1028         FLUSH_RAIL(vc, rail);
 1029 
 1030         cq_overflow = check_cq_overflow(vc, rail);
 1031 
 1032         if (likely(vc->mrail.rails[rail].send_wqes_avail > 0 && !cq_overflow)) {
 1033             --vc->mrail.rails[rail].send_wqes_avail;
 1034             IBV_POST_SR(v, vc, rail, "ibv_post_sr (post_send_desc)");
 1035         } else {
 1036             MRAILI_Ext_sendq_enqueue(vc, rail, v);
 1037             MPIDI_FUNC_EXIT(MPID_STATE_POST_SEND);
 1038             return MPI_MRAIL_MSG_QUEUED;
 1039         }
 1040     }
 1041     else
 1042     {
 1043         ibv_backlog_queue_t *q = &(vc->mrail.srp.credits[rail].backlog);
 1044         BACKLOG_ENQUEUE(q, v);
 1045         MPIDI_FUNC_EXIT(MPID_STATE_POST_SEND);
 1046         return MPI_MRAIL_MSG_QUEUED;
 1047     }
 1048 
 1049     MPIDI_FUNC_EXIT(MPID_STATE_POST_SEND);
 1050     return 0;
 1051 }
 1052 
 1053 #undef FUNCNAME
 1054 #define FUNCNAME MRAILI_Fill_start_buffer
 1055 #undef FCNAME
 1056 #define FCNAME MPL_QUOTE(FUNCNAME)
 1057 int MRAILI_Fill_start_buffer(vbuf * v,
 1058                              MPL_IOV * iov,
 1059                              int n_iov)
 1060 {
 1061     int i = 0;
 1062     int avail = 0;
 1063 #ifdef _ENABLE_CUDA_
 1064     if (rdma_enable_cuda) {
 1065         avail = ((vbuf_pool_t*)v->pool_index)->buf_size - v->content_size;
 1066     } else 
 1067 #endif
 1068     {
 1069         avail = VBUF_BUFFER_SIZE - v->content_size;
 1070     }
 1071     void *ptr = (v->buffer + v->content_size);
 1072     int len = 0;
 1073 #ifdef _ENABLE_UD_
 1074     if( rdma_enable_hybrid && v->transport == IB_TRANSPORT_UD) {
 1075         avail = MRAIL_MAX_UD_SIZE - v->content_size;
 1076     }
 1077 #endif
 1078 
 1079     MPIDI_STATE_DECL(MPID_STATE_MRAILI_FILL_START_BUFFER);
 1080     MPIDI_FUNC_ENTER(MPID_STATE_MRAILI_FILL_START_BUFFER);
 1081 
 1082     DEBUG_PRINT("buffer: %p, content size: %d\n", v->buffer, v->content_size);
 1083 
 1084 #ifdef _ENABLE_CUDA_
 1085     if (rdma_enable_cuda && n_iov > 1 && is_device_buffer(iov[1].MPL_IOV_BUF)) {
 1086         /* in the case of GPU buffers, there is only one data iov, if data is non-contiguous
 1087          * it should have been packed before this */
 1088         MPIU_Assert(n_iov == 2);
 1089 
 1090         MPIU_Memcpy(ptr, iov[0].MPL_IOV_BUF,
 1091                 (iov[0].MPL_IOV_LEN));
 1092         len += (iov[0].MPL_IOV_LEN);
 1093         avail -= (iov[0].MPL_IOV_LEN);
 1094         ptr = (void *) ((unsigned long) ptr + iov[0].MPL_IOV_LEN);
 1095 
 1096         if (avail >= iov[1].MPL_IOV_LEN) {
 1097             MPIU_Memcpy_CUDA(ptr,
 1098                     iov[1].MPL_IOV_BUF,
 1099                     iov[1].MPL_IOV_LEN,
 1100                     cudaMemcpyDeviceToHost);
 1101             len += iov[1].MPL_IOV_LEN;
 1102         } else {
 1103             MPIU_Memcpy_CUDA(ptr,
 1104                     iov[1].MPL_IOV_BUF,
 1105                     avail,
 1106                     cudaMemcpyDeviceToHost);
 1107             len += avail;
 1108             avail = 0;
 1109         }
 1110     } else 
 1111 #endif
 1112     {
 1113         for (i = 0; i < n_iov; i++) {
 1114             DEBUG_PRINT("[fill buf]avail %d, len %d\n", avail,
 1115                     iov[i].MPL_IOV_LEN);
 1116             if (avail >= iov[i].MPL_IOV_LEN) {
 1117                 DEBUG_PRINT("[fill buf] cpy ptr %p\n", ptr);
 1118                 MPIU_Memcpy(ptr, iov[i].MPL_IOV_BUF,
 1119                         (iov[i].MPL_IOV_LEN));
 1120                 len += (iov[i].MPL_IOV_LEN);
 1121                 avail -= (iov[i].MPL_IOV_LEN);
 1122                 ptr = (void *) ((unsigned long) ptr + iov[i].MPL_IOV_LEN);
 1123             } else {
 1124                 MPIU_Memcpy(ptr, iov[i].MPL_IOV_BUF, avail);
 1125                 len += avail;
 1126                 avail = 0;
 1127                 break;
 1128             }
 1129         }
 1130     }
 1131     v->content_size += len;
 1132 
 1133     MPIDI_FUNC_EXIT(MPID_STATE_MRAILI_FILL_START_BUFFER);
 1134     return len;
 1135 }
 1136 
 1137 #undef FUNCNAME
 1138 #define FUNCNAME MRAILI_Get_Vbuf
 1139 #undef FCNAME
 1140 #define FCNAME MPL_QUOTE(FUNCNAME)
 1141 static inline vbuf * MRAILI_Get_Vbuf(MPIDI_VC_t * vc, size_t pkt_len)
 1142 {
 1143     int rail = 0;
 1144     vbuf* temp_v = NULL;
 1145 
 1146     MPIDI_STATE_DECL(MPID_STATE_MRAILI_GET_VBUF);
 1147     MPIDI_FUNC_ENTER(MPID_STATE_MRAILI_GET_VBUF);
 1148 
 1149     if (unlikely(NULL != vc->mrail.coalesce_vbuf)) {
 1150         int coalesc_buf_size = 0;
 1151 #if defined(_ENABLE_UD_) || defined(_MCST_SUPPORT_)
 1152         if (!vc->mrail.coalesce_vbuf->pool_index) {
 1153             coalesc_buf_size = MRAIL_MAX_UD_SIZE;
 1154         } else
 1155 #endif
 1156         {
 1157             coalesc_buf_size = ((vbuf_pool_t*)vc->mrail.coalesce_vbuf->pool_index)->buf_size;
 1158         }
 1159 
 1160         if((coalesc_buf_size - vc->mrail.coalesce_vbuf->content_size) 
 1161                 >= pkt_len)
 1162         {
 1163             DEBUG_PRINT("returning back a coalesce buffer\n");
 1164             return vc->mrail.coalesce_vbuf;
 1165         } else {
 1166             FLUSH_SQUEUE(vc);
 1167             vc->mrail.coalesce_vbuf = NULL;
 1168             DEBUG_PRINT("Send out the coalesce vbuf\n");
 1169         }
 1170     }
 1171 
 1172     rail = MRAILI_Send_select_rail(vc);
 1173     /* if there already wasn't a vbuf that could
 1174      * hold our packet we need to allocate a 
 1175      * new one
 1176      */
 1177     if (likely(NULL == temp_v)) {
 1178         /* are we trying to coalesce? If so, place
 1179          * it as the new coalesce vbuf and add it
 1180          * to the extended sendq
 1181          */
 1182 
 1183         if(unlikely(MRAILI_Coalesce_ok(vc, rail)) &&
 1184             (pkt_len*2 <= DEFAULT_MEDIUM_VBUF_SIZE)) {
 1185             MRAILI_Get_buffer(vc, temp_v, DEFAULT_MEDIUM_VBUF_SIZE);
 1186             vc->mrail.coalesce_vbuf = temp_v;
 1187 
 1188             temp_v->seqnum = vc->mrail.seqnum_next_tosend;
 1189             vc->mrail.seqnum_next_tosend++;
 1190 
 1191             temp_v->coalesce = 1;
 1192             temp_v->rail = rail;
 1193             MRAILI_Ext_sendq_enqueue(vc, temp_v->rail, temp_v); 
 1194             DEBUG_PRINT("coalesce is ok\n");
 1195 
 1196             if(!mv2_MPIDI_CH3I_RDMA_Process.has_srq) {
 1197                 --vc->mrail.srp.credits[temp_v->rail].remote_credit;
 1198             }
 1199 
 1200         } else {
 1201             MRAILI_Get_buffer(vc, temp_v, pkt_len);
 1202             DEBUG_PRINT("coalesce not ok\n");
 1203         }
 1204 
 1205         DEBUG_PRINT("buffer is %p\n", temp_v->buffer);
 1206         DEBUG_PRINT("pheader buffer is %p\n", temp_v->pheader);
 1207 
 1208         temp_v->rail = rail;
 1209         temp_v->eager = 1;
 1210         temp_v->content_size = 0;
 1211 
 1212         DEBUG_PRINT("incrementing the outstanding eager vbufs: eager %d\n",
 1213                 vc->mrail.outstanding_eager_vbufs);
 1214 
 1215         if (temp_v->transport == IB_TRANSPORT_RC)
 1216             ++vc->mrail.outstanding_eager_vbufs;
 1217     }
 1218 
 1219     MPIU_Assert(temp_v != NULL);
 1220 
 1221     MPIDI_FUNC_EXIT(MPID_STATE_MRAILI_GET_VBUF);
 1222     return temp_v;
 1223 }
 1224 
 1225 #undef FUNCNAME
 1226 #define FUNCNAME MPIDI_CH3I_MRAILI_Eager_send
 1227 #undef FCNAME
 1228 #define FCNAME MPL_QUOTE(FUNCNAME)
 1229 int MPIDI_CH3I_MRAILI_Eager_send(MPIDI_VC_t * vc,
 1230                                  MPL_IOV * iov,
 1231                                  int n_iov,
 1232                                  size_t pkt_len,
 1233                                  int *num_bytes_ptr,
 1234                                  vbuf **buf_handle)
 1235 {
 1236     vbuf * v;
 1237 
 1238     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_MRAILI_EAGER_SEND);
 1239     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_MRAILI_EAGER_SEND);
 1240 
 1241     /* first we check if we can take the RDMA FP */
 1242     if(likely(MPIDI_CH3I_MRAILI_Fast_rdma_ok(vc, pkt_len))) {
 1243     
 1244         *num_bytes_ptr = pkt_len;
 1245         MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_MRAILI_EAGER_SEND);
 1246         return MPIDI_CH3I_MRAILI_Fast_rdma_send_complete(vc, iov,
 1247                 n_iov, num_bytes_ptr, buf_handle);
 1248     } 
 1249 
 1250     /* otherwise we can always take the send/recv path */
 1251     v = MRAILI_Get_Vbuf(vc, pkt_len);
 1252 
 1253     DEBUG_PRINT("[eager send]vbuf addr %p, buffer: %p\n", v, v->buffer);
 1254     *num_bytes_ptr = MRAILI_Fill_start_buffer(v, iov, n_iov);
 1255    
 1256 #ifdef CKPT
 1257     /* this won't work properly at the moment... 
 1258      *
 1259      * My guess is that if vc->ch.state != MPIDI_CH3I_VC_STATE_IDLE
 1260      * just have Coalesce_ok return 0 -- then you'll always get a new vbuf
 1261      * (actually there are a few other things to change as well...)
 1262      */
 1263 
 1264     if (vc->ch.state != MPIDI_CH3I_VC_STATE_IDLE) {
 1265         /*MPIDI_CH3I_MRAILI_Pkt_comm_header * p = (MPIDI_CH3I_MRAILI_Pkt_comm_header *) v->pheader;*/
 1266         MPIDI_CH3I_CR_msg_log_queue_entry_t *entry;
 1267         if (rdma_use_coalesce) {
 1268             entry = MSG_LOG_QUEUE_TAIL(vc);
 1269             if (entry->buf == v) /*since the vbuf is already filled, no need to queue it again*/
 1270             {
 1271                 PRINT_DEBUG(DEBUG_FT_verbose, "coalesced buffer\n");
 1272                 return MPI_MRAIL_MSG_QUEUED;
 1273             }
 1274         }
 1275         entry = (MPIDI_CH3I_CR_msg_log_queue_entry_t *) MPIU_Malloc(sizeof(MPIDI_CH3I_CR_msg_log_queue_entry_t));
 1276         entry->buf = v;
 1277         entry->len = *num_bytes_ptr;
 1278         MSG_LOG_ENQUEUE(vc, entry);
 1279         MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_MRAILI_EAGER_SEND);
 1280         return MPI_MRAIL_MSG_QUEUED;
 1281     }
 1282 #endif
 1283 
 1284     /* send the buffer if we aren't trying to coalesce it */
 1285     if(likely(vc->mrail.coalesce_vbuf != v))  {
 1286         DEBUG_PRINT("[eager send] len %d, selected rail hca %d, rail %d\n",
 1287                 *num_bytes_ptr, vc->mrail.rails[v->rail].hca_index, v->rail);
 1288         vbuf_init_send(v, *num_bytes_ptr, v->rail);
 1289         mv2_MPIDI_CH3I_RDMA_Process.post_send(vc, v, v->rail);
 1290     } else {
 1291         MPIDI_CH3I_MRAILI_Pkt_comm_header *p = (MPIDI_CH3I_MRAILI_Pkt_comm_header *)
 1292             (v->buffer + v->content_size - *num_bytes_ptr);
 1293 
 1294         PACKET_SET_CREDIT(p, vc, v->rail);
 1295 #ifdef CRC_CHECK
 1296     p->crc = update_crc(1, (void *)((uintptr_t)p+sizeof *p),
 1297                                   v->desc.sg_entry.length - sizeof *p);
 1298 #endif
 1299         v->vc                = (void *) vc;
 1300         p->rail        = v->rail;
 1301 #ifdef _ENABLE_UD_
 1302     if(rdma_enable_hybrid) {
 1303             p->src.rank    = MPIDI_Process.my_pg_rank;
 1304     } else
 1305 #endif
 1306     {
 1307             p->src.vc_addr = vc->mrail.remote_vc_addr;
 1308     }
 1309     }
 1310 
 1311     *buf_handle = v;
 1312 
 1313     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_MRAILI_EAGER_SEND);
 1314     return 0;
 1315 }
 1316 
 1317 #undef FUNCNAME
 1318 #define FUNCNAME MPIDI_CH3I_MRAILI_rget_finish
 1319 #undef FCNAME
 1320 #define FCNAME MPL_QUOTE(FUNCNAME)
 1321 int MPIDI_CH3I_MRAILI_rget_finish(MPIDI_VC_t * vc,
 1322                                  MPL_IOV * iov,
 1323                                  int n_iov,
 1324                                  int *num_bytes_ptr, vbuf ** buf_handle, 
 1325                                  int rail)
 1326 {
 1327     vbuf *v;
 1328     int mpi_errno;
 1329     size_t nbytes = MAX(DEFAULT_MEDIUM_VBUF_SIZE, *num_bytes_ptr);
 1330 
 1331     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_MRAILI_RGET_FINISH);
 1332     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_MRAILI_RGET_FINISH);
 1333 
 1334     if (likely(nbytes <= DEFAULT_MEDIUM_VBUF_SIZE)) {
 1335         GET_VBUF_BY_OFFSET_WITHOUT_LOCK(v, MV2_MEDIUM_DATA_VBUF_POOL_OFFSET);
 1336     } else {
 1337         GET_VBUF_BY_OFFSET_WITHOUT_LOCK(v, MV2_LARGE_DATA_VBUF_POOL_OFFSET);
 1338     }
 1339     *buf_handle = v;
 1340     *num_bytes_ptr = MRAILI_Fill_start_buffer(v, iov, n_iov);
 1341 
 1342     vbuf_init_send(v, *num_bytes_ptr, rail);
 1343 
 1344     mpi_errno = mv2_MPIDI_CH3I_RDMA_Process.post_send(vc, v, rail);
 1345     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_MRAILI_RGET_FINISH); 
 1346     return mpi_errno;
 1347 }
 1348 
 1349 #undef FUNCNAME
 1350 #define FUNCNAME MPIDI_CH3I_MRAILI_rput_complete
 1351 #undef FCNAME
 1352 #define FCNAME MPL_QUOTE(FUNCNAME)
 1353 int MPIDI_CH3I_MRAILI_rput_complete(MPIDI_VC_t * vc,
 1354                                  MPL_IOV * iov,
 1355                                  int n_iov,
 1356                                  int *num_bytes_ptr, vbuf ** buf_handle, 
 1357                                  int rail)
 1358 {
 1359     vbuf * v;
 1360     int mpi_errno;
 1361 
 1362     MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_MRAILI_RPUT_COMPLETE);
 1363     MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_MRAILI_RPUT_COMPLETE);
 1364 
 1365     MRAILI_Get_buffer(vc, v, iov->MPL_IOV_LEN);
 1366     *buf_handle = v;
 1367     DEBUG_PRINT("[eager send]vbuf addr %p\n", v);
 1368     *num_bytes_ptr = MRAILI_Fill_start_buffer(v, iov, n_iov);
 1369 
 1370     DEBUG_PRINT("[eager send] len %d, selected rail hca %d, rail %d\n",
 1371                 *num_bytes_ptr, vc->mrail.rails[rail].hca_index, rail);
 1372 
 1373     vbuf_init_send(v, *num_bytes_ptr, rail);
 1374 
 1375     mpi_errno = mv2_MPIDI_CH3I_RDMA_Process.post_send(vc, v, rail);
 1376     MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_MRAILI_RPUT_COMPLETE);
 1377     return mpi_errno;
 1378 }
 1379 
 1380 #undef FUNCNAME
 1381 #define FUNCNAME MRAILI_Backlog_send
 1382 #undef FCNAME
 1383 #define FCNAME MPL_QUOTE(FUNCNAME)
 1384 int MRAILI_Backlog_send(MPIDI_VC_t * vc, int rail)
 1385 {
 1386     char cq_overflow = 0;
 1387     ibv_backlog_queue_t *q;
 1388 
 1389     MPIDI_STATE_DECL(MPID_STATE_MRAILI_BACKLOG_SEND);
 1390     MPIDI_FUNC_ENTER(MPID_STATE_MRAILI_BACKLOG_SEND);
 1391 
 1392     q = &vc->mrail.srp.credits[rail].backlog;
 1393 
 1394 #ifdef CKPT
 1395     if (mv2_MPIDI_CH3I_RDMA_Process.has_srq) {
 1396         PRINT_ERROR("[%s, %d] CKPT has_srq error\n", __FILE__, __LINE__  );
 1397         exit(EXIT_FAILURE);
 1398     }
 1399 #endif
 1400 
 1401     while ((q->len > 0)
 1402            && (vc->mrail.srp.credits[rail].remote_credit > 0)) {
 1403         vbuf *v = NULL;
 1404         MPIDI_CH3I_MRAILI_Pkt_comm_header *p;
 1405         MPIU_Assert(q->vbuf_head != NULL);
 1406         BACKLOG_DEQUEUE(q, v);
 1407 
 1408         /* Assumes packet header is at beginning of packet structure */
 1409         p = (MPIDI_CH3I_MRAILI_Pkt_comm_header *) v->pheader;
 1410 
 1411         PACKET_SET_CREDIT(p, vc, rail);
 1412 #ifdef CRC_CHECK
 1413     p->mrail.crc = update_crc(1, (void *)((uintptr_t)p+sizeof *p),
 1414                                   v->desc.sg_entry.length - sizeof *p);
 1415 #endif
 1416         --vc->mrail.srp.credits[rail].remote_credit;
 1417 
 1418         if (mv2_MPIDI_CH3I_RDMA_Process.has_srq) {
 1419 #ifdef _ENABLE_UD_
 1420         if(rdma_enable_hybrid) {
 1421                 p->src.rank    = MPIDI_Process.my_pg_rank;
 1422         } else
 1423 #endif
 1424         {
 1425                 p->src.vc_addr = vc->mrail.remote_vc_addr;
 1426         }
 1427             p->rail        = rail;
 1428         }
 1429 
 1430         v->vc = vc;
 1431         v->rail = rail;
 1432 
 1433         XRC_FILL_SRQN_FIX_CONN (v, vc, rail);
 1434         FLUSH_RAIL(vc, rail);
 1435  
 1436         cq_overflow = check_cq_overflow(vc, rail);
 1437 
 1438         if (likely(vc->mrail.rails[rail].send_wqes_avail > 0 && !cq_overflow)) {
 1439             --vc->mrail.rails[rail].send_wqes_avail;
 1440 
 1441             IBV_POST_SR(v, vc, rail,
 1442                         "ibv_post_sr (MRAILI_Backlog_send)");
 1443         } else {
 1444             MRAILI_Ext_sendq_enqueue(vc, rail, v);
 1445             continue;
 1446         }
 1447     }
 1448 
 1449     MPIDI_FUNC_EXIT(MPID_STATE_MRAILI_BACKLOG_SEND);
 1450     return 0;
 1451 }
 1452 
 1453 
 1454 #undef FUNCNAME
 1455 #define FUNCNAME MRAILI_Flush_wqe
 1456 #undef FCNAME
 1457 #define FCNAME MPL_QUOTE(FUNCNAME)
 1458 int MRAILI_Flush_wqe(MPIDI_VC_t *vc, vbuf *v , int rail)
 1459 {
 1460     MPIDI_STATE_DECL(MPID_STATE_MRAILI_FLUSH_WQE);
 1461     MPIDI_FUNC_ENTER(MPID_STATE_MRAILI_FLUSH_WQE);
 1462     FLUSH_RAIL(vc, rail);
 1463     if (!vc->mrail.rails[rail].send_wqes_avail)
 1464     {
 1465         MRAILI_Ext_sendq_enqueue(vc, rail, v);
 1466         MPIDI_FUNC_EXIT(MPID_STATE_MRAILI_FLUSH_WQE);
 1467         return MPI_MRAIL_MSG_QUEUED;
 1468     }
 1469 
 1470     MPIDI_FUNC_EXIT(MPID_STATE_MRAILI_FLUSH_WQE);
 1471     return 0;
 1472 }
 1473 #undef FUNCNAME
 1474 #define FUNCNAME MRAILI_Process_send
 1475 #undef FCNAME
 1476 #define FCNAME MPL_QUOTE(FUNCNAME)
 1477 int MRAILI_Process_send(void *vbuf_addr)
 1478 {
 1479     int mpi_errno = MPI_SUCCESS;
 1480 
 1481     vbuf            *v = vbuf_addr;
 1482     MPIDI_CH3I_MRAILI_Pkt_comm_header *p;
 1483     MPIDI_VC_t      *vc;
 1484     MPIDI_VC_t      *orig_vc;
 1485     MPID_Request    *req;
 1486     double          time_taken;
 1487     int             complete;
 1488 
 1489     MPIDI_STATE_DECL(MPID_STATE_MRAILI_PROCESS_SEND);
 1490     MPIDI_FUNC_ENTER(MPID_STATE_MRAILI_PROCESS_SEND);
 1491 
 1492     vc  = v->vc;
 1493     p = v->pheader;
 1494 #ifdef _ENABLE_XRC_
 1495     if (USE_XRC && VC_XST_ISSET (vc, XF_INDIRECT_CONN)) {
 1496         orig_vc = vc->ch.orig_vc;
 1497     }
 1498     else 
 1499 #endif
 1500     {
 1501         orig_vc = vc;
 1502     }
 1503     if (v->transport == IB_TRANSPORT_RC) {
 1504         if (v->padding == RDMA_ONE_SIDED) {
 1505             ++(orig_vc->mrail.rails[v->rail].send_wqes_avail);
 1506             if (orig_vc->mrail.rails[v->rail].ext_sendq_head) {
 1507                 MRAILI_Ext_sendq_send(orig_vc, v->rail);
 1508             }
 1509 
 1510             if ((mpi_errno = MRAILI_Handle_one_sided_completions(v)) != MPI_SUCCESS)
 1511             {
 1512                 MPIR_ERR_POP(mpi_errno);
 1513             }
 1514 
 1515             MRAILI_Release_vbuf(v);
 1516             goto fn_exit;
 1517         }
 1518 
 1519     
 1520         ++orig_vc->mrail.rails[v->rail].send_wqes_avail;
 1521 
 1522 
 1523         if(vc->free_vc) {
 1524             if(vc->mrail.rails[v->rail].send_wqes_avail == rdma_default_max_send_wqe)   {
 1525                 if (v->padding == NORMAL_VBUF_FLAG) {
 1526                     DEBUG_PRINT("[process send] normal flag, free vbuf\n");
 1527                     MRAILI_Release_vbuf(v);
 1528                 } else {
 1529                     v->padding = FREE_FLAG;
 1530                 }
 1531 
 1532                 MPIU_Memset(vc, 0, sizeof(MPIDI_VC_t));
 1533                 MPIU_Free(vc); 
 1534                 mpi_errno = MPI_SUCCESS;
 1535                 goto fn_exit;
 1536             }
 1537         }
 1538 
 1539         if(v->eager) {
 1540             --vc->mrail.outstanding_eager_vbufs;
 1541             DEBUG_PRINT("Eager, decrementing to: %d\n", vc->mrail.outstanding_eager_vbufs);
 1542 
 1543             if(vc->mrail.outstanding_eager_vbufs < 
 1544                     rdma_coalesce_threshold) {
 1545                 DEBUG_PRINT("Flushing coalesced\n", v);
 1546                 FLUSH_SQUEUE(vc);
 1547             }
 1548             v->eager = 0;
 1549         }
 1550  
 1551         if (orig_vc->mrail.rails[v->rail].ext_sendq_head) {
 1552             MRAILI_Ext_sendq_send(orig_vc, v->rail);
 1553         }
 1554 
 1555         if(v->padding == COLL_VBUF_FLAG) { 
 1556             MRAILI_Release_vbuf(v);
 1557             goto fn_exit;
 1558         } 
 1559 
 1560         if (v->padding == RPUT_VBUF_FLAG) {
 1561 
 1562             req = (MPID_Request *)v->sreq;
 1563 
 1564             PRINT_DEBUG(DEBUG_RNDV_verbose, "Processing RPUT completion "
 1565                     "req: %p, protocol: %d, local: %d, remote: %d\n",
 1566                     req, req->mrail.protocol, req->mrail.local_complete, req->mrail.remote_complete);
 1567 
 1568             /* HSAM is Activated */
 1569             if (mv2_MPIDI_CH3I_RDMA_Process.has_hsam) {
 1570                 req = (MPID_Request *)v->sreq;
 1571                 MPIU_Assert(req != NULL);
 1572                 get_wall_time(&time_taken);
 1573                 req->mrail.stripe_finish_time[v->rail] = 
 1574                     time_taken;
 1575             }
 1576 
 1577 #ifdef _ENABLE_CUDA_
 1578             if (rdma_enable_cuda
 1579                 && v->orig_vbuf != NULL) {
 1580                 vbuf *orig_vbuf = (vbuf *) (v->orig_vbuf);
 1581                 orig_vbuf->finish_count++;
 1582                 if (orig_vbuf->finish_count == rdma_num_rails) {
 1583                     MRAILI_Release_vbuf(orig_vbuf);
 1584                 }
 1585             }
 1586 #endif
 1587             MRAILI_Release_vbuf(v);
 1588             goto fn_exit;
 1589         }
 1590         if (v->padding == RGET_VBUF_FLAG) {
 1591 
 1592             req = (MPID_Request *)v->sreq;
 1593 
 1594             /* HSAM is Activated */
 1595             if (mv2_MPIDI_CH3I_RDMA_Process.has_hsam) {
 1596                 MPIU_Assert(req != NULL);
 1597                 get_wall_time(&time_taken);
 1598                 /* Record the time only the first time a data transfer
 1599                  * is scheduled on this rail
 1600                  * this may occur for very large size messages */
 1601 
 1602                 /* SS: The value in measuring time is a double.
 1603                  * As long as it is below some epsilon value, it
 1604                  * can be considered same as zero */
 1605                 if(req->mrail.stripe_finish_time[v->rail] < ERROR_EPSILON) {
 1606                     req->mrail.stripe_finish_time[v->rail] = 
 1607                         time_taken;
 1608                 }
 1609             }
 1610 
 1611             ++req->mrail.local_complete;
 1612             PRINT_DEBUG(DEBUG_RNDV_verbose, "Processing RGET completion "
 1613                     "req: %p, protocol: %d, local: %d, remote: %d\n",
 1614                     req, req->mrail.protocol, req->mrail.local_complete, req->mrail.remote_complete);
 1615 
 1616             /* If the message size if less than the striping threshold, send a
 1617              * finish message immediately
 1618              *
 1619              * If HSAM is defined, wait for rdma_num_rails / stripe_factor
 1620              * number of completions before sending the finish message.
 1621              * After sending the finish message, adjust the weights of different
 1622              * paths
 1623              *
 1624              * If HSAM is not defined, wait for rdma_num_rails completions
 1625              * before sending the finish message
 1626              */
 1627 
 1628             if(req->mrail.rndv_buf_sz > rdma_large_msg_rail_sharing_threshold) {
 1629                 if(mv2_MPIDI_CH3I_RDMA_Process.has_hsam && 
 1630                         (req->mrail.local_complete == 
 1631                          req->mrail.num_rdma_read_completions )) { 
 1632 
 1633                     MRAILI_RDMA_Get_finish(vc, 
 1634                             (MPID_Request *) v->sreq, v->rail);
 1635 
 1636                     adjust_weights(v->vc, req->mrail.stripe_start_time,
 1637                             req->mrail.stripe_finish_time, 
 1638                             req->mrail.initial_weight);                       
 1639 
 1640                 } else if (!mv2_MPIDI_CH3I_RDMA_Process.has_hsam && 
 1641                         (req->mrail.local_complete == 
 1642                          req->mrail.num_rdma_read_completions)) {
 1643 
 1644                     MRAILI_RDMA_Get_finish(vc,
 1645                             (MPID_Request *) v->sreq, v->rail);
 1646                 }
 1647             } else {
 1648                 MRAILI_RDMA_Get_finish(vc,
 1649                         (MPID_Request *) v->sreq, v->rail);
 1650             }
 1651 
 1652             MRAILI_Release_vbuf(v);
 1653             goto fn_exit;
 1654         }
 1655         if (v->padding == CREDIT_VBUF_FLAG) {
 1656             PRINT_DEBUG(DEBUG_XRC_verbose>0, "CREDIT Vbuf\n");
 1657             --orig_vc->mrail.rails[v->rail].send_wqes_avail;
 1658             goto fn_exit;
 1659         }
 1660     }
 1661     
 1662     switch (p->type) {
 1663 #ifdef CKPT
 1664     case MPIDI_CH3_PKT_CM_SUSPEND:
 1665     case MPIDI_CH3_PKT_CM_REACTIVATION_DONE:
 1666         MPIDI_CH3I_CM_Handle_send_completion(vc, p->type,v);
 1667         if (v->padding == NORMAL_VBUF_FLAG) {
 1668             MRAILI_Release_vbuf(v);
 1669         }
 1670         break;
 1671     case MPIDI_CH3_PKT_CR_REMOTE_UPDATE:
 1672         MPIDI_CH3I_CR_Handle_send_completion(vc, p->type,v);
 1673         if (v->padding == NORMAL_VBUF_FLAG) {
 1674             MRAILI_Release_vbuf(v);
 1675         }
 1676         break;
 1677 #endif        
 1678 #ifndef MV2_DISABLE_HEADER_CACHING 
 1679     case MPIDI_CH3_PKT_FAST_EAGER_SEND:
 1680     case MPIDI_CH3_PKT_FAST_EAGER_SEND_WITH_REQ:
 1681 #endif
 1682 #if defined(USE_EAGER_SHORT)
 1683     case MPIDI_CH3_PKT_EAGERSHORT_SEND:
 1684 #endif /* defined(USE_EAGER_SHORT) */
 1685     case MPIDI_CH3_PKT_EAGER_SEND:
 1686     case MPIDI_CH3_PKT_EAGER_SYNC_SEND: 
 1687     case MPIDI_CH3_PKT_PACKETIZED_SEND_DATA:
 1688     case MPIDI_CH3_PKT_RNDV_R3_DATA:
 1689     case MPIDI_CH3_PKT_READY_SEND:
 1690     case MPIDI_CH3_PKT_PUT:
 1691     case MPIDI_CH3_PKT_PUT_IMMED:
 1692     case MPIDI_CH3_PKT_ACCUMULATE:
 1693     case MPIDI_CH3_PKT_ACCUMULATE_IMMED:
 1694         req = v->sreq;
 1695         v->sreq = NULL;
 1696         DEBUG_PRINT("[process send] complete for eager msg, req %p\n",
 1697                     req);
 1698         if (req != NULL) {
 1699             MPIDI_CH3U_Handle_send_req(vc, req, &complete);
 1700 
 1701             DEBUG_PRINT("[process send] req not null\n");
 1702             if (complete != TRUE) {
 1703                 ibv_error_abort(IBV_STATUS_ERR, "Get incomplete eager send request\n");
 1704             }
 1705         }
 1706         if (v->padding == NORMAL_VBUF_FLAG) {
 1707             DEBUG_PRINT("[process send] normal flag, free vbuf\n");
 1708             MRAILI_Release_vbuf(v);
 1709         } else {
 1710             v->padding = FREE_FLAG;
 1711         }
 1712         break;
 1713     case MPIDI_CH3_PKT_RPUT_FINISH:
 1714         req = (MPID_Request *) (v->sreq);
 1715         if (req == NULL) {
 1716             ibv_va_error_abort(GEN_EXIT_ERR,
 1717                     "s == NULL, s is the send, v is %p "
 1718                     "handler of the rput finish", v);
 1719         }
 1720 
 1721 #ifdef _ENABLE_CUDA_
 1722         int process_rput_finish = 0;
 1723         MPIDI_CH3_Pkt_rput_finish_t *rput_pkt = 
 1724                         (MPIDI_CH3_Pkt_rput_finish_t *) v->buffer;
 1725         if (rdma_enable_cuda) {
 1726             if (req->mrail.cuda_transfer_mode == NONE 
 1727                         || rput_pkt->cuda_pipeline_finish) {
 1728                 process_rput_finish = 1;
 1729             }
 1730         }
 1731         if (!rdma_enable_cuda || process_rput_finish)
 1732 #endif
 1733         {
 1734 
 1735         ++req->mrail.local_complete;
 1736         if (req->mrail.local_complete == rdma_num_rails) {
 1737             req->mrail.local_complete = UINT32_MAX;
 1738         }
 1739         PRINT_DEBUG(DEBUG_RNDV_verbose, "Processing RPUT FIN completion "
 1740                 "req: %p, protocol: %d, local: %d, remote: %d\n",
 1741                 req, req->mrail.protocol, req->mrail.local_complete, req->mrail.remote_complete);
 1742 
 1743         if(MPIDI_CH3I_MRAIL_Finish_request(req)) {
 1744 
 1745             if (req->mrail.d_entry != NULL) {
 1746                 dreg_unregister(req->mrail.d_entry);
 1747                 req->mrail.d_entry = NULL;
 1748             }
 1749 
 1750             if(mv2_MPIDI_CH3I_RDMA_Process.has_hsam && 
 1751                ((req->mrail.rndv_buf_sz > rdma_large_msg_rail_sharing_threshold))) {
 1752 
 1753                 /* Adjust the weights of different paths according to the
 1754                  * timings obtained for the stripes */
 1755 
 1756                 adjust_weights(v->vc, req->mrail.stripe_start_time,
 1757                         req->mrail.stripe_finish_time, 
 1758                         req->mrail.initial_weight);
 1759             }
 1760             
 1761             MPIDI_CH3I_MRAIL_FREE_RNDV_BUFFER(req);        
 1762             req->mrail.d_entry = NULL;
 1763             MPIDI_CH3U_Handle_send_req(vc, req, &complete);
 1764 
 1765             if (complete != TRUE) {
 1766                 ibv_error_abort(IBV_STATUS_ERR, 
 1767                         "Get incomplete eager send request\n");
 1768             }
 1769         }
 1770         }
 1771 
 1772         if (v->padding == NORMAL_VBUF_FLAG) {
 1773             MRAILI_Release_vbuf(v);
 1774         } else {
 1775             v->padding = FREE_FLAG;
 1776         }
 1777         break;
 1778     case MPIDI_CH3_PKT_GET_RESP:
 1779     case MPIDI_CH3_PKT_GET_RESP_IMMED:
 1780         DEBUG_PRINT("[process send] get get respond finish\n");
 1781         req = (MPID_Request *) (v->sreq);
 1782         v->sreq = NULL;
 1783         if (NULL != req) {
 1784             if (MV2_RNDV_PROTOCOL_RPUT == req->mrail.protocol) {
 1785                 if (req->mrail.d_entry != NULL) {
 1786                     dreg_unregister(req->mrail.d_entry);
 1787                     req->mrail.d_entry = NULL;
 1788                 }
 1789                 MPIDI_CH3I_MRAIL_FREE_RNDV_BUFFER(req);
 1790                 req->mrail.d_entry = NULL;
 1791             }
 1792 
 1793             MPIDI_CH3U_Handle_send_req(vc, req, &complete);
 1794             if (complete != TRUE) {
 1795                 ibv_error_abort(IBV_STATUS_ERR, "Get incomplete eager send request\n");
 1796             }
 1797         }
 1798 
 1799         if (v->padding == NORMAL_VBUF_FLAG) {
 1800             MRAILI_Release_vbuf(v);
 1801         } else {
 1802             v->padding = FREE_FLAG;
 1803         }
 1804         break;
 1805 
 1806     case MPIDI_CH3_PKT_RGET_FINISH:
 1807 
 1808         if (v->padding == NORMAL_VBUF_FLAG) {
 1809             MRAILI_Release_vbuf(v);
 1810         } else {
 1811             v->padding = FREE_FLAG;
 1812         }
 1813 
 1814         break;
 1815 #if defined(_MCST_SUPPORT_)
 1816     case MPIDI_CH3_PKT_MCST:
 1817     case MPIDI_CH3_PKT_MCST_INIT:
 1818         PRINT_DEBUG(DEBUG_MCST_verbose > 4, 
 1819                 "mcast send completion\n");
 1820         mcast_ctx->ud_ctx->send_wqes_avail++;
 1821         if (v->padding == NORMAL_VBUF_FLAG) {
 1822             MRAILI_Release_vbuf(v);
 1823         } else {
 1824             v->padding = FREE_FLAG;
 1825         }
 1826         break;
 1827     case MPIDI_CH3_PKT_MCST_NACK:
 1828         if (mcast_use_mcast_nack) {
 1829             mcast_ctx->ud_ctx->send_wqes_avail++;
 1830         }
 1831     case MPIDI_CH3_PKT_MCST_INIT_ACK:
 1832         if (v->padding == NORMAL_VBUF_FLAG) {
 1833             MRAILI_Release_vbuf(v);
 1834         } else {
 1835             v->padding = FREE_FLAG;
 1836         }
 1837         break;
 1838     
 1839 #endif
 1840     case MPIDI_CH3_PKT_NOOP:
 1841     case MPIDI_CH3_PKT_ADDRESS:
 1842     case MPIDI_CH3_PKT_ADDRESS_REPLY:
 1843     case MPIDI_CH3_PKT_CM_ESTABLISH:
 1844     case MPIDI_CH3_PKT_PACKETIZED_SEND_START:
 1845     case MPIDI_CH3_PKT_RNDV_REQ_TO_SEND:
 1846     case MPIDI_CH3_PKT_RNDV_READY_REQ_TO_SEND:
 1847     case MPIDI_CH3_PKT_RNDV_CLR_TO_SEND:
 1848     case MPIDI_CH3_PKT_EAGER_SYNC_ACK:
 1849     case MPIDI_CH3_PKT_CANCEL_SEND_REQ:
 1850     case MPIDI_CH3_PKT_CANCEL_SEND_RESP:
 1851     case MPIDI_CH3_PKT_PUT_RNDV:
 1852     case MPIDI_CH3_PKT_RMA_RNDV_CLR_TO_SEND:
 1853     case MPIDI_CH3_PKT_CUDA_CTS_CONTI:
 1854     case MPIDI_CH3_PKT_GET:
 1855     case MPIDI_CH3_PKT_GET_RNDV:
 1856     case MPIDI_CH3_PKT_ACCUMULATE_RNDV:
 1857     case MPIDI_CH3_PKT_GET_ACCUM:
 1858     case MPIDI_CH3_PKT_LOCK:
 1859     case MPIDI_CH3_PKT_LOCK_ACK:
 1860     case MPIDI_CH3_PKT_LOCK_OP_ACK:
 1861     case MPIDI_CH3_PKT_UNLOCK:
 1862     case MPIDI_CH3_PKT_FLUSH:
 1863     case MPIDI_CH3_PKT_ACK:
 1864     case MPIDI_CH3_PKT_DECR_AT_COUNTER:
 1865     case MPIDI_CH3_PKT_FOP:
 1866     case MPIDI_CH3_PKT_FOP_RESP:
 1867     case MPIDI_CH3_PKT_FOP_RESP_IMMED:
 1868     case MPIDI_CH3_PKT_FOP_IMMED:
 1869     case MPIDI_CH3_PKT_CAS_IMMED:
 1870     case MPIDI_CH3_PKT_CAS_RESP_IMMED:
 1871     case MPIDI_CH3_PKT_GET_ACCUM_RNDV:
 1872     case MPIDI_CH3_PKT_GET_ACCUM_IMMED:
 1873     case MPIDI_CH3_PKT_GET_ACCUM_RESP_IMMED:
 1874     case MPIDI_CH3_PKT_FLOW_CNTL_UPDATE:
 1875     case MPIDI_CH3_PKT_RNDV_R3_ACK:
 1876     case MPIDI_CH3_PKT_ZCOPY_FINISH:
 1877     case MPIDI_CH3_PKT_ZCOPY_ACK:
 1878         DEBUG_PRINT("[process send] get %d\n", p->type);
 1879         if (v->padding == NORMAL_VBUF_FLAG) {
 1880             MRAILI_Release_vbuf(v);
 1881         }
 1882         else v->padding = FREE_FLAG;
 1883         break;
 1884    case MPIDI_CH3_PKT_GET_ACCUM_RESP:
 1885         req = v->sreq;
 1886         v->sreq = NULL;
 1887         if (NULL != req) {
 1888             MPIDI_CH3I_MRAILI_RREQ_RNDV_FINISH(req);
 1889 
 1890             MPIDI_CH3U_Handle_send_req(vc, req, &complete);
 1891             if (complete != TRUE) {
 1892                 ibv_error_abort(IBV_STATUS_ERR, "Get incomplete eager send request\n");
 1893             }
 1894         }
 1895 
 1896         if (v->padding == NORMAL_VBUF_FLAG) {
 1897             MRAILI_Release_vbuf(v);
 1898         } else {
 1899             v->padding = FREE_FLAG;
 1900         }
 1901         break;
 1902    case MPIDI_CH3_PKT_CLOSE:  /*24*/
 1903         DEBUG_PRINT("[process send] get %d\n", p->type);
 1904         vc->pending_close_ops -= 1;
 1905         if (vc->disconnect == 1 && vc->pending_close_ops == 0)
 1906         {
 1907             mpi_errno = MPIDI_CH3_Connection_terminate(vc);
 1908             if(mpi_errno)
 1909             {
 1910               MPIR_ERR_POP(mpi_errno);
 1911             }
 1912         }
 1913 
 1914         if (v->padding == NORMAL_VBUF_FLAG) {
 1915             MRAILI_Release_vbuf(v);
 1916         }
 1917         else {
 1918             v->padding = FREE_FLAG;
 1919         }
 1920         break;
 1921     default:
 1922         dump_vbuf("unknown packet (send finished)", v);
 1923         ibv_va_error_abort(IBV_STATUS_ERR,
 1924                          "Unknown packet type %d in "
 1925                          "MRAILI_Process_send MPIDI_CH3_PKT_FOP: %d", p->type, MPIDI_CH3_PKT_FOP);
 1926     }
 1927     DEBUG_PRINT("return from process send\n");
 1928 
 1929 fn_exit:
 1930     MPIDI_FUNC_EXIT(MPID_STATE_MRAILI_PROCESS_SEND);
 1931     return mpi_errno;
 1932 
 1933 fn_fail:
 1934     goto fn_exit;
 1935 }
 1936 #undef FUNCNAME
 1937 #define FUNCNAME MRAILI_Send_noop
 1938 #undef FCNAME
 1939 #define FCNAME MPL_QUOTE(FUNCNAME)
 1940 void MRAILI_Send_noop(MPIDI_VC_t * c, int rail)
 1941 {
 1942     /* always send a noop when it is needed even if there is a backlog.
 1943      * noops do not consume credits.
 1944      * this is necessary to avoid credit deadlock.
 1945      * RNR NAK will protect us if receiver is low on buffers.
 1946      * by doing this we can force a noop ahead of any other queued packets.
 1947      */
 1948 
 1949     vbuf* v = get_vbuf_by_offset(MV2_RECV_VBUF_POOL_OFFSET);
 1950 
 1951     MPIDI_CH3I_MRAILI_Pkt_noop* p = (MPIDI_CH3I_MRAILI_Pkt_noop *) v->pheader;
 1952 
 1953     MPIDI_STATE_DECL(MPID_STATE_MRAILI_SEND_NOOP);
 1954     MPIDI_FUNC_ENTER(MPID_STATE_MRAILI_SEND_NOOP);
 1955 
 1956     p->type = MPIDI_CH3_PKT_NOOP;
 1957     vbuf_init_send(v, sizeof(MPIDI_CH3I_MRAILI_Pkt_noop), rail);
 1958     mv2_MPIDI_CH3I_RDMA_Process.post_send(c, v, rail);
 1959     MPIDI_FUNC_EXIT(MPID_STATE_MRAILI_SEND_NOOP);
 1960 }
 1961 
 1962 #undef FUNCNAME
 1963 #define FUNCNAME MRAILI_Send_noop_if_needed
 1964 #undef FCNAME
 1965 #define FCNAME MPL_QUOTE(FUNCNAME)
 1966 int MRAILI_Send_noop_if_needed(MPIDI_VC_t * vc, int rail)
 1967 {
 1968     MPIDI_STATE_DECL(MPID_STATE_MRAILI_SEND_NOOP_IF_NEEDED);
 1969     MPIDI_FUNC_ENTER(MPID_STATE_MRAILI_SEND_NOOP_IF_NEEDED);
 1970 
 1971     if (mv2_MPIDI_CH3I_RDMA_Process.has_srq
 1972      || vc->ch.state != MPIDI_CH3I_VC_STATE_IDLE)
 1973     return MPI_SUCCESS;
 1974 
 1975     DEBUG_PRINT( "[ibv_send]local credit %d, rdma redit %d\n",
 1976         vc->mrail.srp.credits[rail].local_credit,
 1977         vc->mrail.rfp.rdma_credit);
 1978 
 1979     if (vc->mrail.srp.credits[rail].local_credit >=
 1980         rdma_dynamic_credit_threshold
 1981         || vc->mrail.rfp.rdma_credit > num_rdma_buffer / 2
 1982         || (vc->mrail.srp.credits[rail].remote_cc <=
 1983             rdma_credit_preserve
 1984             && vc->mrail.srp.credits[rail].local_credit >=
 1985             rdma_credit_notify_threshold)
 1986         ) {
 1987         MRAILI_Send_noop(vc, rail);
 1988     } 
 1989     MPIDI_FUNC_EXIT(MPID_STATE_MRAILI_SEND_NOOP_IF_NEEDED);
 1990     return MPI_SUCCESS;
 1991 }
 1992 
 1993 #undef FUNCNAME
 1994 #define FUNCNAME MRAILI_RDMA_Get
 1995 #undef FCNAME
 1996 #define FCNAME MPL_QUOTE(FUNCNAME)
 1997 void MRAILI_RDMA_Get(   MPIDI_VC_t * vc, vbuf *v,
 1998                         char * local_addr, uint32_t lkey,
 1999                         char * remote_addr, uint32_t rkey,
 2000                         int nbytes, int rail
 2001                     )
 2002 {
 2003     char cq_overflow = 0;
 2004 
 2005     MPIDI_STATE_DECL(MPID_STATE_MRAILI_RDMA_GET);
 2006     MPIDI_FUNC_ENTER(MPID_STATE_MRAILI_RDMA_GET);
 2007 
 2008     DEBUG_PRINT("MRAILI_RDMA_Get: RDMA Read, "
 2009             "remote addr %p, rkey %p, nbytes %d, hca %d\n",
 2010             remote_addr, rkey, nbytes, vc->mrail.rails[rail].hca_index);
 2011 
 2012     vbuf_init_rget(v, (void *)local_addr, lkey,
 2013                    remote_addr, rkey, nbytes, rail);
 2014     
 2015     v->vc = (void *)vc;
 2016 
 2017     XRC_FILL_SRQN_FIX_CONN (v, vc, rail);
 2018     
 2019     cq_overflow = check_cq_overflow(vc, rail);
 2020 
 2021     if (likely(vc->mrail.rails[rail].send_wqes_avail > 0 && !cq_overflow)) {
 2022         --vc->mrail.rails[rail].send_wqes_avail;
 2023         IBV_POST_SR(v, vc, rail, "MRAILI_RDMA_Get");
 2024     } else {
 2025         MRAILI_Ext_sendq_enqueue(vc,rail, v);
 2026     }
 2027 
 2028     MPIDI_FUNC_EXIT(MPID_STATE_MRAILI_RDMA_GET);
 2029     return;
 2030 }
 2031 
 2032 #undef FUNCNAME
 2033 #define FUNCNAME MRAILI_RDMA_Put
 2034 #undef FCNAME
 2035 #define FCNAME MPL_QUOTE(FUNCNAME)
 2036 void MRAILI_RDMA_Put(   MPIDI_VC_t * vc, vbuf *v,
 2037                         char * local_addr, uint32_t lkey,
 2038                         char * remote_addr, uint32_t rkey,
 2039                         int nbytes, int rail
 2040                     )
 2041 {
 2042     char cq_overflow = 0;
 2043 
 2044     MPIDI_STATE_DECL(MPID_STATE_MRAILI_RDMA_PUT);
 2045     MPIDI_FUNC_ENTER(MPID_STATE_MRAILI_RDMA_PUT);
 2046 
 2047     DEBUG_PRINT("MRAILI_RDMA_Put: RDMA write, "
 2048             "remote addr %p, rkey %p, nbytes %d, hca %d\n",
 2049             remote_addr, rkey, nbytes, vc->mrail.rails[rail].hca_index);
 2050 
 2051     vbuf_init_rput(v, (void *)local_addr, lkey,
 2052                    remote_addr, rkey, nbytes, rail);
 2053     
 2054     v->vc = (void *)vc;
 2055     XRC_FILL_SRQN_FIX_CONN (v, vc, rail);
 2056  
 2057     cq_overflow = check_cq_overflow(vc, rail);
 2058 
 2059     if (likely(vc->mrail.rails[rail].send_wqes_avail > 0 && !cq_overflow)) {
 2060         --vc->mrail.rails[rail].send_wqes_avail;
 2061         IBV_POST_SR(v, vc, rail, "MRAILI_RDMA_Put");
 2062     } else {
 2063         MRAILI_Ext_sendq_enqueue(vc,rail, v);
 2064     }
 2065 
 2066     MPIDI_FUNC_EXIT(MPID_STATE_MRAILI_RDMA_PUT);
 2067     return;
 2068 }
 2069 
 2070 
 2071 void vbuf_address_send(MPIDI_VC_t *vc)
 2072 {
 2073     int rail, i;
 2074 
 2075     vbuf* v = NULL;
 2076     GET_VBUF_BY_OFFSET_WITHOUT_LOCK(v, MV2_SMALL_DATA_VBUF_POOL_OFFSET);
 2077     MPIDI_CH3_Pkt_address_t* p = (MPIDI_CH3_Pkt_address_t *) v->pheader;
 2078 
 2079     rail = MRAILI_Send_select_rail(vc);
 2080     p->type = MPIDI_CH3_PKT_ADDRESS;
 2081     p->rdma_address = (unsigned long)vc->mrail.rfp.RDMA_recv_buf_DMA;
 2082 
 2083     for (i = 0; i < rdma_num_hcas; i++) {    
 2084     DEBUG_PRINT("mr %p\n", vc->mrail.rfp.RDMA_recv_buf_mr[i]);
 2085     p->rdma_hndl[i]   = vc->mrail.rfp.RDMA_recv_buf_mr[i]->rkey;
 2086     }
 2087     vbuf_init_send(v, sizeof(MPIDI_CH3_Pkt_address_t), rail);
 2088     mv2_MPIDI_CH3I_RDMA_Process.post_send(vc, v, rail);
 2089 }
 2090 
 2091 void vbuf_address_reply_send(MPIDI_VC_t *vc, uint8_t data)
 2092 {
 2093     int rail;
 2094 
 2095     vbuf *v = NULL;
 2096     GET_VBUF_BY_OFFSET_WITHOUT_LOCK(v, MV2_SMALL_DATA_VBUF_POOL_OFFSET);
 2097     MPIDI_CH3_Pkt_address_reply_t *p = (MPIDI_CH3_Pkt_address_reply_t *) v->pheader;
 2098 
 2099     rail = MRAILI_Send_select_rail(vc);
 2100     p->type = MPIDI_CH3_PKT_ADDRESS_REPLY;
 2101     p->reply_data = data;
 2102     
 2103     vbuf_init_send(v, sizeof(MPIDI_CH3_Pkt_address_reply_t), rail);
 2104     mv2_MPIDI_CH3I_RDMA_Process.post_send(vc, v, rail);
 2105 }
 2106 
 2107 
 2108 int mv2_shm_coll_post_send(vbuf *v, int rail, MPIDI_VC_t * vc)
 2109 { 
 2110     char cq_overflow = 0;
 2111     int mpi_errno = MPI_SUCCESS;
 2112 
 2113     v->rail = rail; 
 2114 
 2115     cq_overflow = check_cq_overflow(vc, rail);
 2116 
 2117     if (likely(vc->mrail.rails[rail].send_wqes_avail > 0 && !cq_overflow)) {
 2118         --vc->mrail.rails[rail].send_wqes_avail;
 2119 
 2120         IBV_POST_SR(v, vc, rail, "ibv_post_sr (post_fast_rdma)");
 2121         DEBUG_PRINT("[send:post rdma] desc posted\n");
 2122     } else {
 2123         DEBUG_PRINT("[send: rdma_send] Warning! no send wqe or send cq available\n");
 2124         MRAILI_Ext_sendq_enqueue(vc, rail, v);
 2125         mpi_errno = MPI_MRAIL_MSG_QUEUED;
 2126     }
 2127 
 2128     return mpi_errno; 
 2129 }
 2130 
 2131 void mv2_shm_coll_prepare_post_send(uint64_t local_rdma_addr, uint64_t remote_rdma_addr, 
 2132                       uint32_t local_rdma_key, uint32_t remote_rdma_key, 
 2133                       int len, int rail, MPIDI_VC_t * vc)
 2134 {
 2135     vbuf *v=NULL;
 2136     GET_VBUF_BY_OFFSET_WITHOUT_LOCK(v, MV2_SMALL_DATA_VBUF_POOL_OFFSET);
 2137     v->desc.u.sr.next = NULL;
 2138     v->desc.u.sr.opcode = IBV_WR_RDMA_WRITE;
 2139     if (likely(len <= rdma_max_inline_size)) {
 2140         v->desc.u.sr.send_flags = IBV_SEND_INLINE | IBV_SEND_SIGNALED;
 2141     } else {
 2142         v->desc.u.sr.send_flags = IBV_SEND_SIGNALED;
 2143     }
 2144     (v)->desc.u.sr.wr.rdma.remote_addr = (uintptr_t) (remote_rdma_addr);
 2145     (v)->desc.u.sr.wr.rdma.rkey = (remote_rdma_key);
 2146     v->desc.u.sr.wr_id = (uintptr_t) v;
 2147     v->desc.u.sr.num_sge = 1;
 2148     v->desc.u.sr.sg_list = &(v->desc.sg_entry);
 2149     (v)->desc.sg_entry.length = len;
 2150 
 2151     (v)->desc.sg_entry.lkey = (local_rdma_key);
 2152     (v)->desc.sg_entry.addr =  (uintptr_t) (local_rdma_addr);
 2153     (v)->padding = COLL_VBUF_FLAG;
 2154     (v)->vc   = vc;
 2155     XRC_FILL_SRQN_FIX_CONN (v, vc, rail);
 2156     mv2_shm_coll_post_send(v, rail, vc);
 2157 
 2158     return;
 2159 }
 2160 
 2161 int mv2_shm_coll_reg_buffer(void *buffer, int size, struct ibv_mr *mem_handle[], 
 2162                            int *buffer_registered)
 2163 {
 2164    int i=0;
 2165    int mpi_errno = MPI_SUCCESS;
 2166 
 2167     for ( i = 0 ; i < rdma_num_hcas; i ++ ) {
 2168         mem_handle[i]  = (struct ibv_mr *) register_memory(buffer, size, i);
 2169 
 2170         if (!mem_handle[i]) {
 2171             /* de-register already registered with other hcas*/
 2172             for (i = i-1; i >=0 ; --i)
 2173             {
 2174                 if (mem_handle[i] != NULL) {
 2175                     deregister_memory(mem_handle[i]);
 2176                 }
 2177             }
 2178             *buffer_registered = 0;
 2179         }
 2180     }
 2181     *buffer_registered = 1;
 2182 
 2183     return mpi_errno;
 2184 }
 2185 
 2186 int mv2_shm_coll_dereg_buffer(struct ibv_mr *mem_handle[])
 2187 { 
 2188    int i=0, mpi_errno = MPI_SUCCESS;
 2189    for ( i = 0 ; i < rdma_num_hcas; i ++ ) {
 2190        if (mem_handle[i] != NULL) {
 2191            if (deregister_memory(mem_handle[i])) { 
 2192                ibv_error_abort(IBV_RETURN_ERR,
 2193                                         "deregistration failed\n");
 2194            }
 2195            mem_handle[i] = NULL;
 2196        }
 2197    }
 2198    return mpi_errno; 
 2199 }