"Fossies" - the Fresh Open Source Software Archive

Member "hpcc-1.5.0/PTRANS/cblacslt.c" (18 Mar 2016, 22238 Bytes) of package /linux/privat/hpcc-1.5.0.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "cblacslt.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 1.5.0b_vs_1.5.0.

    1 /* -*- mode: C; tab-width: 2; indent-tabs-mode: nil; -*- */
    2 /*
    3 
    4 cblacslt.c
    5 
    6 -- V0.0 Stripped-down BLACS routines --
    7 University of Tennessee, October, 2003
    8 Written by Piotr Luszczek.
    9 
   10 */
   11 
   12 #include <hpcc.h>
   13 
   14 #include <mpi.h>
   15 
   16 #include "cblacslt.h"
   17 
   18 #define DPRN(i,v) do{printf(__FILE__ "(%d)@%d:" #v "=%g\n",__LINE__,i,(double)(v));fflush(stdout);}while(0)
   19 
   20 /* ---------------------------------------------------------------------- */
   21 
   22 /*FIXME: what about parameter checking: context, etc? have a macro too? */
   23 #define CBLACS_INIT if (! CblacsInitialized) CblacsInit(); else if (CblacsFinalized) do{CblacsWarn();return;}while(0)
   24 #define CBLACS_INIT1(v) if (! CblacsInitialized) CblacsInit();else if (CblacsFinalized)do{CblacsWarn();return(v);}while(0)
   25 #define CblacsWarn() CblacsWarnImp( __FILE__, __LINE__ )
   26 
   27 static int CblacsInitialized = 0, CblacsFinalized;
   28 
   29 double
   30 dcputime00(void) {return HPL_ptimer_cputime();}
   31 double
   32 dwalltime00(void) {return MPI_Wtime();}
   33 
   34 static void
   35 CblacsWarnImp(char *file, int line) {
   36   int rank;
   37   MPI_Comm_rank( MPI_COMM_WORLD, &rank );
   38   printf( "%s(%d)@%d: CBLACS warning.\n", file, line, rank );
   39   fflush(stdout);
   40 }
   41 
   42 static struct {MPI_Comm comm, rowComm, colComm; unsigned int taken;} CblacsComms[10];
   43 static int CblacsNComms;
   44 
   45 #define CBLACS_CHK_CTXT(v) if (ctxt < 1 || ctxt > CblacsNComms) return v
   46 static MPI_Comm
   47 CblacsGetComm(int ctxt) {
   48   CBLACS_CHK_CTXT(MPI_COMM_NULL);
   49   return CblacsComms[ctxt - 1].comm;
   50 }
   51 static MPI_Comm
   52 CblacsGetRowComm(int ctxt) {
   53   CBLACS_CHK_CTXT(MPI_COMM_NULL);
   54   return CblacsComms[ctxt - 1].rowComm;
   55 }
   56 static MPI_Comm
   57 CblacsGetColComm(int ctxt) {
   58   CBLACS_CHK_CTXT(MPI_COMM_NULL);
   59   return CblacsComms[ctxt - 1].colComm;
   60 }
   61 
   62 static int
   63 CblacsSetComm(int ctxt, MPI_Comm comm) {
   64   CBLACS_CHK_CTXT(-1);
   65   CblacsComms[ctxt - 1].comm = comm;
   66   return 0;
   67 }
   68 static int
   69 CblacsSetRowComm(int ctxt, MPI_Comm comm) {
   70   CBLACS_CHK_CTXT(-1);
   71   CblacsComms[ctxt - 1].rowComm = comm;
   72   return 0;
   73 }
   74 static int
   75 CblacsSetColComm(int ctxt, MPI_Comm comm) {
   76   CBLACS_CHK_CTXT(-1);
   77   CblacsComms[ctxt - 1].colComm = comm;
   78   return 0;
   79 }
   80 
   81 static int
   82 CblacsNewCtxt() {
   83   int i;
   84 
   85   for (i = 1; i < CblacsNComms; i++)
   86     if (! CblacsComms[i].taken) {
   87       CblacsComms[i].taken = 1;
   88       return i + 1;
   89     }
   90 
   91   return 0;
   92 }
   93 
   94 static int
   95 CblacsDeleteCtxt(int *ctxtP) {
   96   int idx = *ctxtP - 1;
   97 
   98   if (idx < 1 || idx >= CblacsNComms) {
   99     CblacsWarn();
  100     return -1;
  101   }
  102   if (0 == CblacsComms[idx].taken) {
  103     CblacsWarn();
  104     return -1;
  105   }
  106 
  107   if (MPI_COMM_NULL != CblacsComms[idx].colComm) MPI_Comm_free( &(CblacsComms[idx].colComm) );
  108   if (MPI_COMM_NULL != CblacsComms[idx].rowComm) MPI_Comm_free( &(CblacsComms[idx].rowComm) );
  109   if (MPI_COMM_NULL != CblacsComms[idx].comm)    MPI_Comm_free( &(CblacsComms[idx].comm) );
  110 
  111   CblacsComms[idx].taken = 0;
  112 
  113   *ctxtP = 0; /* deleted contexts are 0 */
  114 
  115   return 0;
  116 }
  117 
  118 /*
  119 static void *
  120 CblacsNewBuf(int count, int esize) {
  121   return malloc( count * esize );
  122 }
  123 */
  124 #define CblacsNewBuf(c,s) malloc((c)*(s))
  125 #define CblacsDeleteBuf(b) free(b)
  126 
  127 static int
  128 CblacsInit() {
  129   int i, flag;
  130 
  131   if (MPI_SUCCESS != MPI_Initialized( &flag ) || ! flag) {CblacsWarn();return 1;}
  132 
  133   CblacsInitialized = 1;
  134   CblacsFinalized = 0;
  135 
  136   CblacsNComms = 10;
  137   for (i = 0; i < CblacsNComms; i++) {
  138     CblacsComms[i].comm    = MPI_COMM_NULL;
  139     CblacsComms[i].rowComm = MPI_COMM_NULL;
  140     CblacsComms[i].colComm = MPI_COMM_NULL;
  141     CblacsComms[i].taken   = 0;
  142   }
  143   /* FIXME: setup system context to be a cartesian grid with row and column comm's*/
  144   CblacsComms[0].comm    = MPI_COMM_WORLD;
  145   CblacsComms[0].rowComm = MPI_COMM_NULL;
  146   CblacsComms[0].colComm = MPI_COMM_NULL;
  147   CblacsComms[0].taken   = 1;
  148 
  149   return 0;
  150 }
  151 
  152 void
  153 Cblacs_pinfo(int *mypnum, int *nprocs) {
  154   CBLACS_INIT;
  155   MPI_Comm_rank( MPI_COMM_WORLD, mypnum );
  156   MPI_Comm_size( MPI_COMM_WORLD, nprocs );
  157 }
  158 
  159 void
  160 Cblacs_exit(int NotDone) {
  161   CBLACS_INIT;
  162   CblacsFinalized = 0;
  163   if (! NotDone) MPI_Finalize();
  164 }
  165 
  166 void
  167 Cblacs_abort(int ConTxt, int ErrNo) {
  168   int nprow, npcol, myrow, mycol, rank;
  169 
  170   CBLACS_INIT;
  171 
  172   MPI_Comm_rank( MPI_COMM_WORLD, &rank );
  173 
  174   Cblacs_gridinfo(ConTxt, &nprow, &npcol, &myrow, &mycol);
  175   fprintf(stderr, "{%d,%d}, pnum=%d, Contxt=%d, killed other procs, exiting with error #%d.\n\n",
  176           myrow, mycol, rank, ConTxt, ErrNo);
  177 
  178   fflush(stderr);
  179   fflush(stdout);
  180   MPI_Abort( MPI_COMM_WORLD, ErrNo );
  181 }
  182 
  183 void
  184 Cblacs_get(int ConTxt, int what, int *val) {
  185   CBLACS_INIT;
  186 
  187   switch (what) {
  188     case SGET_SYSCONTXT:
  189       *val = 1;
  190       break;
  191     default:
  192       *val = -1;
  193       CblacsWarn();
  194       break;
  195   }
  196 }
  197 
  198 static int
  199 CblacsGridNew(int nprow, int npcol, int *ConTxt, MPI_Comm *comm) {
  200   int size;
  201 
  202   CBLACS_INIT1(-1);
  203 
  204   *comm = CblacsGetComm(*ConTxt);
  205   if (MPI_COMM_NULL == *comm) return -1;
  206 
  207   MPI_Comm_size( *comm, &size );
  208   if (nprow < 1 || nprow > size) return -1;
  209   if (npcol < 1 || npcol > size) return -1;
  210   if (nprow * npcol > size) return -1;
  211 
  212   *ConTxt = CblacsNewCtxt();
  213 
  214   return 0;
  215 }
  216 
  217 void
  218 Cblacs_gridmap(int *ConTxt, int *umap, int ldumap, int nprow, int npcol) {
  219   int i, j, np_me, npall, npwho, myrow, mycol, color, key, rv;
  220   MPI_Comm comm, newComm, rowComm, colComm;
  221 
  222   if (CblacsGridNew( nprow, npcol, ConTxt, &comm )) {
  223     CblacsWarn();
  224     goto gmapErr;
  225   }
  226 
  227   Cblacs_pinfo( &np_me, &npall );
  228 
  229   myrow = mycol = -1;
  230   color = MPI_UNDEFINED;
  231   key = 0;
  232   for (i = 0; i < nprow; ++i)
  233     for (j = 0; j < npcol; ++j) {
  234       npwho = umap[j + i * ldumap];
  235       if (np_me == npwho) {
  236         color = 0;
  237         key = j + i * npcol;
  238         myrow = i;
  239         mycol = j;
  240         goto gmapFound;
  241       }
  242     }
  243 
  244   gmapFound:
  245 
  246   /* communicator of all grid processes */
  247   rv = MPI_Comm_split( comm, color, key, &newComm );
  248   if (MPI_SUCCESS != rv) {
  249     /* make contexts for non-participating processes a 0 value so gridinfo() works correctly */
  250     CblacsDeleteCtxt( ConTxt );
  251     goto gmapErr;
  252   }
  253   CblacsSetComm( *ConTxt, newComm );
  254   if (MPI_COMM_NULL == newComm) { /* this process does not participate in this grid */
  255     CblacsDeleteCtxt( ConTxt );
  256     return;
  257   }
  258 
  259   /* row communicator */
  260   rv = MPI_Comm_split( newComm, myrow, mycol, &rowComm );
  261   if (MPI_SUCCESS != rv) {
  262     CblacsDeleteCtxt( ConTxt );
  263     goto gmapErr;
  264   }
  265   CblacsSetRowComm( *ConTxt, rowComm );
  266 
  267   /* column communicator */
  268   rv = MPI_Comm_split( newComm, mycol, myrow, &colComm );
  269   if (MPI_SUCCESS != rv) {
  270     CblacsDeleteCtxt( ConTxt );
  271     goto gmapErr;
  272   }
  273   CblacsSetColComm( *ConTxt, colComm );
  274 
  275   return;
  276 
  277   gmapErr:
  278 
  279   *ConTxt = 0;
  280   CblacsWarn();
  281   return;
  282 }
  283 
  284 void
  285 Cblacs_gridexit(int ConTxt) {
  286   CBLACS_INIT;
  287   CblacsDeleteCtxt( &ConTxt );
  288 }
  289 
  290 void
  291 Cblacs_gridinfo(int ConTxt, int *nprow, int *npcol, int *myrow, int *mycol) {
  292   MPI_Comm comm;
  293 
  294   CBLACS_INIT;
  295 
  296   comm = CblacsGetComm( ConTxt );
  297 
  298   /* deleted contexts (or the contexts for non-participating processes) are 0 */
  299   if (MPI_COMM_NULL == comm) {
  300     *nprow = *npcol = *myrow = *mycol = -1;
  301   } else {
  302     MPI_Comm_size( CblacsGetRowComm(ConTxt), npcol );
  303     MPI_Comm_rank( CblacsGetRowComm(ConTxt), mycol );
  304     MPI_Comm_size( CblacsGetColComm(ConTxt), nprow );
  305     MPI_Comm_rank( CblacsGetColComm(ConTxt), myrow );
  306   }
  307 }
  308 
  309 /* ---------------------------------------------------------------------- */
  310 /* Communication routines */
  311 
  312 void
  313 Cblacs_barrier(int ConTxt, char *scope) {
  314   MPI_Comm comm;
  315   CBLACS_INIT;
  316 
  317   switch (*scope) {
  318     case 'A': case 'a': comm = CblacsGetComm( ConTxt );    break;
  319     case 'C': case 'c': comm = CblacsGetColComm( ConTxt ); break;
  320     case 'R': case 'r': comm = CblacsGetRowComm( ConTxt ); break;
  321     default: comm = MPI_COMM_NULL; CblacsWarn(); break;
  322   }
  323   if (MPI_COMM_NULL == comm) {
  324     CblacsWarn();
  325     return;
  326   }
  327 
  328   MPI_Barrier( comm );
  329 }
  330 
  331 static void
  332 Cvgred2d(int ConTxt, char *scope, int m, int n, void *A, int lda, int rowRank,
  333   int colRank, MPI_Datatype dtype, int dsize, MPI_Op op) {
  334   int j, rank, root, count, coords[2], dest_rank, npcol;
  335   void *sbuf, *rbuf;
  336   MPI_Comm comm;
  337 
  338   /* if the answer should be left on all processes */
  339   if (-1 == rowRank || -1 == colRank) root = 0;
  340   else root = 1;
  341 
  342   switch (*scope) {
  343     case 'A': case 'a':
  344       comm = CblacsGetComm( ConTxt );
  345       coords[0] = rowRank;
  346       coords[1] = colRank;
  347       MPI_Comm_size( CblacsGetRowComm( ConTxt ), &npcol );
  348       dest_rank = colRank + rowRank * npcol;
  349       break;
  350     case 'C': case 'c':
  351       comm = CblacsGetColComm( ConTxt );
  352       coords[0] = rowRank;
  353       dest_rank = rowRank;
  354       break;
  355     case 'R': case 'r':
  356       comm = CblacsGetRowComm( ConTxt );
  357       coords[0] = colRank;
  358       dest_rank = colRank;
  359       break;
  360     default: comm = MPI_COMM_NULL; CblacsWarn(); break;
  361   }
  362   if (MPI_COMM_NULL == comm) {
  363     CblacsWarn();
  364     return;
  365   }
  366   /* if not leave-on-all then get rank of the destination */
  367   if (root) root = dest_rank; /* MPI_Cart_rank( comm, coords, &root ); */
  368   else root = MPI_PROC_NULL;
  369 
  370   /* FIXME: what if contiguous buffer cannot be allocated */
  371   count = m * n;
  372   if (m == lda || n == 1) sbuf = A; /* A is contiguous, reuse it */
  373   else {
  374     /* a new data type could be created to reflect layout of `A' but then the
  375      * receiving buffer would have to be the same, and if `lda' is large in
  376      * comparison to `m' then it might be unfeasible */
  377     sbuf = CblacsNewBuf( count, dsize );
  378     for (j = 0; j < n; j++)
  379       memcpy( (char *)sbuf + j * m * dsize, (char *)A + j * lda * dsize, m * dsize );
  380   }
  381   rbuf = CblacsNewBuf( count, dsize );
  382 
  383   if (MPI_PROC_NULL == root) {
  384     MPI_Allreduce( sbuf, rbuf, count, dtype, op, comm );
  385   } else {
  386     MPI_Reduce( sbuf, rbuf, count, dtype, op, root, comm );
  387     MPI_Comm_rank( comm, &rank );
  388   }
  389   if (MPI_PROC_NULL == root || root == rank) {
  390     if (A == sbuf) memcpy( A, rbuf, count * dsize ); /* A is contiguous */
  391     else {
  392       for (j = 0; j < n; j++)
  393         memcpy( (char *)A + j * lda * dsize, (char *)rbuf + j * m * dsize, m * dsize );
  394     }
  395   }
  396 
  397   CblacsDeleteBuf( rbuf );
  398   if (sbuf != A) CblacsDeleteBuf( sbuf );
  399 }
  400 
  401 /*
  402  *  Purpose
  403  *
  404  *  Combine sum operation for double precision rectangular matrices.
  405  *
  406  *  Arguments
  407  *
  408  *  ConTxt  (input) int
  409  *          Index into MyConTxts00 (my contexts array).
  410  *
  411  *  scope   (input) Ptr to char
  412  *          Limit the scope of the operation.
  413  *          = 'R' :   Operation is performed by a process row
  414  *          = 'C' :   Operation is performed by a process column.
  415  *          = 'A' :   Operation is performed by all processes in grid.
  416  * If both `rdest' and `cdest' are not -1 then for 'R' scope `rdest' is ignored and for `C' - `cdest'
  417  * is ignored (row or column of the scope are used, respectively).
  418  *
  419  *  top     (input) Ptr to char
  420  *          Controls fashion in which messages flow within the operation.
  421  *
  422  *  m       (input) int
  423  *          The number of rows of the matrix A.  m >= 0.
  424  *
  425  *  n       (input) int
  426  *          The number of columns of the matrix A.  n >= 0.
  427  *
  428  *  A       (output) Ptr to double precision two dimensional array
  429  *          The m by n matrix A.  Fortran 77 (column-major) storage
  430  *          assumed.
  431  *
  432  *  lda     (input) int
  433  *          The leading dimension of the array A.  lda >= m.
  434  *
  435  *  rdest   (input) int
  436  *          The process row of the destination of the sum.
  437  *          If rdest == -1, then result is left on all processes in scope.
  438  *
  439  *  cdest   (input) int
  440  *          The process column of the destination of the sum.
  441  *          If cdest == -1, then result is left on all processes in scope.
  442  */
  443 void
  444 Cdgsum2d(int ConTxt, char *scope, char *top, int m, int n, double *A, int lda, int rdest, int cdest){
  445   CBLACS_INIT;
  446   Cvgred2d( ConTxt, scope, m, n, A, lda, rdest, cdest, MPI_DOUBLE, sizeof(double), MPI_SUM );
  447 }
  448 void
  449 Cigsum2d(int ConTxt, char *scope, char *top, int m, int n, int *A, int lda, int rdest, int cdest){
  450   CBLACS_INIT;
  451   Cvgred2d( ConTxt, scope, m, n, A, lda, rdest, cdest, MPI_INT, sizeof(int), MPI_SUM );
  452 }
  453 
  454 void
  455 CblacsAbsMax(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
  456   int i, n = *len; double *dinvec, *dinoutvec;
  457   if (MPI_DOUBLE == *datatype) {
  458     dinvec = (double *)invec;
  459     dinoutvec = (double *)inoutvec;
  460     for (i = n; i; i--, dinvec++, dinoutvec++)
  461       if (fabs(*dinvec) > fabs(*dinoutvec)) *dinoutvec = *dinvec;
  462   } else
  463     CblacsWarn();
  464 }
  465 void
  466 CblacsAbsMin(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
  467   int i, n = *len; double *dinvec, *dinoutvec;
  468   if (MPI_DOUBLE == *datatype) {
  469     dinvec = (double *)invec;
  470     dinoutvec = (double *)inoutvec;
  471     for (i = n; i; i--, dinvec++, dinoutvec++)
  472       if (fabs(*dinvec) < fabs(*dinoutvec)) *dinoutvec = *dinvec;
  473   } else
  474     CblacsWarn();
  475 }
  476 
  477 /*
  478  *  Purpose
  479  *
  480  *  Combine amx operation for double precision rectangular matrices.
  481  *
  482  *  Arguments
  483  *
  484  *  ConTxt  (input) Ptr to int
  485  *          Index into MyConTxts00 (my contexts array).
  486  *
  487  *  SCOPE   (input) Ptr to char
  488  *          Limit the scope of the operation.
  489  *          = 'R' :   Operation is performed by a process row.
  490  *          = 'C' :   Operation is performed by a process column.
  491  *          = 'A' :   Operation is performed by all processes in grid.
  492  *
  493  *  TOP     (input) Ptr to char
  494  *          Controls fashion in which messages flow within the operation.
  495  *
  496  *  M       (input) Ptr to int
  497  *          The number of rows of the matrix A.  M >= 0.
  498  *
  499  *  N       (input) Ptr to int
  500  *          The number of columns of the matrix A.  N >= 0.
  501  *
  502  *  A       (output) Ptr to double precision two dimensional array
  503  *          The m by n matrix A.  Fortran77 (column-major) storage
  504  *          assumed.
  505  *
  506  *  LDA     (input) Ptr to int
  507  *          The leading dimension of the array A.  LDA >= M.
  508  *
  509  *  RA      (output) Integer Array, dimension (LDIA, N)
  510  *          Contains process row that the amx of each element
  511  *          of A was found on: i.e., rA(1,2) contains the process
  512  *          row that the amx of A(1,2) was found on.
  513  *          Values are left on process {rdest, cdest} only, others
  514  *          may be modified, but not left with interesting data.
  515  *          If rdest == -1, then result is left on all processes in scope.
  516  *          If LDIA == -1, this array is not accessed, and need not exist.
  517  *
  518  *  CA      (output) Integer Array, dimension (LDIA, N)
  519  *          Contains process column that the amx of each element
  520  *          of A was found on: i.e., cA(1,2) contains the process
  521  *          column that the max/min of A(1,2) was found on.
  522  *          Values are left on process {rdest, cdest} only, others
  523  *          may be modified, but not left with interesting data.
  524  *          If rdest == -1, then result is left on all processes in scope.
  525  *          If LDIA == -1, this array is not accessed, and need not exist.
  526  *
  527  *  LDIA    (input) Ptr to int
  528  *          If (LDIA == -1), then the arrays RA and CA are not accessed.
  529  *          ELSE leading dimension of the arrays RA and CA.  LDIA >= M.
  530  *
  531  *  RDEST   (input) Ptr to int
  532  *          The process row of the destination of the amx.
  533  *          If rdest == -1, then result is left on all processes in scope.
  534  *
  535  *  CDEST   (input) Ptr to int
  536  *          The process column of the destination of the amx.
  537  *          If rdest == -1, then CDEST ignored.
  538  */
  539 void
  540 Cdgamx2d(int ConTxt, char *scope, char *top, int m, int n, double *A, int lda, int *rA, int *cA,
  541   int ldia, int rdest, int cdest) {
  542   MPI_Op op;
  543   CBLACS_INIT;
  544   if (ldia > 0) {CblacsWarn(); rA = cA; return;} /* no AMAX_LOC yet */
  545   MPI_Op_create( CblacsAbsMax, 1, &op );
  546   Cvgred2d( ConTxt, scope, m, n, A, lda, rdest, cdest, MPI_DOUBLE, sizeof(double), op );
  547   MPI_Op_free( &op );
  548 }
  549 void
  550 Cdgamn2d(int ConTxt, char *scope, char *top, int m, int n, double *A, int lda, int *rA, int *cA,
  551   int ldia, int rdest, int cdest) {
  552   MPI_Op op;
  553   CBLACS_INIT;
  554   if (ldia > 0) {CblacsWarn(); rA = cA; return;} /* no AMAX_LOC yet */
  555   MPI_Op_create( CblacsAbsMin, 1, &op );
  556   Cvgred2d( ConTxt, scope, m, n, A, lda, rdest, cdest, MPI_DOUBLE, sizeof(double), op );
  557   MPI_Op_free( &op );
  558 }
  559 
  560 void
  561 Cblacs_dSendrecv(int ctxt, int mSrc, int nSrc, double *Asrc, int ldaSrc, int rdest, int cdest,
  562   int mDest, int nDest, double *Adest, int ldaDest, int rsrc, int csrc) {
  563   MPI_Comm comm, rowComm;
  564   MPI_Datatype typeSrc, typeDest;
  565   MPI_Status stat;
  566   int src, dest, dataIsContiguousSrc, dataIsContiguousDest, countSrc, countDest, npcol;
  567   long int lSrc = mSrc * (long int)nSrc, lDest = mDest * (long int)nDest, li;
  568 
  569   CBLACS_INIT;
  570 
  571   comm = CblacsGetComm( ctxt );
  572   if (MPI_COMM_NULL == comm) {CblacsWarn(); return;}
  573 
  574   if (mSrc == ldaSrc || 1 == nSrc) {
  575     dataIsContiguousSrc = 1;
  576     countSrc = mSrc * nSrc;
  577     typeSrc = MPI_DOUBLE;
  578   } else {
  579     dataIsContiguousSrc = 0;
  580     countSrc = 1;
  581     MPI_Type_vector( nSrc, mSrc, ldaSrc, MPI_DOUBLE, &typeSrc );
  582     MPI_Type_commit( &typeSrc );
  583   }
  584   if (mDest == ldaDest || 1 == nDest) {
  585     dataIsContiguousDest = 1;
  586     countDest = mDest * nDest;
  587     typeDest = MPI_DOUBLE;
  588   } else {
  589     dataIsContiguousDest = 0;
  590     countDest = 1;
  591     MPI_Type_vector( nDest, mDest, ldaDest, MPI_DOUBLE, &typeDest );
  592     MPI_Type_commit( &typeDest );
  593   }
  594 
  595   rowComm = CblacsGetRowComm( ctxt );
  596   MPI_Comm_size( rowComm, &npcol );
  597   dest = cdest + rdest * npcol;
  598   src  = csrc + rsrc * npcol;
  599 
  600   if (dataIsContiguousSrc && dataIsContiguousDest && lSrc == lDest) {
  601     int mlength, maxpayload = 12500000; /* 100 MB at a time */
  602 
  603     for (li = 0; li < lSrc; li += maxpayload) {
  604       mlength = maxpayload;
  605       if (lSrc - li < maxpayload)
  606         mlength = lSrc - li;
  607       MPI_Sendrecv( Asrc + li, mlength, typeSrc, dest, 0, Adest + li, mlength, typeDest, src, 0, comm, &stat );
  608     }
  609   } else {
  610     MPI_Sendrecv( Asrc, countSrc, typeSrc, dest, 0, Adest, countDest, typeDest, src, 0, comm,
  611                 &stat ); /* IBM's (old ?) MPI doesn't have: MPI_STATUS_IGNORE */
  612   }
  613 
  614   if (! dataIsContiguousSrc) MPI_Type_free( &typeSrc );
  615   if (! dataIsContiguousDest) MPI_Type_free( &typeDest );
  616 }
  617 
  618 static void
  619 CblacsBcast(int ConTxt, char *scope, int m, int n, void *A, int lda, int rowRank, int colRank,
  620   MPI_Datatype baseType){
  621   MPI_Comm comm;
  622   MPI_Datatype type;
  623   int root, coords[2], dest_rank, npcol;
  624 
  625   /* if this process is the root of broadcast */
  626   if (-1 == rowRank || -1 == colRank) root = 0;
  627   else root = 1;
  628 
  629   switch (*scope) {
  630     case 'A': case 'a':
  631       comm = CblacsGetComm( ConTxt );
  632       coords[0] = rowRank;
  633       coords[1] = colRank;
  634       MPI_Comm_size( CblacsGetRowComm( ConTxt ), &npcol );
  635       dest_rank = colRank + rowRank * npcol;
  636       break;
  637     case 'C': case 'c':
  638       comm = CblacsGetColComm( ConTxt );
  639       coords[0] = rowRank;
  640       dest_rank = rowRank;
  641       break;
  642     case 'R': case 'r':
  643       comm = CblacsGetRowComm( ConTxt );
  644       coords[0] = colRank;
  645       dest_rank = colRank;
  646       break;
  647     default: comm = MPI_COMM_NULL; CblacsWarn(); break;
  648   }
  649   if (MPI_COMM_NULL == comm) {
  650     CblacsWarn();
  651     return;
  652   }
  653   if (MPI_COMM_NULL == comm) {
  654     CblacsWarn();
  655     return;
  656   }
  657 
  658   /* if broadcast/receive */
  659   if (root) root = dest_rank; /* MPI_Cart_rank( comm, coords, &root ); */
  660   else MPI_Comm_rank( comm, &root ); /* else broadcast/send - I'm the root */
  661 
  662   MPI_Type_vector( n, m, lda, baseType, &type );
  663   MPI_Type_commit( &type );
  664 
  665   MPI_Bcast( A, 1, type, root, comm );
  666 
  667   MPI_Type_free( &type );
  668 }
  669 
  670 /*
  671  *  Purpose
  672  *
  673  *  Broadcast/send for general double precision arrays.
  674  *
  675  *  Arguments
  676  *
  677  *  ConTxt  (input) Ptr to int
  678  *          Index into MyConTxts00 (my contexts array).
  679  *
  680  *  SCOPE   (input) Ptr to char
  681  *          Limit the scope of the operation.
  682  *          = 'R' :   Operation is performed by a process row.
  683  *          = 'C' :   Operation is performed by a process column.
  684  *          = 'A' :   Operation is performed by all processes in grid.
  685  *
  686  *  TOP     (input) Ptr to char
  687  *          Controls fashion in which messages flow within the operation.
  688  *
  689  *  M       (input) Ptr to int
  690  *          The number of rows of the matrix A.  M >= 0.
  691  *
  692  *  N       (input) Ptr to int
  693  *          The number of columns of the matrix A.  N >= 0.
  694  *
  695  *  A       (input) Ptr to double precision two dimensional array
  696  *          The m by n matrix A.  Fortran77 (column-major) storage
  697  *          assumed.
  698  *
  699  *  LDA     (input) Ptr to int
  700  *          The leading dimension of the array A.  LDA >= M.
  701  */
  702 void
  703 Cdgebs2d(int ConTxt, char *scope, char *top, int m, int n, double *A, int lda) {
  704   CBLACS_INIT;
  705   CblacsBcast( ConTxt, scope, m, n, A, lda, -1, -1, MPI_DOUBLE );
  706 }
  707 
  708 /*
  709  *  Purpose
  710  *
  711  *  Broadcast/receive for general double precision arrays.
  712  *
  713  *  Arguments
  714  *
  715  *  ConTxt  (input) Ptr to int
  716  *          Index into MyConTxts00 (my contexts array).
  717  *
  718  *  SCOPE   (input) Ptr to char
  719  *          Limit the scope of the operation.
  720  *          = 'R' :   Operation is performed by a process row.
  721  *          = 'C' :   Operation is performed by a process column.
  722  *          = 'A' :   Operation is performed by all processes in grid.
  723  *
  724  *  TOP     (input) Ptr to char
  725  *          Controls fashion in which messages flow within the operation.
  726  *
  727  *  M       (input) Ptr to int
  728  *          The number of rows of the matrix A.  M >= 0.
  729  *
  730  *  N       (input) Ptr to int
  731  *          The number of columns of the matrix A.  N >= 0.
  732  *
  733  *  A       (output) Ptr to double precision two dimensional array
  734  *          The m by n matrix A.  Fortran77 (column-major) storage
  735  *          assumed.
  736  *
  737  *  LDA     (input) Ptr to int
  738  *          The leading dimension of the array A.  LDA >= M.
  739  *
  740  *
  741  *  RSRC    (input) Ptr to int
  742  *          The process row of the source of the matrix.
  743  *
  744  *  CSRC    (input) Ptr to int
  745  *          The process column of the source of the matrix.
  746  */
  747 void
  748 Cdgebr2d(int ConTxt, char *scope, char *top, int m, int n, double *A, int lda, int rsrc, int csrc) {
  749   CBLACS_INIT;
  750   CblacsBcast( ConTxt, scope, m, n, A, lda, rsrc, csrc, MPI_DOUBLE );
  751 }
  752 void
  753 Cigebs2d(int ConTxt, char *scope, char *top, int m, int n, int *A, int lda) {
  754   CBLACS_INIT;
  755   CblacsBcast( ConTxt, scope, m, n, A, lda, -1, -1, MPI_INT );
  756 }
  757 void
  758 Cigebr2d(int ConTxt, char *scope, char *top, int m, int n, int *A, int lda, int rsrc, int csrc) {
  759   CBLACS_INIT;
  760   CblacsBcast( ConTxt, scope, m, n, A, lda, rsrc, csrc, MPI_INT );
  761 }