"Fossies" - the Fresh Open Source Software Archive

Member "openmpi-4.0.4/ompi/mca/io/romio321/romio/adio/ad_gpfs/bg/ad_bg_aggrs.c" (10 Jun 2020, 24748 Bytes) of package /linux/misc/openmpi-4.0.4.tar.bz2:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "ad_bg_aggrs.c" see the Fossies "Dox" file reference documentation.

    1 /* ---------------------------------------------------------------- */
    2 /* (C)Copyright IBM Corp.  2007, 2008                               */
    3 /* ---------------------------------------------------------------- */
    4 /**
    5  * \file ad_bg_aggrs.c
    6  * \brief The externally used function from this file is is declared in ad_bg_aggrs.h
    7  */
    8 
    9 /* -*- Mode: C; c-basic-offset:4 ; -*- */
   10 /* 
   11  *   Copyright (C) 1997-2001 University of Chicago. 
   12  *   See COPYRIGHT notice in top-level directory.
   13  */
   14 
   15 /*#define TRACE_ON */
   16 
   17 // Uncomment this line to turn tracing on for the gpfsmpio_balancecontig aggr selection optimization
   18 // #define balancecontigtrace 1
   19 // #define bridgeringaggtrace 1
   20 
   21 #include "adio.h"
   22 #include "adio_cb_config_list.h"
   23 #include "../ad_gpfs.h"
   24 #include "ad_bg_pset.h"
   25 #include "ad_bg_aggrs.h"
   26 #ifdef AGGREGATION_PROFILE
   27 #include "mpe.h"
   28 #endif
   29 
   30 
   31 #ifdef USE_DBG_LOGGING
   32   #define AGG_DEBUG 1
   33 #endif
   34 
   35 #ifndef TRACE_ERR
   36 #  define TRACE_ERR(format...)
   37 #endif
   38 
   39 /* Comments copied from common:
   40  * This file contains four functions:
   41  *
   42  * ADIOI_Calc_aggregator()
   43  * ADIOI_Calc_file_domains()
   44  * ADIOI_Calc_my_req()
   45  * ADIOI_Calc_others_req()
   46  *
   47  * The last three of these were originally in ad_read_coll.c, but they are
   48  * also shared with ad_write_coll.c.  I felt that they were better kept with
   49  * the rest of the shared aggregation code.  
   50  */
   51 
   52 /* Discussion of values available from above:
   53  *
   54  * ADIO_Offset st_offsets[0..nprocs-1]
   55  * ADIO_Offset end_offsets[0..nprocs-1]
   56  *    These contain a list of start and end offsets for each process in 
   57  *    the communicator.  For example, an access at loc 10, size 10 would
   58  *    have a start offset of 10 and end offset of 19.
   59  * int nprocs
   60  *    number of processors in the collective I/O communicator
   61  * ADIO_Offset min_st_offset
   62  * ADIO_Offset fd_start[0..nprocs_for_coll-1]
   63  *    starting location of "file domain"; region that a given process will
   64  *    perform aggregation for (i.e. actually do I/O)
   65  * ADIO_Offset fd_end[0..nprocs_for_coll-1]
   66  *    start + size - 1 roughly, but it can be less, or 0, in the case of 
   67  *    uneven distributions
   68  */
   69 
   70 /* forward declaration */
   71 static void 
   72 ADIOI_BG_compute_agg_ranklist_serial ( ADIO_File fd, 
   73                     const ADIOI_BG_ConfInfo_t *confInfo, 
   74                     ADIOI_BG_ProcInfo_t *all_procInfo);
   75 
   76 /*
   77  * Compute the aggregator-related parameters that are required in 2-phase collective IO of ADIO.
   78  * The parameters are 
   79  *  . the number of aggregators (proxies) : fd->hints->cb_nodes
   80  *  . the ranks of the aggregators :        fd->hints->ranklist
   81  * By compute these two parameters in a BG-PSET-aware way, the default 2-phase collective IO of 
   82  *  ADIO can work more efficiently.
   83  */
   84 int 
   85 ADIOI_BG_gen_agg_ranklist(ADIO_File fd, int n_aggrs_per_pset) 
   86 {
   87     int r, s;
   88     ADIOI_BG_ProcInfo_t  *procInfo, *all_procInfo;
   89     ADIOI_BG_ConfInfo_t  *confInfo;
   90     TRACE_ERR("Entering ADIOI_BG_gen_agg_ranklist\n");
   91 
   92     MPI_Comm_size( fd->comm, &s );
   93     MPI_Comm_rank( fd->comm, &r );
   94 
   95   /* Collect individual BG personality information */
   96     confInfo = ADIOI_BG_ConfInfo_new ();
   97     procInfo = ADIOI_BG_ProcInfo_new ();
   98     ADIOI_BG_persInfo_init( confInfo, procInfo, s, r, n_aggrs_per_pset, fd->comm);
   99 
  100   /* Gather BG personality infomation onto process 0 */
  101     /* if (r == 0) */
  102     all_procInfo  = ADIOI_BG_ProcInfo_new_n  (s);
  103 
  104     MPI_Gather( (void *)procInfo,     sizeof(ADIOI_BG_ProcInfo_t), MPI_BYTE, 
  105         (void *)all_procInfo, sizeof(ADIOI_BG_ProcInfo_t), MPI_BYTE, 
  106         0, 
  107         fd->comm );
  108 
  109   /* Compute a list of the ranks of chosen IO proxy CN on process 0 */
  110     if (r == 0) { 
  111     ADIOI_BG_compute_agg_ranklist_serial (fd, confInfo, all_procInfo);
  112     /* ADIOI_BG_ProcInfo_free (all_procInfo);*/
  113     }
  114     ADIOI_BG_ProcInfo_free (all_procInfo);
  115 
  116   /* Send the info of IO proxy CN to all processes and keep the info in fd->hints struct.
  117      Declared in adio_cb_config_list.h */
  118     ADIOI_cb_bcast_rank_map(fd);
  119     if (gpfsmpio_balancecontig == 1) { /* additionally need to send bridgelist,
  120                     bridgelistnum and numbridges to all
  121                     ranks */
  122     if (r != 0) {
  123         fd->hints->fs_hints.bg.bridgelist =
  124         ADIOI_Malloc(fd->hints->cb_nodes*sizeof(int));
  125         if (fd->hints->fs_hints.bg.bridgelist == NULL) {
  126         /* NEED TO HANDLE ENOMEM */
  127         }
  128     }
  129     MPI_Bcast(fd->hints->fs_hints.bg.bridgelist, fd->hints->cb_nodes, MPI_INT, 0,
  130         fd->comm);
  131 
  132     if (r != 0) {
  133         fd->hints->fs_hints.bg.bridgelistnum =
  134         ADIOI_Malloc(fd->hints->cb_nodes*sizeof(int));
  135         if (fd->hints->fs_hints.bg.bridgelistnum == NULL) {
  136         /* NEED TO HANDLE ENOMEM */
  137         }
  138     }
  139     MPI_Bcast(fd->hints->fs_hints.bg.bridgelistnum, fd->hints->cb_nodes,
  140         MPI_INT, 0, fd->comm);
  141 
  142     MPI_Bcast(&fd->hints->fs_hints.bg.numbridges, 1, MPI_INT, 0,
  143         fd->comm);
  144 
  145     }
  146 
  147 
  148     ADIOI_BG_persInfo_free( confInfo, procInfo );
  149     TRACE_ERR("Leaving ADIOI_BG_gen_agg_ranklist\n");
  150     return 0;
  151 }
  152 
  153 
  154 /* There are some number of bridge nodes (randomly) distributed through the job
  155  * We need to split the nodes among the bridge nodes */
  156 /* Maybe find which bridge node is closer (manhattan distance) and try to
  157  * distribute evenly.
  158  */
  159 /* 
  160  * Pick IO aggregators based on the under PSET organization and stores the ranks of the proxy CNs in tmp_ranklist.
  161  * The first order of tmp_ranklist is : PSET number
  162  * The secondary order of the list is determined in ADIOI_BG_select_agg_in_pset() and thus adjustable.
  163  */
  164 typedef struct
  165 {
  166    int rank;
  167    int bridge;
  168 } sortstruct;
  169 
  170 typedef struct
  171 {
  172    int bridgeRank;
  173    int numAggsAssigned;
  174 } bridgeAggAssignment;
  175 
  176 static int intsort(const void *p1, const void *p2)
  177 {
  178    sortstruct *i1, *i2;
  179    i1 = (sortstruct *)p1;
  180    i2 = (sortstruct *)p2;
  181    return(i1->bridge - i2->bridge);
  182 }
  183 
  184 static int 
  185 ADIOI_BG_compute_agg_ranklist_serial_do (const ADIOI_BG_ConfInfo_t *confInfo, 
  186                       ADIOI_BG_ProcInfo_t       *all_procInfo, 
  187                       int *tmp_ranklist)
  188 {
  189     TRACE_ERR("Entering ADIOI_BG_compute_agg_ranklist_serial_do\n");
  190    /* BES: This should be done in the init routines probably. */
  191     int i, j;
  192     int aggTotal;
  193     int *aggList;
  194 
  195     if (gpfsmpio_bridgeringagg > 0) {
  196 
  197       int numAggs = confInfo->aggRatio * confInfo->ioMinSize /*virtualPsetSize*/;
  198         /* the number of aggregators is (numAggs per bridgenode) */
  199       if(numAggs == 1)
  200         aggTotal = 1;
  201       else
  202         aggTotal = confInfo->numBridgeRanks * numAggs;
  203 
  204       aggList = (int *)ADIOI_Malloc(aggTotal * sizeof(int));
  205       if(aggTotal == 1) { /* special case when we only have one bridge node */
  206 
  207         sortstruct *bridgelist = (sortstruct *)ADIOI_Malloc(confInfo->nProcs * sizeof(sortstruct));
  208         for(i=0; i < confInfo->nProcs; i++)
  209         {
  210           bridgelist[i].bridge = all_procInfo[i].bridgeRank;
  211           bridgelist[i].rank = i;
  212           TRACE_ERR("bridgelist[%d].bridge: %d .rank: %d\n", i, bridgelist[i].bridge, i);
  213         }
  214 
  215         /* This list contains rank->bridge info. Now, we need to sort this list. */
  216         qsort(bridgelist, confInfo->nProcs, sizeof(sortstruct), intsort);
  217 
  218         aggList[0] = bridgelist[0].bridge;
  219         ADIOI_Free(bridgelist);
  220 
  221       }
  222       else { // aggTotal > 1
  223 
  224         int currentAggListSize = 0;
  225         int numBridgesWithAggAssignments = 0;
  226         bridgeAggAssignment *aggAssignments = (bridgeAggAssignment *)ADIOI_Malloc(confInfo->numBridgeRanks * sizeof(bridgeAggAssignment));
  227 
  228         int partitionSize = all_procInfo[0].numNodesInPartition;
  229         int *nodesAssigned = (int *)ADIOI_Malloc(partitionSize * sizeof(int));
  230         for (i=0;i<partitionSize;i++)
  231           nodesAssigned[i] = 0;
  232 
  233         int currentNumHops = gpfsmpio_bridgeringagg;
  234         int allAggsAssigned = 0;
  235 
  236         /* Iterate thru the process infos and select aggregators starting at currentNumHops
  237            away.  Increase the currentNumHops until all bridges have numAggs assigned to them.
  238         */
  239         while (!allAggsAssigned) {
  240           /* track whether any aggs are selected durng this round */
  241           int startingCurrentAggListSize = currentAggListSize;
  242           int numIterForHopsWithNoAggs = 0;
  243           for (i=0;i<confInfo->nProcs;i++) {
  244           if (all_procInfo[i].manhattanDistanceToBridge == currentNumHops) {
  245             if (nodesAssigned[all_procInfo[i].nodeRank] == 0) { // node is not assigned as an agg yet
  246               int foundBridge = 0;
  247               for (j=0;(j<numBridgesWithAggAssignments && !foundBridge);j++) {
  248                 if (aggAssignments[j].bridgeRank == all_procInfo[i].bridgeRank) {
  249                   foundBridge = 1;
  250                   if (aggAssignments[j].numAggsAssigned < numAggs) {
  251                     aggAssignments[j].numAggsAssigned++;
  252                     nodesAssigned[all_procInfo[i].nodeRank] = 1;
  253                     aggList[currentAggListSize] = all_procInfo[i].rank;
  254                     currentAggListSize++;
  255 #ifdef bridgeringaggtrace
  256                 printf("Assigned agg rank %d at nodeRank %d to bridge rank %d at a distance of %d hops\n",all_procInfo[i].rank,all_procInfo[i].nodeRank,all_procInfo[i].bridgeRank,currentNumHops);
  257 #endif
  258                   }
  259                 }
  260               }
  261               if (!foundBridge) {
  262                 aggAssignments[numBridgesWithAggAssignments].bridgeRank = all_procInfo[i].bridgeRank;
  263                 aggAssignments[numBridgesWithAggAssignments].numAggsAssigned = 1;
  264                 numBridgesWithAggAssignments++;
  265                 nodesAssigned[all_procInfo[i].nodeRank] = 1;
  266                 aggList[currentAggListSize] = all_procInfo[i].rank;
  267                 currentAggListSize++;
  268 #ifdef bridgeringaggtrace
  269                 printf("Assigned agg rank %d at nodeRank %d to bridge rank %d at a distance of %d hops\n",all_procInfo[i].rank,all_procInfo[i].nodeRank,all_procInfo[i].bridgeRank,currentNumHops);
  270 #endif
  271               }
  272             }
  273           }
  274         }
  275 
  276         if (numBridgesWithAggAssignments == confInfo->numBridgeRanks) {
  277           allAggsAssigned = 1;
  278           for (i=0;(i<numBridgesWithAggAssignments && allAggsAssigned);i++) {
  279             if (aggAssignments[i].numAggsAssigned < numAggs)
  280               allAggsAssigned = 0;
  281           }
  282         }
  283         currentNumHops++;
  284         /* Handle the case where the numAggs is more than exists starting
  285          * at gpfsmpio_bridgeringagg hops, wrap back and restart at 0 to
  286          * assign the overrun - it is up to the user to realize this
  287          * situation and adjust numAggs and gpfsmpio_bridgeringagg
  288          * accordingly.
  289          */
  290         if (currentNumHops > 16)
  291           currentNumHops = 0;
  292         /* If 3 rounds go by without selecting an agg abort to avoid
  293            infinite loop.
  294         */
  295         if (startingCurrentAggListSize == currentAggListSize)
  296           numIterForHopsWithNoAggs++;
  297         else
  298           numIterForHopsWithNoAggs = 0;
  299         ADIOI_Assert(numIterForHopsWithNoAggs <= 3);
  300         }
  301 
  302         ADIOI_Free(aggAssignments);
  303         ADIOI_Free(nodesAssigned);
  304 
  305       } // else aggTotal  > 1
  306 
  307        memcpy(tmp_ranklist, aggList, aggTotal*sizeof(int));
  308     } // gpfsmpio_bridgeringagg > 0
  309 
  310     else { // gpfsmpio_bridgeringagg unset - default code
  311 
  312     int distance, numAggs;
  313 
  314     /* Aggregators will be midpoints between sorted MPI rank lists of who shares a given
  315      * bridge node */
  316 
  317    sortstruct *bridgelist = (sortstruct *)ADIOI_Malloc(confInfo->nProcs * sizeof(sortstruct));
  318    for(i=0; i < confInfo->nProcs; i++)
  319    {
  320       bridgelist[i].bridge = all_procInfo[i].bridgeRank;
  321       bridgelist[i].rank = i;
  322       TRACE_ERR("bridgelist[%d].bridge: %d .rank: %d\n", i, bridgelist[i].bridge, i);
  323    }
  324    
  325    /* This list contains rank->bridge info. Now, we need to sort this list. */
  326    qsort(bridgelist, confInfo->nProcs, sizeof(sortstruct), intsort);
  327 
  328    /* In this array, we can pick an appropriate number of midpoints based on
  329     * our bridgenode index and the number of aggregators */
  330 
  331    numAggs = confInfo->aggRatio * confInfo->ioMinSize /*virtualPsetSize*/;
  332    if(numAggs == 1)
  333       aggTotal = 1;
  334    else
  335    /* the number of aggregators is (numAggs per bridgenode) plus each 
  336     * bridge node is an aggregator */
  337       aggTotal = confInfo->numBridgeRanks * (numAggs+1);
  338 
  339    if(aggTotal>confInfo->nProcs) aggTotal=confInfo->nProcs;
  340 
  341    TRACE_ERR("numBridgeRanks: %d, aggRatio: %f numBridge: %d pset size: %d/%d numAggs: %d, aggTotal: %d\n", confInfo->numBridgeRanks, confInfo->aggRatio, confInfo->numBridgeRanks,  confInfo->ioMinSize, confInfo->ioMaxSize /*virtualPsetSize*/, numAggs, aggTotal);
  342    aggList = (int *)ADIOI_Malloc(aggTotal * sizeof(int));
  343 
  344 
  345    /* For each bridge node, determine who the aggregators will be */
  346    /* basically, the n*distance and bridge node */
  347    if(aggTotal == 1) /* special case when we only have one bridge node */
  348       aggList[0] = bridgelist[0].bridge;
  349    else
  350    {
  351      int lastBridge = bridgelist[confInfo->nProcs-1].bridge;
  352      int nextBridge = 0, nextAggr = confInfo->numBridgeRanks;
  353      int psetSize = 0;
  354      int procIndex;
  355      for(procIndex=confInfo->nProcs-1; procIndex>=0; procIndex--)
  356      {
  357        TRACE_ERR("bridgelist[%d].bridge %u/rank %u\n",procIndex,  bridgelist[procIndex].bridge, bridgelist[procIndex].rank);
  358        if(lastBridge == bridgelist[procIndex].bridge)
  359        {
  360          psetSize++;
  361          if(procIndex) continue; 
  362          else procIndex--;/* procIndex == 0 */
  363        }
  364        /* Sets up a list of nodes which will act as aggregators. numAggs
  365         * per bridge node total. The list of aggregators is
  366         * bridgeNode 0
  367         * bridgeNode 1
  368         * bridgeNode ...
  369         * bridgeNode N
  370         * bridgeNode[0]aggr[0]
  371         * bridgeNode[0]aggr[1]...
  372         * bridgeNode[0]aggr[N]...
  373         * ...
  374         * bridgeNode[N]aggr[0]..
  375         * bridgeNode[N]aggr[N]
  376         */
  377        aggList[nextBridge]=lastBridge;
  378        distance = psetSize/numAggs;
  379        TRACE_ERR("nextBridge %u is bridge %u, distance %u, size %u\n",nextBridge, aggList[nextBridge],distance,psetSize);
  380        if(numAggs>1)
  381        {
  382          for(j = 0; j < numAggs; j++)
  383          {
  384            ADIOI_Assert(nextAggr<aggTotal);
  385            aggList[nextAggr] = bridgelist[procIndex+j*distance+1].rank;
  386            TRACE_ERR("agglist[%d] -> bridgelist[%d] = %d\n", nextAggr, procIndex+j*distance+1,aggList[nextAggr]);
  387            if(aggList[nextAggr]==lastBridge) /* can't have bridge in the list twice */
  388            {  
  389              aggList[nextAggr] = bridgelist[procIndex+psetSize].rank; /* take the last one in the pset */
  390              TRACE_ERR("replacement agglist[%d] -> bridgelist[%d] = %d\n", nextAggr, procIndex+psetSize,aggList[nextAggr]);
  391            }
  392            nextAggr++;
  393          }
  394        }
  395        if(procIndex<0) break;
  396        lastBridge = bridgelist[procIndex].bridge;
  397        psetSize = 1;
  398        nextBridge++;
  399      }
  400    }
  401 
  402    TRACE_ERR("memcpy(tmp_ranklist, aggList, (numAggs(%u)*confInfo->numBridgeRanks(%u)+numAggs(%u)) (%u) %u*sizeof(int))\n",numAggs,confInfo->numBridgeRanks,numAggs,(numAggs*confInfo->numBridgeRanks+numAggs),aggTotal);
  403    memcpy(tmp_ranklist, aggList, aggTotal*sizeof(int));
  404    for(i=0;i<aggTotal;i++)
  405    {
  406       TRACE_ERR("tmp_ranklist[%d]: %d\n", i, tmp_ranklist[i]);
  407    }
  408 
  409 
  410    ADIOI_Free (bridgelist);
  411 
  412    TRACE_ERR("Leaving ADIOI_BG_compute_agg_ranklist_serial_do\n");
  413    }
  414 
  415    ADIOI_Free (aggList);
  416    return aggTotal;
  417 
  418 }
  419 
  420 /* 
  421  * compute aggregators ranklist and put it into fd->hints struct
  422  */ 
  423 static void 
  424 ADIOI_BG_compute_agg_ranklist_serial ( ADIO_File fd, 
  425                     const ADIOI_BG_ConfInfo_t *confInfo, 
  426                     ADIOI_BG_ProcInfo_t *all_procInfo)
  427 {
  428     TRACE_ERR("Entering ADIOI_BG_compute_agg_ranklist_serial\n");
  429     int i; 
  430     int naggs; 
  431     int size;
  432     int *tmp_ranklist;
  433 
  434   /* compute the ranklist of IO aggregators and put into tmp_ranklist */
  435     tmp_ranklist = (int *) ADIOI_Malloc (confInfo->nProcs * sizeof(int));
  436 
  437 #   if AGG_DEBUG
  438     for (i=0; i<confInfo->nProcs; i++) {
  439       DBG_FPRINTF(stderr, "\trank = %6d\n", all_procInfo[i].rank );
  440     }
  441 #   endif
  442 
  443     naggs= 
  444     ADIOI_BG_compute_agg_ranklist_serial_do (confInfo, all_procInfo, tmp_ranklist);
  445 
  446 #   define VERIFY 1
  447 #   if VERIFY
  448     DBG_FPRINTF(stderr, "\tconfInfo = min: %3d, max: %3d, naggrs: %3d, bridge: %3d, nprocs: %3d, vpset: %3d, ratio: %.4f; naggs = %d\n",
  449         confInfo->ioMinSize        ,
  450         confInfo->ioMaxSize        ,
  451         confInfo->nAggrs           ,
  452         confInfo->numBridgeRanks ,
  453         confInfo->nProcs          ,
  454         confInfo->ioMaxSize /*virtualPsetSize*/          ,
  455         confInfo->aggRatio        ,
  456         naggs );
  457 #   endif
  458     MPI_Comm_size( fd->comm, &size );
  459     /* This fix is for when the bridgenode rnk is not part of the particular
  460      * subcomm associated with this MPI File operation. I don't know if
  461      * this is the best/right answer but it passes the test cases at least.
  462      * I don't know how common file IO in subcomms is anyway... */
  463     for(i=0;i<naggs;i++)
  464     {
  465       if(tmp_ranklist[i] > size)
  466       {
  467          TRACE_ERR("Using 0 as tmp_ranklist[%d] instead of %d for comm %x\n",
  468                i, tmp_ranklist[i], fd->comm);
  469          tmp_ranklist[i] = 0;
  470       }
  471    }
  472          
  473 #   if AGG_DEBUG
  474     for (i=0; i<naggs; i++) {
  475       DBG_FPRINTF(stderr, "\taggr %-4d = %6d\n", i, tmp_ranklist[i] );
  476     }
  477 #   endif
  478     if (gpfsmpio_balancecontig == 1) {
  479     /* what comes out of this code block is the agg ranklist sorted by
  480      * bridge set and ion id with associated bridge info stored in the
  481      * hints structure for later access during file domain assignment */
  482 
  483     // sort the agg ranklist by ions and bridges
  484 
  485     int *interleavedbridgeranklist = (int *) ADIOI_Malloc (naggs * sizeof(int)); // resorted agg rank list
  486     /* list of all bridge ranks */
  487     int *bridgelist = (int *) ADIOI_Malloc (naggs * sizeof(int));
  488 
  489     /* each entry here is the number of aggregators associated with the
  490      * bridge rank of the same index in bridgelist */
  491     int *bridgelistnum = (int *) ADIOI_Malloc (naggs * sizeof(int));
  492     /* list of all ion IDs corresponding with bridgelist entries of same index */
  493     int *ionlist = (int *) ADIOI_Malloc (naggs * sizeof(int));
  494 
  495     int numbridges = 0;
  496 
  497     for (i=0;i<naggs;i++)
  498         bridgelistnum[i] = 0;
  499 
  500     /* Each entry in this list corresponds with the bridgelist and will contain the lowest bridge
  501      * agg rank on that ion. */
  502     int *summarybridgeminionaggrank = (int *) ADIOI_Malloc (naggs * sizeof(int));
  503     for (i=0;i<naggs;i++)
  504         summarybridgeminionaggrank[i] = -1;
  505 
  506     /* build the bridgelist, ionlist and bridgelistnum data by going thru each agg
  507      * entry and find the associated bridge list index - at the end we will
  508      * know how many aggs belong to each bridge in each ion */
  509     for (i=0;i<naggs;i++) {
  510         int aggbridgerank = all_procInfo[tmp_ranklist[i]].bridgeRank;
  511         int aggionid = all_procInfo[tmp_ranklist[i]].ionID;
  512         int foundrank = 0;
  513         int summaryranklistbridgeindex = 0;
  514         int j;
  515         for (j=0;(j<numbridges && !foundrank);j++) {
  516         if (bridgelist[j] == aggbridgerank) {
  517             foundrank = 1;
  518             summaryranklistbridgeindex = j;
  519         }
  520         else
  521             summaryranklistbridgeindex++;
  522         }
  523         if (!foundrank) {
  524         bridgelist[summaryranklistbridgeindex] = aggbridgerank;
  525         ionlist[summaryranklistbridgeindex] = aggionid;
  526 
  527         if (summarybridgeminionaggrank[summaryranklistbridgeindex] == -1)
  528             summarybridgeminionaggrank[summaryranklistbridgeindex] = aggbridgerank;
  529         else if (summarybridgeminionaggrank[summaryranklistbridgeindex] > aggbridgerank)
  530             summarybridgeminionaggrank[summaryranklistbridgeindex] = aggbridgerank;
  531         numbridges++;
  532         }
  533 
  534         bridgelistnum[summaryranklistbridgeindex]++;
  535     }
  536 
  537     /* at this point summarybridgeminionaggrank has the agg rank of the bridge for entries,
  538      * need to make each entry the minimum bridge rank for the entire ion. */
  539     for (i=0;i<numbridges;i++) {
  540         int aggIonId = ionlist[i];
  541         int j;
  542         for (j=0;j<numbridges;j++) {
  543           if (ionlist[j] == aggIonId) {
  544             if (summarybridgeminionaggrank[j] < summarybridgeminionaggrank[i])
  545               summarybridgeminionaggrank[i] = summarybridgeminionaggrank[j];
  546           }
  547         }
  548     }
  549 
  550     // resort by io node minimum bridge rank
  551     int x;
  552     for (x=0;x<numbridges;x++) {
  553         for (i=0;i<(numbridges-1);i++) {
  554         if (summarybridgeminionaggrank[i] > summarybridgeminionaggrank[i+1]) {
  555             int tmpminionaggrank = summarybridgeminionaggrank[i];
  556             summarybridgeminionaggrank[i] = summarybridgeminionaggrank[i+1];
  557             summarybridgeminionaggrank[i+1] = tmpminionaggrank;
  558             int tmpionid = ionlist[i];
  559             ionlist[i] = ionlist[i+1];
  560             ionlist[i+1] = tmpionid;
  561             int tmpbridgerank = bridgelist[i];
  562             bridgelist[i] = bridgelist[i+1];
  563             bridgelist[i+1] = tmpbridgerank;
  564             int tmpbridgeranknum = bridgelistnum[i];
  565             bridgelistnum[i] = bridgelistnum[i+1];
  566             bridgelistnum[i+1] = tmpbridgeranknum;
  567           }
  568         }
  569     }
  570 
  571     // for each io node make sure bridgelist is in rank order
  572     int startSortIndex = -1;
  573     int endSortIndex = -1;
  574     int currentBridgeIndex = 0;
  575 
  576     while (currentBridgeIndex < numbridges) {
  577         int currentIonId = ionlist[currentBridgeIndex];
  578         startSortIndex = currentBridgeIndex;
  579         while (ionlist[currentBridgeIndex] == currentIonId)
  580           currentBridgeIndex++;
  581         endSortIndex = currentBridgeIndex-1;
  582         for (x=startSortIndex;x<=endSortIndex;x++) {
  583           for (i=startSortIndex;i<endSortIndex;i++) {
  584             if (bridgelist[i] > bridgelist[i+1]) {
  585               int tmpbridgerank = bridgelist[i];
  586               bridgelist[i] = bridgelist[i+1];
  587               bridgelist[i+1] = tmpbridgerank;
  588               int tmpbridgeranknum = bridgelistnum[i];
  589               bridgelistnum[i] = bridgelistnum[i+1];
  590               bridgelistnum[i+1] = tmpbridgeranknum;
  591             }
  592           }
  593         }
  594     }
  595 
  596 
  597     /* populate interleavedbridgeranklist - essentially the agg rank list
  598      * is now sorted by the ion minimum bridge rank and bridge node */
  599     int currentrankoffset = 0;
  600     for (i=0;i<numbridges;i++) {
  601         int *thisBridgeAggList = (int *) ADIOI_Malloc (naggs * sizeof(int));
  602         int numAggsForThisBridge = 0;
  603 
  604         int k;
  605         for (k=0;k<naggs;k++) {
  606         int aggbridgerank = all_procInfo[tmp_ranklist[k]].bridgeRank;
  607         if (aggbridgerank == bridgelist[i]) {
  608             thisBridgeAggList[numAggsForThisBridge] = tmp_ranklist[k];
  609             numAggsForThisBridge++;
  610         }
  611         }
  612 
  613         // sort thisBridgeAggList
  614         for (x=0;x<numAggsForThisBridge;x++) {
  615         int n;
  616         for (n=0;n<(numAggsForThisBridge-1);n++) {
  617             if (thisBridgeAggList[n] > thisBridgeAggList[n+1]) {
  618             int tmpthisBridgeAggList = thisBridgeAggList[n];
  619             thisBridgeAggList[n] = thisBridgeAggList[n+1];
  620             thisBridgeAggList[n+1] = tmpthisBridgeAggList;
  621             }
  622         }
  623         }
  624         int n;
  625         for (n=0;n<numAggsForThisBridge;n++) {
  626         interleavedbridgeranklist[currentrankoffset] = thisBridgeAggList[n];
  627         currentrankoffset++;
  628         }
  629         ADIOI_Free(thisBridgeAggList);
  630     }
  631 
  632 #ifdef balancecontigtrace
  633     fprintf(stderr,"Interleaved aggregator list:\n");
  634     for (i=0;i<naggs;i++) {
  635         fprintf(stderr,"Agg: %d Agg rank: %d with bridge rank %d and ion ID %d\n",i,interleavedbridgeranklist[i],all_procInfo[interleavedbridgeranklist[i]].bridgeRank,all_procInfo[interleavedbridgeranklist[i]].ionID);
  636     }
  637     fprintf(stderr,"Bridges list:\n");
  638     for (i=0;i<numbridges;i++) {
  639         fprintf(stderr,"bridge %d ion min rank %d rank %d number of aggs %d ion id %d\n",i,summarybridgeminionaggrank[i],bridgelist[i],bridgelistnum[i],ionlist[i]);
  640     }
  641 
  642 #endif
  643     /* copy the ranklist of IO aggregators to fd->hints */
  644     if(fd->hints->ranklist != NULL)
  645         ADIOI_Free (fd->hints->ranklist);
  646     if(fd->hints->fs_hints.bg.bridgelist != NULL)
  647         ADIOI_Free (fd->hints->fs_hints.bg.bridgelist);
  648     if(fd->hints->fs_hints.bg.bridgelistnum != NULL)
  649         ADIOI_Free (fd->hints->fs_hints.bg.bridgelistnum);
  650 
  651     fd->hints->cb_nodes = naggs;
  652     fd->hints->fs_hints.bg.numbridges = numbridges;
  653     fd->hints->ranklist = (int *) ADIOI_Malloc (naggs * sizeof(int));
  654     memcpy( fd->hints->ranklist, interleavedbridgeranklist, naggs*sizeof(int) );
  655 
  656     fd->hints->fs_hints.bg.bridgelist = (int *) ADIOI_Malloc (naggs * sizeof(int));
  657     memcpy( fd->hints->fs_hints.bg.bridgelist, bridgelist, naggs*sizeof(int) );
  658 
  659     fd->hints->fs_hints.bg.bridgelistnum = (int *) ADIOI_Malloc (naggs * sizeof(int));
  660     memcpy( fd->hints->fs_hints.bg.bridgelistnum, bridgelistnum, naggs*sizeof(int) );
  661 
  662     ADIOI_Free(summarybridgeminionaggrank);
  663     ADIOI_Free( tmp_ranklist );
  664     ADIOI_Free( bridgelistnum );
  665     ADIOI_Free( bridgelist );
  666     ADIOI_Free( interleavedbridgeranklist );
  667     ADIOI_Free(ionlist);
  668 
  669     }  else {
  670     /* classic topology-agnostic copy of the ranklist of IO aggregators to
  671      * fd->hints */
  672     if(fd->hints->ranklist != NULL) ADIOI_Free (fd->hints->ranklist);
  673 
  674     fd->hints->cb_nodes = naggs;
  675     fd->hints->ranklist = (int *) ADIOI_Malloc (naggs * sizeof(int));
  676     memcpy( fd->hints->ranklist, tmp_ranklist, naggs*sizeof(int) );
  677 
  678     ADIOI_Free( tmp_ranklist );
  679     }
  680     TRACE_ERR("Leaving ADIOI_BG_compute_agg_ranklist_serial\n");
  681     return;
  682 }