"Fossies" - the Fresh Open Source Software Archive

Member "hpcc-1.5.0/STREAM/stream.c" (18 Mar 2016, 24548 Bytes) of package /linux/privat/hpcc-1.5.0.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "stream.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 1.5.0b_vs_1.5.0.

    1 /*-----------------------------------------------------------------------*/
    2 /* Program: STREAM                                                       */
    3 /* Revision: $Id: stream_mpi.c,v 1.7 2014/10/22 00:13:21 mccalpin Exp mccalpin $ */
    4 /* Original code developed by John D. McCalpin                           */
    5 /* Programmers: John D. McCalpin                                         */
    6 /*              Joe R. Zagar                                             */
    7 /*                                                                       */
    8 /* This program measures memory transfer rates in MB/s for simple        */
    9 /* computational kernels coded in C.                                     */
   10 /*-----------------------------------------------------------------------*/
   11 /* Copyright 1991-2013: John D. McCalpin                                 */
   12 /*-----------------------------------------------------------------------*/
   13 /* License:                                                              */
   14 /*  1. You are free to use this program and/or to redistribute           */
   15 /*     this program.                                                     */
   16 /*  2. You are free to modify this program for your own use,             */
   17 /*     including commercial use, subject to the publication              */
   18 /*     restrictions in item 3.                                           */
   19 /*  3. You are free to publish results obtained from running this        */
   20 /*     program, or from works that you derive from this program,         */
   21 /*     with the following limitations:                                   */
   22 /*     3a. In order to be referred to as "STREAM benchmark results",     */
   23 /*         published results must be in conformance to the STREAM        */
   24 /*         Run Rules, (briefly reviewed below) published at              */
   25 /*         http://www.cs.virginia.edu/stream/ref.html                    */
   26 /*         and incorporated herein by reference.                         */
   27 /*         As the copyright holder, John McCalpin retains the            */
   28 /*         right to determine conformity with the Run Rules.             */
   29 /*     3b. Results based on modified source code or on runs not in       */
   30 /*         accordance with the STREAM Run Rules must be clearly          */
   31 /*         labelled whenever they are published.  Examples of            */
   32 /*         proper labelling include:                                     */
   33 /*           "tuned STREAM benchmark results"                            */
   34 /*           "based on a variant of the STREAM benchmark code"           */
   35 /*         Other comparable, clear, and reasonable labelling is          */
   36 /*         acceptable.                                                   */
   37 /*     3c. Submission of results to the STREAM benchmark web site        */
   38 /*         is encouraged, but not required.                              */
   39 /*  4. Use of this program or creation of derived works based on this    */
   40 /*     program constitutes acceptance of these licensing restrictions.   */
   41 /*  5. Absolutely no warranty is expressed or implied.                   */
   42 /*-----------------------------------------------------------------------*/
   43 
   44 #include <hpcc.h>
   45 
   46 #include <float.h>
   47 #include <limits.h>
   48 
   49 #ifdef _OPENMP
   50 #include <omp.h>
   51 #endif
   52 
   53 #define TUNED 1
   54 #define VERBOSE 1
   55 
   56 /* INSTRUCTIONS:
   57  *
   58  * 1) STREAM requires different amounts of memory to run on different
   59  *           systems, depending on both the system cache size(s) and the
   60  *           granularity of the system timer.
   61  *     You should adjust the value of 'STREAM_ARRAY_SIZE' (below)
   62  *           to meet *both* of the following criteria:
   63  *       (a) Each array must be at least 4 times the size of the
   64  *           available cache memory. I don't worry about the difference
   65  *           between 10^6 and 2^20, so in practice the minimum array size
   66  *           is about 3.8 times the cache size.
   67  *           Example 1: One Xeon E3 with 8 MB L3 cache
   68  *               STREAM_ARRAY_SIZE should be >= 4 million, giving
   69  *               an array size of 30.5 MB and a total memory requirement
   70  *               of 91.5 MB.
   71  *           Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP)
   72  *               STREAM_ARRAY_SIZE should be >= 20 million, giving
   73  *               an array size of 153 MB and a total memory requirement
   74  *               of 458 MB.
   75  *       (b) The size should be large enough so that the 'timing calibration'
   76  *           output by the program is at least 20 clock-ticks.
   77  *           Example: most versions of Windows have a 10 millisecond timer
   78  *               granularity.  20 "ticks" at 10 ms/tic is 200 milliseconds.
   79  *               If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec.
   80  *               This means the each array must be at least 1 GB, or 128M elements.
   81  *
   82  *      Version 5.10 increases the default array size from 2 million
   83  *          elements to 10 million elements in response to the increasing
   84  *          size of L3 caches.  The new default size is large enough for caches
   85  *          up to 20 MB.
   86  *      Version 5.10 changes the loop index variables from "register int"
   87  *          to "ssize_t", which allows array indices >2^32 (4 billion)
   88  *          on properly configured 64-bit systems.  Additional compiler options
   89  *          (such as "-mcmodel=medium") may be required for large memory runs.
   90  *
   91  *      Array size can be set at compile time without modifying the source
   92  *          code for the (many) compilers that support preprocessor definitions
   93  *          on the compile line.  E.g.,
   94  *                gcc -O -DSTREAM_ARRAY_SIZE=100000000 stream.c -o stream.100M
   95  *          will override the default size of 10M with a new size of 100M elements
   96  *          per array.
   97  */
   98 
   99 /*  2) STREAM runs each kernel "NTIMES" times and reports the *best* result
  100  *         for any iteration after the first, therefore the minimum value
  101  *         for NTIMES is 2.
  102  *      There are no rules on maximum allowable values for NTIMES, but
  103  *         values larger than the default are unlikely to noticeably
  104  *         increase the reported performance.
  105  *      NTIMES can also be set on the compile line without changing the source
  106  *         code using, for example, "-DNTIMES=7".
  107  */
  108 
  109 static int array_elements;
  110 # define N 2000000
  111 # define NTIMES 10
  112 
  113 /*
  114 // Make the scalar coefficient modifiable at compile time.
  115 // The old value of 3.0 cause floating-point overflows after a relatively small
  116 // number of iterations.  The new default of 0.42 allows over 2000 iterations for
  117 // 32-bit IEEE arithmetic and over 18000 iterations for 64-bit IEEE arithmetic.
  118 // The growth in the solution can be eliminated (almost) completely by setting
  119 // the scalar value to 0.41421445, but this also means that the error checking
  120 // code no longer triggers an error if the code does not actually execute the
  121 // correct number of iterations!
  122 */
  123 #ifndef SCALAR
  124 #define SCALAR 0.42
  125 #endif
  126 
  127 /*
  128 // ----------------------- !!! NOTE CHANGE IN DEFINITION !!! ------------------
  129 // The OFFSET preprocessor variable is not used in this version of the benchmark.
  130 // The user must change the code at or after the "posix_memalign" array allocations
  131 //    to change the relative alignment of the pointers.
  132 // ----------------------- !!! NOTE CHANGE IN DEFINITION !!! ------------------
  133 */
  134 # define OFFSET 0
  135 
  136 /*
  137  * 3) Compile the code with optimization.  Many compilers generate
  138  *    unreasonably bad code before the optimizer tightens things up.
  139  *    If the results are unreasonably good, on the other hand, the
  140  *    optimizer might be too smart for me!
  141  *
  142  *    For a simple single-core version, try compiling with:
  143  *            cc -O stream.c -o stream
  144  *    This is known to work on many, many systems....
  145  *
  146  *    To use multiple cores, you need to tell the compiler to obey the OpenMP
  147  *    directives in the code.  This varies by compiler, but a common example is
  148  *            gcc -O -fopenmp stream.c -o stream_omp
  149  *    The environment variable OMP_NUM_THREADS allows runtime control of the
  150  *    number of threads/cores used when the resulting "stream_omp" program
  151  *    is executed.
  152  *
  153  *     To run with single-precision variables and arithmetic, simply add
  154  *         -DSTREAM_TYPE=float
  155  *     to the compile line.
  156  *     Note that this changes the minimum array sizes required --- see (1) above.
  157  *
  158  *     The preprocessor directive "TUNED" does not do much -- it simply causes the
  159  *     code to call separate functions to execute each kernel.  Trivial versions
  160  *     of these functions are provided, but they are *not* tuned -- they just
  161  *     provide predefined interfaces to be replaced with tuned code.
  162  *
  163  *
  164  * 4) Optional: Mail the results to mccalpin@cs.virginia.edu
  165  *    Be sure to include info that will help me understand:
  166  *  a) the computer hardware configuration (e.g., processor model, memory type)
  167  *  b) the compiler name/version and compilation flags
  168  *  c) any run-time information (such as OMP_NUM_THREADS)
  169  *  d) all of the output from the test case.
  170  *
  171  * Thanks!
  172  *
  173  *-----------------------------------------------------------------------*/
  174 
  175 # define HLINE "-------------------------------------------------------------\n"
  176 
  177 /* Some compilers require an extra keyword to recognize the "restrict" qualifier. */
  178 static double * restrict a, * restrict b, * restrict c;
  179 
  180 static double avgtime[4] = {0}, maxtime[4] = {0},
  181   mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
  182 
  183 static char *label[4] = {"Copy:      ", "Scale:     ",
  184     "Add:       ", "Triad:     "};
  185 
  186 static double bytes[4] = {
  187     2 * sizeof(double),
  188     2 * sizeof(double),
  189     3 * sizeof(double),
  190     3 * sizeof(double)
  191     };
  192 
  193 #ifdef TUNED
  194 extern void tuned_STREAM_Copy(void);
  195 extern void tuned_STREAM_Scale(double scalar);
  196 extern void tuned_STREAM_Add(void);
  197 extern void tuned_STREAM_Triad(double scalar);
  198 #endif
  199 
  200 static void
  201 checkSTREAMresults(FILE *outFile, int doIO, double *AvgErrByRank, int numranks, int *failure) {
  202   double aj,bj,cj,scalar;
  203   double aSumErr,bSumErr,cSumErr;
  204   double aAvgErr,bAvgErr,cAvgErr;
  205   double epsilon;
  206   int j, k, ierr, err;
  207 
  208   /* Repeat the computation of aj, bj, cj */
  209   /* reproduce initialization */
  210   aj = 1.0;
  211   bj = 2.0;
  212   cj = 0.0;
  213   /* a[] is modified during timing check */
  214   aj = 2.0E0 * aj;
  215   /* now execute timing loop */
  216   scalar = SCALAR;
  217   for (k=0; k<NTIMES; k++) {
  218     cj = aj;
  219     bj = scalar*cj;
  220     cj = aj+bj;
  221     aj = bj+scalar*cj;
  222   }
  223 
  224   /* Compute the average of the average errors contributed by each MPI rank */
  225   aSumErr = 0.0;
  226   bSumErr = 0.0;
  227   cSumErr = 0.0;
  228   for (k=0; k<numranks; k++) {
  229           aSumErr += AvgErrByRank[3*k + 0];
  230           bSumErr += AvgErrByRank[3*k + 1];
  231           cSumErr += AvgErrByRank[3*k + 2];
  232   }
  233   aAvgErr = aSumErr / (double) numranks;
  234   bAvgErr = bSumErr / (double) numranks;
  235   cAvgErr = cSumErr / (double) numranks;
  236 
  237   if (sizeof(double) == 4) {
  238           epsilon = 1.e-6;
  239   }
  240   else if (sizeof(double) == 8) {
  241           epsilon = 1.e-13;
  242   }
  243   else if (sizeof(double) == 10) {
  244           epsilon = 1.e-23;
  245   }
  246   else {
  247           if (doIO) fprintf( outFile, "WEIRD: sizeof(double) = %lu\n",sizeof(double));
  248           epsilon = 1.e-6;
  249   }
  250 
  251   *failure = 1;
  252 
  253   err = 0;
  254 
  255   if (fabs(aAvgErr/aj) > epsilon) {
  256     err++;
  257     if (doIO) {
  258       fprintf( outFile, "Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
  259       fprintf( outFile, "     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,fabs(aAvgErr)/aj);
  260     }
  261     ierr = 0;
  262     for (j=0; j<array_elements; j++) {
  263       if (fabs(a[j]/aj-1.0) > epsilon) {
  264         ierr++;
  265       }
  266     }
  267     if (ierr > 0)
  268       if (doIO)
  269         fprintf( outFile, "     For array a[], %d errors were found.\n",ierr);
  270   }
  271   if (fabs(bAvgErr/bj) > epsilon) {
  272     err++;
  273     if (doIO) {
  274       fprintf( outFile, "Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
  275       fprintf( outFile, "     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,fabs(bAvgErr)/bj);
  276       fprintf( outFile, "     AvgRelAbsErr > Epsilon (%e)\n",epsilon);
  277     }
  278     ierr = 0;
  279     for (j=0; j<array_elements; j++) {
  280       if (fabs(b[j]/bj-1.0) > epsilon) {
  281         ierr++;
  282       }
  283     }
  284     if (ierr > 0)
  285       if (doIO)
  286         fprintf( outFile, "     For array b[], %d errors were found.\n",ierr);
  287   }
  288   if (fabs(cAvgErr/cj) > epsilon) {
  289     err++;
  290     if (doIO) {
  291       fprintf( outFile, "Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
  292       fprintf( outFile, "     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,fabs(cAvgErr)/cj);
  293       fprintf( outFile, "     AvgRelAbsErr > Epsilon (%e)\n",epsilon);
  294     }
  295     ierr = 0;
  296     for (j=0; j<array_elements; j++) {
  297       if (fabs(c[j]/cj-1.0) > epsilon) {
  298         ierr++;
  299       }
  300     }
  301     if (ierr > 0)
  302       if (doIO)
  303         fprintf( outFile, "     For array c[], %d errors were found.\n",ierr);
  304   }
  305   if (err == 0) {
  306     *failure = 0;
  307     if (doIO)
  308       fprintf( outFile, "Solution Validates: avg error less than %e on all three arrays\n",epsilon);
  309   }
  310 }
  311 
  312 # define M 20
  313 
  314 static int
  315 checktick() {
  316     int  i, minDelta, Delta;
  317     double t1, t2, timesfound[M];
  318 
  319 /*  Collect a sequence of M unique time values from the system. */
  320 
  321     for (i = 0; i < M; i++) {
  322       t1 = MPI_Wtime();
  323       while( ((t2=MPI_Wtime()) - t1) < 1.0E-6 )
  324         ;
  325       timesfound[i] = t1 = t2;
  326     }
  327 
  328 /*
  329  * Determine the minimum difference between these M values.
  330  * This result will be our estimate (in microseconds) for the
  331  * clock granularity.
  332  */
  333 
  334     minDelta = 1000000;
  335     for (i = 1; i < M; i++) {
  336       Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1]));
  337       minDelta = Mmin(minDelta, Mmax(Delta,0));
  338     }
  339 
  340    return(minDelta);
  341 }
  342 #undef M
  343 
  344 
  345 /*
  346 For the MPI code I separate the computation of errors from the error
  347 reporting output functions (which are handled by MPI rank 0).
  348 */
  349 void computeSTREAMerrors(double *aAvgErr, double *bAvgErr, double *cAvgErr)
  350 {
  351   double aj,bj,cj,scalar;
  352   double aSumErr,bSumErr,cSumErr;
  353   int j;
  354   int k;
  355 
  356   /* reproduce initialization */
  357   aj = 1.0;
  358   bj = 2.0;
  359   cj = 0.0;
  360   /* a[] is modified during timing check */
  361   aj = 2.0E0 * aj;
  362   /* now execute timing loop */
  363   scalar = SCALAR;
  364   for (k=0; k<NTIMES; k++)
  365   {
  366       cj = aj;
  367       bj = scalar*cj;
  368       cj = aj+bj;
  369       aj = bj+scalar*cj;
  370   }
  371 
  372   /* accumulate deltas between observed and expected results */
  373   aSumErr = 0.0;
  374   bSumErr = 0.0;
  375   cSumErr = 0.0;
  376   for (j=0; j<array_elements; j++) {
  377     aSumErr += fabs(a[j] - aj);
  378     bSumErr += fabs(b[j] - bj);
  379     cSumErr += fabs(c[j] - cj);
  380   }
  381   *aAvgErr = aSumErr / (double) array_elements;
  382   *bAvgErr = bSumErr / (double) array_elements;
  383   *cAvgErr = cSumErr / (double) array_elements;
  384 }
  385 
  386 
  387 int
  388 HPCC_Stream(HPCC_Params *params, int doIO, MPI_Comm comm, int world_rank,
  389   double *copyGBs, double *scaleGBs, double *addGBs, double *triadGBs, int *failure) {
  390     int quantum,  BytesPerWord, numranks, myrank;
  391     int j, k;
  392     double  scalar, t, t0, t1, times[4][NTIMES], times_copy[4][NTIMES];
  393     FILE *outFile;
  394     double GiBs = 1024.0 * 1024.0 * 1024.0, curGBs;
  395     double AvgError[3] = {0.0,0.0,0.0};
  396     double *AvgErrByRank;
  397 
  398 
  399     if (doIO) {
  400       outFile = fopen( params->outFname, "a" );
  401       if (! outFile) {
  402         outFile = stderr;
  403         fprintf( outFile, "Cannot open output file.\n" );
  404         return 1;
  405       }
  406     }
  407 
  408     t0 = MPI_Wtime();
  409 
  410     MPI_Comm_size( comm, &numranks );
  411     MPI_Comm_rank( comm, &myrank );
  412 
  413     array_elements = HPCC_LocalVectorSize( params, 3, sizeof(double), 0 ); /* Need 3 vectors */
  414     params->StreamVectorSize = array_elements;
  415 
  416     a = HPCC_XMALLOC( double, array_elements );
  417     b = HPCC_XMALLOC( double, array_elements );
  418     c = HPCC_XMALLOC( double, array_elements );
  419 
  420     if (!a || !b || !c) {
  421       if (c) HPCC_free(c);
  422       if (b) HPCC_free(b);
  423       if (a) HPCC_free(a);
  424       if (doIO) {
  425         fprintf( outFile, "Failed to allocate memory (%d).\n", array_elements );
  426         fflush( outFile );
  427         fclose( outFile );
  428       }
  429       /* FIXME: must be made global */
  430       return 1;
  431     }
  432 
  433     /* --- SETUP --- determine precision and check timing --- */
  434 
  435     if (doIO) {
  436     fprintf( outFile, HLINE);
  437     BytesPerWord = sizeof(double);
  438     fprintf( outFile, "This system uses %d bytes per DOUBLE PRECISION word.\n",
  439              BytesPerWord);
  440 
  441     fprintf( outFile, HLINE);
  442     fprintf( outFile, "Array size = %d, Offset = %d\n" , array_elements, OFFSET);
  443     fprintf( outFile, "Total memory required = %.4f GiB.\n",
  444              (3.0 * BytesPerWord) * ( (double) array_elements / GiBs));
  445     fprintf( outFile, "Each test is run %d times.\n", NTIMES );
  446     fprintf( outFile, " The *best* time for each kernel (excluding the first iteration)\n" );
  447     fprintf( outFile, " will be used to compute the reported bandwidth.\n");
  448     fprintf( outFile, "The SCALAR value used for this run is %f\n", SCALAR );
  449 
  450     }
  451 
  452 #ifdef _OPENMP
  453     if (doIO) fprintf( outFile, HLINE);
  454 #pragma omp parallel private(k)
  455     {
  456 #pragma omp single nowait
  457       {
  458         k = omp_get_num_threads();
  459         if (doIO) fprintf( outFile, "Number of Threads requested = %i\n",k);
  460         params->StreamThreads = k;
  461       }
  462     }
  463 #endif
  464 
  465     /* --- SETUP --- initialize arrays and estimate precision of timer --- */
  466 
  467 #ifdef _OPENMP
  468 #pragma omp parallel for
  469 #endif
  470     for (j=0; j<array_elements; j++) {
  471       a[j] = 1.0;
  472       b[j] = 2.0;
  473       c[j] = 0.0;
  474     }
  475 
  476     /* Rank 0 needs to allocate arrays to hold error data and timing data from
  477        all ranks for analysis and output.
  478        Allocate and instantiate the arrays here -- after the primary arrays
  479        have been instantiated -- so there is no possibility of having these
  480        auxiliary arrays mess up the NUMA placement of the primary arrays. */
  481 
  482     /* There are 3 average error values for each rank (using double). */
  483     AvgErrByRank = HPCC_XMALLOC( double, 3 * numranks );
  484 
  485     /* There are 4*NTIMES timing values for each rank (always doubles) */
  486     if (AvgErrByRank == NULL) {
  487       if (doIO)
  488         fprintf( outFile, "Ooops -- allocation of arrays to collect timing data on MPI rank %d failed\n", world_rank);
  489       MPI_Abort(comm, 3); /* FIXME: handle failure more gracefully */
  490     }
  491 
  492     /* FIXME: replace with loop to use floating-point data */
  493     memset(AvgErrByRank,0,3*sizeof(double)*numranks);
  494 
  495     if (doIO) fprintf( outFile, HLINE);
  496 
  497     if  ( (quantum = checktick()) >= 1) {
  498       if (doIO) fprintf( outFile, "Your clock granularity/precision appears to be "
  499                          "%d microseconds.\n", quantum);
  500     } else {
  501       if (doIO) fprintf( outFile, "Your clock granularity appears to be "
  502                          "less than one microsecond.\n");
  503     }
  504 
  505     /* Get initial timing estimate to compare to timer granularity.
  506        All ranks need to run this code since it changes the values in array `a' */
  507     t = MPI_Wtime();
  508 #ifdef _OPENMP
  509 #pragma omp parallel for
  510 #endif
  511     for (j = 0; j < array_elements; j++)
  512       a[j] = 2.0E0 * a[j];
  513     t = 1.0E6 * (MPI_Wtime() - t);
  514 
  515     if (doIO) {
  516     fprintf( outFile, "Each test below will take on the order"
  517              " of %d microseconds.\n", (int) t  );
  518     fprintf( outFile, "   (= %d clock ticks)\n", (int) (t/quantum) );
  519     fprintf( outFile, "Increase the size of the arrays if this shows that\n");
  520     fprintf( outFile, "you are not getting at least 20 clock ticks per test.\n");
  521 
  522     fprintf( outFile, HLINE);
  523 
  524     fprintf( outFile, "WARNING -- The above is only a rough guideline.\n");
  525     fprintf( outFile, "For best results, please be sure you know the\n");
  526     fprintf( outFile, "precision of your system timer.\n");
  527     fprintf( outFile, HLINE);
  528 
  529     t1 = MPI_Wtime();
  530     fprintf( outFile, "VERBOSE: total setup time for rank 0 = %f seconds\n",t1-t0);
  531     fprintf( outFile, HLINE);
  532     }
  533 
  534     /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
  535 
  536     /* This code has more barriers and timing calls than are actually needed, but
  537        this should not cause a problem for arrays that are large enough to satisfy
  538        the STREAM run rules. */
  539 
  540     scalar = SCALAR;
  541     for (k=0; k<NTIMES; k++) {
  542         /* kernel 1: Copy */
  543         MPI_Barrier( comm );
  544         times[0][k] = MPI_Wtime();
  545 #ifdef TUNED
  546         tuned_STREAM_Copy();
  547 #else
  548 #ifdef _OPENMP
  549 #pragma omp parallel for
  550 #endif
  551         for (j=0; j<array_elements; j++)
  552           c[j] = a[j];
  553 #endif
  554         MPI_Barrier( comm );
  555         times[0][k] = MPI_Wtime() - times[0][k];
  556 
  557         /* kernel 2: Scale */
  558         MPI_Barrier( comm );
  559         times[1][k] = MPI_Wtime();
  560 #ifdef TUNED
  561         tuned_STREAM_Scale(scalar);
  562 #else
  563 #ifdef _OPENMP
  564 #pragma omp parallel for
  565 #endif
  566         for (j=0; j<array_elements; j++)
  567           b[j] = scalar*c[j];
  568 #endif
  569         MPI_Barrier( comm );
  570         times[1][k] = MPI_Wtime() - times[1][k];
  571 
  572         /* kernel 3: Add */
  573         MPI_Barrier( comm );
  574         times[2][k] = MPI_Wtime();
  575 #ifdef TUNED
  576         tuned_STREAM_Add();
  577 #else
  578 #ifdef _OPENMP
  579 #pragma omp parallel for
  580 #endif
  581         for (j=0; j<array_elements; j++)
  582           c[j] = a[j]+b[j];
  583 #endif
  584         MPI_Barrier( comm );
  585         times[2][k] = MPI_Wtime() - times[2][k];
  586 
  587         /* kernel 4: Triad */
  588         MPI_Barrier( comm );
  589         times[3][k] = MPI_Wtime();
  590 #ifdef TUNED
  591         tuned_STREAM_Triad(scalar);
  592 #else
  593 #ifdef _OPENMP
  594 #pragma omp parallel for
  595 #endif
  596         for (j=0; j<array_elements; j++)
  597           a[j] = b[j]+scalar*c[j];
  598 #endif
  599         MPI_Barrier( comm );
  600         times[3][k] = MPI_Wtime() - times[3][k];
  601     }
  602 
  603     t0 = MPI_Wtime();
  604 
  605     /* --- SUMMARY --- */
  606 
  607     /* Because of the MPI_Barrier() calls, the timings from any thread are equally valid.
  608        The best estimate of the maximum performance is the minimum of the "outside the barrier"
  609        timings across all the MPI ranks. */
  610 
  611     memcpy(times_copy, times, sizeof times_copy );
  612 
  613     /* for each iteration and each kernel, collect the minimum time across all MPI ranks */
  614     MPI_Allreduce( times_copy, times, 4*NTIMES, MPI_DOUBLE, MPI_MIN, comm );
  615 
  616     /* Back to the original code, but now using the minimum global timing across all ranks */
  617     for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
  618     {
  619       for (j=0; j<4; j++)
  620       {
  621         avgtime[j] = avgtime[j] + times[j][k];
  622         mintime[j] = Mmin(mintime[j], times[j][k]);
  623         maxtime[j] = Mmax(maxtime[j], times[j][k]);
  624       }
  625     }
  626 
  627     if (doIO)
  628       fprintf( outFile, "Function      Rate (GB/s)   Avg time     Min time     Max time\n");
  629     for (j=0; j<4; j++) {
  630       avgtime[j] /= (double)(NTIMES - 1); /* note -- skip first iteration */
  631 
  632       /* make sure no division by zero */
  633       curGBs = (mintime[j] > 0.0 ? 1.0 / mintime[j] : -1.0);
  634       curGBs *= 1e-9 * bytes[j] * array_elements;
  635       if (doIO)
  636         fprintf( outFile, "%s%11.4f  %11.4f  %11.4f  %11.4f\n", label[j],
  637                  curGBs,
  638                  avgtime[j],
  639                  mintime[j],
  640                  maxtime[j]);
  641       switch (j) {
  642         case 0: *copyGBs = curGBs; break;
  643         case 1: *scaleGBs = curGBs; break;
  644         case 2: *addGBs = curGBs; break;
  645         case 3: *triadGBs = curGBs; break;
  646       }
  647     }
  648     if (doIO)
  649       fprintf( outFile, HLINE);
  650 
  651     /* --- Every Rank Checks its Results --- */
  652     computeSTREAMerrors(&AvgError[0], &AvgError[1], &AvgError[2]);
  653     /* --- Collect the Average Errors for Each Array on Rank 0 --- */
  654     MPI_Gather(AvgError, 3, MPI_DOUBLE, AvgErrByRank, 3, MPI_DOUBLE, 0, comm);
  655 
  656     /* -- Combined averaged errors and report on Rank 0 only --- */
  657     if (myrank == 0) {
  658       checkSTREAMresults( outFile, doIO, AvgErrByRank, numranks, failure );
  659       if (doIO) fprintf( outFile, HLINE);
  660     }
  661 
  662     HPCC_free(AvgErrByRank);
  663 
  664     HPCC_free(c);
  665     HPCC_free(b);
  666     HPCC_free(a);
  667 
  668     if (doIO) {
  669       fflush( outFile );
  670       fclose( outFile );
  671     }
  672 
  673     return 0;
  674 }
  675 
  676 void tuned_STREAM_Copy()
  677 {
  678   int j;
  679 #ifdef _OPENMP
  680 #pragma omp parallel for
  681 #endif
  682         for (j=0; j<array_elements; j++)
  683             c[j] = a[j];
  684 }
  685 
  686 void tuned_STREAM_Scale(double scalar)
  687 {
  688   int j;
  689 #ifdef _OPENMP
  690 #pragma omp parallel for
  691 #endif
  692   for (j=0; j<array_elements; j++)
  693     b[j] = scalar*c[j];
  694 }
  695 
  696 void tuned_STREAM_Add()
  697 {
  698   int j;
  699 #ifdef _OPENMP
  700 #pragma omp parallel for
  701 #endif
  702   for (j=0; j<array_elements; j++)
  703     c[j] = a[j]+b[j];
  704 }
  705 
  706 void tuned_STREAM_Triad(double scalar)
  707 {
  708   int j;
  709 #ifdef _OPENMP
  710 #pragma omp parallel for
  711 #endif
  712   for (j=0; j<array_elements; j++)
  713     a[j] = b[j]+scalar*c[j];
  714 }