"Fossies" - the Fresh Open Source Software Archive

Member "stress-ng-0.09.56/stress-cpu.c" (15 Mar 2019, 58429 Bytes) of package /linux/privat/stress-ng-0.09.56.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "stress-cpu.c" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 0.09.54_vs_0.09.55.

    1 /*
    2  * Copyright (C) 2013-2019 Canonical, Ltd.
    3  *
    4  * This program is free software; you can redistribute it and/or
    5  * modify it under the terms of the GNU General Public License
    6  * as published by the Free Software Foundation; either version 2
    7  * of the License, or (at your option) any later version.
    8  *
    9  * This program is distributed in the hope that it will be useful,
   10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
   11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   12  * GNU General Public License for more details.
   13  *
   14  * You should have received a copy of the GNU General Public License
   15  * along with this program; if not, write to the Free Software
   16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
   17  *
   18  * This code is a complete clean re-write of the stress tool by
   19  * Colin Ian King <colin.king@canonical.com> and attempts to be
   20  * backwardly compatible with the stress tool by Amos Waterland
   21  * <apw@rossby.metr.ou.edu> but has more stress tests and more
   22  * functionality.
   23  *
   24  */
   25 #include "stress-ng.h"
   26 
   27 #define GAMMA       (0.57721566490153286060651209008240243104215933593992L)
   28 #define OMEGA       (0.56714329040978387299996866221035554975381578718651L)
   29 #define PSI     (3.359885666243177553172011302918927179688905133732L)
   30 #define STATS_MAX   (250)
   31 
   32 /*
   33  * Some awful math lib workarounds for functions that some
   34  * math libraries don't have implemented (yet)
   35  */
   36 #if !defined(HAVE_CABSL)
   37 #define cabsl   cabs
   38 #endif
   39 
   40 #if !defined(HAVE_LGAMMAL)
   41 #define lgammal lgamma
   42 #endif
   43 
   44 #if !defined(HAVE_CCOSL)
   45 #define ccosl   ccos
   46 #endif
   47 
   48 #if !defined(HAVE_CSINL)
   49 #define csinl   csin
   50 #endif
   51 
   52 #if !defined(HAVE_CPOW)
   53 #define cpow    pow
   54 #endif
   55 
   56 #if !defined(HAVE_POWL)
   57 #define powl    pow
   58 #endif
   59 
   60 #if !defined(HAVE_RINTL)
   61 #define rintl   rint
   62 #endif
   63 
   64 #if !defined(HAVE_LOGL)
   65 #define logl    log
   66 #endif
   67 
   68 #if !defined(HAVE_EXPL) || defined(__HAIKU__)
   69 #define expl    exp
   70 #endif
   71 
   72 #if !defined(HAVE_COSL)
   73 #define cosl    cos
   74 #endif
   75 
   76 #if !defined(HAVE_SINL)
   77 #define sinl    sin
   78 #endif
   79 
   80 #if !defined(HAVE_COSHL)
   81 #define coshl   cosh
   82 #endif
   83 
   84 #if !defined(HAVE_SINHL)
   85 #define sinhl   sinh
   86 #endif
   87 
   88 #if !defined(HAVE_SQRTL)
   89 #define sqrtl   sqrt
   90 #endif
   91 
   92 /*
   93  *  the CPU stress test has different classes of cpu stressor
   94  */
   95 typedef void (*stress_cpu_func)(const char *name);
   96 
   97 typedef struct {
   98     const char      *name;  /* human readable form of stressor */
   99     const stress_cpu_func   func;   /* the cpu method function */
  100 } stress_cpu_method_info_t;
  101 
  102 static const stress_cpu_method_info_t cpu_methods[];
  103 
  104 /* Don't make this static to ensure dithering does not get optimised out */
  105 uint8_t pixels[STRESS_CPU_DITHER_X][STRESS_CPU_DITHER_Y];
  106 
  107 int stress_set_cpu_load(const char *opt) {
  108     int32_t cpu_load;
  109 
  110     cpu_load = get_int32(opt);
  111     check_range("cpu-load", cpu_load, 0, 100);
  112     return set_setting("cpu-load", TYPE_ID_INT32, &cpu_load);
  113 }
  114 
  115 /*
  116  *  stress_set_cpu_load_slice()
  117  *  < 0   - number of iterations per busy slice
  118  *  = 0   - random duration between 0..0.5 seconds
  119  *  > 0   - milliseconds per busy slice
  120  */
  121 int stress_set_cpu_load_slice(const char *opt)
  122 {
  123     int32_t cpu_load_slice;
  124 
  125     cpu_load_slice = get_int32(opt);
  126     if ((cpu_load_slice < -5000) || (cpu_load_slice > 5000)) {
  127         (void)fprintf(stderr, "cpu-load-slice must in the range -5000 to 5000.\n");
  128         _exit(EXIT_FAILURE);
  129     }
  130     return set_setting("cpu-load-slice", TYPE_ID_INT32, &cpu_load_slice);
  131 }
  132 
  133 /*
  134  *  stress_cpu_sqrt()
  135  *  stress CPU on square roots
  136  */
  137 static void HOT TARGET_CLONES stress_cpu_sqrt(const char *name)
  138 {
  139     int i;
  140 
  141     for (i = 0; i < 16384; i++) {
  142         uint64_t rnd = mwc32();
  143         double r = sqrt((double)rnd) * sqrt((double)rnd);
  144         if (UNLIKELY((g_opt_flags & OPT_FLAGS_VERIFY) &&
  145             (uint64_t)rint(r) != rnd)) {
  146             pr_fail("%s: sqrt error detected on "
  147                 "sqrt(%" PRIu64 ")\n", name, rnd);
  148             if (!g_keep_stressing_flag)
  149                 break;
  150         }
  151     }
  152 }
  153 
  154 /*
  155  *  stress_cpu_loop()
  156  *  simple CPU busy loop
  157  */
  158 static void OPTIMIZE0 stress_cpu_loop(const char *name)
  159 {
  160     uint32_t i, i_sum = 0;
  161     const uint32_t sum = 134209536UL;
  162 
  163     for (i = 0; i < 16384; i++) {
  164         i_sum += i;
  165         FORCE_DO_NOTHING();
  166     }
  167     if ((g_opt_flags & OPT_FLAGS_VERIFY) && (i_sum != sum))
  168         pr_fail("%s: cpu loop 0..16383 sum was %" PRIu32 " and "
  169             "did not match the expected value of %" PRIu32 "\n",
  170             name, i_sum, sum);
  171 }
  172 
  173 /*
  174  *  stress_cpu_gcd()
  175  *  compute Greatest Common Divisor
  176  */
  177 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_gcd(const char *name)
  178 {
  179     uint32_t i, i_sum = 0;
  180     const uint32_t sum = 63000868UL;
  181 
  182     for (i = 0; i < 16384; i++) {
  183         register uint32_t a = i, b = i % (3 + (1997 ^ i));
  184 
  185         while (b != 0) {
  186             register uint32_t r = b;
  187             b = a % b;
  188             a = r;
  189         }
  190         i_sum += a;
  191         FORCE_DO_NOTHING();
  192     }
  193     if ((g_opt_flags & OPT_FLAGS_VERIFY) && (i_sum != sum))
  194         pr_fail("%s: gcd error detected, failed modulo "
  195             "or assignment operations\n", name);
  196 }
  197 
  198 /*
  199  *  stress_cpu_bitops()
  200  *  various bit manipulation hacks from bithacks
  201  *  https://graphics.stanford.edu/~seander/bithacks.html
  202  */
  203 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_bitops(const char *name)
  204 {
  205     uint32_t i, i_sum = 0;
  206     const uint32_t sum = 0x8aac0aab;
  207 
  208     for (i = 0; i < 16384; i++) {
  209         {
  210             register uint32_t r, v, s = (sizeof(v) * 8) - 1;
  211 
  212             /* Reverse bits */
  213             r = v = i;
  214             for (v >>= 1; v; v >>= 1, s--) {
  215                 r <<= 1;
  216                 r |= v & 1;
  217             }
  218             r <<= s;
  219             i_sum += r;
  220         }
  221         {
  222             /* parity check */
  223             register uint32_t v = i;
  224 
  225             v ^= v >> 16;
  226             v ^= v >> 8;
  227             v ^= v >> 4;
  228             v &= 0xf;
  229             i_sum += (0x6996 >> v) & 1;
  230         }
  231         {
  232             /* Brian Kernighan count bits */
  233             register uint32_t j, v = i;
  234 
  235             for (j = 0; v; j++)
  236                 v &= v - 1;
  237             i_sum += j;
  238         }
  239         {
  240             /* round up to nearest highest power of 2 */
  241             register uint32_t v = i - 1;
  242 
  243             v |= v >> 1;
  244             v |= v >> 2;
  245             v |= v >> 4;
  246             v |= v >> 8;
  247             v |= v >> 16;
  248             i_sum += v;
  249         }
  250     }
  251     if ((g_opt_flags & OPT_FLAGS_VERIFY) && (i_sum != sum))
  252         pr_fail("%s: bitops error detected, failed "
  253             "bitops operations\n", name);
  254 }
  255 
  256 /*
  257  *  stress_cpu_trig()
  258  *  simple sin, cos trig functions
  259  */
  260 static void HOT stress_cpu_trig(const char *name)
  261 {
  262     int i;
  263     long double d_sum = 0.0L;
  264 
  265     (void)name;
  266 
  267     for (i = 0; i < 1500; i++) {
  268         long double theta = (2.0L * M_PI * (double)i)/1500.0L;
  269         {
  270             d_sum += (cosl(theta) * sinl(theta));
  271             d_sum += (cos(theta) * sin(theta));
  272             d_sum += (cosf(theta) * sinf(theta));
  273         }
  274         {
  275             long double theta2 = theta * 2.0L;
  276 
  277             d_sum += cosl(theta2);
  278             d_sum += cos(theta2);
  279             d_sum += cosf(theta2);
  280         }
  281         {
  282             long double theta3 = theta * 3.0L;
  283 
  284             d_sum += sinl(theta3);
  285             d_sum += sin(theta3);
  286             d_sum += sinf(theta3);
  287         }
  288     }
  289     double_put(d_sum);
  290 }
  291 
  292 /*
  293  *  stress_cpu_hyperbolic()
  294  *  simple hyperbolic sinh, cosh functions
  295  */
  296 static void HOT stress_cpu_hyperbolic(const char *name)
  297 {
  298     int i;
  299     double d_sum = 0.0;
  300 
  301     (void)name;
  302 
  303     for (i = 0; i < 1500; i++) {
  304         long double theta = (2.0L * M_PI * (double)i)/1500.0L;
  305         {
  306             d_sum += (coshl(theta) * sinhl(theta));
  307             d_sum += (cosh(theta) * sinh(theta));
  308             d_sum += (double)(coshf(theta) * sinhf(theta));
  309         }
  310         {
  311             long double theta2 = theta * 2.0L;
  312 
  313             d_sum += coshl(theta2);
  314             d_sum += cosh(theta2);
  315             d_sum += (double)coshf(theta2);
  316         }
  317         {
  318             long double theta3 = theta * 3.0L;
  319 
  320             d_sum += sinhl(theta3);
  321             d_sum += sinh(theta3);
  322             d_sum += (double)sinhf(theta3);
  323         }
  324     }
  325     double_put(d_sum);
  326 }
  327 
  328 /*
  329  *  stress_cpu_rand()
  330  *  generate lots of pseudo-random integers
  331  */
  332 static void HOT OPTIMIZE3 stress_cpu_rand(const char *name)
  333 {
  334     int i;
  335     uint32_t i_sum = 0;
  336     const uint32_t sum = 0xc253698c;
  337 
  338     MWC_SEED();
  339     for (i = 0; i < 16384; i++)
  340         i_sum += mwc32();
  341 
  342     if ((g_opt_flags & OPT_FLAGS_VERIFY) && (i_sum != sum))
  343         pr_fail("%s: rand error detected, failed sum of "
  344             "pseudo-random values\n", name);
  345 }
  346 
  347 /*
  348  *  stress_cpu_rand48()
  349  *  generate random values using rand48 family of functions
  350  */
  351 static void HOT OPTIMIZE3 stress_cpu_rand48(const char *name)
  352 {
  353     int i;
  354     double d = 0;
  355     long int l = 0;
  356 
  357     (void)name;
  358 
  359     srand48(0x0defaced);
  360     for (i = 0; i < 16384; i++) {
  361         d += drand48();
  362         l += lrand48();
  363     }
  364     double_put(d);
  365     uint64_put(l);
  366 }
  367 
  368 /*
  369  *  stress_cpu_nsqrt()
  370  *  iterative Newton–Raphson square root
  371  */
  372 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_nsqrt(const char *name)
  373 {
  374     int i;
  375     const long double precision = 1.0e-12L;
  376     const int max_iter = 56;
  377 
  378     for (i = 0; i < 16384; i++) {
  379         long double n = (double)i;
  380         long double lo = (n < 1.0L) ? n : 1.0L;
  381         long double hi = (n < 1.0L) ? 1.0L : n;
  382         long double rt;
  383         int j = 0;
  384 
  385         while ((j++ < max_iter) && ((hi - lo) > precision)) {
  386             long double g = (lo + hi) / 2.0L;
  387             if ((g * g) > n)
  388                 hi = g;
  389             else
  390                 lo = g;
  391         }
  392         rt = (lo + hi) / 2.0L;
  393 
  394         if (g_opt_flags & OPT_FLAGS_VERIFY) {
  395             if (j >= max_iter)
  396                 pr_fail("%s: Newton-Raphson sqrt "
  397                     "computation took more iterations "
  398                     "than expected\n", name);
  399             if ((int)rintl(rt * rt) != i)
  400                 pr_fail("%s: Newton-Rapshon sqrt not "
  401                     "accurate enough\n", name);
  402         }
  403     }
  404 }
  405 
  406 /*
  407  *  stress_cpu_phi()
  408  *  compute the Golden Ratio
  409  */
  410 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_phi(const char *name)
  411 {
  412     long double phi; /* Golden ratio */
  413     const long double precision = 1.0e-15L;
  414     const long double phi_ = (1.0L + sqrtl(5.0L)) / 2.0L;
  415     register uint64_t a, b;
  416     const uint64_t mask = 1ULL << 63;
  417     int i;
  418 
  419     /* Pick any two starting points */
  420     a = mwc64() % 99;
  421     b = mwc64() % 99;
  422 
  423     /* Iterate until we approach overflow */
  424     for (i = 0; (i < 64) && !((a | b) & mask); i++) {
  425         /* Find nth term */
  426         register uint64_t c = a + b;
  427 
  428         a = b;
  429         b = c;
  430     }
  431     /* And we have the golden ratio */
  432     phi = (long double)b / (long double)a;
  433 
  434     if ((g_opt_flags & OPT_FLAGS_VERIFY) &&
  435         (fabsl(phi - phi_) > precision))
  436         pr_fail("%s: Golden Ratio phi not accurate enough\n",
  437             name);
  438 }
  439 
  440 #if defined(HAVE_COMPLEX_H) &&      \
  441     defined(HAVE_COMPLEX) &&        \
  442     defined(__STDC_IEC_559_COMPLEX__) &&\
  443     !defined(__UCLIBC__)
  444 /*
  445  *  fft_partial()
  446  *      partial Fast Fourier Transform
  447  */
  448 static void HOT OPTIMIZE3 fft_partial(
  449     double complex *data,
  450     double complex *tmp,
  451     const int n,
  452     const int m)
  453 {
  454     if (m < n) {
  455         const int m2 = m * 2;
  456         int i;
  457 
  458         fft_partial(tmp, data, n, m2);
  459         fft_partial(tmp + m, data + m, n, m2);
  460         for (i = 0; i < n; i += m2) {
  461             const double complex negI = -I;
  462             double complex v = tmp[i];
  463             double complex t =
  464                 cexp((negI * M_PI * (double)i) /
  465                      (double)n) * tmp[i + m];
  466             data[i / 2] = v + t;
  467             data[(i + n) / 2] = v - t;
  468         }
  469     }
  470 }
  471 
  472 /*
  473  *  stress_cpu_fft()
  474  *  Fast Fourier Transform
  475  */
  476 static void HOT TARGET_CLONES stress_cpu_fft(const char *name)
  477 {
  478     double complex buf[FFT_SIZE], tmp[FFT_SIZE];
  479     int i;
  480 
  481     (void)name;
  482 
  483     for (i = 0; i < FFT_SIZE; i++)
  484         buf[i] = (double complex)(i % 63);
  485 
  486     (void)memcpy(tmp, buf, sizeof(*tmp) * FFT_SIZE);
  487     fft_partial(buf, tmp, FFT_SIZE, 1);
  488 }
  489 #endif
  490 
  491 /*
  492  *   stress_cpu_euler()
  493  *  compute e using series
  494  */
  495 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_euler(const char *name)
  496 {
  497     long double e = 1.0L, last_e;
  498     long double fact = 1.0L;
  499     long double precision = 1.0e-20L;
  500     int n = 1;
  501 
  502     do {
  503         last_e = e;
  504         fact *= n;
  505         n++;
  506         e += (1.0L / fact);
  507     } while ((n < 25) && (fabsl(e - last_e) > precision));
  508 
  509     if ((g_opt_flags & OPT_FLAGS_VERIFY) && (n >= 25))
  510         pr_fail("%s: Euler computation took more iterations "
  511             "than expected\n", name);
  512 }
  513 
  514 /*
  515  *  random_buffer()
  516  *  fill a uint8_t buffer full of random data
  517  *  buffer *must* be multiple of 4 bytes in size
  518  */
  519 static void random_buffer(uint8_t *data, const size_t len)
  520 {
  521     size_t i;
  522 
  523     for (i = 0; i < len / 4; i++) {
  524         uint32_t v = mwc32();
  525 
  526         *data++ = v;
  527         v >>= 8;
  528         *data++ = v;
  529         v >>= 8;
  530         *data++ = v;
  531         v >>= 8;
  532         *data++ = v;
  533     }
  534 }
  535 
  536 /*
  537  *  stress_cpu_hash_generic()
  538  *  stress test generic string hash function
  539  */
  540 static void stress_cpu_hash_generic(
  541     const char *name,
  542     const char *hash_name,
  543     uint32_t (*hash_func)(const char *str),
  544     const uint32_t result)
  545 {
  546     char buffer[128];
  547     size_t i;
  548     uint32_t i_sum = 0;
  549 
  550     MWC_SEED();
  551     random_buffer((uint8_t *)buffer, sizeof(buffer));
  552     /* Make it ASCII range ' '..'_' */
  553     for (i = 0; i < sizeof(buffer); i++)
  554         buffer[i] = (buffer[i] & 0x3f) + ' ';
  555 
  556     for (i = sizeof(buffer) - 1; i; i--) {
  557         buffer[i] = '\0';
  558         i_sum += hash_func(buffer);
  559     }
  560     if ((g_opt_flags & OPT_FLAGS_VERIFY) && (i_sum != result))
  561         pr_fail("%s: %s error detected, failed hash %s sum\n",
  562             name, hash_name, hash_name);
  563 }
  564 
  565 
  566 /*
  567  *  jenkin()
  568  *  Jenkin's hash on random data
  569  *  http://www.burtleburtle.net/bob/hash/doobs.html
  570  */
  571 static uint32_t HOT OPTIMIZE3 jenkin(const uint8_t *data, const size_t len)
  572 {
  573     register size_t i;
  574     register uint32_t h = 0;
  575 
  576     for (i = 0; i < len; i++) {
  577         h += *data++;
  578         h += h << 10;
  579         h ^= h >> 6;
  580     }
  581     h += h << 3;
  582     h ^= h >> 11;
  583     h += h << 15;
  584 
  585     return h;
  586 }
  587 
  588 /*
  589  *  stress_cpu_jenkin()
  590  *  multiple iterations on jenkin hash
  591  */
  592 static void stress_cpu_jenkin(const char *name)
  593 {
  594     uint8_t buffer[128];
  595     size_t i;
  596     uint32_t i_sum = 0;
  597     const uint32_t sum = 0x96673680;
  598 
  599     MWC_SEED();
  600     random_buffer(buffer, sizeof(buffer));
  601     for (i = 0; i < sizeof(buffer); i++)
  602         i_sum += jenkin(buffer, sizeof(buffer));
  603 
  604     if ((g_opt_flags & OPT_FLAGS_VERIFY) && (i_sum != sum))
  605         pr_fail("%s: jenkin error detected, failed hash "
  606             "jenkin sum\n", name);
  607 }
  608 
  609 /*
  610  *  pjw()
  611  *  Hash a string, from Aho, Sethi, Ullman, Compiling Techniques.
  612  */
  613 static uint32_t HOT OPTIMIZE3 pjw(const char *str)
  614 {
  615     register uint32_t h = 0;
  616 
  617     while (*str) {
  618         register uint32_t g;
  619 
  620         h = (h << 4) + (*str);
  621         if (0 != (g = h & 0xf0000000)) {
  622             h = h ^ (g >> 24);
  623             h = h ^ g;
  624         }
  625         str++;
  626     }
  627     return h;
  628 }
  629 
  630 /*
  631  *  stress_cpu_pjw()
  632  *  stress test hash pjw
  633  */
  634 static void stress_cpu_pjw(const char *name)
  635 {
  636     stress_cpu_hash_generic(name, "pjw", pjw, 0xa89a91c0);
  637 }
  638 
  639 /*
  640  *  djb2a()
  641  *  Hash a string, from Dan Bernstein comp.lang.c (xor version)
  642  */
  643 static uint32_t HOT OPTIMIZE3 djb2a(const char *str)
  644 {
  645     register uint32_t hash = 5381;
  646     register int c;
  647 
  648     while ((c = *str++)) {
  649         /* (hash * 33) ^ c */
  650         hash = ((hash << 5) + hash) ^ c;
  651     }
  652     return hash;
  653 }
  654 
  655 /*
  656  *  stress_cpu_djb2a()
  657  *  stress test hash djb2a
  658  */
  659 static void stress_cpu_djb2a(const char *name)
  660 {
  661     stress_cpu_hash_generic(name, "djb2a", djb2a, 0x6a60cb5a);
  662 }
  663 
  664 /*
  665  *  fnv1a()
  666  *  Hash a string, using the improved 32 bit FNV-1a hash
  667  */
  668 static uint32_t HOT OPTIMIZE3 fnv1a(const char *str)
  669 {
  670     register uint32_t hash = 5381;
  671     const uint32_t fnv_prime = 16777619; /* 2^24 + 2^9 + 0x93 */
  672     register int c;
  673 
  674     while ((c = *str++)) {
  675         hash ^= c;
  676         hash *= fnv_prime;
  677     }
  678     return hash;
  679 }
  680 
  681 /*
  682  *  stress_cpu_fnv1a()
  683  *  stress test hash fnv1a
  684  */
  685 static void HOT stress_cpu_fnv1a(const char *name)
  686 {
  687     stress_cpu_hash_generic(name, "fnv1a", fnv1a, 0x8ef17e80);
  688 }
  689 
  690 /*
  691  *  sdbm()
  692  *  Hash a string, using the sdbm data base hash and also
  693  *  apparently used in GNU awk.
  694  */
  695 static uint32_t OPTIMIZE3 sdbm(const char *str)
  696 {
  697     register uint32_t hash = 0;
  698     register int c;
  699 
  700     while ((c = *str++))
  701         hash = c + (hash << 6) + (hash << 16) - hash;
  702     return hash;
  703 }
  704 
  705 /*
  706  *  stress_cpu_sdbm()
  707  *  stress test hash sdbm
  708  */
  709 static void stress_cpu_sdbm(const char *name)
  710 {
  711     stress_cpu_hash_generic(name, "sdbm", sdbm, 0x46357819);
  712 }
  713 
  714 /*
  715  *  stress_cpu_idct()
  716  *  compute 8x8 Inverse Discrete Cosine Transform
  717  */
  718 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_idct(const char *name)
  719 {
  720     const double invsqrt2 = 1.0 / sqrt(2.0);
  721     const double pi_over_16 = M_PI / 16.0;
  722     const int sz = 8;
  723     int i, j, u, v;
  724     float data[sz][sz], idct[sz][sz];
  725 
  726     /*
  727      *  Set up DCT
  728      */
  729     for (i = 0; i < sz; i++) {
  730         for (j = 0; j < sz; j++) {
  731             data[i][j] = (i + j == 0) ? 2040: 0;
  732         }
  733     }
  734     for (i = 0; i < sz; i++) {
  735         const double pi_i = (i + i + 1) * pi_over_16;
  736 
  737         for (j = 0; j < sz; j++) {
  738             const double pi_j = (j + j + 1) * pi_over_16;
  739             double sum = 0.0;
  740 
  741             for (u = 0; u < sz; u++) {
  742                 const double cos_pi_i_u = cos(pi_i * u);
  743 
  744                 for (v = 0; v < sz; v++) {
  745                     const double cos_pi_j_v =
  746                         cos(pi_j * v);
  747 
  748                     sum += (data[u][v] *
  749                         (u ? 1.0 : invsqrt2) *
  750                         (v ? 1.0 : invsqrt2) *
  751                         cos_pi_i_u * cos_pi_j_v);
  752                 }
  753             }
  754             idct[i][j] = 0.25 * sum;
  755         }
  756     }
  757     /* Final output should be a 8x8 matrix of values 255 */
  758     if (g_opt_flags & OPT_FLAGS_VERIFY) {
  759         for (i = 0; i < sz; i++) {
  760             for (j = 0; j < sz; j++) {
  761                 if ((int)idct[i][j] != 255) {
  762                     pr_fail("%s: IDCT error detected, "
  763                         "IDCT[%d][%d] was %d, "
  764                         "expecting 255\n",
  765                         name, i, j, (int)idct[i][j]);
  766                 }
  767             }
  768             if (!g_keep_stressing_flag)
  769                 return;
  770         }
  771     }
  772 }
  773 
  774 #define int_ops(a, b, c1, c2, c3)   \
  775     do {                \
  776         a += b;         \
  777         b ^= a;         \
  778         a >>= 1;        \
  779         b <<= 2;        \
  780         b -= a;         \
  781         a ^= ~0;        \
  782         b ^= ~(c1);     \
  783         a *= 3;         \
  784         b *= 7;         \
  785         a += 2;         \
  786         b -= 3;         \
  787         a /= 77;        \
  788         b /= 3;         \
  789         a <<= 1;        \
  790         b <<= 2;        \
  791         a |= 1;         \
  792         b |= 3;         \
  793         a *= mwc32();       \
  794         b ^= mwc32();       \
  795         a += mwc32();       \
  796         b -= mwc32();       \
  797         a /= 7;         \
  798         b /= 9;         \
  799         a |= (c2);      \
  800         b &= (c3);      \
  801     } while (0);
  802 
  803 #define C1  (0xf0f0f0f0f0f0f0f0ULL)
  804 #define C2  (0x1000100010001000ULL)
  805 #define C3  (0xffeffffefebefffeULL)
  806 
  807 /*
  808  *  Generic int stressor macro
  809  */
  810 #define stress_cpu_int(_type, _sz, _a, _b, _c1, _c2, _c3)   \
  811 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_int ## _sz(const char *name)\
  812 {                               \
  813     const _type mask = ~0;                  \
  814     const _type a_final = _a;               \
  815     const _type b_final = _b;               \
  816     const _type c1 = _c1 & mask;                \
  817     const _type c2 = _c2 & mask;                \
  818     const _type c3 = _c3 & mask;                \
  819     register _type a, b;                    \
  820     int i;                          \
  821                                 \
  822     MWC_SEED();                     \
  823     a = mwc32();                        \
  824     b = mwc32();                        \
  825                                 \
  826     for (i = 0; i < 1000; i++) {                \
  827         int_ops(a, b, c1, c2, c3)           \
  828     }                           \
  829                                 \
  830     if ((g_opt_flags & OPT_FLAGS_VERIFY) &&         \
  831         ((a != a_final) || (b != b_final)))         \
  832         pr_fail("%s: int" # _sz " error detected, "     \
  833             "failed int" # _sz          \
  834             " math operations\n", name);        \
  835 }                               \
  836 
  837 /* For compilers that support int128 .. */
  838 #if defined(HAVE_INT128_T)
  839 
  840 #define _UINT128(hi, lo)    ((((__uint128_t)hi << 64) | (__uint128_t)lo))
  841 
  842 stress_cpu_int(__uint128_t, 128,
  843     _UINT128(0x132af604d8b9183a,0x5e3af8fa7a663d74),
  844     _UINT128(0x62f086e6160e4e,0xd84c9f800365858),
  845     _UINT128(C1, C1), _UINT128(C2, C2), _UINT128(C3, C3))
  846 #endif
  847 
  848 stress_cpu_int(uint64_t, 64, \
  849     0x013f7f6dc1d79197cULL, 0x01863d2c6969a51ceULL,
  850     C1, C2, C3)
  851 
  852 stress_cpu_int(uint32_t, 32, \
  853     0x1ce9b547UL, 0xa24b33aUL,
  854     C1, C2, C3)
  855 
  856 stress_cpu_int(uint16_t, 16, \
  857     0x1871, 0x07f0,
  858     C1, C2, C3)
  859 
  860 stress_cpu_int(uint8_t, 8, \
  861     0x12, 0x1a,
  862     C1, C2, C3)
  863 
  864 #define float_ops(_type, a, b, c, d, _sin, _cos)    \
  865     do {                        \
  866         a = a + b;              \
  867         b = a * c;              \
  868         c = a - b;              \
  869         d = a / b;              \
  870         a = c / (_type)0.1923L;         \
  871         b = c + a;              \
  872         c = b * (_type)3.12L;           \
  873         d = d + b + (_type)_sin(a);     \
  874         a = (b + c) / c;            \
  875         b = b * c;              \
  876         c = c + (_type)1.0L;            \
  877         d = d - (_type)_sin(c);         \
  878         a = a * (_type)_cos(b);         \
  879         b = b + (_type)_cos(c);         \
  880         c = (_type)_sin(a + b) / (_type)2.344L; \
  881         b = d - (_type)1.0L;            \
  882     } while (0)
  883 
  884 /*
  885  *  Generic floating point stressor macro
  886  */
  887 #define stress_cpu_fp(_type, _name, _sin, _cos)     \
  888 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_ ## _name(const char *name)\
  889 {                           \
  890     int i;                      \
  891     _type a = 0.18728L, b = mwc32(), c = mwc32(), d;\
  892                             \
  893     (void)name;                 \
  894                             \
  895     for (i = 0; i < 1000; i++) {            \
  896         float_ops(_type, a, b, c, d,        \
  897             _sin, _cos);            \
  898     }                       \
  899     double_put(a + b + c + d);          \
  900 }
  901 
  902 stress_cpu_fp(float, float, sinf, cosf)
  903 stress_cpu_fp(double, double, sin, cos)
  904 stress_cpu_fp(long double, longdouble, sinl, cosl)
  905 #if defined(HAVE_FLOAT_DECIMAL32) && !defined(__clang__)
  906 stress_cpu_fp(_Decimal32, decimal32, sinf, cosf)
  907 #endif
  908 #if defined(HAVE_FLOAT_DECIMAL64) && !defined(__clang__)
  909 stress_cpu_fp(_Decimal64, decimal64, sin, cos)
  910 #endif
  911 #if defined(HAVE_FLOAT_DECIMAL128) && !defined(__clang__)
  912 stress_cpu_fp(_Decimal128, decimal128, sinl, cosl)
  913 #endif
  914 #if defined(HAVE_FLOAT16) && !defined(__clang__)
  915 stress_cpu_fp(__fp16, float16, sin, cos)
  916 #endif
  917 #if defined(HAVE_FLOAT32) && !defined(__clang__)
  918 stress_cpu_fp(_Float32, float32, sin, cos)
  919 #endif
  920 #if defined(HAVE_FLOAT80) && !defined(__clang__)
  921 stress_cpu_fp(__float80, float80, sinl, cosl)
  922 #endif
  923 #if defined(HAVE_FLOAT128) && !defined(__clang__)
  924 stress_cpu_fp(__float128, float128, sinl, cosl)
  925 #endif
  926 
  927 /* Append floating point literal specifier to literal value */
  928 #define FP(val, ltype)  val ## ltype
  929 
  930 #if defined(HAVE_COMPLEX_H) &&      \
  931     defined(HAVE_COMPLEX) &&        \
  932     defined(__STDC_IEC_559_COMPLEX__) &&\
  933     !defined(__UCLIBC__)
  934 /*
  935  *  Generic complex stressor macro
  936  */
  937 #define stress_cpu_complex(_type, _ltype, _name, _csin, _ccos)  \
  938 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_ ## _name(const char *name)\
  939 {                           \
  940     int i;                      \
  941     _type cI = I;                   \
  942     _type a = FP(0.18728, _ltype) +         \
  943         cI * FP(0.2762, _ltype),        \
  944         b = mwc32() - cI * FP(0.11121, _ltype),\
  945         c = mwc32() + cI * mwc32(), d;      \
  946                             \
  947     (void)name;                 \
  948                             \
  949     for (i = 0; i < 1000; i++) {            \
  950         float_ops(_type, a, b, c, d,        \
  951             _csin, _ccos);          \
  952     }                       \
  953     double_put(a + b + c + d);          \
  954 }
  955 
  956 stress_cpu_complex(complex float, f, complex_float, csinf, ccosf)
  957 stress_cpu_complex(complex double, , complex_double, csin, ccos)
  958 stress_cpu_complex(complex long double, l, complex_long_double, csinl, ccosl)
  959 #endif
  960 
  961 #define int_float_ops(_ftype, flt_a, flt_b, flt_c, flt_d,   \
  962     _sin, _cos, int_a, int_b, _c1, _c2, _c3)        \
  963     do {                            \
  964         int_a += int_b;                 \
  965         int_b ^= int_a;                 \
  966         flt_a = flt_a + flt_b;              \
  967         int_a >>= 1;                    \
  968         int_b <<= 2;                    \
  969         flt_b = flt_a * flt_c;              \
  970         int_b -= int_a;                 \
  971         int_a ^= ~0;                    \
  972         flt_c = flt_a - flt_b;              \
  973         int_b ^= ~(_c1);                \
  974         int_a *= 3;                 \
  975         flt_d = flt_a / flt_b;              \
  976         int_b *= 7;                 \
  977         int_a += 2;                 \
  978         flt_a = flt_c / (_ftype)0.1923L;        \
  979         int_b -= 3;                 \
  980         int_a /= 77;                    \
  981         flt_b = flt_c + flt_a;              \
  982         int_b /= 3;                 \
  983         int_a <<= 1;                    \
  984         flt_c = flt_b * (_ftype)3.12L;          \
  985         int_b <<= 2;                    \
  986         int_a |= 1;                 \
  987         flt_d = flt_d + flt_b + (_ftype)_sin(flt_a);    \
  988         int_b |= 3;                 \
  989         int_a *= mwc32();               \
  990         flt_a = (flt_b + flt_c) / flt_c;        \
  991         int_b ^= mwc32();               \
  992         int_a += mwc32();               \
  993         flt_b = flt_b * flt_c;              \
  994         int_b -= mwc32();               \
  995         int_a /= 7;                 \
  996         flt_c = flt_c + (_ftype)1.0L;           \
  997         int_b /= 9;                 \
  998         flt_d = flt_d - (_ftype)_sin(flt_c);        \
  999         int_a |= (_c2);                 \
 1000         flt_a = flt_a * (_ftype)_cos(flt_b);        \
 1001         flt_b = flt_b + (_ftype)_cos(flt_c);        \
 1002         int_b &= (_c3);                 \
 1003         flt_c = (_ftype)_sin(flt_a + flt_b) / (_ftype)2.344L;   \
 1004         flt_b = flt_d - (_ftype)1.0L;           \
 1005     } while (0)
 1006 
 1007 
 1008 /*
 1009  *  Generic integer and floating point stressor macro
 1010  */
 1011 #define stress_cpu_int_fp(_inttype, _sz, _ftype, _name, _a, _b, \
 1012     _c1, _c2, _c3, _sinf, _cosf)                \
 1013 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_int ## _sz ## _ ## _name(const char *name)\
 1014 {                               \
 1015     int i;                          \
 1016     _inttype int_a, int_b;                  \
 1017     const _inttype mask = ~0;               \
 1018     const _inttype a_final = _a;                \
 1019     const _inttype b_final = _b;                \
 1020     const _inttype c1 = _c1 & mask;             \
 1021     const _inttype c2 = _c2 & mask;             \
 1022     const _inttype c3 = _c3 & mask;             \
 1023     _ftype flt_a = 0.18728L, flt_b = mwc32(),       \
 1024         flt_c = mwc32(), flt_d;             \
 1025                                 \
 1026     MWC_SEED();                     \
 1027     int_a = mwc32();                    \
 1028     int_b = mwc32();                    \
 1029                                 \
 1030     for (i = 0; i < 1000; i++) {                \
 1031         int_float_ops(_ftype, flt_a, flt_b, flt_c, flt_d,\
 1032             _sinf, _cosf, int_a, int_b, c1, c2, c3);\
 1033     }                           \
 1034     if ((g_opt_flags & OPT_FLAGS_VERIFY) &&         \
 1035         ((int_a != a_final) || (int_b != b_final)))     \
 1036         pr_fail("%s: int" # _sz " error detected, " \
 1037             "failed int" # _sz "" # _ftype      \
 1038             " math operations\n", name);        \
 1039                                 \
 1040     double_put(flt_a + flt_b + flt_c + flt_d);      \
 1041 }
 1042 
 1043 stress_cpu_int_fp(uint32_t, 32, float, float,
 1044     0x1ce9b547UL, 0xa24b33aUL,
 1045     C1, C2, C3, sinf, cosf)
 1046 stress_cpu_int_fp(uint32_t, 32, double, double,
 1047     0x1ce9b547UL, 0xa24b33aUL,
 1048     C1, C2, C3, sin, cos)
 1049 stress_cpu_int_fp(uint32_t, 32, long double, longdouble,
 1050     0x1ce9b547UL, 0xa24b33aUL,
 1051     C1, C2, C3, sinl, cosl)
 1052 stress_cpu_int_fp(uint64_t, 64, float, float,
 1053     0x13f7f6dc1d79197cULL, 0x1863d2c6969a51ceULL,
 1054     C1, C2, C3, sinf, cosf)
 1055 stress_cpu_int_fp(uint64_t, 64, double, double,
 1056     0x13f7f6dc1d79197cULL, 0x1863d2c6969a51ceULL,
 1057     C1, C2, C3, sin, cos)
 1058 stress_cpu_int_fp(uint64_t, 64, long double, longdouble,
 1059     0x13f7f6dc1d79197cULL, 0x1863d2c6969a51ceULL,
 1060     C1, C2, C3, sinl, cosl)
 1061 
 1062 #if defined(HAVE_INT128_T)
 1063 stress_cpu_int_fp(__uint128_t, 128, float, float,
 1064     _UINT128(0x132af604d8b9183a,0x5e3af8fa7a663d74),
 1065     _UINT128(0x0062f086e6160e4e,0x0d84c9f800365858),
 1066     _UINT128(C1, C1), _UINT128(C2, C2), _UINT128(C3, C3),
 1067     sinf, cosf)
 1068 stress_cpu_int_fp(__uint128_t, 128, double, double,
 1069     _UINT128(0x132af604d8b9183a,0x5e3af8fa7a663d74),
 1070     _UINT128(0x0062f086e6160e4e,0x0d84c9f800365858),
 1071     _UINT128(C1, C1), _UINT128(C2, C2), _UINT128(C3, C3),
 1072     sin, cos)
 1073 stress_cpu_int_fp(__uint128_t, 128, long double, longdouble,
 1074     _UINT128(0x132af604d8b9183a,0x5e3af8fa7a663d74),
 1075     _UINT128(0x0062f086e6160e4e,0x0d84c9f800365858),
 1076     _UINT128(C1, C1), _UINT128(C2, C2), _UINT128(C3, C3),
 1077     sinl, cosl)
 1078 #if defined(HAVE_FLOAT_DECIMAL32) && !defined(__clang__)
 1079 stress_cpu_int_fp(__uint128_t, 128, _Decimal32, decimal32,
 1080     _UINT128(0x132af604d8b9183a,0x5e3af8fa7a663d74),
 1081     _UINT128(0x0062f086e6160e4e,0x0d84c9f800365858),
 1082     _UINT128(C1, C1), _UINT128(C2, C2), _UINT128(C3, C3),
 1083     (_Decimal32)sinf, (_Decimal32)cosf)
 1084 #endif
 1085 #if defined(HAVE_FLOAT_DECIMAL64) && !defined(__clang__)
 1086 stress_cpu_int_fp(__uint128_t, 128, _Decimal64, decimal64,
 1087     _UINT128(0x132af604d8b9183a,0x5e3af8fa7a663d74),
 1088     _UINT128(0x0062f086e6160e4e,0x0d84c9f800365858),
 1089     _UINT128(C1, C1), _UINT128(C2, C2), _UINT128(C3, C3),
 1090     (_Decimal64)sin, (_Decimal64)cos)
 1091 #endif
 1092 #if defined(HAVE_FLOAT_DECIMAL128) && !defined(__clang__)
 1093 stress_cpu_int_fp(__uint128_t, 128, _Decimal128, decimal128,
 1094     _UINT128(0x132af604d8b9183a,0x5e3af8fa7a663d74),
 1095     _UINT128(0x0062f086e6160e4e,0x0d84c9f800365858),
 1096     _UINT128(C1, C1), _UINT128(C2, C2), _UINT128(C3, C3),
 1097     (_Decimal128)sinl, (_Decimal128)cosl)
 1098 #endif
 1099 #endif
 1100 
 1101 /*
 1102  *  stress_cpu_rgb()
 1103  *  CCIR 601 RGB to YUV to RGB conversion
 1104  */
 1105 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_rgb(const char *name)
 1106 {
 1107     int i;
 1108     uint32_t rgb = mwc32() & 0xffffff;
 1109     uint8_t r = rgb >> 16;
 1110     uint8_t g = rgb >> 8;
 1111     uint8_t b = rgb;
 1112 
 1113     (void)name;
 1114 
 1115     /* Do a 1000 colours starting from the rgb seed */
 1116     for (i = 0; i < 1000; i++) {
 1117         float y, u, v;
 1118 
 1119         /* RGB to CCIR 601 YUV */
 1120         y = (0.299f * r) + (0.587f * g) + (0.114f * b);
 1121         u = (b - y) * 0.565f;
 1122         v = (r - y) * 0.713f;
 1123 
 1124         /* YUV back to RGB */
 1125         r = y + (1.403f * v);
 1126         g = y - (0.344f * u) - (0.714f * v);
 1127         b = y + (1.770f * u);
 1128 
 1129         /* And bump each colour to make next round */
 1130         r += 1;
 1131         g += 2;
 1132         b += 3;
 1133         uint64_put(r + g + b);
 1134     }
 1135 }
 1136 
 1137 /*
 1138  *  stress_cpu_matrix_prod(void)
 1139  *  matrix product
 1140  */
 1141 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_matrix_prod(const char *name)
 1142 {
 1143     int i, j, k;
 1144     const int n = 128;
 1145 
 1146     long double a[n][n], b[n][n], r[n][n];
 1147     long double v = 1 / (long double)((uint32_t)~0);
 1148     long double sum = 0.0L;
 1149 
 1150     (void)name;
 1151 
 1152     for (i = 0; i < n; i++) {
 1153         for (j = 0; j < n; j++) {
 1154             a[i][j] = (long double)mwc32() * v;
 1155             b[i][j] = (long double)mwc32() * v;
 1156             r[i][j] = 0.0L;
 1157         }
 1158     }
 1159 
 1160     for (i = 0; i < n; i++) {
 1161         for (j = 0; j < n; j++) {
 1162             for (k = 0; k < n; k++) {
 1163                 r[i][j] += a[i][k] * b[k][j];
 1164             }
 1165         }
 1166     }
 1167 
 1168     for (i = 0; i < n; i++)
 1169         for (j = 0; j < n; j++)
 1170             sum += r[i][j];
 1171     double_put(sum);
 1172 }
 1173 
 1174 /*
 1175  *   stress_cpu_fibonacci()
 1176  *  compute fibonacci series
 1177  */
 1178 static void HOT OPTIMIZE3 stress_cpu_fibonacci(const char *name)
 1179 {
 1180     const uint64_t fn_res = 0xa94fad42221f2702ULL;
 1181     register uint64_t f1 = 0, f2 = 1, fn;
 1182 
 1183     do {
 1184         fn = f1 + f2;
 1185         f1 = f2;
 1186         f2 = fn;
 1187     } while (!(fn & 0x8000000000000000ULL));
 1188 
 1189     if ((g_opt_flags & OPT_FLAGS_VERIFY) && (fn_res != fn))
 1190         pr_fail("%s: fibonacci error detected, summation "
 1191             "or assignment failure\n", name);
 1192 }
 1193 
 1194 /*
 1195  *  stress_cpu_psi
 1196  *  compute the constant psi,
 1197  *  the reciprocal Fibonacci constant
 1198  */
 1199 static void HOT OPTIMIZE3 stress_cpu_psi(const char *name)
 1200 {
 1201     long double f1 = 0.0L, f2 = 1.0L;
 1202     long double psi = 0.0L, last_psi;
 1203     long double precision = 1.0e-20L;
 1204     int i = 0;
 1205     const int max_iter = 100;
 1206 
 1207     do {
 1208         long double fn = f1 + f2;
 1209         f1 = f2;
 1210         f2 = fn;
 1211         last_psi = psi;
 1212         psi += 1.0L / f1;
 1213         i++;
 1214     } while ((i < max_iter) && (fabsl(psi - last_psi) > precision));
 1215 
 1216     if (g_opt_flags & OPT_FLAGS_VERIFY) {
 1217         if (fabsl(psi - PSI) > 1.0e-15L)
 1218             pr_fail("%s: calculation of reciprocal "
 1219                 "Fibonacci constant phi not as accurate "
 1220                 "as expected\n", name);
 1221         if (i >= max_iter)
 1222             pr_fail("%s: calculation of reciprocal "
 1223                 "Fibonacci constant took more iterations "
 1224                 "than expected\n", name);
 1225     }
 1226 
 1227     double_put(psi);
 1228 }
 1229 
 1230 /*
 1231  *   stress_cpu_ln2
 1232  *  compute ln(2) using series
 1233  */
 1234 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_ln2(const char *name)
 1235 {
 1236     long double ln2 = 0.0L, last_ln2 = 0.0L;
 1237     long double precision = 1.0e-7L;
 1238     register int n = 1;
 1239     const int max_iter = 10000;
 1240 
 1241     /* Not the fastest converging series */
 1242     do {
 1243         last_ln2 = ln2;
 1244         /* Unroll, do several ops */
 1245         ln2 += (long double)1.0L / (long double)n++;
 1246         ln2 -= (long double)1.0L / (long double)n++;
 1247         ln2 += (long double)1.0L / (long double)n++;
 1248         ln2 -= (long double)1.0L / (long double)n++;
 1249         ln2 += (long double)1.0L / (long double)n++;
 1250         ln2 -= (long double)1.0L / (long double)n++;
 1251         ln2 += (long double)1.0L / (long double)n++;
 1252         ln2 -= (long double)1.0L / (long double)n++;
 1253     } while ((n < max_iter) && (fabsl(ln2 - last_ln2) > precision));
 1254 
 1255     if ((g_opt_flags & OPT_FLAGS_VERIFY) && (n >= max_iter))
 1256         pr_fail("%s: calculation of ln(2) took more "
 1257             "iterations than expected\n", name);
 1258 
 1259     double_put(ln2);
 1260 }
 1261 
 1262 /*
 1263  *  ackermann()
 1264  *  a naive/simple implementation of the ackermann function
 1265  */
 1266 static uint32_t HOT ackermann(const uint32_t m, const uint32_t n)
 1267 {
 1268     if (m == 0)
 1269         return n + 1;
 1270     else if (n == 0)
 1271         return ackermann(m - 1, 1);
 1272     else
 1273         return ackermann(m - 1, ackermann(m, n - 1));
 1274 }
 1275 
 1276 /*
 1277  *   stress_cpu_ackermann
 1278  *  compute ackermann function
 1279  */
 1280 static void stress_cpu_ackermann(const char *name)
 1281 {
 1282     uint32_t a = ackermann(3, 10);
 1283 
 1284     if ((g_opt_flags & OPT_FLAGS_VERIFY) && (a != 0x1ffd))
 1285         pr_fail("%s: ackermann error detected, "
 1286             "ackermann(3,10) miscalculated\n", name);
 1287 }
 1288 
 1289 /*
 1290  *   stress_cpu_explog
 1291  *  compute exp(log(n))
 1292  */
 1293 static void HOT stress_cpu_explog(const char *name)
 1294 {
 1295     uint32_t i;
 1296     double n = 1e6;
 1297 
 1298     (void)name;
 1299 
 1300     for (i = 1; i < 100000; i++)
 1301         n = exp(log(n) / 1.00002);
 1302 }
 1303 
 1304 /*
 1305  *  This could be a ternary operator, v = (v op val) ? a : b
 1306  *  but it may be optimised down, so force a compare and jmp
 1307  *  with -O0 and a if/else construct
 1308  */
 1309 #define JMP(v, op, val, a, b)       \
 1310     if (v op val)           \
 1311         v = a;          \
 1312     else                \
 1313         v = b;          \
 1314     uint64_put(next + i);       \
 1315 
 1316 /*
 1317  *   stress_cpu_jmp
 1318  *  jmp conditionals
 1319  */
 1320 static void HOT OPTIMIZE0 stress_cpu_jmp(const char *name)
 1321 {
 1322     register int i, next = 0;
 1323 
 1324     (void)name;
 1325 
 1326     for (i = 1; i < 1000; i++) {
 1327         /* Force lots of compare jmps */
 1328         JMP(next, ==, 1, 2, 3);
 1329         JMP(next, >, 2, 0, 1);
 1330         JMP(next, <, 1, 1, 0);
 1331         JMP(next, ==, 1, 2, 3);
 1332         JMP(next, >, 2, 0, 1);
 1333         JMP(next, <, 1, 1, 0);
 1334         JMP(next, ==, 1, 2, 3);
 1335         JMP(next, >, 2, 0, 1);
 1336         JMP(next, <, 1, 1, 0);
 1337         JMP(next, ==, 1, 2, 3);
 1338         JMP(next, >, 2, 0, 1);
 1339         JMP(next, <, 1, 1, 0);
 1340     }
 1341 }
 1342 
 1343 /*
 1344  *  ccitt_crc16()
 1345  *  perform naive CCITT CRC16
 1346  */
 1347 static uint16_t HOT OPTIMIZE3 ccitt_crc16(const uint8_t *data, size_t n)
 1348 {
 1349     /*
 1350      *  The CCITT CRC16 polynomial is
 1351      *     16    12    5
 1352      *    x   + x   + x  + 1
 1353      *
 1354      *  which is 0x11021, but to make the computation
 1355      *  simpler, this has been reversed to 0x8408 and
 1356      *  the top bit ignored..
 1357      *  We can get away with a 17 bit polynomial
 1358      *  being represented by a 16 bit value because
 1359      *  we are assuming the top bit is always set.
 1360      */
 1361     const uint16_t polynomial = 0x8408;
 1362     register uint16_t crc = ~0;
 1363 
 1364     if (!n)
 1365         return 0;
 1366 
 1367     for (; n; n--) {
 1368         uint8_t i;
 1369         uint8_t val = (uint16_t)0xff & *data++;
 1370 
 1371         for (i = 8; i; --i, val >>= 1) {
 1372             bool do_xor = 1 & (val ^ crc);
 1373             crc >>= 1;
 1374             crc ^= do_xor ? polynomial : 0;
 1375         }
 1376     }
 1377 
 1378     crc = ~crc;
 1379     return (crc << 8) | (crc >> 8);
 1380 }
 1381 
 1382 /*
 1383  *   stress_cpu_crc16
 1384  *  compute 1024 rounds of CCITT CRC16
 1385  */
 1386 static void stress_cpu_crc16(const char *name)
 1387 {
 1388     uint8_t buffer[1024];
 1389     size_t i;
 1390 
 1391     (void)name;
 1392 
 1393     random_buffer(buffer, sizeof(buffer));
 1394     for (i = 0; i < sizeof(buffer); i++)
 1395         uint64_put(ccitt_crc16(buffer, i));
 1396 }
 1397 
 1398 #if defined(HAVE_COMPLEX_H) &&      \
 1399     defined(HAVE_COMPLEX) &&        \
 1400     defined(__STDC_IEC_559_COMPLEX__) &&\
 1401     !defined(__UCLIBC__)
 1402 /*
 1403  *  zeta()
 1404  *  Riemann zeta function
 1405  */
 1406 static inline long double complex HOT OPTIMIZE3 zeta(
 1407     const long double complex s,
 1408     long double precision)
 1409 {
 1410     int i = 1;
 1411     long double complex z = 0.0L, zold = 0.0L;
 1412 
 1413     do {
 1414         zold = z;
 1415         z += 1 / cpow(i++, s);
 1416     } while (cabsl(z - zold) > precision);
 1417 
 1418     return z;
 1419 }
 1420 
 1421 /*
 1422  * stress_cpu_zeta()
 1423  *  stress test Zeta(2.0)..Zeta(10.0)
 1424  */
 1425 static void stress_cpu_zeta(const char *name)
 1426 {
 1427     long double precision = 0.00000001L;
 1428     int i;
 1429 
 1430     (void)name;
 1431 
 1432     for (i = 2; i < 11; i++)
 1433         double_put(zeta((double complex)i, precision));
 1434 }
 1435 #endif
 1436 
 1437 /*
 1438  * stress_cpu_gamma()
 1439  *  stress Euler–Mascheroni constant gamma
 1440  */
 1441 static void HOT OPTIMIZE3 stress_cpu_gamma(const char *name)
 1442 {
 1443     long double precision = 1.0e-10L;
 1444     long double sum = 0.0L, k = 1.0L, _gamma = 0.0L, gammaold;
 1445 
 1446     do {
 1447         gammaold = _gamma;
 1448         sum += 1.0L / k;
 1449         _gamma = sum - logl(k);
 1450         k += 1.0L;
 1451     } while (k < 1e6 && fabsl(_gamma - gammaold) > precision);
 1452 
 1453     double_put(_gamma);
 1454 
 1455     if (g_opt_flags & OPT_FLAGS_VERIFY) {
 1456         if (fabsl(_gamma - GAMMA) > 1.0e-5L)
 1457             pr_fail("%s: calculation of Euler-Mascheroni "
 1458                 "constant not as accurate as expected\n", name);
 1459         if (k > 80000.0L)
 1460             pr_fail("%s: calculation of Euler-Mascheroni "
 1461                 "constant took more iterations than "
 1462                 "expected\n", name);
 1463     }
 1464 
 1465 }
 1466 
 1467 /*
 1468  * stress_cpu_correlate()
 1469  *
 1470  *  Introduction to Signal Processing,
 1471  *  Prentice-Hall, 1995, ISBN: 0-13-209172-0.
 1472  */
 1473 static void HOT OPTIMIZE3 stress_cpu_correlate(const char *name)
 1474 {
 1475     const size_t data_len = 16384;
 1476     const size_t corr_len = data_len / 16;
 1477     size_t i, j;
 1478     double data_average = 0.0;
 1479     double data[data_len], corr[corr_len + 1];
 1480 
 1481     (void)name;
 1482 
 1483     /* Generate some random data */
 1484     for (i = 0; i < data_len; i++) {
 1485         data[i] = mwc64();
 1486         data_average += data[i];
 1487     }
 1488     data_average /= (double)data_len;
 1489 
 1490     /* And correlate */
 1491     for (i = 0; i <= corr_len; i++) {
 1492         corr[i] = 0.0;
 1493         for (j = 0; j < data_len - i; j++) {
 1494             corr[i] += (data[i + j] - data_average) *
 1495                    (data[j] - data_average);
 1496         }
 1497         corr[i] /= (double)corr_len;
 1498         double_put(corr[i]);
 1499     }
 1500 }
 1501 
 1502 
 1503 /*
 1504  * stress_cpu_sieve()
 1505  *  slightly optimised Sieve of Eratosthenes
 1506  */
 1507 static void HOT OPTIMIZE3 stress_cpu_sieve(const char *name)
 1508 {
 1509     const uint32_t nsqrt = sqrt(SIEVE_SIZE);
 1510     static uint32_t sieve[(SIEVE_SIZE + 31) / 32];
 1511     uint32_t i, j;
 1512 
 1513     (void)memset(sieve, 0xff, sizeof(sieve));
 1514     for (i = 2; i < nsqrt; i++)
 1515         if (STRESS_GETBIT(sieve, i))
 1516             for (j = i * i; j < SIEVE_SIZE; j += i)
 1517                 STRESS_CLRBIT(sieve, j);
 1518 
 1519     /* And count up number of primes */
 1520     for (j = 0, i = 2; i < SIEVE_SIZE; i++) {
 1521         if (STRESS_GETBIT(sieve, i))
 1522             j++;
 1523     }
 1524     if ((g_opt_flags & OPT_FLAGS_VERIFY) && (j != 664579))
 1525         pr_fail("%s: sieve error detected, number of "
 1526             "primes has been miscalculated\n", name);
 1527 }
 1528 
 1529 /*
 1530  *  is_prime()
 1531  *  return true if n is prime
 1532  *  http://en.wikipedia.org/wiki/Primality_test
 1533  */
 1534 static inline HOT OPTIMIZE3 int is_prime(uint32_t n)
 1535 {
 1536     register uint32_t i, max;
 1537 
 1538     if (UNLIKELY(n <= 3))
 1539         return n >= 2;
 1540     if ((n % 2 == 0) || (n % 3 == 0))
 1541         return 0;
 1542     max = sqrt(n) + 1;
 1543     for (i = 5; i < max; i+= 6)
 1544         if ((n % i == 0) || (n % (i + 2) == 0))
 1545             return 0;
 1546     return 1;
 1547 }
 1548 
 1549 /*
 1550  *  stress_cpu_prime()
 1551  *
 1552  */
 1553 static void stress_cpu_prime(const char *name)
 1554 {
 1555     uint32_t i, nprimes = 0;
 1556 
 1557     for (i = 0; i < 1000000; i++) {
 1558         nprimes += is_prime(i);
 1559     }
 1560 
 1561     if ((g_opt_flags & OPT_FLAGS_VERIFY) && (nprimes != 78498))
 1562         pr_fail("%s: prime error detected, number of primes "
 1563             "between 0 and 1000000 miscalculated\n", name);
 1564 }
 1565 
 1566 /*
 1567  *  stress_cpu_gray()
 1568  *  compute gray codes
 1569  */
 1570 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_gray(const char *name)
 1571 {
 1572     register uint32_t i;
 1573     register uint64_t sum = 0;
 1574 
 1575     for (i = 0; i < 0x10000; i++) {
 1576         register uint32_t gray_code;
 1577 
 1578         /* Binary to Gray code */
 1579         gray_code = (i >> 1) ^ i;
 1580         sum += gray_code;
 1581 
 1582         /* Gray code back to binary */
 1583 #if 0
 1584         {
 1585             /* Slow iterative method */
 1586             register uint32_t mask;
 1587 
 1588             for (mask = gray_code >> 1; mask; mask >>= 1)
 1589                 gray_code ^= mask;
 1590         }
 1591 #else
 1592         /* Fast non-loop method */
 1593         gray_code ^= (gray_code >> 1);
 1594         gray_code ^= (gray_code >> 2);
 1595         gray_code ^= (gray_code >> 4);
 1596         gray_code ^= (gray_code >> 8);
 1597         gray_code ^= (gray_code >> 16);
 1598 #endif
 1599         sum += gray_code;
 1600     }
 1601     if ((g_opt_flags & OPT_FLAGS_VERIFY) && (sum != 0xffff0000))
 1602         pr_fail("%s: gray code error detected, sum of gray "
 1603             "codes between 0x00000 and 0x10000 miscalculated\n",
 1604             name);
 1605 }
 1606 
 1607 /*
 1608  * hanoi()
 1609  *  do a Hanoi move
 1610  */
 1611 static uint32_t HOT hanoi(
 1612     const uint16_t n,
 1613     const char p1,
 1614     const char p2,
 1615     const char p3)
 1616 {
 1617     if (UNLIKELY(n == 0)) {
 1618         /* Move p1 -> p2 */
 1619         return 1;
 1620     } else {
 1621         uint32_t m = hanoi(n - 1, p1, p3, p2);
 1622         /* Move p1 -> p2 */
 1623         m += hanoi(n - 1, p3, p2, p1);
 1624         return m;
 1625     }
 1626 }
 1627 
 1628 /*
 1629  *  stress_cpu_hanoi
 1630  *  stress with recursive Towers of Hanoi
 1631  */
 1632 static void stress_cpu_hanoi(const char *name)
 1633 {
 1634     uint32_t n = hanoi(20, 'X', 'Y', 'Z');
 1635 
 1636     if ((g_opt_flags & OPT_FLAGS_VERIFY) && (n != 1048576))
 1637         pr_fail("%s: number of hanoi moves different from "
 1638             "the expected number\n", name);
 1639 
 1640     uint64_put(n);
 1641 }
 1642 
 1643 /*
 1644  *  factorial()
 1645  *  compute n!
 1646  */
 1647 static inline long double HOT OPTIMIZE3 factorial(int n)
 1648 {
 1649     static long double factorials[] = {
 1650         1.0L,
 1651         1.0L,
 1652         2.0L,
 1653         6.0L,
 1654         24.0L,
 1655         120.0L,
 1656         720.0L,
 1657         5040.0L,
 1658         40320.0L,
 1659         362880.0L,
 1660         3628800.0L,
 1661         39916800.0L,
 1662         479001600.0L,
 1663         6227020800.0L,
 1664         87178291200.0L,
 1665         1307674368000.0L,
 1666         20922789888000.0L,
 1667         355687428096000.0L,
 1668         6402373705728000.0L,
 1669         121645100408832000.0L,
 1670         2432902008176640000.0L,
 1671         51090942171709440000.0L,
 1672         1124000727777607680000.0L,
 1673         25852016738884976640000.0L,
 1674         620448401733239439360000.0L,
 1675         15511210043330985984000000.0L,
 1676         403291461126605635592388608.0L,
 1677         10888869450418352161430700032.0L,
 1678         304888344611713860511469666304.0L,
 1679         8841761993739701954695181369344.0L,
 1680         265252859812191058647452510846976.0L,
 1681         8222838654177922818071027836256256.0L,
 1682         263130836933693530178272890760200192.0L
 1683     };
 1684 
 1685     if (n < (int)SIZEOF_ARRAY(factorials))
 1686         return factorials[n];
 1687 
 1688     return roundl(expl(lgammal((long double)(n + 1))));
 1689 }
 1690 
 1691 /*
 1692  *  stress_cpu_pi()
 1693  *  compute pi using the Srinivasa Ramanujan
 1694  *  fast convergence algorithm
 1695  */
 1696 static void HOT OPTIMIZE3 stress_cpu_pi(const char *name)
 1697 {
 1698     long double s = 0.0L, pi = 0.0L, last_pi = 0.0L;
 1699     const long double precision = 1.0e-20L;
 1700     const long double c = 2.0L * sqrtl(2.0L) / 9801.0L;
 1701     const int max_iter = 5;
 1702     int k = 0;
 1703 
 1704     do {
 1705         last_pi = pi;
 1706         s += (factorial(4 * k) *
 1707             ((26390.0L * (long double)k) + 1103)) /
 1708             (powl(factorial(k), 4.0L) * powl(396.0L, 4.0L * k));
 1709         pi = 1 / (s * c);
 1710         k++;
 1711     } while ((k < max_iter) && (fabsl(pi - last_pi) > precision));
 1712 
 1713     /* Quick sanity checks */
 1714     if (g_opt_flags & OPT_FLAGS_VERIFY) {
 1715         if (k >= max_iter)
 1716             pr_fail("%s: number of iterations to compute "
 1717                 "pi was more than expected\n", name);
 1718         if (fabsl(pi - M_PI) > 1.0e-15L)
 1719             pr_fail("%s: accuracy of computed pi is not "
 1720                 "as good as expected\n", name);
 1721     }
 1722 
 1723     double_put(pi);
 1724 }
 1725 
 1726 /*
 1727  *  stress_cpu_omega()
 1728  *  compute the constant omega
 1729  *  See http://en.wikipedia.org/wiki/Omega_constant
 1730  */
 1731 static void HOT OPTIMIZE3 stress_cpu_omega(const char *name)
 1732 {
 1733     long double omega = 0.5L, last_omega = 0.0L;
 1734     const long double precision = 1.0e-20L;
 1735     const int max_iter = 6;
 1736     int n = 0;
 1737 
 1738     /* Omega converges very quickly */
 1739     do {
 1740         last_omega = omega;
 1741         omega = (1 + omega) / (1 + expl(omega));
 1742         n++;
 1743     } while ((n < max_iter) && (fabsl(omega - last_omega) > precision));
 1744 
 1745     if (g_opt_flags & OPT_FLAGS_VERIFY) {
 1746         if (n >= max_iter)
 1747             pr_fail("%s: number of iterations to compute "
 1748                 "omega was more than expected\n", name);
 1749         if (fabsl(omega - OMEGA) > 1.0e-16L)
 1750             pr_fail("%s: accuracy of computed omega is "
 1751                 "not as good as expected\n", name);
 1752     }
 1753 
 1754     double_put(omega);
 1755 }
 1756 
 1757 #define HAMMING(G, i, nybble, code)             \
 1758 {                           \
 1759     int8_t res;                 \
 1760     res = (((G[3] >> i) & (nybble >> 3)) & 1) ^ \
 1761           (((G[2] >> i) & (nybble >> 2)) & 1) ^ \
 1762           (((G[1] >> i) & (nybble >> 1)) & 1) ^ \
 1763           (((G[0] >> i) & (nybble >> 0)) & 1);  \
 1764     code ^= ((res & 1) << i);           \
 1765 }
 1766 
 1767 /*
 1768  *  hamming84()
 1769  *  compute Hamming (8,4) codes
 1770  */
 1771 static uint8_t HOT OPTIMIZE3 hamming84(const uint8_t nybble)
 1772 {
 1773     /*
 1774      * Hamming (8,4) Generator matrix
 1775      * (4 parity bits, 4 data bits)
 1776      *
 1777      *  p1 p2 p3 p4 d1 d2 d3 d4
 1778      *  0  1  1  1  1  0  0  0
 1779      *  1  0  1  1  0  1  0  0
 1780      *  1  1  0  1  0  0  1  0
 1781      *  1  1  1  0  0  0  0  1
 1782      *
 1783      * Where:
 1784      *  d1..d4 = 4 data bits
 1785      *  p1..p4 = 4 parity bits:
 1786      *    p1 = d2 + d3 + d4
 1787      *    p2 = d1 + d3 + d4
 1788      *    p3 = d1 + d2 + d4
 1789      *    p4 = d1 + d2 + d3
 1790      *
 1791      * G[] is reversed to turn G[3-j] into G[j] to save a subtraction
 1792      */
 1793     static const uint8_t G[] = {
 1794         0xf1,   /* 0b11110001 */
 1795         0xd2,   /* 0b11010010 */
 1796         0xb4,   /* 0b10110100 */
 1797         0x78,   /* 0b01111000 */
 1798     };
 1799 
 1800     register uint8_t code = 0;
 1801 
 1802     /* Unrolled 8 bit loop x unrolled 4 bit loop  */
 1803     HAMMING(G, 7, nybble, code);
 1804     HAMMING(G, 6, nybble, code);
 1805     HAMMING(G, 5, nybble, code);
 1806     HAMMING(G, 4, nybble, code);
 1807     HAMMING(G, 3, nybble, code);
 1808     HAMMING(G, 2, nybble, code);
 1809     HAMMING(G, 1, nybble, code);
 1810     HAMMING(G, 0, nybble, code);
 1811 
 1812     return code;
 1813 }
 1814 
 1815 /*
 1816  *  stress_cpu_hamming()
 1817  *  compute hamming code on 65536 x 4 nybbles
 1818  */
 1819 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_hamming(const char *name)
 1820 {
 1821     uint32_t i;
 1822     uint32_t sum = 0;
 1823 
 1824     for (i = 0; i < 65536; i++) {
 1825         uint32_t encoded;
 1826 
 1827         /* 4 x 4 bits to 4 x 8 bits hamming encoded */
 1828         encoded =
 1829               (hamming84((i >> 12) & 0xf) << 24) |
 1830               (hamming84((i >> 8) & 0xf) << 16) |
 1831               (hamming84((i >> 4) & 0xf) << 8) |
 1832               (hamming84((i >> 0) & 0xf) << 0);
 1833         sum += encoded;
 1834     }
 1835 
 1836     if ((g_opt_flags & OPT_FLAGS_VERIFY) && (sum != 0xffff8000))
 1837         pr_fail("%s: hamming error detected, sum of 65536 "
 1838             "hamming codes not correct\n", name);
 1839 }
 1840 
 1841 
 1842 static ptrdiff_t stress_cpu_callfunc_func(
 1843     ssize_t     n,
 1844     uint64_t    u64arg,
 1845     uint32_t    u32arg,
 1846     uint16_t    u16arg,
 1847     uint8_t     u8arg,
 1848     uint64_t    *p_u64arg,
 1849     uint32_t    *p_u32arg,
 1850     uint16_t    *p_u16arg,
 1851     uint8_t     *p_u8arg)
 1852 {
 1853     if (LIKELY(n > 0))
 1854         return stress_cpu_callfunc_func(n - 1,
 1855             u64arg, u32arg, u16arg, u8arg,
 1856             p_u64arg, p_u32arg, p_u16arg, p_u8arg);
 1857     else
 1858         return &u64arg - p_u64arg;
 1859 }
 1860 
 1861 /*
 1862  *  stress_cpu_callfunc()
 1863  *  deep function calls
 1864  */
 1865 static void stress_cpu_callfunc(const char *name)
 1866 {
 1867     uint64_t    u64arg = mwc64();
 1868     uint32_t    u32arg = mwc32();
 1869     uint16_t    u16arg = mwc16();
 1870     uint8_t     u8arg  = mwc8();
 1871     ptrdiff_t   ret;
 1872 
 1873     (void)name;
 1874 
 1875     ret = stress_cpu_callfunc_func(1024,
 1876         u64arg, u32arg, u16arg, u8arg,
 1877         &u64arg, &u32arg, &u16arg, &u8arg);
 1878 
 1879     uint64_put((uint64_t)ret);
 1880 }
 1881 
 1882 
 1883 #define P2(n) n, n^1, n^1, n
 1884 #define P4(n) P2(n), P2(n^1), P2(n^1), P2(n)
 1885 #define P6(n) P4(n), P4(n^1), P4(n^1), P4(n)
 1886 
 1887 static const bool stress_cpu_parity_table[256] = {
 1888     P6(0), P6(1), P6(1), P6(0)
 1889 };
 1890 
 1891 /*
 1892  *  stress_cpu_parity
 1893  *  compute parity different ways
 1894  */
 1895 static void stress_cpu_parity(const char *name)
 1896 {
 1897     uint32_t val = 0x83fb5acf;
 1898     size_t i;
 1899 
 1900     for (i = 0; i < 1000; i++, val++) {
 1901         register uint32_t v, parity, p;
 1902         uint8_t *ptr;
 1903 
 1904         /*
 1905          * Naive way
 1906          */
 1907         v = val;
 1908         parity = 0;
 1909         while (v) {
 1910             if (v & 1)
 1911                 parity = !parity;
 1912             v >>= 1;
 1913         }
 1914 
 1915         /*
 1916          * Naive way with Brian Kernigan's bit counting optimisation
 1917          * https://graphics.stanford.edu/~seander/bithacks.html
 1918          */
 1919         v = val;
 1920         p = 0;
 1921         while (v) {
 1922             p = !p;
 1923             v = v & (v - 1);
 1924         }
 1925         if ((g_opt_flags & OPT_FLAGS_VERIFY) && (p != parity))
 1926             pr_fail("%s: parity error detected, using "
 1927                 "optimised naive method\n",  name);
 1928 
 1929         /*
 1930          * "Compute parity of a word with a multiply"
 1931          * the Andrew Shapira method,
 1932          * https://graphics.stanford.edu/~seander/bithacks.html
 1933          */
 1934         v = val;
 1935         v ^= v >> 1;
 1936         v ^= v >> 2;
 1937         v = (v & 0x11111111U) * 0x11111111U;
 1938         p = (v >> 28) & 1;
 1939         if ((g_opt_flags & OPT_FLAGS_VERIFY) && (p != parity))
 1940             pr_fail("%s: parity error detected, using the "
 1941                 "multiply Shapira method\n",  name);
 1942 
 1943         /*
 1944          * "Compute parity in parallel"
 1945          * https://graphics.stanford.edu/~seander/bithacks.html
 1946          */
 1947         v = val;
 1948         v ^= v >> 16;
 1949         v ^= v >> 8;
 1950         v ^= v >> 4;
 1951         v &= 0xf;
 1952         p = (0x6996 >> v) & 1;
 1953         if ((g_opt_flags & OPT_FLAGS_VERIFY) && (p != parity))
 1954             pr_fail("%s: parity error detected, using "
 1955                 "the parallel method\n",  name);
 1956 
 1957         /*
 1958          * "Compute parity by lookup table"
 1959          * https://graphics.stanford.edu/~seander/bithacks.html
 1960          * Variation #1
 1961          */
 1962         v = val;
 1963         v ^= v >> 16;
 1964         v ^= v >> 8;
 1965         p = stress_cpu_parity_table[v & 0xff];
 1966         if ((g_opt_flags & OPT_FLAGS_VERIFY) && (p != parity))
 1967             pr_fail("%s: parity error detected, using "
 1968                 "the lookup method, variation 1\n",  name);
 1969 
 1970         /*
 1971          * "Compute parity by lookup table"
 1972          * https://graphics.stanford.edu/~seander/bithacks.html
 1973          * Variation #2
 1974          */
 1975         ptr = (uint8_t *)&val;
 1976         p = stress_cpu_parity_table[ptr[0] ^ ptr[1] ^ ptr[2] ^ ptr[3]];
 1977         if ((g_opt_flags & OPT_FLAGS_VERIFY) && (p != parity))
 1978             pr_fail("%s: parity error detected, using the "
 1979                 "lookup method, variation 1\n",  name);
 1980     }
 1981 }
 1982 
 1983 /*
 1984  *  stress_cpu_dither
 1985  *  perform 8 bit to 1 bit gray scale
 1986  *  Floyd–Steinberg dither
 1987  */
 1988 static void TARGET_CLONES stress_cpu_dither(const char *name)
 1989 {
 1990     size_t x, y;
 1991 
 1992     (void)name;
 1993 
 1994     /*
 1995      *  Generate some random 8 bit image
 1996      */
 1997     for (y = 0; y < STRESS_CPU_DITHER_Y; y += 8) {
 1998         for (x = 0; x < STRESS_CPU_DITHER_X; x ++) {
 1999             uint64_t v = mwc64();
 2000 
 2001             pixels[x][y + 0] = v;
 2002             v >>= 8;
 2003             pixels[x][y + 1] = v;
 2004             v >>= 8;
 2005             pixels[x][y + 2] = v;
 2006             v >>= 8;
 2007             pixels[x][y + 3] = v;
 2008             v >>= 8;
 2009             pixels[x][y + 4] = v;
 2010             v >>= 8;
 2011             pixels[x][y + 5] = v;
 2012             v >>= 8;
 2013             pixels[x][y + 6] = v;
 2014             v >>= 8;
 2015             pixels[x][y + 7] = v;
 2016         }
 2017     }
 2018 
 2019     /*
 2020      *  ..and dither
 2021      */
 2022     for (y = 0; y < STRESS_CPU_DITHER_Y; y++) {
 2023         for (x = 0; x < STRESS_CPU_DITHER_X; x++) {
 2024             uint8_t pixel = pixels[x][y];
 2025             uint8_t quant = (pixel < 128) ? 0 : 255;
 2026             int32_t error = pixel - quant;
 2027 
 2028             bool xok1 = x < (STRESS_CPU_DITHER_X - 1);
 2029             bool xok2 = x > 0;
 2030             bool yok1 = y < (STRESS_CPU_DITHER_Y - 1);
 2031 
 2032             if (xok1)
 2033                 pixels[x + 1][y] +=
 2034                     (error * 7) >> 4;
 2035             if (xok2 && yok1)
 2036                 pixels[x - 1][y + 1] +=
 2037                     (error * 3) >> 4;
 2038             if (yok1)
 2039                 pixels[x][y + 1] +=
 2040                     (error * 5) >> 4;
 2041             if (xok1 && yok1)
 2042                 pixels[x + 1][y + 1] +=
 2043                     error >> 4;
 2044         }
 2045     }
 2046 }
 2047 
 2048 /*
 2049  *  stress_cpu_union
 2050  *  perform bit field operations on a union
 2051  */
 2052 static void TARGET_CLONES stress_cpu_union(const char *name)
 2053 {
 2054     typedef union {
 2055         struct {
 2056             uint64_t    b1:1;
 2057             uint64_t    b10:10;
 2058             uint64_t    b2:2;
 2059             uint64_t    b9:9;
 2060             uint64_t    b3:3;
 2061             uint64_t    b8:8;
 2062             uint64_t    b4:4;
 2063             uint64_t    b7:7;
 2064             uint64_t    b5:5;
 2065             uint64_t    b6:6;
 2066         } bits64;
 2067         uint64_t    u64:64;
 2068         union {
 2069             uint8_t     b1:1;
 2070             uint8_t     b7:7;
 2071             uint8_t     b8:8;
 2072         } bits8;
 2073         struct {
 2074             uint16_t    b15:15;
 2075             uint16_t    b1:1;
 2076         } bits16;
 2077         struct {
 2078             uint32_t    b10:10;
 2079             uint32_t    b20:20;
 2080             uint32_t    :1;
 2081             uint32_t    b1:1;
 2082         } bits32;
 2083         uint32_t    u32:30;
 2084     } u_t;
 2085 
 2086     static u_t u;
 2087     size_t i;
 2088 
 2089     (void)name;
 2090     for (i = 0; i < 1000; i++) {
 2091         u.bits64.b1 ^= 1;
 2092         u.bits64.b2--;
 2093         u.bits32.b10 ^= ~0;
 2094         u.bits64.b3++;
 2095         u.bits16.b1--;
 2096         u.bits8.b1++;
 2097         u.bits64.b4 *= 2;
 2098         u.bits32.b20 += 3;
 2099         u.u64 += 0x1037fc2ae21ef829ULL;
 2100         u.bits64.b6--;
 2101         u.bits8.b7 *= 3;
 2102         u.bits64.b5 += (u.bits64.b4 << 1);
 2103         u.bits32.b1 ^= 1;
 2104         u.bits64.b7++;
 2105         u.bits8.b8 ^= 0xaa;
 2106         u.bits64.b8--;
 2107         u.bits16.b15 ^= 0xbeef;
 2108         u.bits64.b9++;
 2109         u.bits64.b10 *= 5;
 2110     }
 2111 }
 2112 
 2113 static const uint32_t queens_solutions[] = {
 2114     -1, 1, 0, 0, 2, 10, 4, 40, 92, 352, 724, 2680, 14200
 2115 };
 2116 
 2117 /*
 2118  *  Solution from http://www.cl.cam.ac.uk/~mr10/backtrk.pdf
 2119  *     see section 2.1
 2120  */
 2121 static uint32_t queens_try(
 2122     uint32_t left_diag,
 2123     uint32_t cols,
 2124     uint32_t right_diag,
 2125     uint32_t all)
 2126 {
 2127     register uint32_t solutions = 0;
 2128     register uint32_t poss = ~(left_diag | cols | right_diag) & all;
 2129 
 2130     while (poss) {
 2131         register uint32_t bit = poss & -poss;
 2132         register uint32_t new_cols = cols | bit;
 2133 
 2134         poss -= bit;
 2135         solutions += (new_cols == all) ?
 2136             1 : queens_try((left_diag | bit) << 1,
 2137                 new_cols, (right_diag | bit) >> 1, all);
 2138     }
 2139     return solutions;
 2140 }
 2141 
 2142 
 2143 /*
 2144  *  stress_cpu_queens
 2145  *  solve the queens problem for sizes 1..12
 2146  */
 2147 static void stress_cpu_queens(const char *name)
 2148 {
 2149     uint32_t all, n;
 2150 
 2151     for (all = 1, n = 1; n < 13; n++) {
 2152         uint32_t solutions = queens_try(0, 0, 0, all);
 2153         if ((g_opt_flags & OPT_FLAGS_VERIFY) &&
 2154             (solutions != queens_solutions[n]))
 2155             pr_fail("%s: queens solution error detected "
 2156                 "on board size %" PRIu32 "\n",
 2157                 name, n);
 2158         all = (all + all) + 1;
 2159     }
 2160 }
 2161 
 2162 /*
 2163  *  stress_cpu_factorial
 2164  *  find factorials from 1..150 using
 2165  *  Stirling's and Ramanujan's Approximations.
 2166  */
 2167 static void stress_cpu_factorial(const char *name)
 2168 {
 2169     int n;
 2170     double f = 1.0;
 2171     const double precision = 1.0e-6;
 2172     const double sqrt_pi = sqrtl(M_PI);
 2173 
 2174     for (n = 1; n < 150; n++) {
 2175         double fact = roundl(expl(lgammal((double)(n + 1))));
 2176         double dn;
 2177 
 2178         f *= (double)n;
 2179 
 2180         /* Stirling */
 2181         if ((g_opt_flags & OPT_FLAGS_VERIFY) &&
 2182             ((f - fact) / fact > precision)) {
 2183             pr_fail("%s: Stirling's approximation of factorial(%d) out of range\n",
 2184                                 name, n);
 2185         }
 2186 
 2187         /* Ramanujan */
 2188         dn = (double)n;
 2189         fact = sqrt_pi * powl((dn / M_E), dn);
 2190         fact *= powl((((((((8 * dn) + 4)) * dn) + 1) * dn) + 1.0/30.0), (1.0/6.0));
 2191         if ((g_opt_flags & OPT_FLAGS_VERIFY) &&
 2192             ((f - fact) / fact > precision)) {
 2193             pr_fail("%s: Ramanujan's approximation of factorial(%d) out of range\n",
 2194                                 name, n);
 2195         }
 2196     }
 2197 }
 2198 
 2199 /*
 2200  *  stress_cpu_stats
 2201  *  Exercise some standard stats computations on random data
 2202  */
 2203 static void stress_cpu_stats(const char *name)
 2204 {
 2205     size_t i;
 2206     double data[STATS_MAX];
 2207     double min, max, am = 0.0, gm, hm = 0.0, stddev = 0.0;
 2208     int64_t expon = 0;
 2209     double mant = 1.0;
 2210     const double inverse_n = 1.0 / (double)STATS_MAX;
 2211 
 2212     for (i = 0; i < STATS_MAX; i++)
 2213         data[i] = ((double)(mwc32() + 1)) / 4294967296.0;
 2214 
 2215     min = max = data[0];
 2216 
 2217     for (i = 0; i < STATS_MAX; i++) {
 2218         double d = data[i];
 2219         double f;
 2220         int e;
 2221 
 2222         f = frexp(d, &e);
 2223         mant *= f;
 2224         expon += e;
 2225 
 2226         if (min > d)
 2227             min = d;
 2228         if (max < d)
 2229             max = d;
 2230 
 2231         am += d;
 2232         hm += 1 / d;
 2233     }
 2234     /* Arithmetic mean (average) */
 2235     am = am / STATS_MAX;
 2236     /* Geometric mean */
 2237     gm = pow(mant, inverse_n) *
 2238          pow(2.0, (double)expon * inverse_n);
 2239     /* Harmonic mean */
 2240     hm = STATS_MAX / hm;
 2241 
 2242     for (i = 0; i < STATS_MAX; i++) {
 2243         double d = data[i] - am;
 2244         stddev += (d * d);
 2245     }
 2246     /* Standard Deviation */
 2247     stddev = sqrt(stddev);
 2248 
 2249     double_put(am);
 2250     double_put(gm);
 2251     double_put(hm);
 2252     double_put(stddev);
 2253 
 2254     if (min > hm)
 2255         pr_fail("%s: stats: minimum %f > harmonic mean %f\n",
 2256             name, min, hm);
 2257     if (hm > gm)
 2258         pr_fail("%s: stats: harmonic mean %f > geometric mean %f\n",
 2259             name, hm, gm);
 2260     if (gm > am)
 2261         pr_fail("%s: stats: geometric mean %f > arithmetic mean %f\n",
 2262             name, gm, am);
 2263     if (am > max)
 2264         pr_fail("%s: stats: arithmetic mean %f > maxiumum %f\n",
 2265             name, am, max);
 2266 }
 2267 
 2268 /*
 2269  *  stress_cpu_all()
 2270  *  iterate over all cpu stressors
 2271  */
 2272 static HOT OPTIMIZE3 void stress_cpu_all(const char *name)
 2273 {
 2274     static int i = 1;   /* Skip over stress_cpu_all */
 2275 
 2276     cpu_methods[i++].func(name);
 2277     if (!cpu_methods[i].func)
 2278         i = 1;
 2279 }
 2280 
 2281 /*
 2282  * Table of cpu stress methods
 2283  */
 2284 static const stress_cpu_method_info_t cpu_methods[] = {
 2285     { "all",        stress_cpu_all },   /* Special "all test */
 2286 
 2287     { "ackermann",      stress_cpu_ackermann },
 2288     { "bitops",     stress_cpu_bitops },
 2289     { "callfunc",       stress_cpu_callfunc },
 2290 #if defined(HAVE_COMPLEX_H) &&      \
 2291     defined(HAVE_COMPLEX) &&        \
 2292     defined(__STDC_IEC_559_COMPLEX__) &&\
 2293     !defined(__UCLIBC__)
 2294     { "cdouble",        stress_cpu_complex_double },
 2295     { "cfloat",     stress_cpu_complex_float },
 2296     { "clongdouble",    stress_cpu_complex_long_double },
 2297 #endif
 2298     { "correlate",      stress_cpu_correlate },
 2299     { "crc16",      stress_cpu_crc16 },
 2300 #if defined(HAVE_FLOAT_DECIMAL32) && !defined(__clang__)
 2301     { "decimal32",      stress_cpu_decimal32 },
 2302 #endif
 2303 #if defined(HAVE_FLOAT_DECIMAL64) && !defined(__clang__)
 2304     { "decimal64",      stress_cpu_decimal64 },
 2305 #endif
 2306 #if defined(HAVE_FLOAT_DECIMAL128) && !defined(__clang__)
 2307     { "decimal128",     stress_cpu_decimal128 },
 2308 #endif
 2309     { "dither",     stress_cpu_dither },
 2310     { "djb2a",      stress_cpu_djb2a },
 2311     { "double",     stress_cpu_double },
 2312     { "euler",      stress_cpu_euler },
 2313     { "explog",     stress_cpu_explog },
 2314 #if defined(HAVE_COMPLEX_H) &&      \
 2315     defined(HAVE_COMPLEX) &&        \
 2316     defined(__STDC_IEC_559_COMPLEX__) &&\
 2317     !defined(__UCLIBC__)
 2318     { "fft",        stress_cpu_fft },
 2319 #endif
 2320     { "factorial",      stress_cpu_factorial },
 2321     { "fibonacci",      stress_cpu_fibonacci },
 2322     { "float",      stress_cpu_float },
 2323 #if defined(HAVE_FLOAT16) && !defined(__clang__)
 2324     { "float16",        stress_cpu_float16 },
 2325 #endif
 2326 #if defined(HAVE_FLOAT32) && !defined(__clang__)
 2327     { "float32",        stress_cpu_float32 },
 2328 #endif
 2329 #if defined(HAVE_FLOAT80) && !defined(__clang__)
 2330     { "float80",        stress_cpu_float80 },
 2331 #endif
 2332 #if defined(HAVE_FLOAT128) && !defined(__clang__)
 2333     { "float128",       stress_cpu_float128 },
 2334 #endif
 2335     { "fnv1a",      stress_cpu_fnv1a },
 2336     { "gamma",      stress_cpu_gamma },
 2337     { "gcd",        stress_cpu_gcd },
 2338     { "gray",       stress_cpu_gray },
 2339     { "hamming",        stress_cpu_hamming },
 2340     { "hanoi",      stress_cpu_hanoi },
 2341     { "hyperbolic",     stress_cpu_hyperbolic },
 2342     { "idct",       stress_cpu_idct },
 2343 #if defined(HAVE_INT128_T)
 2344     { "int128",     stress_cpu_int128 },
 2345 #endif
 2346     { "int64",      stress_cpu_int64 },
 2347     { "int32",      stress_cpu_int32 },
 2348     { "int16",      stress_cpu_int16 },
 2349     { "int8",       stress_cpu_int8 },
 2350 #if defined(HAVE_INT128_T)
 2351     { "int128float",    stress_cpu_int128_float },
 2352     { "int128double",   stress_cpu_int128_double },
 2353     { "int128longdouble",   stress_cpu_int128_longdouble },
 2354 #if defined(HAVE_FLOAT_DECIMAL32) && !defined(__clang__)
 2355     { "int128decimal32",    stress_cpu_int128_decimal32 },
 2356 #endif
 2357 #if defined(HAVE_FLOAT_DECIMAL64) && !defined(__clang__)
 2358     { "int128decimal64",    stress_cpu_int128_decimal64 },
 2359 #endif
 2360 #if defined(HAVE_FLOAT_DECIMAL128) && !defined(__clang__)
 2361     { "int128decimal128",   stress_cpu_int128_decimal128 },
 2362 #endif
 2363 #endif
 2364     { "int64float",     stress_cpu_int64_float },
 2365     { "int64double",    stress_cpu_int64_double },
 2366     { "int64longdouble",    stress_cpu_int64_longdouble },
 2367     { "int32float",     stress_cpu_int32_float },
 2368     { "int32double",    stress_cpu_int32_double },
 2369     { "int32longdouble",    stress_cpu_int32_longdouble },
 2370     { "jenkin",     stress_cpu_jenkin },
 2371     { "jmp",        stress_cpu_jmp },
 2372     { "ln2",        stress_cpu_ln2 },
 2373     { "longdouble",     stress_cpu_longdouble },
 2374     { "loop",       stress_cpu_loop },
 2375     { "matrixprod",     stress_cpu_matrix_prod },
 2376     { "nsqrt",      stress_cpu_nsqrt },
 2377     { "omega",      stress_cpu_omega },
 2378     { "parity",     stress_cpu_parity },
 2379     { "phi",        stress_cpu_phi },
 2380     { "pi",         stress_cpu_pi },
 2381     { "pjw",        stress_cpu_pjw },
 2382     { "prime",      stress_cpu_prime },
 2383     { "psi",        stress_cpu_psi },
 2384     { "queens",     stress_cpu_queens },
 2385     { "rand",       stress_cpu_rand },
 2386     { "rand48",     stress_cpu_rand48 },
 2387     { "rgb",        stress_cpu_rgb },
 2388     { "sdbm",       stress_cpu_sdbm },
 2389     { "sieve",      stress_cpu_sieve },
 2390     { "stats",      stress_cpu_stats },
 2391     { "sqrt",       stress_cpu_sqrt },
 2392     { "trig",       stress_cpu_trig },
 2393     { "union",      stress_cpu_union },
 2394 #if defined(HAVE_COMPLEX_H) &&      \
 2395     defined(HAVE_COMPLEX) &&        \
 2396     defined(__STDC_IEC_559_COMPLEX__) &&\
 2397     !defined(__UCLIBC__)
 2398     { "zeta",       stress_cpu_zeta },
 2399 #endif
 2400     { NULL,         NULL }
 2401 };
 2402 
 2403 /*
 2404  *  stress_set_cpu_method()
 2405  *  set the default cpu stress method
 2406  */
 2407 int stress_set_cpu_method(const char *name)
 2408 {
 2409     stress_cpu_method_info_t const *info;
 2410 
 2411     for (info = cpu_methods; info->func; info++) {
 2412         if (!strcmp(info->name, name)) {
 2413             set_setting("cpu-method", TYPE_ID_UINTPTR_T, &info);
 2414             return 0;
 2415         }
 2416     }
 2417 
 2418     (void)fprintf(stderr, "cpu-method must be one of:");
 2419     for (info = cpu_methods; info->func; info++) {
 2420         (void)fprintf(stderr, " %s", info->name);
 2421     }
 2422     (void)fprintf(stderr, "\n");
 2423 
 2424     return -1;
 2425 }
 2426 
 2427 /*
 2428  *  stress_cpu()
 2429  *  stress CPU by doing floating point math ops
 2430  */
 2431 static int HOT OPTIMIZE3 stress_cpu(const args_t *args)
 2432 {
 2433     double bias;
 2434     const stress_cpu_method_info_t *cpu_method = &cpu_methods[0];
 2435     stress_cpu_func func;
 2436     int32_t cpu_load = 100;
 2437     int32_t cpu_load_slice = -64;
 2438 
 2439     (void)get_setting("cpu-load", &cpu_load);
 2440     (void)get_setting("cpu-load-slice", &cpu_load_slice);
 2441     (void)get_setting("cpu-method", &cpu_method);
 2442 
 2443     func = cpu_method->func;
 2444 
 2445     pr_dbg("%s using method '%s'\n", args->name, cpu_method->name);
 2446 
 2447     /*
 2448      * Normal use case, 100% load, simple spinning on CPU
 2449      */
 2450     if (cpu_load == 100) {
 2451         do {
 2452             (void)func(args->name);
 2453             inc_counter(args);
 2454         } while (keep_stressing());
 2455         return EXIT_SUCCESS;
 2456     }
 2457 
 2458     /*
 2459      * It is unlikely, but somebody may request to do a zero
 2460      * load stress test(!)
 2461      */
 2462     if (cpu_load == 0) {
 2463         (void)sleep((int)g_opt_timeout);
 2464         return EXIT_SUCCESS;
 2465     }
 2466 
 2467     /*
 2468      * More complex percentage CPU utilisation.  This is
 2469      * not intended to be 100% accurate timing, it is good
 2470      * enough for most purposes.
 2471      */
 2472     bias = 0.0;
 2473     do {
 2474         double t, delay;
 2475         double t1, t2, t3;
 2476         struct timeval tv;
 2477 
 2478         t1 = time_now();
 2479         if (cpu_load_slice < 0) {
 2480             /* < 0 specifies number of iterations to do per slice */
 2481             int j;
 2482 
 2483             for (j = 0; j < -cpu_load_slice; j++) {
 2484                 (void)func(args->name);
 2485                 if (!g_keep_stressing_flag)
 2486                     break;
 2487                 inc_counter(args);
 2488             }
 2489             t2 = time_now();
 2490         } else if (cpu_load_slice == 0) {
 2491             /* == 0, random time slices */
 2492             double slice_end = t1 + (((double)mwc16()) / 131072.0);
 2493             do {
 2494                 (void)func(args->name);
 2495                 t2 = time_now();
 2496                 if (!g_keep_stressing_flag)
 2497                     break;
 2498                 inc_counter(args);
 2499             } while (t2 < slice_end);
 2500         } else {
 2501             /* > 0, time slice in milliseconds */
 2502             double slice_end = t1 +
 2503                 ((double)cpu_load_slice / 1000.0);
 2504             do {
 2505                 (void)func(args->name);
 2506                 t2 = time_now();
 2507                 if (!g_keep_stressing_flag)
 2508                     break;
 2509                 inc_counter(args);
 2510             } while (t2 < slice_end);
 2511         }
 2512         t = t2 - t1;
 2513         /* Must not calculate this with zero % load */
 2514         delay = t * (((100.0 / (double)cpu_load)) - 1.0);
 2515         delay -= bias;
 2516 
 2517         tv.tv_sec = delay;
 2518         tv.tv_usec = (delay - tv.tv_sec) * 1000000.0;
 2519         (void)select(0, NULL, NULL, NULL, &tv);
 2520 
 2521         t3 = time_now();
 2522         /* Bias takes account of the time to do the delay */
 2523         bias = (t3 - t2) - delay;
 2524     } while (keep_stressing());
 2525 
 2526     return EXIT_SUCCESS;
 2527 }
 2528 
 2529 static void stress_cpu_set_default(void)
 2530 {
 2531     stress_set_cpu_method("all");
 2532 }
 2533 
 2534 stressor_info_t stress_cpu_info = {
 2535     .stressor = stress_cpu,
 2536     .set_default = stress_cpu_set_default,
 2537     .class = CLASS_CPU
 2538 };