"Fossies" - the Fresh Open Source Software Archive

Member "cryptsetup-2.4.3/lib/crypto_backend/argon2/blake2/blamka-round-opt.h" (13 Jan 2022, 21363 Bytes) of package /linux/misc/cryptsetup-2.4.3.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "blamka-round-opt.h" see the Fossies "Dox" file reference documentation.

    1 /*
    2  * Argon2 reference source code package - reference C implementations
    3  *
    4  * Copyright 2015
    5  * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
    6  *
    7  * You may use this work under the terms of a Creative Commons CC0 1.0
    8  * License/Waiver or the Apache Public License 2.0, at your option. The terms of
    9  * these licenses can be found at:
   10  *
   11  * - CC0 1.0 Universal : https://creativecommons.org/publicdomain/zero/1.0
   12  * - Apache 2.0        : https://www.apache.org/licenses/LICENSE-2.0
   13  *
   14  * You should have received a copy of both of these licenses along with this
   15  * software. If not, they may be obtained at the above URLs.
   16  */
   17 
   18 #ifndef BLAKE_ROUND_MKA_OPT_H
   19 #define BLAKE_ROUND_MKA_OPT_H
   20 
   21 #include "blake2-impl.h"
   22 
   23 #include <emmintrin.h>
   24 #if defined(__SSSE3__)
   25 #include <tmmintrin.h> /* for _mm_shuffle_epi8 and _mm_alignr_epi8 */
   26 #endif
   27 
   28 #if defined(__XOP__) && (defined(__GNUC__) || defined(__clang__))
   29 #include <x86intrin.h>
   30 #endif
   31 
   32 #if !defined(__AVX512F__)
   33 #if !defined(__AVX2__)
   34 #if !defined(__XOP__)
   35 #if defined(__SSSE3__)
   36 #define r16                                                                    \
   37     (_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
   38 #define r24                                                                    \
   39     (_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
   40 #define _mm_roti_epi64(x, c)                                                   \
   41     (-(c) == 32)                                                               \
   42         ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1))                      \
   43         : (-(c) == 24)                                                         \
   44               ? _mm_shuffle_epi8((x), r24)                                     \
   45               : (-(c) == 16)                                                   \
   46                     ? _mm_shuffle_epi8((x), r16)                               \
   47                     : (-(c) == 63)                                             \
   48                           ? _mm_xor_si128(_mm_srli_epi64((x), -(c)),           \
   49                                           _mm_add_epi64((x), (x)))             \
   50                           : _mm_xor_si128(_mm_srli_epi64((x), -(c)),           \
   51                                           _mm_slli_epi64((x), 64 - (-(c))))
   52 #else /* defined(__SSE2__) */
   53 #define _mm_roti_epi64(r, c)                                                   \
   54     _mm_xor_si128(_mm_srli_epi64((r), -(c)), _mm_slli_epi64((r), 64 - (-(c))))
   55 #endif
   56 #else
   57 #endif
   58 
   59 static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
   60     const __m128i z = _mm_mul_epu32(x, y);
   61     return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z));
   62 }
   63 
   64 #define G1(A0, B0, C0, D0, A1, B1, C1, D1)                                     \
   65     do {                                                                       \
   66         A0 = fBlaMka(A0, B0);                                                  \
   67         A1 = fBlaMka(A1, B1);                                                  \
   68                                                                                \
   69         D0 = _mm_xor_si128(D0, A0);                                            \
   70         D1 = _mm_xor_si128(D1, A1);                                            \
   71                                                                                \
   72         D0 = _mm_roti_epi64(D0, -32);                                          \
   73         D1 = _mm_roti_epi64(D1, -32);                                          \
   74                                                                                \
   75         C0 = fBlaMka(C0, D0);                                                  \
   76         C1 = fBlaMka(C1, D1);                                                  \
   77                                                                                \
   78         B0 = _mm_xor_si128(B0, C0);                                            \
   79         B1 = _mm_xor_si128(B1, C1);                                            \
   80                                                                                \
   81         B0 = _mm_roti_epi64(B0, -24);                                          \
   82         B1 = _mm_roti_epi64(B1, -24);                                          \
   83     } while ((void)0, 0)
   84 
   85 #define G2(A0, B0, C0, D0, A1, B1, C1, D1)                                     \
   86     do {                                                                       \
   87         A0 = fBlaMka(A0, B0);                                                  \
   88         A1 = fBlaMka(A1, B1);                                                  \
   89                                                                                \
   90         D0 = _mm_xor_si128(D0, A0);                                            \
   91         D1 = _mm_xor_si128(D1, A1);                                            \
   92                                                                                \
   93         D0 = _mm_roti_epi64(D0, -16);                                          \
   94         D1 = _mm_roti_epi64(D1, -16);                                          \
   95                                                                                \
   96         C0 = fBlaMka(C0, D0);                                                  \
   97         C1 = fBlaMka(C1, D1);                                                  \
   98                                                                                \
   99         B0 = _mm_xor_si128(B0, C0);                                            \
  100         B1 = _mm_xor_si128(B1, C1);                                            \
  101                                                                                \
  102         B0 = _mm_roti_epi64(B0, -63);                                          \
  103         B1 = _mm_roti_epi64(B1, -63);                                          \
  104     } while ((void)0, 0)
  105 
  106 #if defined(__SSSE3__)
  107 #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
  108     do {                                                                       \
  109         __m128i t0 = _mm_alignr_epi8(B1, B0, 8);                               \
  110         __m128i t1 = _mm_alignr_epi8(B0, B1, 8);                               \
  111         B0 = t0;                                                               \
  112         B1 = t1;                                                               \
  113                                                                                \
  114         t0 = C0;                                                               \
  115         C0 = C1;                                                               \
  116         C1 = t0;                                                               \
  117                                                                                \
  118         t0 = _mm_alignr_epi8(D1, D0, 8);                                       \
  119         t1 = _mm_alignr_epi8(D0, D1, 8);                                       \
  120         D0 = t1;                                                               \
  121         D1 = t0;                                                               \
  122     } while ((void)0, 0)
  123 
  124 #define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                          \
  125     do {                                                                       \
  126         __m128i t0 = _mm_alignr_epi8(B0, B1, 8);                               \
  127         __m128i t1 = _mm_alignr_epi8(B1, B0, 8);                               \
  128         B0 = t0;                                                               \
  129         B1 = t1;                                                               \
  130                                                                                \
  131         t0 = C0;                                                               \
  132         C0 = C1;                                                               \
  133         C1 = t0;                                                               \
  134                                                                                \
  135         t0 = _mm_alignr_epi8(D0, D1, 8);                                       \
  136         t1 = _mm_alignr_epi8(D1, D0, 8);                                       \
  137         D0 = t1;                                                               \
  138         D1 = t0;                                                               \
  139     } while ((void)0, 0)
  140 #else /* SSE2 */
  141 #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
  142     do {                                                                       \
  143         __m128i t0 = D0;                                                       \
  144         __m128i t1 = B0;                                                       \
  145         D0 = C0;                                                               \
  146         C0 = C1;                                                               \
  147         C1 = D0;                                                               \
  148         D0 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t0, t0));               \
  149         D1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(D1, D1));               \
  150         B0 = _mm_unpackhi_epi64(B0, _mm_unpacklo_epi64(B1, B1));               \
  151         B1 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(t1, t1));               \
  152     } while ((void)0, 0)
  153 
  154 #define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                          \
  155     do {                                                                       \
  156         __m128i t0, t1;                                                        \
  157         t0 = C0;                                                               \
  158         C0 = C1;                                                               \
  159         C1 = t0;                                                               \
  160         t0 = B0;                                                               \
  161         t1 = D0;                                                               \
  162         B0 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(B0, B0));               \
  163         B1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(B1, B1));               \
  164         D0 = _mm_unpackhi_epi64(D0, _mm_unpacklo_epi64(D1, D1));               \
  165         D1 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t1, t1));               \
  166     } while ((void)0, 0)
  167 #endif
  168 
  169 #define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1)                           \
  170     do {                                                                       \
  171         G1(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
  172         G2(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
  173                                                                                \
  174         DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                           \
  175                                                                                \
  176         G1(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
  177         G2(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
  178                                                                                \
  179         UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                         \
  180     } while ((void)0, 0)
  181 #else /* __AVX2__ */
  182 
  183 #include <immintrin.h>
  184 
  185 #define rotr32(x)   _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
  186 #define rotr24(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
  187 #define rotr16(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
  188 #define rotr63(x)   _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x)))
  189 
  190 #define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
  191     do { \
  192         __m256i ml = _mm256_mul_epu32(A0, B0); \
  193         ml = _mm256_add_epi64(ml, ml); \
  194         A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
  195         D0 = _mm256_xor_si256(D0, A0); \
  196         D0 = rotr32(D0); \
  197         \
  198         ml = _mm256_mul_epu32(C0, D0); \
  199         ml = _mm256_add_epi64(ml, ml); \
  200         C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
  201         \
  202         B0 = _mm256_xor_si256(B0, C0); \
  203         B0 = rotr24(B0); \
  204         \
  205         ml = _mm256_mul_epu32(A1, B1); \
  206         ml = _mm256_add_epi64(ml, ml); \
  207         A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
  208         D1 = _mm256_xor_si256(D1, A1); \
  209         D1 = rotr32(D1); \
  210         \
  211         ml = _mm256_mul_epu32(C1, D1); \
  212         ml = _mm256_add_epi64(ml, ml); \
  213         C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
  214         \
  215         B1 = _mm256_xor_si256(B1, C1); \
  216         B1 = rotr24(B1); \
  217     } while((void)0, 0);
  218 
  219 #define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
  220     do { \
  221         __m256i ml = _mm256_mul_epu32(A0, B0); \
  222         ml = _mm256_add_epi64(ml, ml); \
  223         A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
  224         D0 = _mm256_xor_si256(D0, A0); \
  225         D0 = rotr16(D0); \
  226         \
  227         ml = _mm256_mul_epu32(C0, D0); \
  228         ml = _mm256_add_epi64(ml, ml); \
  229         C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
  230         B0 = _mm256_xor_si256(B0, C0); \
  231         B0 = rotr63(B0); \
  232         \
  233         ml = _mm256_mul_epu32(A1, B1); \
  234         ml = _mm256_add_epi64(ml, ml); \
  235         A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
  236         D1 = _mm256_xor_si256(D1, A1); \
  237         D1 = rotr16(D1); \
  238         \
  239         ml = _mm256_mul_epu32(C1, D1); \
  240         ml = _mm256_add_epi64(ml, ml); \
  241         C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
  242         B1 = _mm256_xor_si256(B1, C1); \
  243         B1 = rotr63(B1); \
  244     } while((void)0, 0);
  245 
  246 #define DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
  247     do { \
  248         B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
  249         C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
  250         D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
  251         \
  252         B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
  253         C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
  254         D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
  255     } while((void)0, 0);
  256 
  257 #define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
  258     do { \
  259         __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
  260         __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
  261         B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
  262         B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
  263         \
  264         tmp1 = C0; \
  265         C0 = C1; \
  266         C1 = tmp1; \
  267         \
  268         tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
  269         tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
  270         D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
  271         D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
  272     } while(0);
  273 
  274 #define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
  275     do { \
  276         B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
  277         C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
  278         D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
  279         \
  280         B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
  281         C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
  282         D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
  283     } while((void)0, 0);
  284 
  285 #define UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
  286     do { \
  287         __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
  288         __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
  289         B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
  290         B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
  291         \
  292         tmp1 = C0; \
  293         C0 = C1; \
  294         C1 = tmp1; \
  295         \
  296         tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
  297         tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
  298         D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
  299         D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
  300     } while((void)0, 0);
  301 
  302 #define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
  303     do{ \
  304         G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
  305         G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
  306         \
  307         DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
  308         \
  309         G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
  310         G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
  311         \
  312         UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
  313     } while((void)0, 0);
  314 
  315 #define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \
  316     do{ \
  317         G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
  318         G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
  319         \
  320         DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
  321         \
  322         G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
  323         G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
  324         \
  325         UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
  326     } while((void)0, 0);
  327 
  328 #endif /* __AVX2__ */
  329 
  330 #else /* __AVX512F__ */
  331 
  332 #include <immintrin.h>
  333 
  334 #define ror64(x, n) _mm512_ror_epi64((x), (n))
  335 
  336 static __m512i muladd(__m512i x, __m512i y)
  337 {
  338     __m512i z = _mm512_mul_epu32(x, y);
  339     return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z));
  340 }
  341 
  342 #define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
  343     do { \
  344         A0 = muladd(A0, B0); \
  345         A1 = muladd(A1, B1); \
  346 \
  347         D0 = _mm512_xor_si512(D0, A0); \
  348         D1 = _mm512_xor_si512(D1, A1); \
  349 \
  350         D0 = ror64(D0, 32); \
  351         D1 = ror64(D1, 32); \
  352 \
  353         C0 = muladd(C0, D0); \
  354         C1 = muladd(C1, D1); \
  355 \
  356         B0 = _mm512_xor_si512(B0, C0); \
  357         B1 = _mm512_xor_si512(B1, C1); \
  358 \
  359         B0 = ror64(B0, 24); \
  360         B1 = ror64(B1, 24); \
  361     } while ((void)0, 0)
  362 
  363 #define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
  364     do { \
  365         A0 = muladd(A0, B0); \
  366         A1 = muladd(A1, B1); \
  367 \
  368         D0 = _mm512_xor_si512(D0, A0); \
  369         D1 = _mm512_xor_si512(D1, A1); \
  370 \
  371         D0 = ror64(D0, 16); \
  372         D1 = ror64(D1, 16); \
  373 \
  374         C0 = muladd(C0, D0); \
  375         C1 = muladd(C1, D1); \
  376 \
  377         B0 = _mm512_xor_si512(B0, C0); \
  378         B1 = _mm512_xor_si512(B1, C1); \
  379 \
  380         B0 = ror64(B0, 63); \
  381         B1 = ror64(B1, 63); \
  382     } while ((void)0, 0)
  383 
  384 #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
  385     do { \
  386         B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
  387         B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
  388 \
  389         C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
  390         C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
  391 \
  392         D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
  393         D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
  394     } while ((void)0, 0)
  395 
  396 #define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
  397     do { \
  398         B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
  399         B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
  400 \
  401         C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
  402         C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
  403 \
  404         D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
  405         D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
  406     } while ((void)0, 0)
  407 
  408 #define BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1) \
  409     do { \
  410         G1(A0, B0, C0, D0, A1, B1, C1, D1); \
  411         G2(A0, B0, C0, D0, A1, B1, C1, D1); \
  412 \
  413         DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
  414 \
  415         G1(A0, B0, C0, D0, A1, B1, C1, D1); \
  416         G2(A0, B0, C0, D0, A1, B1, C1, D1); \
  417 \
  418         UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
  419     } while ((void)0, 0)
  420 
  421 #define SWAP_HALVES(A0, A1) \
  422     do { \
  423         __m512i t0, t1; \
  424         t0 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \
  425         t1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \
  426         A0 = t0; \
  427         A1 = t1; \
  428     } while((void)0, 0)
  429 
  430 #define SWAP_QUARTERS(A0, A1) \
  431     do { \
  432         SWAP_HALVES(A0, A1); \
  433         A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
  434         A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
  435     } while((void)0, 0)
  436 
  437 #define UNSWAP_QUARTERS(A0, A1) \
  438     do { \
  439         A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
  440         A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
  441         SWAP_HALVES(A0, A1); \
  442     } while((void)0, 0)
  443 
  444 #define BLAKE2_ROUND_1(A0, C0, B0, D0, A1, C1, B1, D1) \
  445     do { \
  446         SWAP_HALVES(A0, B0); \
  447         SWAP_HALVES(C0, D0); \
  448         SWAP_HALVES(A1, B1); \
  449         SWAP_HALVES(C1, D1); \
  450         BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \
  451         SWAP_HALVES(A0, B0); \
  452         SWAP_HALVES(C0, D0); \
  453         SWAP_HALVES(A1, B1); \
  454         SWAP_HALVES(C1, D1); \
  455     } while ((void)0, 0)
  456 
  457 #define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \
  458     do { \
  459         SWAP_QUARTERS(A0, A1); \
  460         SWAP_QUARTERS(B0, B1); \
  461         SWAP_QUARTERS(C0, C1); \
  462         SWAP_QUARTERS(D0, D1); \
  463         BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \
  464         UNSWAP_QUARTERS(A0, A1); \
  465         UNSWAP_QUARTERS(B0, B1); \
  466         UNSWAP_QUARTERS(C0, C1); \
  467         UNSWAP_QUARTERS(D0, D1); \
  468     } while ((void)0, 0)
  469 
  470 #endif /* __AVX512F__ */
  471 #endif /* BLAKE_ROUND_MKA_OPT_H */