"Fossies" - the Fresh Open Source Software Archive

Member "hashcat-6.2.6/deps/LZMA-SDK/C/AesOpt.c" (2 Sep 2022, 18601 Bytes) of package /linux/privat/hashcat-6.2.6.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. See also the last Fossies "Diffs" side-by-side code changes report for "AesOpt.c": 6.2.1_vs_6.2.2.

    1 /* AesOpt.c -- AES optimized code for x86 AES hardware instructions
    2 2021-04-01 : Igor Pavlov : Public domain */
    3 
    4 #include "Precomp.h"
    5 
    6 #include "CpuArch.h"
    7 
    8 #ifdef MY_CPU_X86_OR_AMD64
    9 
   10   #if defined(__clang__)
   11     #if __clang_major__ > 3 || (__clang_major__ == 3 && __clang_minor__ >= 8)
   12       #define USE_INTEL_AES
   13         #define ATTRIB_AES __attribute__((__target__("aes")))
   14       #if (__clang_major__ >= 8)
   15         #define USE_INTEL_VAES
   16         #define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx2")))
   17       #endif
   18     #endif
   19   #elif defined(__GNUC__)
   20     #if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)
   21       #define USE_INTEL_AES
   22       #ifndef __AES__
   23         #define ATTRIB_AES __attribute__((__target__("aes")))
   24       #endif
   25       #if (__GNUC__ >= 8)
   26         #define USE_INTEL_VAES
   27         #define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx2")))
   28       #endif
   29     #endif
   30   #elif defined(__INTEL_COMPILER)
   31     #if (__INTEL_COMPILER >= 1110)
   32       #define USE_INTEL_AES
   33       #if (__INTEL_COMPILER >= 1900)
   34         #define USE_INTEL_VAES
   35       #endif
   36     #endif
   37   #elif defined(_MSC_VER)
   38     #if (_MSC_VER > 1500) || (_MSC_FULL_VER >= 150030729)
   39       #define USE_INTEL_AES
   40       #if (_MSC_VER >= 1910)
   41         #define USE_INTEL_VAES
   42       #endif
   43     #endif
   44   #endif
   45 
   46 #ifndef ATTRIB_AES
   47   #define ATTRIB_AES
   48 #endif
   49 #ifndef ATTRIB_VAES
   50   #define ATTRIB_VAES
   51 #endif
   52 
   53 
   54 #ifdef USE_INTEL_AES
   55 
   56 #include <wmmintrin.h>
   57 
   58 #ifndef USE_INTEL_VAES
   59 #define AES_TYPE_keys __m128i
   60 #define AES_TYPE_data __m128i
   61 #endif
   62 
   63 #define AES_FUNC_START(name) \
   64     void MY_FAST_CALL name(__m128i *p, __m128i *data, size_t numBlocks)
   65 
   66 #define AES_FUNC_START2(name) \
   67 AES_FUNC_START (name); \
   68 ATTRIB_AES \
   69 AES_FUNC_START (name)
   70 
   71 #define MM_OP(op, dest, src)  dest = op(dest, src);
   72 #define MM_OP_m(op, src)      MM_OP(op, m, src);
   73 
   74 #define MM_XOR( dest, src)    MM_OP(_mm_xor_si128,    dest, src);
   75 #define AVX_XOR(dest, src)    MM_OP(_mm256_xor_si256, dest, src);
   76 
   77 
   78 AES_FUNC_START2 (AesCbc_Encode_HW)
   79 {
   80   __m128i m = *p;
   81   const __m128i k0 = p[2];
   82   const __m128i k1 = p[3];
   83   const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
   84   for (; numBlocks != 0; numBlocks--, data++)
   85   {
   86     UInt32 r = numRounds2;
   87     const __m128i *w = p + 4;
   88     __m128i temp = *data;
   89     MM_XOR (temp, k0);
   90     MM_XOR (m, temp);
   91     MM_OP_m (_mm_aesenc_si128, k1);
   92     do
   93     {
   94       MM_OP_m (_mm_aesenc_si128, w[0]);
   95       MM_OP_m (_mm_aesenc_si128, w[1]);
   96       w += 2;
   97     }
   98     while (--r);
   99     MM_OP_m (_mm_aesenclast_si128, w[0]);
  100     *data = m;
  101   }
  102   *p = m;
  103 }
  104 
  105 
  106 #define WOP_1(op)
  107 #define WOP_2(op)   WOP_1 (op)  op (m1, 1);
  108 #define WOP_3(op)   WOP_2 (op)  op (m2, 2);
  109 #define WOP_4(op)   WOP_3 (op)  op (m3, 3);
  110 #ifdef MY_CPU_AMD64
  111 #define WOP_5(op)   WOP_4 (op)  op (m4, 4);
  112 #define WOP_6(op)   WOP_5 (op)  op (m5, 5);
  113 #define WOP_7(op)   WOP_6 (op)  op (m6, 6);
  114 #define WOP_8(op)   WOP_7 (op)  op (m7, 7);
  115 #endif
  116 /*
  117 #define WOP_9(op)   WOP_8 (op)  op (m8, 8);
  118 #define WOP_10(op)  WOP_9 (op)  op (m9, 9);
  119 #define WOP_11(op)  WOP_10(op)  op (m10, 10);
  120 #define WOP_12(op)  WOP_11(op)  op (m11, 11);
  121 #define WOP_13(op)  WOP_12(op)  op (m12, 12);
  122 #define WOP_14(op)  WOP_13(op)  op (m13, 13);
  123 */
  124 
  125 #ifdef MY_CPU_AMD64
  126   #define NUM_WAYS      8
  127   #define WOP_M1    WOP_8
  128 #else
  129   #define NUM_WAYS      4
  130   #define WOP_M1    WOP_4
  131 #endif
  132 
  133 #define WOP(op)  op (m0, 0);  WOP_M1(op)
  134 
  135 
  136 #define DECLARE_VAR(reg, ii)  __m128i reg
  137 #define LOAD_data(  reg, ii)  reg = data[ii];
  138 #define STORE_data( reg, ii)  data[ii] = reg;
  139 #if (NUM_WAYS > 1)
  140 #define XOR_data_M1(reg, ii)  MM_XOR (reg, data[ii- 1]);
  141 #endif
  142 
  143 #define AVX__DECLARE_VAR(reg, ii)  __m256i reg
  144 #define AVX__LOAD_data(  reg, ii)  reg = ((const __m256i *)(const void *)data)[ii];
  145 #define AVX__STORE_data( reg, ii)  ((__m256i *)(void *)data)[ii] = reg;
  146 #define AVX__XOR_data_M1(reg, ii)  AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii]));
  147 
  148 #define MM_OP_key(op, reg)  MM_OP(op, reg, key);
  149 
  150 #define AES_DEC(      reg, ii)   MM_OP_key (_mm_aesdec_si128,     reg)
  151 #define AES_DEC_LAST( reg, ii)   MM_OP_key (_mm_aesdeclast_si128, reg)
  152 #define AES_ENC(      reg, ii)   MM_OP_key (_mm_aesenc_si128,     reg)
  153 #define AES_ENC_LAST( reg, ii)   MM_OP_key (_mm_aesenclast_si128, reg)
  154 #define AES_XOR(      reg, ii)   MM_OP_key (_mm_xor_si128,        reg)
  155 
  156 
  157 #define AVX__AES_DEC(      reg, ii)   MM_OP_key (_mm256_aesdec_epi128,     reg)
  158 #define AVX__AES_DEC_LAST( reg, ii)   MM_OP_key (_mm256_aesdeclast_epi128, reg)
  159 #define AVX__AES_ENC(      reg, ii)   MM_OP_key (_mm256_aesenc_epi128,     reg)
  160 #define AVX__AES_ENC_LAST( reg, ii)   MM_OP_key (_mm256_aesenclast_epi128, reg)
  161 #define AVX__AES_XOR(      reg, ii)   MM_OP_key (_mm256_xor_si256,         reg)
  162 
  163 #define CTR_START(reg, ii)  MM_OP (_mm_add_epi64, ctr, one); reg = ctr;
  164 #define CTR_END(  reg, ii)  MM_XOR (data[ii], reg);
  165 
  166 #define AVX__CTR_START(reg, ii)  MM_OP (_mm256_add_epi64, ctr2, two); reg = _mm256_xor_si256(ctr2, key);
  167 #define AVX__CTR_END(  reg, ii)  AVX_XOR (((__m256i *)(void *)data)[ii], reg);
  168 
  169 #define WOP_KEY(op, n) { \
  170     const __m128i key = w[n]; \
  171     WOP(op); }
  172 
  173 #define AVX__WOP_KEY(op, n) { \
  174     const __m256i key = w[n]; \
  175     WOP(op); }
  176 
  177 
  178 #define WIDE_LOOP_START  \
  179     dataEnd = data + numBlocks;  \
  180     if (numBlocks >= NUM_WAYS)  \
  181     { dataEnd -= NUM_WAYS; do {  \
  182 
  183 
  184 #define WIDE_LOOP_END  \
  185     data += NUM_WAYS;  \
  186     } while (data <= dataEnd);  \
  187     dataEnd += NUM_WAYS; }  \
  188 
  189 
  190 #define SINGLE_LOOP  \
  191     for (; data < dataEnd; data++)
  192 
  193 
  194 #define NUM_AES_KEYS_MAX 15
  195 
  196 #define WIDE_LOOP_START_AVX(OP)  \
  197     dataEnd = data + numBlocks;  \
  198     if (numBlocks >= NUM_WAYS * 2)  \
  199     { __m256i keys[NUM_AES_KEYS_MAX]; \
  200     UInt32 ii; \
  201     OP \
  202     for (ii = 0; ii < numRounds; ii++) \
  203       keys[ii] = _mm256_broadcastsi128_si256(p[ii]); \
  204     dataEnd -= NUM_WAYS * 2; do {  \
  205 
  206 
  207 #define WIDE_LOOP_END_AVX(OP)  \
  208     data += NUM_WAYS * 2;  \
  209     } while (data <= dataEnd);  \
  210     dataEnd += NUM_WAYS * 2;  \
  211     OP  \
  212     _mm256_zeroupper();  \
  213     }  \
  214 
  215 /* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified,
  216    MSVC still can insert vzeroupper instruction. */
  217 
  218 
  219 AES_FUNC_START2 (AesCbc_Decode_HW)
  220 {
  221   __m128i iv = *p;
  222   const __m128i *wStart = p + *(const UInt32 *)(p + 1) * 2 + 2 - 1;
  223   const __m128i *dataEnd;
  224   p += 2;
  225   
  226   WIDE_LOOP_START
  227   {
  228     const __m128i *w = wStart;
  229     
  230     WOP (DECLARE_VAR)
  231     WOP (LOAD_data);
  232     WOP_KEY (AES_XOR, 1)
  233 
  234     do
  235     {
  236       WOP_KEY (AES_DEC, 0)
  237       w--;
  238     }
  239     while (w != p);
  240     WOP_KEY (AES_DEC_LAST, 0)
  241 
  242     MM_XOR (m0, iv);
  243     WOP_M1 (XOR_data_M1)
  244     iv = data[NUM_WAYS - 1];
  245     WOP (STORE_data);
  246   }
  247   WIDE_LOOP_END
  248 
  249   SINGLE_LOOP
  250   {
  251     const __m128i *w = wStart - 1;
  252     __m128i m = _mm_xor_si128 (w[2], *data);
  253     do
  254     {
  255       MM_OP_m (_mm_aesdec_si128, w[1]);
  256       MM_OP_m (_mm_aesdec_si128, w[0]);
  257       w -= 2;
  258     }
  259     while (w != p);
  260     MM_OP_m (_mm_aesdec_si128,     w[1]);
  261     MM_OP_m (_mm_aesdeclast_si128, w[0]);
  262 
  263     MM_XOR (m, iv);
  264     iv = *data;
  265     *data = m;
  266   }
  267   
  268   p[-2] = iv;
  269 }
  270 
  271 
  272 AES_FUNC_START2 (AesCtr_Code_HW)
  273 {
  274   __m128i ctr = *p;
  275   UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1;
  276   const __m128i *dataEnd;
  277   __m128i one = _mm_cvtsi32_si128(1);
  278 
  279   p += 2;
  280   
  281   WIDE_LOOP_START
  282   {
  283     const __m128i *w = p;
  284     UInt32 r = numRoundsMinus2;
  285     WOP (DECLARE_VAR)
  286     WOP (CTR_START);
  287     WOP_KEY (AES_XOR, 0)
  288     w += 1;
  289     do
  290     {
  291       WOP_KEY (AES_ENC, 0)
  292       w += 1;
  293     }
  294     while (--r);
  295     WOP_KEY (AES_ENC_LAST, 0)
  296    
  297     WOP (CTR_END);
  298   }
  299   WIDE_LOOP_END
  300 
  301   SINGLE_LOOP
  302   {
  303     UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1;
  304     const __m128i *w = p;
  305     __m128i m;
  306     MM_OP (_mm_add_epi64, ctr, one);
  307     m = _mm_xor_si128 (ctr, p[0]);
  308     w += 1;
  309     do
  310     {
  311       MM_OP_m (_mm_aesenc_si128, w[0]);
  312       MM_OP_m (_mm_aesenc_si128, w[1]);
  313       w += 2;
  314     }
  315     while (--numRounds2);
  316     MM_OP_m (_mm_aesenc_si128,     w[0]);
  317     MM_OP_m (_mm_aesenclast_si128, w[1]);
  318     MM_XOR (*data, m);
  319   }
  320   
  321   p[-2] = ctr;
  322 }
  323 
  324 
  325 
  326 #ifdef USE_INTEL_VAES
  327 
  328 #if defined(__clang__) && defined(_MSC_VER)
  329 #define __SSE4_2__
  330 #define __AES__
  331 #define __AVX__
  332 #define __AVX2__
  333 #define __VAES__
  334 #define __AVX512F__
  335 #define __AVX512VL__
  336 #endif
  337 
  338 #include <immintrin.h>
  339 
  340 #define VAES_FUNC_START2(name) \
  341 AES_FUNC_START (name); \
  342 ATTRIB_VAES \
  343 AES_FUNC_START (name)
  344 
  345 VAES_FUNC_START2 (AesCbc_Decode_HW_256)
  346 {
  347   __m128i iv = *p;
  348   const __m128i *dataEnd;
  349   UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
  350   p += 2;
  351   
  352   WIDE_LOOP_START_AVX(;)
  353   {
  354     const __m256i *w = keys + numRounds - 2;
  355     
  356     WOP (AVX__DECLARE_VAR)
  357     WOP (AVX__LOAD_data);
  358     AVX__WOP_KEY (AVX__AES_XOR, 1)
  359 
  360     do
  361     {
  362       AVX__WOP_KEY (AVX__AES_DEC, 0)
  363       w--;
  364     }
  365     while (w != keys);
  366     AVX__WOP_KEY (AVX__AES_DEC_LAST, 0)
  367 
  368     AVX_XOR (m0, _mm256_setr_m128i(iv, data[0]));
  369     WOP_M1 (AVX__XOR_data_M1)
  370     iv = data[NUM_WAYS * 2 - 1];
  371     WOP (AVX__STORE_data);
  372   }
  373   WIDE_LOOP_END_AVX(;)
  374 
  375   SINGLE_LOOP
  376   {
  377     const __m128i *w = p + *(const UInt32 *)(p + 1 - 2) * 2 + 1 - 3;
  378     __m128i m = _mm_xor_si128 (w[2], *data);
  379     do
  380     {
  381       MM_OP_m (_mm_aesdec_si128, w[1]);
  382       MM_OP_m (_mm_aesdec_si128, w[0]);
  383       w -= 2;
  384     }
  385     while (w != p);
  386     MM_OP_m (_mm_aesdec_si128,     w[1]);
  387     MM_OP_m (_mm_aesdeclast_si128, w[0]);
  388 
  389     MM_XOR (m, iv);
  390     iv = *data;
  391     *data = m;
  392   }
  393   
  394   p[-2] = iv;
  395 }
  396 
  397 
  398 /*
  399 SSE2: _mm_cvtsi32_si128 : movd
  400 AVX:  _mm256_setr_m128i            : vinsertf128
  401 AVX2: _mm256_add_epi64             : vpaddq ymm, ymm, ymm
  402       _mm256_extracti128_si256     : vextracti128
  403       _mm256_broadcastsi128_si256  : vbroadcasti128
  404 */
  405 
  406 #define AVX__CTR_LOOP_START  \
  407     ctr2 = _mm256_setr_m128i(_mm_sub_epi64(ctr, one), ctr); \
  408     two = _mm256_setr_m128i(one, one); \
  409     two = _mm256_add_epi64(two, two); \
  410 
  411 // two = _mm256_setr_epi64x(2, 0, 2, 0);
  412   
  413 #define AVX__CTR_LOOP_ENC  \
  414     ctr = _mm256_extracti128_si256 (ctr2, 1); \
  415  
  416 VAES_FUNC_START2 (AesCtr_Code_HW_256)
  417 {
  418   __m128i ctr = *p;
  419   UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
  420   const __m128i *dataEnd;
  421   __m128i one = _mm_cvtsi32_si128(1);
  422   __m256i ctr2, two;
  423   p += 2;
  424   
  425   WIDE_LOOP_START_AVX (AVX__CTR_LOOP_START)
  426   {
  427     const __m256i *w = keys;
  428     UInt32 r = numRounds - 2;
  429     WOP (AVX__DECLARE_VAR)
  430     AVX__WOP_KEY (AVX__CTR_START, 0);
  431 
  432     w += 1;
  433     do
  434     {
  435       AVX__WOP_KEY (AVX__AES_ENC, 0)
  436       w += 1;
  437     }
  438     while (--r);
  439     AVX__WOP_KEY (AVX__AES_ENC_LAST, 0)
  440    
  441     WOP (AVX__CTR_END);
  442   }
  443   WIDE_LOOP_END_AVX (AVX__CTR_LOOP_ENC)
  444   
  445   SINGLE_LOOP
  446   {
  447     UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1;
  448     const __m128i *w = p;
  449     __m128i m;
  450     MM_OP (_mm_add_epi64, ctr, one);
  451     m = _mm_xor_si128 (ctr, p[0]);
  452     w += 1;
  453     do
  454     {
  455       MM_OP_m (_mm_aesenc_si128, w[0]);
  456       MM_OP_m (_mm_aesenc_si128, w[1]);
  457       w += 2;
  458     }
  459     while (--numRounds2);
  460     MM_OP_m (_mm_aesenc_si128,     w[0]);
  461     MM_OP_m (_mm_aesenclast_si128, w[1]);
  462     MM_XOR (*data, m);
  463   }
  464 
  465   p[-2] = ctr;
  466 }
  467 
  468 #endif // USE_INTEL_VAES
  469 
  470 #else // USE_INTEL_AES
  471 
  472 /* no USE_INTEL_AES */
  473 
  474 #pragma message("AES  HW_SW stub was used")
  475 
  476 #define AES_TYPE_keys UInt32
  477 #define AES_TYPE_data Byte
  478 
  479 #define AES_FUNC_START(name) \
  480     void MY_FAST_CALL name(UInt32 *p, Byte *data, size_t numBlocks) \
  481 
  482 #define AES_COMPAT_STUB(name) \
  483     AES_FUNC_START(name); \
  484     AES_FUNC_START(name ## _HW) \
  485     { name(p, data, numBlocks); }
  486 
  487 AES_COMPAT_STUB (AesCbc_Encode)
  488 AES_COMPAT_STUB (AesCbc_Decode)
  489 AES_COMPAT_STUB (AesCtr_Code)
  490 
  491 #endif // USE_INTEL_AES
  492 
  493 
  494 #ifndef USE_INTEL_VAES
  495 
  496 #pragma message("VAES HW_SW stub was used")
  497 
  498 #define VAES_COMPAT_STUB(name) \
  499     void MY_FAST_CALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \
  500     void MY_FAST_CALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks) \
  501     { name((AES_TYPE_keys *)(void *)p, (AES_TYPE_data *)(void *)data, numBlocks); }
  502 
  503 VAES_COMPAT_STUB (AesCbc_Decode_HW)
  504 VAES_COMPAT_STUB (AesCtr_Code_HW)
  505 
  506 #endif // ! USE_INTEL_VAES
  507 
  508 
  509 #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE)
  510 
  511   #if defined(__clang__)
  512     #if (__clang_major__ >= 8) // fix that check
  513       #define USE_HW_AES
  514     #endif
  515   #elif defined(__GNUC__)
  516     #if (__GNUC__ >= 6) // fix that check
  517       #define USE_HW_AES
  518     #endif
  519   #elif defined(_MSC_VER)
  520     #if _MSC_VER >= 1910
  521       #define USE_HW_AES
  522     #endif
  523   #endif
  524 
  525 #ifdef USE_HW_AES
  526 
  527 // #pragma message("=== AES HW === ")
  528 
  529 #if defined(__clang__) || defined(__GNUC__)
  530   #ifdef MY_CPU_ARM64
  531     #define ATTRIB_AES __attribute__((__target__("+crypto")))
  532   #else
  533     #define ATTRIB_AES __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
  534   #endif
  535 #else
  536   // _MSC_VER
  537   // for arm32
  538   #define _ARM_USE_NEW_NEON_INTRINSICS
  539 #endif
  540 
  541 #ifndef ATTRIB_AES
  542   #define ATTRIB_AES
  543 #endif
  544 
  545 #if defined(_MSC_VER) && defined(MY_CPU_ARM64)
  546 #include <arm64_neon.h>
  547 #else
  548 #include <arm_neon.h>
  549 #endif
  550 
  551 typedef uint8x16_t v128;
  552 
  553 #define AES_FUNC_START(name) \
  554     void MY_FAST_CALL name(v128 *p, v128 *data, size_t numBlocks)
  555 
  556 #define AES_FUNC_START2(name) \
  557 AES_FUNC_START (name); \
  558 ATTRIB_AES \
  559 AES_FUNC_START (name)
  560 
  561 #define MM_OP(op, dest, src)  dest = op(dest, src);
  562 #define MM_OP_m(op, src)      MM_OP(op, m, src);
  563 #define MM_OP1_m(op)          m = op(m);
  564 
  565 #define MM_XOR( dest, src)    MM_OP(veorq_u8, dest, src);
  566 #define MM_XOR_m( src)        MM_XOR(m, src);
  567 
  568 #define AES_E_m(k)     MM_OP_m (vaeseq_u8, k);
  569 #define AES_E_MC_m(k)  AES_E_m (k);  MM_OP1_m(vaesmcq_u8);
  570 
  571 
  572 AES_FUNC_START2 (AesCbc_Encode_HW)
  573 {
  574   v128 m = *p;
  575   const v128 k0 = p[2];
  576   const v128 k1 = p[3];
  577   const v128 k2 = p[4];
  578   const v128 k3 = p[5];
  579   const v128 k4 = p[6];
  580   const v128 k5 = p[7];
  581   const v128 k6 = p[8];
  582   const v128 k7 = p[9];
  583   const v128 k8 = p[10];
  584   const v128 k9 = p[11];
  585   const UInt32 numRounds2 = *(const UInt32 *)(p + 1);
  586   const v128 *w = p + ((size_t)numRounds2 * 2);
  587   const v128 k_z1 = w[1];
  588   const v128 k_z0 = w[2];
  589   for (; numBlocks != 0; numBlocks--, data++)
  590   {
  591     MM_XOR_m (*data);
  592     AES_E_MC_m (k0)
  593     AES_E_MC_m (k1)
  594     AES_E_MC_m (k2)
  595     AES_E_MC_m (k3)
  596     AES_E_MC_m (k4)
  597     AES_E_MC_m (k5)
  598     AES_E_MC_m (k6)
  599     AES_E_MC_m (k7)
  600     AES_E_MC_m (k8)
  601     if (numRounds2 >= 6)
  602     {
  603       AES_E_MC_m (k9)
  604       AES_E_MC_m (p[12])
  605       if (numRounds2 != 6)
  606       {
  607         AES_E_MC_m (p[13])
  608         AES_E_MC_m (p[14])
  609       }
  610     }
  611     AES_E_m  (k_z1);
  612     MM_XOR_m (k_z0);
  613     *data = m;
  614   }
  615   *p = m;
  616 }
  617 
  618 
  619 #define WOP_1(op)
  620 #define WOP_2(op)   WOP_1 (op)  op (m1, 1);
  621 #define WOP_3(op)   WOP_2 (op)  op (m2, 2);
  622 #define WOP_4(op)   WOP_3 (op)  op (m3, 3);
  623 #define WOP_5(op)   WOP_4 (op)  op (m4, 4);
  624 #define WOP_6(op)   WOP_5 (op)  op (m5, 5);
  625 #define WOP_7(op)   WOP_6 (op)  op (m6, 6);
  626 #define WOP_8(op)   WOP_7 (op)  op (m7, 7);
  627 
  628   #define NUM_WAYS      8
  629   #define WOP_M1    WOP_8
  630 
  631 #define WOP(op)  op (m0, 0);  WOP_M1(op)
  632 
  633 #define DECLARE_VAR(reg, ii)  v128 reg
  634 #define LOAD_data(  reg, ii)  reg = data[ii];
  635 #define STORE_data( reg, ii)  data[ii] = reg;
  636 #if (NUM_WAYS > 1)
  637 #define XOR_data_M1(reg, ii)  MM_XOR (reg, data[ii- 1]);
  638 #endif
  639 
  640 #define MM_OP_key(op, reg)  MM_OP (op, reg, key);
  641 
  642 #define AES_D_m(k)      MM_OP_m (vaesdq_u8, k);
  643 #define AES_D_IMC_m(k)  AES_D_m (k);  MM_OP1_m (vaesimcq_u8);
  644 
  645 #define AES_XOR(   reg, ii)  MM_OP_key (veorq_u8,  reg)
  646 #define AES_D(     reg, ii)  MM_OP_key (vaesdq_u8, reg)
  647 #define AES_E(     reg, ii)  MM_OP_key (vaeseq_u8, reg)
  648 
  649 #define AES_D_IMC( reg, ii)  AES_D (reg, ii);  reg = vaesimcq_u8(reg)
  650 #define AES_E_MC(  reg, ii)  AES_E (reg, ii);  reg = vaesmcq_u8(reg)
  651 
  652 #define CTR_START(reg, ii)  MM_OP (vaddq_u64, ctr, one);  reg = vreinterpretq_u8_u64(ctr);
  653 #define CTR_END(  reg, ii)  MM_XOR (data[ii], reg);
  654 
  655 #define WOP_KEY(op, n) { \
  656     const v128 key = w[n]; \
  657     WOP(op); }
  658 
  659 #define WIDE_LOOP_START  \
  660     dataEnd = data + numBlocks;  \
  661     if (numBlocks >= NUM_WAYS)  \
  662     { dataEnd -= NUM_WAYS; do {  \
  663 
  664 #define WIDE_LOOP_END  \
  665     data += NUM_WAYS;  \
  666     } while (data <= dataEnd);  \
  667     dataEnd += NUM_WAYS; }  \
  668 
  669 #define SINGLE_LOOP  \
  670     for (; data < dataEnd; data++)
  671 
  672 
  673 AES_FUNC_START2 (AesCbc_Decode_HW)
  674 {
  675   v128 iv = *p;
  676   const v128 *wStart = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
  677   const v128 *dataEnd;
  678   p += 2;
  679   
  680   WIDE_LOOP_START
  681   {
  682     const v128 *w = wStart;
  683     WOP (DECLARE_VAR)
  684     WOP (LOAD_data);
  685     WOP_KEY (AES_D_IMC, 2)
  686     do
  687     {
  688       WOP_KEY (AES_D_IMC, 1)
  689       WOP_KEY (AES_D_IMC, 0)
  690       w -= 2;
  691     }
  692     while (w != p);
  693     WOP_KEY (AES_D,   1)
  694     WOP_KEY (AES_XOR, 0)
  695     MM_XOR (m0, iv);
  696     WOP_M1 (XOR_data_M1)
  697     iv = data[NUM_WAYS - 1];
  698     WOP (STORE_data);
  699   }
  700   WIDE_LOOP_END
  701 
  702   SINGLE_LOOP
  703   {
  704     const v128 *w = wStart;
  705     v128 m = *data;
  706     AES_D_IMC_m (w[2])
  707     do
  708     {
  709       AES_D_IMC_m (w[1]);
  710       AES_D_IMC_m (w[0]);
  711       w -= 2;
  712     }
  713     while (w != p);
  714     AES_D_m  (w[1]);
  715     MM_XOR_m (w[0]);
  716     MM_XOR_m (iv);
  717     iv = *data;
  718     *data = m;
  719   }
  720   
  721   p[-2] = iv;
  722 }
  723 
  724 
  725 AES_FUNC_START2 (AesCtr_Code_HW)
  726 {
  727   uint64x2_t ctr = vreinterpretq_u64_u8(*p);
  728   const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
  729   const v128 *dataEnd;
  730   uint64x2_t one = vdupq_n_u64(0);
  731   one = vsetq_lane_u64(1, one, 0);
  732   p += 2;
  733   
  734   WIDE_LOOP_START
  735   {
  736     const v128 *w = p;
  737     WOP (DECLARE_VAR)
  738     WOP (CTR_START);
  739     do
  740     {
  741       WOP_KEY (AES_E_MC, 0)
  742       WOP_KEY (AES_E_MC, 1)
  743       w += 2;
  744     }
  745     while (w != wEnd);
  746     WOP_KEY (AES_E_MC, 0)
  747     WOP_KEY (AES_E,    1)
  748     WOP_KEY (AES_XOR,  2)
  749     WOP (CTR_END);
  750   }
  751   WIDE_LOOP_END
  752 
  753   SINGLE_LOOP
  754   {
  755     const v128 *w = p;
  756     v128 m;
  757     CTR_START (m, 0);
  758     do
  759     {
  760       AES_E_MC_m (w[0]);
  761       AES_E_MC_m (w[1]);
  762       w += 2;
  763     }
  764     while (w != wEnd);
  765     AES_E_MC_m (w[0]);
  766     AES_E_m    (w[1]);
  767     MM_XOR_m   (w[2]);
  768     CTR_END (m, 0);
  769   }
  770   
  771   p[-2] = vreinterpretq_u8_u64(ctr);
  772 }
  773 
  774 #endif // USE_HW_AES
  775 
  776 #endif // MY_CPU_ARM_OR_ARM64