"Fossies" - the Fresh Open Source Software Archive

Member "src/Crypto/Sha2.c" (10 Oct 2018, 22983 Bytes) of package /windows/misc/VeraCrypt_1.23-Hotfix-2_Source.zip:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "Sha2.c" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 1.20_Source_vs_1.21_Source.

    1 /*
    2 This code is written by kerukuro for cppcrypto library (http://cppcrypto.sourceforge.net/)
    3 and released into public domain.
    4 */
    5 
    6 /* Modified for VeraCrypt with speed optimization for C implementation */
    7 
    8 #include "Sha2.h"
    9 #include "Common/Endian.h"
   10 #include "Crypto/cpu.h"
   11 #include "Crypto/misc.h"
   12 
   13 #ifdef _UEFI
   14 #define NO_OPTIMIZED_VERSIONS
   15 #endif
   16 
   17 #ifndef NO_OPTIMIZED_VERSIONS
   18 
   19 #if defined(__cplusplus)
   20 extern "C"
   21 {
   22 #endif
   23 #if CRYPTOPP_BOOL_X64
   24     void sha512_rorx(const void* M, void* D, uint_64t l);
   25     void sha512_sse4(const void* M, uint_64t D[8], uint_64t l);
   26     void sha512_avx(const void* M, void* D, uint_64t l);
   27 #endif
   28     
   29 #if CRYPTOPP_BOOL_X64 || ((CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32) && !defined (TC_MACOSX))
   30     void sha512_compress_nayuki(uint_64t state[8], const uint_8t block[128]);
   31 #endif
   32 #if defined(__cplusplus)
   33 }
   34 #endif
   35 
   36 #endif
   37 
   38 typedef void (*transformFn)(sha512_ctx* ctx, void* m, uint_64t num_blks);
   39 
   40 transformFn transfunc = NULL;
   41 
   42 static const uint_64t K[80] = {
   43     LL(0x428a2f98d728ae22), LL(0x7137449123ef65cd), LL(0xb5c0fbcfec4d3b2f), LL(0xe9b5dba58189dbbc),
   44     LL(0x3956c25bf348b538), LL(0x59f111f1b605d019), LL(0x923f82a4af194f9b), LL(0xab1c5ed5da6d8118),
   45     LL(0xd807aa98a3030242), LL(0x12835b0145706fbe), LL(0x243185be4ee4b28c), LL(0x550c7dc3d5ffb4e2),
   46     LL(0x72be5d74f27b896f), LL(0x80deb1fe3b1696b1), LL(0x9bdc06a725c71235), LL(0xc19bf174cf692694),
   47     LL(0xe49b69c19ef14ad2), LL(0xefbe4786384f25e3), LL(0x0fc19dc68b8cd5b5), LL(0x240ca1cc77ac9c65),
   48     LL(0x2de92c6f592b0275), LL(0x4a7484aa6ea6e483), LL(0x5cb0a9dcbd41fbd4), LL(0x76f988da831153b5),
   49     LL(0x983e5152ee66dfab), LL(0xa831c66d2db43210), LL(0xb00327c898fb213f), LL(0xbf597fc7beef0ee4),
   50     LL(0xc6e00bf33da88fc2), LL(0xd5a79147930aa725), LL(0x06ca6351e003826f), LL(0x142929670a0e6e70),
   51     LL(0x27b70a8546d22ffc), LL(0x2e1b21385c26c926), LL(0x4d2c6dfc5ac42aed), LL(0x53380d139d95b3df),
   52     LL(0x650a73548baf63de), LL(0x766a0abb3c77b2a8), LL(0x81c2c92e47edaee6), LL(0x92722c851482353b),
   53     LL(0xa2bfe8a14cf10364), LL(0xa81a664bbc423001), LL(0xc24b8b70d0f89791), LL(0xc76c51a30654be30),
   54     LL(0xd192e819d6ef5218), LL(0xd69906245565a910), LL(0xf40e35855771202a), LL(0x106aa07032bbd1b8),
   55     LL(0x19a4c116b8d2d0c8), LL(0x1e376c085141ab53), LL(0x2748774cdf8eeb99), LL(0x34b0bcb5e19b48a8),
   56     LL(0x391c0cb3c5c95a63), LL(0x4ed8aa4ae3418acb), LL(0x5b9cca4f7763e373), LL(0x682e6ff3d6b2b8a3),
   57     LL(0x748f82ee5defb2fc), LL(0x78a5636f43172f60), LL(0x84c87814a1f0ab72), LL(0x8cc702081a6439ec),
   58     LL(0x90befffa23631e28), LL(0xa4506cebde82bde9), LL(0xbef9a3f7b2c67915), LL(0xc67178f2e372532b),
   59     LL(0xca273eceea26619c), LL(0xd186b8c721c0c207), LL(0xeada7dd6cde0eb1e), LL(0xf57d4f7fee6ed178),
   60     LL(0x06f067aa72176fba), LL(0x0a637dc5a2c898a6), LL(0x113f9804bef90dae), LL(0x1b710b35131c471b),
   61     LL(0x28db77f523047d84), LL(0x32caab7b40c72493), LL(0x3c9ebe0a15c9bebc), LL(0x431d67c49c100d4c),
   62     LL(0x4cc5d4becb3e42b6), LL(0x597f299cfc657e2a), LL(0x5fcb6fab3ad6faec), LL(0x6c44198c4a475817)
   63 };
   64 
   65 
   66 #define Ch(x,y,z)       ((z) ^ ((x) & ((y) ^ (z))))
   67 #define Maj(x,y,z)      (((x) & (y)) | ((z) & ((x) ^ (y))))
   68 #define sum0(x)         (rotr64((x), 28) ^ rotr64((x), 34) ^ rotr64((x), 39))
   69 #define sum1(x)         (rotr64((x), 14) ^ rotr64((x), 18) ^ rotr64((x), 41))
   70 #define sigma0(x)       (rotr64((x), 1) ^ rotr64((x), 8) ^ ((x) >> 7))
   71 #define sigma1(x)       (rotr64((x), 19) ^ rotr64((x), 61) ^ ((x) >> 6))
   72 
   73 #define WU(j) (W[j & 15] += sigma1(W[(j + 14) & 15]) + W[(j + 9) & 15] + sigma0(W[(j + 1) & 15]))
   74 
   75 #define COMPRESS_ROUND(i, j, K) \
   76            T1 = h + sum1(e) + Ch(e, f, g) + K[i + j] + (i? WU(j): W[j]); \
   77             T2 = sum0(a) + Maj(a, b, c); \
   78             h = g; \
   79             g = f; \
   80             f = e; \
   81             e = d + T1; \
   82             d = c; \
   83             c = b; \
   84             b = a; \
   85             a = T1 + T2;
   86 
   87 void StdTransform(sha512_ctx* ctx, void* mp, uint_64t num_blks)
   88 {
   89     uint_64t blk;
   90     for (blk = 0; blk < num_blks; blk++)
   91     {
   92         uint_64t W[16];
   93         uint_64t a,b,c,d,e,f,g,h;
   94         uint_64t T1, T2;
   95         int i;
   96 #if defined (TC_WINDOWS_DRIVER) && defined (DEBUG)
   97         int   j;
   98 #endif
   99 
  100         for (i = 0; i < 128 / 8; i++)
  101         {
  102             W[i] = bswap_64((((const uint_64t*)(mp))[blk * 16 + i]));
  103         }
  104 
  105         a = ctx->hash[0];
  106         b = ctx->hash[1];
  107         c = ctx->hash[2];
  108         d = ctx->hash[3];
  109         e = ctx->hash[4];
  110         f = ctx->hash[5];
  111         g = ctx->hash[6];
  112         h = ctx->hash[7];
  113 
  114         for (i = 0; i <= 79; i+=16)
  115         {
  116 #if defined (TC_WINDOWS_DRIVER) && defined (DEBUG)
  117             for (j = 0; j < 16; j++)
  118             {
  119                 COMPRESS_ROUND(i, j, K);
  120             }
  121 #else
  122             COMPRESS_ROUND(i, 0, K);
  123             COMPRESS_ROUND(i, 1, K);
  124             COMPRESS_ROUND(i , 2, K);
  125             COMPRESS_ROUND(i, 3, K);
  126             COMPRESS_ROUND(i, 4, K);
  127             COMPRESS_ROUND(i, 5, K);
  128             COMPRESS_ROUND(i, 6, K);
  129             COMPRESS_ROUND(i, 7, K);
  130             COMPRESS_ROUND(i, 8, K);
  131             COMPRESS_ROUND(i, 9, K);
  132             COMPRESS_ROUND(i, 10, K);
  133             COMPRESS_ROUND(i, 11, K);
  134             COMPRESS_ROUND(i, 12, K);
  135             COMPRESS_ROUND(i, 13, K);
  136             COMPRESS_ROUND(i, 14, K);
  137             COMPRESS_ROUND(i, 15, K);
  138 #endif
  139         }
  140         ctx->hash[0] += a;
  141         ctx->hash[1] += b;
  142         ctx->hash[2] += c;
  143         ctx->hash[3] += d;
  144         ctx->hash[4] += e;
  145         ctx->hash[5] += f;
  146         ctx->hash[6] += g;
  147         ctx->hash[7] += h;
  148     }
  149 }
  150 
  151 #ifndef NO_OPTIMIZED_VERSIONS
  152 
  153 #if CRYPTOPP_BOOL_X64
  154 void Avx2Transform(sha512_ctx* ctx, void* mp, uint_64t num_blks)
  155 {
  156     if (num_blks > 1)
  157         sha512_rorx(mp, ctx->hash, num_blks);
  158     else
  159         sha512_sse4(mp, ctx->hash, num_blks);
  160 }
  161 
  162 void AvxTransform(sha512_ctx* ctx, void* mp, uint_64t num_blks)
  163 {
  164     if (num_blks > 1)
  165         sha512_avx(mp, ctx->hash, num_blks);
  166     else
  167         sha512_sse4(mp, ctx->hash, num_blks);
  168 }
  169 
  170 void SSE4Transform(sha512_ctx* ctx, void* mp, uint_64t num_blks)
  171 {
  172     sha512_sse4(mp, ctx->hash, num_blks);
  173 }
  174 #endif
  175 
  176 #if CRYPTOPP_BOOL_X64 || ((CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32) && !defined (TC_MACOSX))
  177 
  178 void SSE2Transform(sha512_ctx* ctx, void* mp, uint_64t num_blks)
  179 {
  180     uint_64t i;
  181     for (i = 0; i < num_blks; i++)
  182         sha512_compress_nayuki(ctx->hash, (uint_8t*)mp + i * 128);
  183 }
  184 
  185 #endif
  186 
  187 #endif // NO_OPTIMIZED_VERSIONS
  188 
  189 void sha512_begin(sha512_ctx* ctx)
  190 {
  191     ctx->hash[0] = LL(0x6a09e667f3bcc908);
  192     ctx->hash[1] = LL(0xbb67ae8584caa73b);
  193     ctx->hash[2] = LL(0x3c6ef372fe94f82b);
  194     ctx->hash[3] = LL(0xa54ff53a5f1d36f1);
  195     ctx->hash[4] = LL(0x510e527fade682d1);
  196     ctx->hash[5] = LL(0x9b05688c2b3e6c1f);
  197     ctx->hash[6] = LL(0x1f83d9abfb41bd6b);
  198     ctx->hash[7] = LL(0x5be0cd19137e2179);
  199     ctx->count[0] = 0;
  200     ctx->count[1] = 0;
  201 
  202     if (!transfunc)
  203     {
  204 #ifndef NO_OPTIMIZED_VERSIONS
  205 #if CRYPTOPP_BOOL_X64
  206         if (g_isIntel&& HasSAVX2() && HasSBMI2())
  207             transfunc = Avx2Transform;
  208         else if (g_isIntel && HasSAVX())
  209         {
  210                 transfunc = AvxTransform;
  211         }
  212         else if (HasSSE41())
  213         {
  214                 transfunc = SSE4Transform;
  215         }
  216         else
  217 #endif
  218 
  219 #if CRYPTOPP_BOOL_X64 || ((CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32) && !defined (TC_MACOSX))
  220 #if CRYPTOPP_BOOL_X64
  221         if (HasSSE2())
  222 #else
  223         if (HasSSSE3() && HasMMX())
  224 #endif
  225                 transfunc = SSE2Transform;
  226         else
  227 #endif
  228 
  229 #endif
  230             transfunc = StdTransform;
  231     }
  232 }
  233 
  234 void sha512_end(unsigned char * result, sha512_ctx* ctx)
  235 {
  236     int i;
  237     uint_64t mlen, pos = ctx->count[0];
  238     uint_8t* m = (uint_8t*) ctx->wbuf;
  239     m[pos++] = 0x80;
  240     if (pos > 112)
  241     {
  242         memset(m + pos, 0, (size_t) (128 - pos));
  243         transfunc(ctx, m, 1);
  244         pos = 0;
  245     }
  246     memset(m + pos, 0, (size_t) (128 - pos));
  247     mlen = bswap_64(ctx->count[1]);
  248     memcpy(m + (128 - 8), &mlen, 64 / 8);
  249     transfunc(ctx, m, 1);
  250     for (i = 0; i < 8; i++)
  251     {
  252         ctx->hash[i] = bswap_64(ctx->hash[i]);
  253     }
  254     memcpy(result, ctx->hash, 64);
  255 }
  256 
  257 void sha512_hash(const unsigned char * data, uint_64t len, sha512_ctx *ctx)
  258 {
  259     uint_64t pos = ctx->count[0];
  260     uint_64t total = ctx->count[1];
  261     uint_8t* m = (uint_8t*) ctx->wbuf;
  262     if (pos && pos + len >= 128)
  263     {
  264         memcpy(m + pos, data, (size_t) (128 - pos));
  265         transfunc(ctx, m, 1);
  266         len -= 128 - pos;
  267         total += (128 - pos) * 8;
  268         data += 128 - pos;
  269         pos = 0;
  270     }
  271     if (len >= 128)
  272     {
  273         uint_64t blocks = len / 128;
  274         uint_64t bytes = blocks * 128;
  275         transfunc(ctx, (void*)data, blocks);
  276         len -= bytes;
  277         total += (bytes)* 8;
  278         data += bytes;
  279     }
  280     memcpy(m+pos, data, (size_t) (len));
  281     pos += len;
  282     total += len * 8;
  283     ctx->count[0] = pos;
  284     ctx->count[1] = total;
  285 }
  286 
  287 void sha512(unsigned char * result, const unsigned char* source, uint_64t sourceLen)
  288 {
  289     sha512_ctx  ctx;
  290 
  291     sha512_begin(&ctx);
  292     sha512_hash(source, sourceLen, &ctx);
  293     sha512_end(result, &ctx);
  294 }
  295 
  296 /////////////////////////////
  297 
  298 #ifndef NO_OPTIMIZED_VERSIONS
  299 
  300 #if defined(__cplusplus)
  301 extern "C"
  302 {
  303 #endif
  304 
  305 #if CRYPTOPP_BOOL_X64
  306     void sha256_sse4(void *input_data, uint_32t digest[8], uint_64t num_blks);
  307     void sha256_rorx(void *input_data, uint_32t digest[8], uint_64t num_blks);
  308     void sha256_avx(void *input_data, uint_32t digest[8], uint_64t num_blks);
  309 #endif
  310 
  311 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
  312     void sha256_compress_nayuki(uint_32t state[8], const uint_8t block[64]);
  313 #endif
  314 
  315 #if defined(__cplusplus)
  316 }
  317 #endif
  318 
  319 #endif
  320 
  321 CRYPTOPP_ALIGN_DATA(16) uint_32t SHA256_K[64] CRYPTOPP_SECTION_ALIGN16 = {
  322         0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
  323         0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
  324         0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
  325         0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
  326         0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
  327         0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
  328         0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
  329         0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
  330     };
  331 
  332 #if (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X32_ASM_AVAILABLE))
  333 
  334 #ifdef _MSC_VER
  335 # pragma warning(disable: 4100 4731)
  336 #endif
  337 
  338 static void CRYPTOPP_FASTCALL X86_SHA256_HashBlocks(uint_32t *state, const uint_32t *data, size_t len)
  339 {
  340     #define LOCALS_SIZE 8*4 + 16*4 + 4*WORD_SZ
  341     #define H(i)        [BASE+ASM_MOD(1024+7-(i),8)*4]
  342     #define G(i)        H(i+1)
  343     #define F(i)        H(i+2)
  344     #define E(i)        H(i+3)
  345     #define D(i)        H(i+4)
  346     #define C(i)        H(i+5)
  347     #define B(i)        H(i+6)
  348     #define A(i)        H(i+7)
  349     #define Wt(i)       BASE+8*4+ASM_MOD(1024+15-(i),16)*4
  350     #define Wt_2(i)     Wt((i)-2)
  351     #define Wt_15(i)    Wt((i)-15)
  352     #define Wt_7(i)     Wt((i)-7)
  353     #define K_END       [BASE+8*4+16*4+0*WORD_SZ]
  354     #define STATE_SAVE  [BASE+8*4+16*4+1*WORD_SZ]
  355     #define DATA_SAVE   [BASE+8*4+16*4+2*WORD_SZ]
  356     #define DATA_END    [BASE+8*4+16*4+3*WORD_SZ]
  357     #define Kt(i)       WORD_REG(si)+(i)*4
  358 #if CRYPTOPP_BOOL_X32
  359     #define BASE        esp+8
  360 #elif CRYPTOPP_BOOL_X86
  361     #define BASE        esp+4
  362 #elif defined(__GNUC__)
  363     #define BASE        r8
  364 #else
  365     #define BASE        rsp
  366 #endif
  367 
  368 #define RA0(i, edx, edi)        \
  369     AS2(    add edx, [Kt(i)]    )\
  370     AS2(    add edx, [Wt(i)]    )\
  371     AS2(    add edx, H(i)       )\
  372 
  373 #define RA1(i, edx, edi)
  374 
  375 #define RB0(i, edx, edi)
  376 
  377 #define RB1(i, edx, edi)    \
  378     AS2(    mov AS_REG_7d, [Wt_2(i)]    )\
  379     AS2(    mov edi, [Wt_15(i)])\
  380     AS2(    mov ebx, AS_REG_7d  )\
  381     AS2(    shr AS_REG_7d, 10       )\
  382     AS2(    ror ebx, 17     )\
  383     AS2(    xor AS_REG_7d, ebx  )\
  384     AS2(    ror ebx, 2      )\
  385     AS2(    xor ebx, AS_REG_7d  )/* s1(W_t-2) */\
  386     AS2(    add ebx, [Wt_7(i)])\
  387     AS2(    mov AS_REG_7d, edi  )\
  388     AS2(    shr AS_REG_7d, 3        )\
  389     AS2(    ror edi, 7      )\
  390     AS2(    add ebx, [Wt(i)])/* s1(W_t-2) + W_t-7 + W_t-16 */\
  391     AS2(    xor AS_REG_7d, edi  )\
  392     AS2(    add edx, [Kt(i)])\
  393     AS2(    ror edi, 11     )\
  394     AS2(    add edx, H(i)   )\
  395     AS2(    xor AS_REG_7d, edi  )/* s0(W_t-15) */\
  396     AS2(    add AS_REG_7d, ebx  )/* W_t = s1(W_t-2) + W_t-7 + s0(W_t-15) W_t-16*/\
  397     AS2(    mov [Wt(i)], AS_REG_7d)\
  398     AS2(    add edx, AS_REG_7d  )\
  399 
  400 #define ROUND(i, r, eax, ecx, edi, edx)\
  401     /* in: edi = E  */\
  402     /* unused: eax, ecx, temp: ebx, AS_REG_7d, out: edx = T1 */\
  403     AS2(    mov edx, F(i)   )\
  404     AS2(    xor edx, G(i)   )\
  405     AS2(    and edx, edi    )\
  406     AS2(    xor edx, G(i)   )/* Ch(E,F,G) = (G^(E&(F^G))) */\
  407     AS2(    mov AS_REG_7d, edi  )\
  408     AS2(    ror edi, 6      )\
  409     AS2(    ror AS_REG_7d, 25       )\
  410     RA##r(i, edx, edi       )/* H + Wt + Kt + Ch(E,F,G) */\
  411     AS2(    xor AS_REG_7d, edi  )\
  412     AS2(    ror edi, 5      )\
  413     AS2(    xor AS_REG_7d, edi  )/* S1(E) */\
  414     AS2(    add edx, AS_REG_7d  )/* T1 = S1(E) + Ch(E,F,G) + H + Wt + Kt */\
  415     RB##r(i, edx, edi       )/* H + Wt + Kt + Ch(E,F,G) */\
  416     /* in: ecx = A, eax = B^C, edx = T1 */\
  417     /* unused: edx, temp: ebx, AS_REG_7d, out: eax = A, ecx = B^C, edx = E */\
  418     AS2(    mov ebx, ecx    )\
  419     AS2(    xor ecx, B(i)   )/* A^B */\
  420     AS2(    and eax, ecx    )\
  421     AS2(    xor eax, B(i)   )/* Maj(A,B,C) = B^((A^B)&(B^C) */\
  422     AS2(    mov AS_REG_7d, ebx  )\
  423     AS2(    ror ebx, 2      )\
  424     AS2(    add eax, edx    )/* T1 + Maj(A,B,C) */\
  425     AS2(    add edx, D(i)   )\
  426     AS2(    mov D(i), edx   )\
  427     AS2(    ror AS_REG_7d, 22       )\
  428     AS2(    xor AS_REG_7d, ebx  )\
  429     AS2(    ror ebx, 11     )\
  430     AS2(    xor AS_REG_7d, ebx  )\
  431     AS2(    add eax, AS_REG_7d  )/* T1 + S0(A) + Maj(A,B,C) */\
  432     AS2(    mov H(i), eax   )\
  433 
  434 // Unroll the use of CRYPTOPP_BOOL_X64 in assembler math. The GAS assembler on X32 (version 2.25)
  435 //   complains "Error: invalid operands (*ABS* and *UND* sections) for `*` and `-`"
  436 #if CRYPTOPP_BOOL_X64
  437 #define SWAP_COPY(i)        \
  438     AS2(    mov     WORD_REG(bx), [WORD_REG(dx)+i*WORD_SZ])\
  439     AS1(    bswap   WORD_REG(bx))\
  440     AS2(    mov     [Wt(i*2+1)], WORD_REG(bx))
  441 #else // X86 and X32
  442 #define SWAP_COPY(i)        \
  443     AS2(    mov     WORD_REG(bx), [WORD_REG(dx)+i*WORD_SZ])\
  444     AS1(    bswap   WORD_REG(bx))\
  445     AS2(    mov     [Wt(i)], WORD_REG(bx))
  446 #endif
  447 
  448 #if defined(__GNUC__)
  449     #if CRYPTOPP_BOOL_X64
  450         CRYPTOPP_ALIGN_DATA(16) byte workspace[LOCALS_SIZE] ;
  451     #endif
  452     __asm__ __volatile__
  453     (
  454     #if CRYPTOPP_BOOL_X64
  455         "lea %4, %%r8;"
  456     #endif
  457     INTEL_NOPREFIX
  458 #endif
  459 
  460 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
  461     #ifndef __GNUC__
  462         AS2(    mov     edi, [len])
  463         AS2(    lea     WORD_REG(si), [SHA256_K+48*4])
  464     #endif
  465     #if !defined(_MSC_VER) || (_MSC_VER < 1400)
  466         AS_PUSH_IF86(bx)
  467     #endif
  468 
  469     AS_PUSH_IF86(bp)
  470     AS2(    mov     ebx, esp)
  471     AS2(    and     esp, -16)
  472     AS2(    sub     WORD_REG(sp), LOCALS_SIZE)
  473     AS_PUSH_IF86(bx)
  474 #endif
  475     AS2(    mov     STATE_SAVE, WORD_REG(cx))
  476     AS2(    mov     DATA_SAVE, WORD_REG(dx))
  477     AS2(    lea     WORD_REG(ax), [WORD_REG(di) + WORD_REG(dx)])
  478     AS2(    mov     DATA_END, WORD_REG(ax))
  479     AS2(    mov     K_END, WORD_REG(si))
  480 
  481 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
  482 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
  483     AS2(    test    edi, 1)
  484     ASJ(    jnz,    2, f)
  485     AS1(    dec     DWORD PTR K_END)
  486 #endif
  487     AS2(    movdqu  xmm0, XMMWORD_PTR [WORD_REG(cx)+0*16])
  488     AS2(    movdqu  xmm1, XMMWORD_PTR [WORD_REG(cx)+1*16])
  489 #endif
  490 
  491 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
  492 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
  493     ASJ(    jmp,    0, f)
  494 #endif
  495     ASL(2)  // non-SSE2
  496     AS2(    mov     esi, ecx)
  497     AS2(    lea     edi, A(0))
  498     AS2(    mov     ecx, 8)
  499 ATT_NOPREFIX
  500     AS1(    rep movsd)
  501 INTEL_NOPREFIX
  502     AS2(    mov     esi, K_END)
  503     ASJ(    jmp,    3, f)
  504 #endif
  505 
  506 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
  507     ASL(0)
  508     AS2(    movdqu  E(0), xmm1)
  509     AS2(    movdqu  A(0), xmm0)
  510 #endif
  511 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
  512     ASL(3)
  513 #endif
  514     AS2(    sub     WORD_REG(si), 48*4)
  515     SWAP_COPY(0)    SWAP_COPY(1)    SWAP_COPY(2)    SWAP_COPY(3)
  516     SWAP_COPY(4)    SWAP_COPY(5)    SWAP_COPY(6)    SWAP_COPY(7)
  517 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
  518     SWAP_COPY(8)    SWAP_COPY(9)    SWAP_COPY(10)   SWAP_COPY(11)
  519     SWAP_COPY(12)   SWAP_COPY(13)   SWAP_COPY(14)   SWAP_COPY(15)
  520 #endif
  521     AS2(    mov     edi, E(0))  // E
  522     AS2(    mov     eax, B(0))  // B
  523     AS2(    xor     eax, C(0))  // B^C
  524     AS2(    mov     ecx, A(0))  // A
  525 
  526     ROUND(0, 0, eax, ecx, edi, edx)
  527     ROUND(1, 0, ecx, eax, edx, edi)
  528     ROUND(2, 0, eax, ecx, edi, edx)
  529     ROUND(3, 0, ecx, eax, edx, edi)
  530     ROUND(4, 0, eax, ecx, edi, edx)
  531     ROUND(5, 0, ecx, eax, edx, edi)
  532     ROUND(6, 0, eax, ecx, edi, edx)
  533     ROUND(7, 0, ecx, eax, edx, edi)
  534     ROUND(8, 0, eax, ecx, edi, edx)
  535     ROUND(9, 0, ecx, eax, edx, edi)
  536     ROUND(10, 0, eax, ecx, edi, edx)
  537     ROUND(11, 0, ecx, eax, edx, edi)
  538     ROUND(12, 0, eax, ecx, edi, edx)
  539     ROUND(13, 0, ecx, eax, edx, edi)
  540     ROUND(14, 0, eax, ecx, edi, edx)
  541     ROUND(15, 0, ecx, eax, edx, edi)
  542 
  543     ASL(1)
  544     AS2(add WORD_REG(si), 4*16)
  545     ROUND(0, 1, eax, ecx, edi, edx)
  546     ROUND(1, 1, ecx, eax, edx, edi)
  547     ROUND(2, 1, eax, ecx, edi, edx)
  548     ROUND(3, 1, ecx, eax, edx, edi)
  549     ROUND(4, 1, eax, ecx, edi, edx)
  550     ROUND(5, 1, ecx, eax, edx, edi)
  551     ROUND(6, 1, eax, ecx, edi, edx)
  552     ROUND(7, 1, ecx, eax, edx, edi)
  553     ROUND(8, 1, eax, ecx, edi, edx)
  554     ROUND(9, 1, ecx, eax, edx, edi)
  555     ROUND(10, 1, eax, ecx, edi, edx)
  556     ROUND(11, 1, ecx, eax, edx, edi)
  557     ROUND(12, 1, eax, ecx, edi, edx)
  558     ROUND(13, 1, ecx, eax, edx, edi)
  559     ROUND(14, 1, eax, ecx, edi, edx)
  560     ROUND(15, 1, ecx, eax, edx, edi)
  561     AS2(    cmp     WORD_REG(si), K_END)
  562     ATT_NOPREFIX
  563     ASJ(    jb,     1, b)
  564     INTEL_NOPREFIX
  565 
  566     AS2(    mov     WORD_REG(dx), DATA_SAVE)
  567     AS2(    add     WORD_REG(dx), 64)
  568     AS2(    mov     AS_REG_7, STATE_SAVE)
  569     AS2(    mov     DATA_SAVE, WORD_REG(dx))
  570 
  571 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
  572 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
  573     AS2(    test    DWORD PTR K_END, 1)
  574     ASJ(    jz,     4, f)
  575 #endif
  576     AS2(    movdqu  xmm1, XMMWORD_PTR [AS_REG_7+1*16])
  577     AS2(    movdqu  xmm0, XMMWORD_PTR [AS_REG_7+0*16])
  578     AS2(    paddd   xmm1, E(0))
  579     AS2(    paddd   xmm0, A(0))
  580     AS2(    movdqu  [AS_REG_7+1*16], xmm1)
  581     AS2(    movdqu  [AS_REG_7+0*16], xmm0)
  582     AS2(    cmp     WORD_REG(dx), DATA_END)
  583     ATT_NOPREFIX
  584     ASJ(    jb,     0, b)
  585     INTEL_NOPREFIX
  586 #endif
  587 
  588 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
  589 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
  590     ASJ(    jmp,    5, f)
  591     ASL(4)  // non-SSE2
  592 #endif
  593     AS2(    add     [AS_REG_7+0*4], ecx)    // A
  594     AS2(    add     [AS_REG_7+4*4], edi)    // E
  595     AS2(    mov     eax, B(0))
  596     AS2(    mov     ebx, C(0))
  597     AS2(    mov     ecx, D(0))
  598     AS2(    add     [AS_REG_7+1*4], eax)
  599     AS2(    add     [AS_REG_7+2*4], ebx)
  600     AS2(    add     [AS_REG_7+3*4], ecx)
  601     AS2(    mov     eax, F(0))
  602     AS2(    mov     ebx, G(0))
  603     AS2(    mov     ecx, H(0))
  604     AS2(    add     [AS_REG_7+5*4], eax)
  605     AS2(    add     [AS_REG_7+6*4], ebx)
  606     AS2(    add     [AS_REG_7+7*4], ecx)
  607     AS2(    mov     ecx, AS_REG_7d)
  608     AS2(    cmp     WORD_REG(dx), DATA_END)
  609     ASJ(    jb,     2, b)
  610 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
  611     ASL(5)
  612 #endif
  613 #endif
  614 
  615     AS_POP_IF86(sp)
  616     AS_POP_IF86(bp)
  617     #if !defined(_MSC_VER) || (_MSC_VER < 1400)
  618         AS_POP_IF86(bx)
  619     #endif
  620 
  621 #ifdef __GNUC__
  622     ATT_PREFIX
  623     :
  624     : "c" (state), "d" (data), "S" (SHA256_K+48), "D" (len)
  625     #if CRYPTOPP_BOOL_X64
  626         , "m" (workspace[0])
  627     #endif
  628     : "memory", "cc", "%eax"
  629     #if CRYPTOPP_BOOL_X64
  630         , "%rbx", "%r8", "%r10"
  631     #endif
  632     );
  633 #endif
  634 }
  635 
  636 #endif  // (defined(CRYPTOPP_X86_ASM_AVAILABLE))
  637 
  638 #undef sum0
  639 #undef sum1
  640 #undef sigma0
  641 #undef sigma1
  642 
  643 #define sum0(x)     (rotr32((x), 2) ^ rotr32((x), 13) ^ rotr32((x), 22))
  644 #define sum1(x)     (rotr32((x), 6) ^ rotr32((x), 11) ^ rotr32((x), 25))
  645 #define sigma0(x)   (rotr32((x), 7) ^ rotr32((x), 18) ^ ((x) >> 3))
  646 #define sigma1(x)   (rotr32((x), 17) ^ rotr32((x), 19) ^ ((x) >> 10))
  647 
  648 
  649 typedef void (*sha256transformFn)(sha256_ctx* ctx, void* m, uint_64t num_blks);
  650 
  651 sha256transformFn sha256transfunc = NULL;
  652 
  653 void StdSha256Transform(sha256_ctx* ctx, void* mp, uint_64t num_blks)
  654 {
  655     uint_64t blk;
  656     for (blk = 0; blk < num_blks; blk++)
  657     {
  658         uint_32t W[16];
  659         uint_32t a,b,c,d,e,f,g,h;
  660         uint_32t T1, T2;
  661         int i;
  662 #if defined (TC_WINDOWS_DRIVER) && defined (DEBUG)
  663         int   j;
  664 #endif
  665 
  666         for (i = 0; i < 64 / 4; i++)
  667         {
  668             W[i] = bswap_32((((const uint_32t*)(mp))[blk * 16 + i]));
  669         }
  670 
  671         a = ctx->hash[0];
  672         b = ctx->hash[1];
  673         c = ctx->hash[2];
  674         d = ctx->hash[3];
  675         e = ctx->hash[4];
  676         f = ctx->hash[5];
  677         g = ctx->hash[6];
  678         h = ctx->hash[7];
  679 
  680         for (i = 0; i <= 63; i+=16)
  681         {
  682 #if defined (TC_WINDOWS_DRIVER) && defined (DEBUG)
  683             for (j = 0; j < 16; j++)
  684             {
  685                 COMPRESS_ROUND(i, j, SHA256_K);
  686             }
  687 #else
  688             COMPRESS_ROUND(i, 0, SHA256_K);
  689             COMPRESS_ROUND(i, 1, SHA256_K);
  690             COMPRESS_ROUND(i , 2, SHA256_K);
  691             COMPRESS_ROUND(i, 3, SHA256_K);
  692             COMPRESS_ROUND(i, 4, SHA256_K);
  693             COMPRESS_ROUND(i, 5, SHA256_K);
  694             COMPRESS_ROUND(i, 6, SHA256_K);
  695             COMPRESS_ROUND(i, 7, SHA256_K);
  696             COMPRESS_ROUND(i, 8, SHA256_K);
  697             COMPRESS_ROUND(i, 9, SHA256_K);
  698             COMPRESS_ROUND(i, 10, SHA256_K);
  699             COMPRESS_ROUND(i, 11, SHA256_K);
  700             COMPRESS_ROUND(i, 12, SHA256_K);
  701             COMPRESS_ROUND(i, 13, SHA256_K);
  702             COMPRESS_ROUND(i, 14, SHA256_K);
  703             COMPRESS_ROUND(i, 15, SHA256_K);
  704 #endif
  705         }
  706         ctx->hash[0] += a;
  707         ctx->hash[1] += b;
  708         ctx->hash[2] += c;
  709         ctx->hash[3] += d;
  710         ctx->hash[4] += e;
  711         ctx->hash[5] += f;
  712         ctx->hash[6] += g;
  713         ctx->hash[7] += h;
  714     }
  715 }
  716 
  717 #ifndef NO_OPTIMIZED_VERSIONS
  718 
  719 #if CRYPTOPP_BOOL_X64
  720 void Avx2Sha256Transform(sha256_ctx* ctx, void* mp, uint_64t num_blks)
  721 {
  722     if (num_blks > 1)
  723         sha256_rorx(mp, ctx->hash, num_blks);
  724     else
  725         sha256_sse4(mp, ctx->hash, num_blks);
  726 }
  727 
  728 void AvxSha256Transform(sha256_ctx* ctx, void* mp, uint_64t num_blks)
  729 {
  730     if (num_blks > 1)
  731         sha256_avx(mp, ctx->hash, num_blks);
  732     else
  733         sha256_sse4(mp, ctx->hash, num_blks);
  734 }
  735 
  736 void SSE4Sha256Transform(sha256_ctx* ctx, void* mp, uint_64t num_blks)
  737 {
  738     sha256_sse4(mp, ctx->hash, num_blks);
  739 }
  740 
  741 #endif
  742 
  743 #if (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X32_ASM_AVAILABLE))
  744 void SSE2Sha256Transform(sha256_ctx* ctx, void* mp, uint_64t num_blks)
  745 {
  746     X86_SHA256_HashBlocks(ctx->hash, (const uint_32t*)mp, (size_t)(num_blks * 64));
  747 }
  748 #endif
  749 
  750 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
  751 void Sha256AsmTransform(sha256_ctx* ctx, void* mp, uint_64t num_blks)
  752 {
  753     uint_64t i;
  754     for (i = 0; i < num_blks; i++)
  755         sha256_compress_nayuki(ctx->hash, (uint_8t*)mp + i * 64);
  756 }
  757 #endif
  758 
  759 #endif
  760 
  761 void sha256_begin(sha256_ctx* ctx)
  762 {
  763     ctx->hash[0] = 0x6a09e667;
  764     ctx->hash[1] = 0xbb67ae85;
  765     ctx->hash[2] = 0x3c6ef372;
  766     ctx->hash[3] = 0xa54ff53a;
  767     ctx->hash[4] = 0x510e527f;
  768     ctx->hash[5] = 0x9b05688c;
  769     ctx->hash[6] = 0x1f83d9ab;
  770     ctx->hash[7] = 0x5be0cd19;
  771     ctx->count[0] = 0;
  772     ctx->count[1] = 0;
  773 
  774     if (!sha256transfunc)
  775     {
  776 #ifndef NO_OPTIMIZED_VERSIONS
  777 #ifdef _M_X64
  778         if (g_isIntel && HasSAVX2() && HasSBMI2())
  779             sha256transfunc = Avx2Sha256Transform;
  780         else if (g_isIntel && HasSAVX())
  781                 sha256transfunc = AvxSha256Transform;
  782         else if (HasSSE41())
  783                 sha256transfunc = SSE4Sha256Transform;
  784         else
  785 #endif
  786 
  787 #if (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X32_ASM_AVAILABLE))
  788         if (HasSSE2 ())
  789             sha256transfunc = SSE2Sha256Transform;
  790         else
  791 #endif
  792 
  793 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
  794             sha256transfunc = Sha256AsmTransform;
  795 #else
  796             sha256transfunc = StdSha256Transform;
  797 #endif
  798 #else
  799         sha256transfunc = StdSha256Transform;
  800 #endif
  801     }
  802 }
  803 
  804 void sha256_end(unsigned char * result, sha256_ctx* ctx)
  805 {
  806     int i;
  807     uint_64t mlen, pos = ctx->count[0];
  808     uint_8t* m = (uint_8t*) ctx->wbuf;
  809     m[pos++] = 0x80;
  810     if (pos > 56)
  811     {
  812         memset(m + pos, 0, (size_t) (64 - pos));
  813         sha256transfunc(ctx, m, 1);
  814         pos = 0;
  815     }
  816     memset(m + pos, 0, (size_t) (56 - pos));
  817     mlen = bswap_64((uint_64t) ctx->count[1]);
  818     memcpy(m + (64 - 8), &mlen, 64 / 8);
  819     sha256transfunc(ctx, m, 1);
  820     for (i = 0; i < 8; i++)
  821     {
  822         ctx->hash[i] = bswap_32(ctx->hash[i]);
  823     }
  824     memcpy(result, ctx->hash, 32);
  825 }
  826 
  827 void sha256_hash(const unsigned char * data, uint_32t len, sha256_ctx *ctx)
  828 {
  829     uint_32t pos = ctx->count[0];
  830     uint_32t total = ctx->count[1];
  831     uint_8t* m = (uint_8t*) ctx->wbuf;
  832     if (pos && pos + len >= 64)
  833     {
  834         memcpy(m + pos, data, 64 - pos);
  835         sha256transfunc(ctx, m, 1);
  836         len -= 64 - pos;
  837         total += (64 - pos) * 8;
  838         data += 64 - pos;
  839         pos = 0;
  840     }
  841     if (len >= 64)
  842     {
  843         uint_32t blocks = len / 64;
  844         uint_32t bytes = blocks * 64;
  845         sha256transfunc(ctx, (void*)data, blocks);
  846         len -= bytes;
  847         total += (bytes)* 8;
  848         data += bytes;
  849     }
  850     memcpy(m+pos, data, len);
  851     pos += len;
  852     total += len * 8;
  853     ctx->count[0] = pos;
  854     ctx->count[1] = total;
  855 }
  856 
  857 void sha256(unsigned char * result, const unsigned char* source, uint_32t sourceLen)
  858 {
  859     sha256_ctx  ctx;
  860 
  861     sha256_begin(&ctx);
  862     sha256_hash(source, sourceLen, &ctx);
  863     sha256_end(result, &ctx);
  864 }