"Fossies" - the Fresh Open Source Software Archive

Member "src/Crypto/sha512_avx1_x64.asm" (10 Oct 2018, 13735 Bytes) of package /windows/misc/VeraCrypt_1.23-Hotfix-2_Source.zip:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Generic Assembler source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file.

    1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    2 ; Copyright (c) 2012, Intel Corporation 
    3 ; 
    4 ; All rights reserved. 
    5 ; 
    6 ; Redistribution and use in source and binary forms, with or without
    7 ; modification, are permitted provided that the following conditions are
    8 ; met: 
    9 ; 
   10 ; * Redistributions of source code must retain the above copyright
   11 ;   notice, this list of conditions and the following disclaimer.  
   12 ; 
   13 ; * Redistributions in binary form must reproduce the above copyright
   14 ;   notice, this list of conditions and the following disclaimer in the
   15 ;   documentation and/or other materials provided with the
   16 ;   distribution. 
   17 ; 
   18 ; * Neither the name of the Intel Corporation nor the names of its
   19 ;   contributors may be used to endorse or promote products derived from
   20 ;   this software without specific prior written permission. 
   21 ; 
   22 ; 
   23 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
   24 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   25 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   26 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
   27 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   28 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   29 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   30 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   31 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   32 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   33 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   34 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   35 ;
   36 ; Example YASM command lines:
   37 ; Windows:  yasm -f x64 -D WINABI sha512_avx.asm
   38 ; Linux:    yasm -f elf64 sha512_avx.asm
   39 ;
   40 
   41 BITS 64
   42 section .text
   43 
   44 ; Virtual Registers
   45 %ifdef WINABI
   46     %define msg rcx ; ARG1
   47     %define digest  rdx ; ARG2
   48     %define msglen  r8  ; ARG3
   49     %define T1  rsi
   50     %define T2  rdi
   51 %else
   52     %define msg rdi ; ARG1
   53     %define digest  rsi ; ARG2
   54     %define msglen  rdx ; ARG3
   55     %define T1  rcx
   56     %define T2  r8
   57 %endif
   58 %define a_64    r9
   59 %define b_64    r10
   60 %define c_64    r11
   61 %define d_64    r12
   62 %define e_64    r13
   63 %define f_64    r14
   64 %define g_64    r15
   65 %define h_64    rbx
   66 %define tmp0    rax
   67 
   68 ; Local variables (stack frame)
   69 ; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP
   70 struc frame
   71     .W:       resq 80 ; Message Schedule
   72     .WK:      resq  2 ; W[t] + K[t] | W[t+1] + K[t+1]
   73 
   74 %ifdef WINABI
   75     .XMMSAVE: resdq 4
   76     .GPRSAVE: resq  7
   77 %else
   78     .GPRSAVE: resq  5
   79 %endif
   80 endstruc
   81 
   82 ; Useful QWORD "arrays" for simpler memory references
   83 %define MSG(i)    msg    + 8*(i) ; Input message (arg1)
   84 %define DIGEST(i) digest + 8*(i) ; Output Digest (arg2)
   85 %define K_t(i)    K512   + 8*(i) wrt rip ; SHA Constants (static mem)
   86 %define W_t(i)    rsp + frame.W  + 8*(i) ; Message Schedule (stack frame)
   87 %define WK_2(i)   rsp + frame.WK + 8*((i) % 2) ; W[t]+K[t] (stack frame)
   88 ; MSG, DIGEST, K_t, W_t are arrays
   89 ; WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
   90 
   91 %macro RotateState 0
   92     ; Rotate symbles a..h right
   93     %xdefine    %%TMP h_64
   94     %xdefine    h_64 g_64
   95     %xdefine    g_64 f_64
   96     %xdefine    f_64 e_64
   97     %xdefine    e_64 d_64
   98     %xdefine    d_64 c_64
   99     %xdefine    c_64 b_64
  100     %xdefine    b_64 a_64
  101     %xdefine    a_64 %%TMP
  102 %endmacro
  103 
  104 %macro RORQ 2
  105     ; shld is faster than ror on Sandybridge
  106     shld    %1, %1, (64 - %2)
  107 %endmacro
  108 
  109 %macro SHA512_Round 1
  110 %assign %%t   (%1)
  111 
  112     ; Compute Round %%t
  113     mov T1,   f_64        ; T1 = f
  114     mov tmp0, e_64        ; tmp = e
  115     xor T1,   g_64        ; T1 = f ^ g
  116     RORQ    tmp0, 23 ; 41     ; tmp = e ror 23
  117     and T1,   e_64        ; T1 = (f ^ g) & e
  118     xor tmp0, e_64        ; tmp = (e ror 23) ^ e
  119     xor T1,   g_64        ; T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
  120     add T1,   [WK_2(%%t)] ; W[t] + K[t] from message scheduler
  121     RORQ    tmp0, 4 ; 18      ; tmp = ((e ror 23) ^ e) ror 4
  122     xor tmp0, e_64        ; tmp = (((e ror 23) ^ e) ror 4) ^ e
  123     mov T2,   a_64        ; T2 = a
  124     add T1,   h_64        ; T1 = CH(e,f,g) + W[t] + K[t] + h
  125     RORQ    tmp0, 14 ; 14     ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
  126     add T1,   tmp0        ; T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
  127     mov tmp0, a_64        ; tmp = a
  128     xor T2,   c_64        ; T2 = a ^ c
  129     and tmp0, c_64        ; tmp = a & c
  130     and T2,   b_64        ; T2 = (a ^ c) & b
  131     xor T2,   tmp0        ; T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
  132     mov tmp0, a_64        ; tmp = a
  133     RORQ    tmp0, 5 ; 39      ; tmp = a ror 5
  134     xor tmp0, a_64        ; tmp = (a ror 5) ^ a
  135     add d_64, T1          ; e(next_state) = d + T1 
  136     RORQ    tmp0, 6 ; 34      ; tmp = ((a ror 5) ^ a) ror 6
  137     xor tmp0, a_64        ; tmp = (((a ror 5) ^ a) ror 6) ^ a
  138     lea h_64, [T1 + T2]   ; a(next_state) = T1 + Maj(a,b,c)
  139     RORQ    tmp0, 28 ; 28     ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
  140     add h_64, tmp0        ; a(next_state) = T1 + Maj(a,b,c) S0(a)
  141     RotateState
  142 %endmacro
  143 
  144 %macro SHA512_2Sched_2Round_avx 1
  145 %assign %%t %1
  146     ; Compute rounds %%t-2 and %%t-1
  147     ; Compute message schedule QWORDS %%t and %%t+1
  148 
  149     ;   Two rounds are computed based on the values for K[t-2]+W[t-2] and 
  150     ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message
  151     ; scheduler.
  152     ;   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
  153     ; They are then added to their respective SHA512 constants at
  154     ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
  155     ;   For brievity, the comments following vectored instructions only refer to
  156     ; the first of a pair of QWORDS.
  157     ; Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]}
  158     ;   The computation of the message schedule and the rounds are tightly
  159     ; stitched to take advantage of instruction-level parallelism.
  160     ; For clarity, integer instructions (for the rounds calculation) are indented
  161     ; by one tab. Vectored instructions (for the message scheduler) are indented
  162     ; by two tabs.
  163 
  164         vmovdqa xmm4, [W_t(%%t-2)]   ; XMM4 = W[t-2]
  165         vmovdqu xmm5, [W_t(%%t-15)]  ; XMM5 = W[t-15]
  166     mov T1,   f_64
  167         vpsrlq  xmm0, xmm4, 61       ; XMM0 = W[t-2]>>61
  168     mov tmp0, e_64
  169         vpsrlq  xmm6, xmm5, 1        ; XMM6 = W[t-15]>>1
  170     xor T1,   g_64
  171     RORQ    tmp0, 23 ; 41
  172         vpsrlq  xmm1, xmm4, 19       ; XMM1 = W[t-2]>>19
  173     and T1,   e_64
  174     xor tmp0, e_64
  175         vpxor   xmm0, xmm1           ; XMM0 = W[t-2]>>61 ^ W[t-2]>>19
  176     xor T1,   g_64
  177     add T1,   [WK_2(%%t)];
  178         vpsrlq  xmm7, xmm5, 8        ; XMM7 = W[t-15]>>8
  179     RORQ    tmp0, 4 ; 18
  180         vpsrlq  xmm2, xmm4, 6        ; XMM2 = W[t-2]>>6
  181     xor tmp0, e_64
  182     mov T2,   a_64
  183     add T1,   h_64
  184         vpxor   xmm6, xmm7           ; XMM6 = W[t-15]>>1 ^ W[t-15]>>8
  185     RORQ    tmp0, 14 ; 14
  186     add T1,   tmp0
  187         vpsrlq  xmm8, xmm5, 7        ; XMM8 = W[t-15]>>7
  188     mov     tmp0, a_64
  189     xor T2,   c_64
  190         vpsllq  xmm3, xmm4, (64-61)  ; XMM3 = W[t-2]<<3
  191     and tmp0, c_64
  192     and T2,   b_64
  193         vpxor   xmm2, xmm3           ; XMM2 = W[t-2]>>6 ^ W[t-2]<<3
  194     xor T2,   tmp0
  195     mov tmp0, a_64
  196         vpsllq  xmm9, xmm5, (64-1)   ; XMM9 = W[t-15]<<63
  197     RORQ    tmp0, 5 ; 39
  198         vpxor   xmm8, xmm9           ; XMM8 = W[t-15]>>7 ^ W[t-15]<<63
  199     xor tmp0, a_64
  200     add d_64, T1
  201     RORQ    tmp0, 6 ; 34
  202     xor tmp0, a_64
  203         vpxor   xmm6, xmm8           ; XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ W[t-15]>>7 ^ W[t-15]<<63
  204     lea h_64, [T1 + T2]
  205     RORQ    tmp0, 28 ; 28
  206         vpsllq  xmm4, (64-19)        ; XMM4 = W[t-2]<<25
  207     add h_64, tmp0
  208     RotateState
  209         vpxor   xmm0, xmm4           ; XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ W[t-2]<<25
  210     mov T1, f_64
  211         vpxor   xmm0, xmm2           ; XMM0 = s1(W[t-2])
  212     mov tmp0, e_64
  213     xor T1,   g_64
  214         vpaddq  xmm0, [W_t(%%t-16)]  ; XMM0 = s1(W[t-2]) + W[t-16]
  215         vmovdqu xmm1, [W_t(%%t- 7)]  ; XMM1 = W[t-7]
  216     RORQ    tmp0, 23 ; 41
  217     and T1,   e_64
  218     xor tmp0, e_64
  219     xor T1,   g_64
  220         vpsllq  xmm5, (64-8)         ; XMM5 = W[t-15]<<56
  221     add T1,   [WK_2(%%t+1)]
  222         vpxor   xmm6, xmm5           ; XMM6 = s0(W[t-15])
  223     RORQ    tmp0, 4 ; 18
  224         vpaddq  xmm0, xmm6           ; XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15])
  225     xor tmp0, e_64
  226         vpaddq  xmm0, xmm1           ; XMM0 = W[t] = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
  227     mov T2,   a_64
  228     add T1,   h_64
  229     RORQ    tmp0, 14 ; 14
  230     add T1,   tmp0
  231         vmovdqa [W_t(%%t)], xmm0      ; Store W[t]
  232         vpaddq  xmm0, [K_t(t)]        ; Compute W[t]+K[t]
  233         vmovdqa [WK_2(t)], xmm0       ; Store W[t]+K[t] for next rounds
  234     mov tmp0, a_64
  235     xor T2,   c_64
  236     and tmp0, c_64
  237     and T2,   b_64
  238     xor T2,   tmp0
  239     mov tmp0, a_64
  240     RORQ    tmp0, 5 ; 39
  241     xor tmp0, a_64
  242     add d_64, T1
  243     RORQ    tmp0, 6 ; 34
  244     xor tmp0, a_64
  245     lea h_64, [T1 + T2]
  246     RORQ    tmp0, 28 ; 28
  247     add h_64, tmp0
  248     RotateState
  249 %endmacro
  250 
  251 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  252 ; void sha512_avx(const void* M, void* D, uint64_t L);
  253 ; Purpose: Updates the SHA512 digest stored at D with the message stored in M.
  254 ; The size of the message pointed to by M must be an integer multiple of SHA512
  255 ;   message blocks.
  256 ; L is the message length in SHA512 blocks
  257 global sha512_avx:function
  258 sha512_avx:
  259     cmp msglen, 0
  260     je  .nowork
  261     
  262     ; Allocate Stack Space
  263     sub rsp, frame_size
  264 
  265     ; Save GPRs
  266     mov [rsp + frame.GPRSAVE + 8 * 0], rbx
  267     mov [rsp + frame.GPRSAVE + 8 * 1], r12
  268     mov [rsp + frame.GPRSAVE + 8 * 2], r13
  269     mov [rsp + frame.GPRSAVE + 8 * 3], r14
  270     mov [rsp + frame.GPRSAVE + 8 * 4], r15
  271 %ifdef WINABI
  272     mov [rsp + frame.GPRSAVE + 8 * 5], rsi
  273     mov [rsp + frame.GPRSAVE + 8 * 6], rdi
  274 %endif
  275     ; Save XMMs
  276 %ifdef WINABI
  277     vmovdqa [rsp + frame.XMMSAVE + 16 * 0], xmm6
  278     vmovdqa [rsp + frame.XMMSAVE + 16 * 1], xmm7
  279     vmovdqa [rsp + frame.XMMSAVE + 16 * 2], xmm8
  280     vmovdqa [rsp + frame.XMMSAVE + 16 * 3], xmm9
  281 %endif  
  282 
  283 .updateblock:
  284 
  285     ; Load state variables
  286     mov a_64, [DIGEST(0)]
  287     mov b_64, [DIGEST(1)]
  288     mov c_64, [DIGEST(2)]
  289     mov d_64, [DIGEST(3)]
  290     mov e_64, [DIGEST(4)]
  291     mov f_64, [DIGEST(5)]
  292     mov g_64, [DIGEST(6)]
  293     mov h_64, [DIGEST(7)]
  294 
  295     %assign t 0
  296     %rep 80/2 + 1
  297     ; (80 rounds) / (2 rounds/iteration) + (1 iteration)
  298     ; +1 iteration because the scheduler leads hashing by 1 iteration
  299         %if t < 2
  300             ; BSWAP 2 QWORDS
  301             vmovdqa xmm1, [XMM_QWORD_BSWAP wrt rip]
  302             vmovdqu xmm0, [MSG(t)]
  303             vpshufb xmm0, xmm0, xmm1     ; BSWAP
  304             vmovdqa [W_t(t)], xmm0       ; Store Scheduled Pair
  305             vpaddq  xmm0, xmm0, [K_t(t)] ; Compute W[t]+K[t]
  306             vmovdqa [WK_2(t)], xmm0      ; Store into WK for rounds
  307         %elif t < 16
  308             ; BSWAP 2 QWORDS, Compute 2 Rounds
  309             vmovdqu xmm0, [MSG(t)]
  310             vpshufb xmm0, xmm0, xmm1     ; BSWAP
  311             SHA512_Round t - 2           ; Round t-2
  312             vmovdqa [W_t(t)], xmm0       ; Store Scheduled Pair
  313             vpaddq  xmm0, xmm0, [K_t(t)] ; Compute W[t]+K[t]
  314             SHA512_Round t - 1           ; Round t-1
  315             vmovdqa [WK_2(t)], xmm0      ; W[t]+K[t] into WK
  316         %elif t < 79
  317             ; Schedule 2 QWORDS; Compute 2 Rounds
  318             SHA512_2Sched_2Round_avx t
  319         %else
  320             ; Compute 2 Rounds
  321             SHA512_Round t - 2
  322             SHA512_Round t - 1
  323         %endif
  324     %assign t t+2
  325     %endrep
  326 
  327     ; Update digest
  328     add [DIGEST(0)], a_64
  329     add [DIGEST(1)], b_64
  330     add [DIGEST(2)], c_64
  331     add [DIGEST(3)], d_64
  332     add [DIGEST(4)], e_64
  333     add [DIGEST(5)], f_64
  334     add [DIGEST(6)], g_64
  335     add [DIGEST(7)], h_64
  336 
  337     ; Advance to next message block
  338     add msg, 16*8
  339     dec msglen
  340     jnz .updateblock
  341 
  342     ; Restore XMMs
  343 %ifdef WINABI
  344     vmovdqa xmm6, [rsp + frame.XMMSAVE + 16 * 0]
  345     vmovdqa xmm7, [rsp + frame.XMMSAVE + 16 * 1]
  346     vmovdqa xmm8, [rsp + frame.XMMSAVE + 16 * 2]
  347     vmovdqa xmm9, [rsp + frame.XMMSAVE + 16 * 3]
  348 %endif
  349     ; Restore GPRs
  350     mov rbx, [rsp + frame.GPRSAVE + 8 * 0]
  351     mov r12, [rsp + frame.GPRSAVE + 8 * 1]
  352     mov r13, [rsp + frame.GPRSAVE + 8 * 2]
  353     mov r14, [rsp + frame.GPRSAVE + 8 * 3]
  354     mov r15, [rsp + frame.GPRSAVE + 8 * 4]
  355 %ifdef WINABI
  356     mov rsi, [rsp + frame.GPRSAVE + 8 * 5]
  357     mov rdi, [rsp + frame.GPRSAVE + 8 * 6]
  358 %endif
  359     ; Restore Stack Pointer
  360     add rsp, frame_size
  361 
  362 .nowork:
  363     ret
  364 
  365 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  366 ;;; Binary Data
  367 
  368 section .data
  369 
  370 ALIGN 16
  371 
  372 ; Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
  373 XMM_QWORD_BSWAP: 
  374     ddq 0x08090a0b0c0d0e0f0001020304050607
  375 
  376 ; K[t] used in SHA512 hashing
  377 K512:
  378     dq 0x428a2f98d728ae22,0x7137449123ef65cd 
  379     dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
  380     dq 0x3956c25bf348b538,0x59f111f1b605d019 
  381     dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118
  382     dq 0xd807aa98a3030242,0x12835b0145706fbe 
  383     dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
  384     dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1 
  385     dq 0x9bdc06a725c71235,0xc19bf174cf692694
  386     dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3 
  387     dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
  388     dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483 
  389     dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
  390     dq 0x983e5152ee66dfab,0xa831c66d2db43210 
  391     dq 0xb00327c898fb213f,0xbf597fc7beef0ee4
  392     dq 0xc6e00bf33da88fc2,0xd5a79147930aa725 
  393     dq 0x06ca6351e003826f,0x142929670a0e6e70
  394     dq 0x27b70a8546d22ffc,0x2e1b21385c26c926 
  395     dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
  396     dq 0x650a73548baf63de,0x766a0abb3c77b2a8 
  397     dq 0x81c2c92e47edaee6,0x92722c851482353b
  398     dq 0xa2bfe8a14cf10364,0xa81a664bbc423001 
  399     dq 0xc24b8b70d0f89791,0xc76c51a30654be30
  400     dq 0xd192e819d6ef5218,0xd69906245565a910 
  401     dq 0xf40e35855771202a,0x106aa07032bbd1b8
  402     dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53 
  403     dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
  404     dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 
  405     dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
  406     dq 0x748f82ee5defb2fc,0x78a5636f43172f60 
  407     dq 0x84c87814a1f0ab72,0x8cc702081a6439ec
  408     dq 0x90befffa23631e28,0xa4506cebde82bde9 
  409     dq 0xbef9a3f7b2c67915,0xc67178f2e372532b
  410     dq 0xca273eceea26619c,0xd186b8c721c0c207 
  411     dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
  412     dq 0x06f067aa72176fba,0x0a637dc5a2c898a6 
  413     dq 0x113f9804bef90dae,0x1b710b35131c471b
  414     dq 0x28db77f523047d84,0x32caab7b40c72493 
  415     dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
  416     dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 
  417     dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817
  418 
  419 %ifidn __OUTPUT_FORMAT__,elf
  420 section .note.GNU-stack noalloc noexec nowrite progbits
  421 %endif
  422 %ifidn __OUTPUT_FORMAT__,elf32
  423 section .note.GNU-stack noalloc noexec nowrite progbits
  424 %endif
  425 %ifidn __OUTPUT_FORMAT__,elf64
  426 section .note.GNU-stack noalloc noexec nowrite progbits
  427 %endif