"Fossies" - the Fresh Open Source Software Archive

Member "src/Crypto/sha512_sse4_x64.asm" (10 Oct 2018, 13508 Bytes) of package /windows/misc/VeraCrypt_1.23-Hotfix-2_Source.zip:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Generic Assembler source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file.

    1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    2 ; Copyright (c) 2012, Intel Corporation 
    3 ; 
    4 ; All rights reserved. 
    5 ; 
    6 ; Redistribution and use in source and binary forms, with or without
    7 ; modification, are permitted provided that the following conditions are
    8 ; met: 
    9 ; 
   10 ; * Redistributions of source code must retain the above copyright
   11 ;   notice, this list of conditions and the following disclaimer.  
   12 ; 
   13 ; * Redistributions in binary form must reproduce the above copyright
   14 ;   notice, this list of conditions and the following disclaimer in the
   15 ;   documentation and/or other materials provided with the
   16 ;   distribution. 
   17 ; 
   18 ; * Neither the name of the Intel Corporation nor the names of its
   19 ;   contributors may be used to endorse or promote products derived from
   20 ;   this software without specific prior written permission. 
   21 ; 
   22 ; 
   23 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
   24 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   25 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   26 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
   27 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   28 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   29 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   30 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   31 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   32 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   33 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   34 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   35 ;
   36 ; Example YASM command lines:
   37 ; Windows:  yasm -f x64 -D WINABI sha512_sse4.asm
   38 ; Linux:    yasm -f elf64 sha512_sse4.asm
   39 ;
   40 
   41 # Modified by kerukuro for use in cppcrypto.
   42 
   43 BITS 64
   44 section .text
   45 
   46 ; Virtual Registers
   47 %ifdef WINABI
   48     %define msg rcx ; ARG1
   49     %define digest  rdx ; ARG2
   50     %define msglen  r8  ; ARG3
   51     %define T1  rsi
   52     %define T2  rdi
   53 %else
   54     %define msg rdi ; ARG1
   55     %define digest  rsi ; ARG2
   56     %define msglen  rdx ; ARG3
   57     %define T1  rcx
   58     %define T2  r8
   59 %endif
   60 %define a_64    r9
   61 %define b_64    r10
   62 %define c_64    r11
   63 %define d_64    r12
   64 %define e_64    r13
   65 %define f_64    r14
   66 %define g_64    r15
   67 %define h_64    rbx
   68 %define tmp0    rax
   69 
   70 ; Local variables (stack frame)
   71 ; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP
   72 struc frame
   73     .W:       resq 80 ; Message Schedule
   74     .WK:      resq  2 ; W[t] + K[t] | W[t+1] + K[t+1]
   75 
   76 %ifdef WINABI
   77     .GPRSAVE: resq 7
   78 %else
   79     .GPRSAVE: resq 5
   80 %endif
   81 endstruc
   82 
   83 ; Useful QWORD "arrays" for simpler memory references
   84 %define MSG(i)    msg    + 8*(i)               ; Input message (arg1)
   85 %define DIGEST(i) digest + 8*(i)               ; Output Digest (arg2)
   86 %define K_t(i)    K512   + 8*(i) wrt rip       ; SHA Constants (static mem)
   87 %define W_t(i)    rsp + frame.W  + 8*(i)       ; Message Schedule (stack frame)
   88 %define WK_2(i)   rsp + frame.WK + 8*((i) % 2) ; W[t]+K[t] (stack frame)
   89 ; MSG, DIGEST, K_t, W_t are arrays
   90 ; WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
   91 
   92 %macro RotateState 0
   93     ; Rotate symbles a..h right
   94     %xdefine %%TMP h_64
   95     %xdefine h_64  g_64
   96     %xdefine g_64  f_64
   97     %xdefine f_64  e_64
   98     %xdefine e_64  d_64
   99     %xdefine d_64  c_64
  100     %xdefine c_64  b_64
  101     %xdefine b_64  a_64
  102     %xdefine a_64  %%TMP
  103 %endmacro
  104 
  105 %macro SHA512_Round 1
  106 %assign %%t   (%1)
  107 
  108     ; Compute Round %%t
  109     mov T1,   f_64        ; T1 = f
  110     mov tmp0, e_64        ; tmp = e
  111     xor T1,   g_64        ; T1 = f ^ g
  112     ror tmp0, 23 ; 41     ; tmp = e ror 23
  113     and T1,   e_64        ; T1 = (f ^ g) & e
  114     xor tmp0, e_64        ; tmp = (e ror 23) ^ e
  115     xor T1,   g_64        ; T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
  116     add T1,   [WK_2(%%t)] ; W[t] + K[t] from message scheduler
  117     ror tmp0, 4 ; 18      ; tmp = ((e ror 23) ^ e) ror 4
  118     xor tmp0, e_64        ; tmp = (((e ror 23) ^ e) ror 4) ^ e
  119     mov T2,   a_64        ; T2 = a
  120     add T1,   h_64        ; T1 = CH(e,f,g) + W[t] + K[t] + h
  121     ror tmp0, 14 ; 14     ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
  122     add T1,   tmp0        ; T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
  123     mov tmp0, a_64        ; tmp = a
  124     xor T2,   c_64        ; T2 = a ^ c
  125     and tmp0, c_64        ; tmp = a & c
  126     and T2,   b_64        ; T2 = (a ^ c) & b
  127     xor T2,   tmp0        ; T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
  128     mov tmp0, a_64        ; tmp = a
  129     ror tmp0, 5 ; 39      ; tmp = a ror 5
  130     xor tmp0, a_64        ; tmp = (a ror 5) ^ a
  131     add d_64, T1          ; e(next_state) = d + T1 
  132     ror tmp0, 6 ; 34      ; tmp = ((a ror 5) ^ a) ror 6
  133     xor tmp0, a_64        ; tmp = (((a ror 5) ^ a) ror 6) ^ a
  134     lea h_64, [T1 + T2]   ; a(next_state) = T1 + Maj(a,b,c)
  135     ror tmp0, 28 ; 28     ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
  136     add h_64, tmp0        ; a(next_state) = T1 + Maj(a,b,c) S0(a)
  137     RotateState
  138 %endmacro
  139 
  140 %macro SHA512_2Sched_2Round_sse 1
  141 %assign %%t (%1)
  142 
  143     ; Compute rounds %%t-2 and %%t-1
  144     ; Compute message schedule QWORDS %%t and %%t+1
  145 
  146     ;   Two rounds are computed based on the values for K[t-2]+W[t-2] and 
  147     ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message
  148     ; scheduler.
  149     ;   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
  150     ; They are then added to their respective SHA512 constants at
  151     ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
  152     ;   For brievity, the comments following vectored instructions only refer to
  153     ; the first of a pair of QWORDS.
  154     ; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]}
  155     ;   The computation of the message schedule and the rounds are tightly
  156     ; stitched to take advantage of instruction-level parallelism.
  157     ; For clarity, integer instructions (for the rounds calculation) are indented
  158     ; by one tab. Vectored instructions (for the message scheduler) are indented
  159     ; by two tabs.
  160 
  161     mov T1, f_64
  162         movdqa  xmm2, [W_t(%%t-2)]  ; XMM2 = W[t-2]
  163     xor T1,   g_64
  164     and T1,   e_64
  165         movdqa  xmm0, xmm2          ; XMM0 = W[t-2]
  166     xor T1,   g_64
  167     add T1,   [WK_2(%%t)]
  168         movdqu  xmm5, [W_t(%%t-15)] ; XMM5 = W[t-15]
  169     mov tmp0, e_64
  170     ror tmp0, 23 ; 41
  171         movdqa  xmm3, xmm5          ; XMM3 = W[t-15]
  172     xor tmp0, e_64
  173     ror tmp0, 4 ; 18
  174         psrlq   xmm0, 61 - 19       ; XMM0 = W[t-2] >> 42
  175     xor tmp0, e_64
  176     ror tmp0, 14 ; 14
  177         psrlq   xmm3, (8 - 7)       ; XMM3 = W[t-15] >> 1
  178     add T1,   tmp0
  179     add T1,   h_64
  180         pxor    xmm0, xmm2          ; XMM0 = (W[t-2] >> 42) ^ W[t-2]
  181     mov T2,   a_64
  182     xor T2,   c_64
  183         pxor    xmm3, xmm5          ; XMM3 = (W[t-15] >> 1) ^ W[t-15]
  184     and T2,   b_64
  185     mov tmp0, a_64
  186         psrlq   xmm0, 19 - 6        ; XMM0 = ((W[t-2]>>42)^W[t-2])>>13
  187     and tmp0, c_64
  188     xor T2,   tmp0
  189         psrlq   xmm3, (7 - 1)       ; XMM3 = ((W[t-15]>>1)^W[t-15])>>6
  190     mov tmp0, a_64
  191     ror tmp0, 5 ; 39
  192         pxor    xmm0, xmm2          ; XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2]
  193     xor tmp0, a_64
  194     ror tmp0, 6 ; 34
  195         pxor    xmm3, xmm5          ; XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]
  196     xor tmp0, a_64
  197     ror tmp0, 28 ; 28
  198         psrlq   xmm0, 6             ; XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6
  199     add T2,   tmp0
  200     add d_64, T1 
  201         psrlq   xmm3, 1             ; XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1
  202     lea h_64, [T1 + T2]
  203     RotateState
  204         movdqa  xmm1, xmm2          ; XMM1 = W[t-2]
  205     mov T1, f_64
  206     xor T1,   g_64
  207         movdqa  xmm4, xmm5          ; XMM4 = W[t-15]
  208     and T1,   e_64
  209     xor T1,   g_64
  210         psllq   xmm1, (64 - 19) - (64 - 61) ; XMM1 = W[t-2] << 42
  211     add T1,   [WK_2(%%t+1)]
  212     mov tmp0, e_64
  213         psllq   xmm4, (64 - 1) - (64 - 8) ; XMM4 = W[t-15] << 7
  214     ror tmp0, 23 ; 41
  215     xor tmp0, e_64
  216         pxor    xmm1, xmm2          ; XMM1 = (W[t-2] << 42)^W[t-2]
  217     ror tmp0, 4 ; 18
  218     xor tmp0, e_64
  219         pxor    xmm4, xmm5          ; XMM4 = (W[t-15]<<7)^W[t-15]
  220     ror tmp0, 14 ; 14
  221     add T1,   tmp0
  222         psllq   xmm1, (64 - 61)     ; XMM1 = ((W[t-2] << 42)^W[t-2])<<3
  223     add T1,   h_64
  224     mov T2,   a_64
  225         psllq   xmm4, (64 - 8)      ; XMM4 = ((W[t-15]<<7)^W[t-15])<<56
  226     xor T2,   c_64
  227     and T2,   b_64
  228         pxor    xmm0, xmm1          ; XMM0 = s1(W[t-2])
  229     mov tmp0, a_64
  230     and tmp0, c_64
  231         movdqu  xmm1, [W_t(%%t- 7)] ; XMM1 = W[t-7]
  232     xor T2,   tmp0
  233         pxor    xmm3, xmm4          ; XMM3 = s0(W[t-15])
  234     mov tmp0, a_64
  235         paddq   xmm0, xmm3          ; XMM0 = s1(W[t-2]) + s0(W[t-15])
  236     ror tmp0, 5 ; 39
  237         paddq   xmm0, [W_t(%%t-16)] ; XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16]
  238     xor tmp0, a_64
  239         paddq   xmm0, xmm1          ; XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
  240     ror tmp0, 6 ; 34
  241         movdqa  [W_t(%%t)], xmm0    ; Store scheduled qwords
  242     xor tmp0, a_64
  243         paddq   xmm0, [K_t(t)]      ; Compute W[t]+K[t]
  244     ror tmp0, 28 ; 28
  245         movdqa  [WK_2(t)], xmm0     ; Store W[t]+K[t] for next rounds
  246     add T2,   tmp0
  247     add d_64, T1
  248     lea h_64, [T1 + T2]
  249     RotateState
  250 %endmacro
  251 
  252 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  253 ; void sha512_sse4(const void* M, void* D, uint64_t L);
  254 ; Purpose: Updates the SHA512 digest stored at D with the message stored in M.
  255 ; The size of the message pointed to by M must be an integer multiple of SHA512
  256 ;   message blocks.
  257 ; L is the message length in SHA512 blocks.
  258 global sha512_sse4:function
  259 global _sha512_sse4:function
  260 sha512_sse4:
  261 _sha512_sse4:
  262     cmp msglen, 0
  263     je .nowork
  264     
  265     ; Allocate Stack Space
  266     sub rsp, frame_size
  267 
  268     ; Save GPRs
  269     mov [rsp + frame.GPRSAVE + 8 * 0], rbx
  270     mov [rsp + frame.GPRSAVE + 8 * 1], r12
  271     mov [rsp + frame.GPRSAVE + 8 * 2], r13
  272     mov [rsp + frame.GPRSAVE + 8 * 3], r14
  273     mov [rsp + frame.GPRSAVE + 8 * 4], r15
  274 %ifdef WINABI
  275     mov [rsp + frame.GPRSAVE + 8 * 5], rsi
  276     mov [rsp + frame.GPRSAVE + 8 * 6], rdi
  277 %endif
  278 
  279 .updateblock:
  280 
  281     ; Load state variables
  282     mov a_64, [DIGEST(0)]
  283     mov b_64, [DIGEST(1)]
  284     mov c_64, [DIGEST(2)]
  285     mov d_64, [DIGEST(3)]
  286     mov e_64, [DIGEST(4)]
  287     mov f_64, [DIGEST(5)]
  288     mov g_64, [DIGEST(6)]
  289     mov h_64, [DIGEST(7)]
  290 
  291     %assign t 0
  292     %rep 80/2 + 1
  293     ; (80 rounds) / (2 rounds/iteration) + (1 iteration)
  294     ; +1 iteration because the scheduler leads hashing by 1 iteration
  295         %if t < 2
  296             ; BSWAP 2 QWORDS
  297             movdqa  xmm1, [XMM_QWORD_BSWAP wrt rip]
  298             movdqu  xmm0, [MSG(t)]
  299             pshufb  xmm0, xmm1      ; BSWAP
  300             movdqa  [W_t(t)], xmm0  ; Store Scheduled Pair
  301             paddq   xmm0, [K_t(t)]  ; Compute W[t]+K[t]
  302             movdqa  [WK_2(t)], xmm0 ; Store into WK for rounds
  303         %elif t < 16
  304             ; BSWAP 2 QWORDS; Compute 2 Rounds
  305             movdqu  xmm0, [MSG(t)]
  306             pshufb  xmm0, xmm1      ; BSWAP
  307             SHA512_Round t - 2      ; Round t-2
  308             movdqa  [W_t(t)], xmm0  ; Store Scheduled Pair
  309             paddq   xmm0, [K_t(t)]  ; Compute W[t]+K[t]
  310             SHA512_Round t - 1      ; Round t-1
  311             movdqa  [WK_2(t)], xmm0 ; Store W[t]+K[t] into WK
  312         %elif t < 79
  313             ; Schedule 2 QWORDS; Compute 2 Rounds
  314             SHA512_2Sched_2Round_sse t 
  315         %else
  316             ; Compute 2 Rounds
  317             SHA512_Round t - 2
  318             SHA512_Round t - 1
  319         %endif
  320     %assign t t+2
  321     %endrep
  322 
  323     ; Update digest
  324     add [DIGEST(0)], a_64
  325     add [DIGEST(1)], b_64
  326     add [DIGEST(2)], c_64
  327     add [DIGEST(3)], d_64
  328     add [DIGEST(4)], e_64
  329     add [DIGEST(5)], f_64
  330     add [DIGEST(6)], g_64
  331     add [DIGEST(7)], h_64
  332 
  333     ; Advance to next message block
  334     add msg, 16*8
  335     dec msglen
  336     jnz .updateblock
  337 
  338     ; Restore GPRs
  339     mov rbx, [rsp + frame.GPRSAVE + 8 * 0]
  340     mov r12, [rsp + frame.GPRSAVE + 8 * 1]
  341     mov r13, [rsp + frame.GPRSAVE + 8 * 2]
  342     mov r14, [rsp + frame.GPRSAVE + 8 * 3]
  343     mov r15, [rsp + frame.GPRSAVE + 8 * 4]
  344 %ifdef WINABI
  345     mov rsi, [rsp + frame.GPRSAVE + 8 * 5]
  346     mov rdi, [rsp + frame.GPRSAVE + 8 * 6]
  347 %endif
  348     ; Restore Stack Pointer
  349     add rsp, frame_size
  350 
  351 .nowork:
  352     ret
  353 
  354 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  355 ;;; Binary Data
  356 
  357 section .data
  358 
  359 ALIGN 16
  360 
  361 ; Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
  362 XMM_QWORD_BSWAP: 
  363     ddq 0x08090a0b0c0d0e0f0001020304050607
  364 
  365 ; K[t] used in SHA512 hashing
  366 K512:
  367     dq 0x428a2f98d728ae22,0x7137449123ef65cd 
  368     dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
  369     dq 0x3956c25bf348b538,0x59f111f1b605d019 
  370     dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118
  371     dq 0xd807aa98a3030242,0x12835b0145706fbe 
  372     dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
  373     dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1 
  374     dq 0x9bdc06a725c71235,0xc19bf174cf692694
  375     dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3 
  376     dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
  377     dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483 
  378     dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
  379     dq 0x983e5152ee66dfab,0xa831c66d2db43210 
  380     dq 0xb00327c898fb213f,0xbf597fc7beef0ee4
  381     dq 0xc6e00bf33da88fc2,0xd5a79147930aa725 
  382     dq 0x06ca6351e003826f,0x142929670a0e6e70
  383     dq 0x27b70a8546d22ffc,0x2e1b21385c26c926 
  384     dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
  385     dq 0x650a73548baf63de,0x766a0abb3c77b2a8 
  386     dq 0x81c2c92e47edaee6,0x92722c851482353b
  387     dq 0xa2bfe8a14cf10364,0xa81a664bbc423001 
  388     dq 0xc24b8b70d0f89791,0xc76c51a30654be30
  389     dq 0xd192e819d6ef5218,0xd69906245565a910 
  390     dq 0xf40e35855771202a,0x106aa07032bbd1b8
  391     dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53 
  392     dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
  393     dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 
  394     dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
  395     dq 0x748f82ee5defb2fc,0x78a5636f43172f60 
  396     dq 0x84c87814a1f0ab72,0x8cc702081a6439ec
  397     dq 0x90befffa23631e28,0xa4506cebde82bde9 
  398     dq 0xbef9a3f7b2c67915,0xc67178f2e372532b
  399     dq 0xca273eceea26619c,0xd186b8c721c0c207 
  400     dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
  401     dq 0x06f067aa72176fba,0x0a637dc5a2c898a6 
  402     dq 0x113f9804bef90dae,0x1b710b35131c471b
  403     dq 0x28db77f523047d84,0x32caab7b40c72493 
  404     dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
  405     dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 
  406     dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817
  407 
  408 %ifidn __OUTPUT_FORMAT__,elf
  409 section .note.GNU-stack noalloc noexec nowrite progbits
  410 %endif
  411 %ifidn __OUTPUT_FORMAT__,elf32
  412 section .note.GNU-stack noalloc noexec nowrite progbits
  413 %endif
  414 %ifidn __OUTPUT_FORMAT__,elf64
  415 section .note.GNU-stack noalloc noexec nowrite progbits
  416 %endif