"Fossies" - the Fresh Open Source Software Archive

Member "src/Crypto/sha256_avx1_x64.asm" (10 Oct 2018, 16047 Bytes) of package /windows/misc/VeraCrypt_1.23-Hotfix-2_Source.zip:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Generic Assembler source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file.

    1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    2 ; Copyright (c) 2012, Intel Corporation 
    3 ; 
    4 ; All rights reserved. 
    5 ; 
    6 ; Redistribution and use in source and binary forms, with or without
    7 ; modification, are permitted provided that the following conditions are
    8 ; met: 
    9 ; 
   10 ; * Redistributions of source code must retain the above copyright
   11 ;   notice, this list of conditions and the following disclaimer.  
   12 ; 
   13 ; * Redistributions in binary form must reproduce the above copyright
   14 ;   notice, this list of conditions and the following disclaimer in the
   15 ;   documentation and/or other materials provided with the
   16 ;   distribution. 
   17 ; 
   18 ; * Neither the name of the Intel Corporation nor the names of its
   19 ;   contributors may be used to endorse or promote products derived from
   20 ;   this software without specific prior written permission. 
   21 ; 
   22 ; 
   23 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
   24 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   25 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   26 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
   27 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   28 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   29 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   30 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   31 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   32 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   33 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   34 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   35 ;
   36 ; Example YASM command lines:
   37 ; Windows:  yasm -Xvc -f x64 -rnasm -pnasm -o sha256_avx1.obj -g cv8 sha256_avx1.asm
   38 ; Linux:    yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_avx1.o sha256_avx1.asm
   39 ;
   40 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   41 ;
   42 ; This code is described in an Intel White-Paper:
   43 ; "Fast SHA-256 Implementations on Intel Architecture Processors"
   44 ;
   45 ; To find it, surf to http://www.intel.com/p/en_US/embedded 
   46 ; and search for that title.
   47 ; The paper is expected to be released roughly at the end of April, 2012
   48 ;
   49 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   50 ; This code schedules 1 blocks at a time, with 4 lanes per block
   51 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   52 
   53 %define VMOVDQ vmovdqu ;; assume buffers not aligned 
   54 
   55 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
   56 
   57 ; addm [mem], reg
   58 ; Add reg to mem using reg-mem add and store
   59 %macro addm 2
   60     add %2, %1
   61     mov %1, %2
   62 %endm
   63 
   64 %macro MY_ROR 2
   65     shld    %1,%1,(32-(%2))
   66 %endm
   67 
   68 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   69 
   70 ; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
   71 ; Load xmm with mem and byte swap each dword
   72 %macro COPY_XMM_AND_BSWAP 3
   73     VMOVDQ %1, %2
   74     vpshufb %1, %1, %3
   75 %endmacro
   76 
   77 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   78 
   79 %define X0 xmm4
   80 %define X1 xmm5
   81 %define X2 xmm6
   82 %define X3 xmm7
   83 
   84 %define XTMP0 xmm0
   85 %define XTMP1 xmm1
   86 %define XTMP2 xmm2
   87 %define XTMP3 xmm3
   88 %define XTMP4 xmm8
   89 %define XFER  xmm9
   90 %define XTMP5 xmm11
   91 
   92 %define SHUF_00BA   xmm10 ; shuffle xBxA -> 00BA
   93 %define SHUF_DC00   xmm12 ; shuffle xDxC -> DC00
   94 %define BYTE_FLIP_MASK  xmm13
   95     
   96 %ifndef WINABI
   97 %define NUM_BLKS rdx    ; 3rd arg
   98 %define CTX rsi ; 2nd arg
   99 %define INP rdi ; 1st arg
  100 
  101 %define SRND    rdi ; clobbers INP
  102 %define c   ecx
  103 %define d   r8d
  104 %define e   edx
  105 %else
  106 %define NUM_BLKS r8 ; 3rd arg
  107 %define CTX rdx     ; 2nd arg
  108 %define INP rcx     ; 1st arg
  109 
  110 %define SRND    rcx ; clobbers INP
  111 %define c   edi 
  112 %define d   esi 
  113 %define e   r8d
  114     
  115 %endif
  116 %define TBL rbp
  117 %define a eax
  118 %define b ebx
  119 
  120 %define f r9d
  121 %define g r10d
  122 %define h r11d
  123 
  124 %define y0 r13d
  125 %define y1 r14d
  126 %define y2 r15d
  127 
  128 
  129 _INP_END_SIZE   equ 8
  130 _INP_SIZE   equ 8
  131 _XFER_SIZE  equ 8
  132 %ifndef WINABI
  133 _XMM_SAVE_SIZE  equ 0
  134 %else
  135 _XMM_SAVE_SIZE  equ 8*16
  136 %endif
  137 ; STACK_SIZE plus pushes must be an odd multiple of 8
  138 _ALIGN_SIZE equ 8
  139 
  140 _INP_END    equ 0
  141 _INP        equ _INP_END  + _INP_END_SIZE
  142 _XFER       equ _INP      + _INP_SIZE
  143 _XMM_SAVE   equ _XFER     + _XFER_SIZE + _ALIGN_SIZE
  144 STACK_SIZE  equ _XMM_SAVE + _XMM_SAVE_SIZE
  145 
  146 ; rotate_Xs
  147 ; Rotate values of symbols X0...X3
  148 %macro rotate_Xs 0
  149 %xdefine X_ X0
  150 %xdefine X0 X1
  151 %xdefine X1 X2
  152 %xdefine X2 X3
  153 %xdefine X3 X_
  154 %endm
  155 
  156 ; ROTATE_ARGS
  157 ; Rotate values of symbols a...h
  158 %macro ROTATE_ARGS 0
  159 %xdefine TMP_ h
  160 %xdefine h g
  161 %xdefine g f
  162 %xdefine f e
  163 %xdefine e d
  164 %xdefine d c
  165 %xdefine c b
  166 %xdefine b a
  167 %xdefine a TMP_
  168 %endm
  169 
  170 %macro FOUR_ROUNDS_AND_SCHED 0
  171         ;; compute s0 four at a time and s1 two at a time
  172         ;; compute W[-16] + W[-7] 4 at a time
  173         ;vmovdqa    XTMP0, X3
  174     mov y0, e       ; y0 = e
  175     MY_ROR  y0, (25-11) ; y0 = e >> (25-11)
  176     mov y1, a       ; y1 = a
  177         vpalignr    XTMP0, X3, X2, 4    ; XTMP0 = W[-7]
  178     MY_ROR  y1, (22-13) ; y1 = a >> (22-13)
  179     xor y0, e       ; y0 = e ^ (e >> (25-11))
  180     mov y2, f       ; y2 = f
  181     MY_ROR  y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
  182         ;vmovdqa    XTMP1, X1
  183     xor y1, a       ; y1 = a ^ (a >> (22-13)
  184     xor y2, g       ; y2 = f^g
  185         vpaddd  XTMP0, XTMP0, X0    ; XTMP0 = W[-7] + W[-16]
  186     xor y0, e       ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  187     and y2, e       ; y2 = (f^g)&e
  188     MY_ROR  y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
  189         ;; compute s0
  190         vpalignr    XTMP1, X1, X0, 4    ; XTMP1 = W[-15]
  191     xor y1, a       ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  192     MY_ROR  y0, 6       ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  193     xor y2, g       ; y2 = CH = ((f^g)&e)^g
  194     
  195         
  196     MY_ROR  y1, 2       ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  197     add y2, y0      ; y2 = S1 + CH
  198     add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
  199 
  200     mov y0, a       ; y0 = a
  201     add h, y2       ; h = h + S1 + CH + k + w
  202     mov y2, a       ; y2 = a
  203     
  204         vpsrld  XTMP2, XTMP1, 7
  205         
  206     or  y0, c       ; y0 = a|c
  207     add d, h        ; d = d + h + S1 + CH + k + w
  208     and y2, c       ; y2 = a&c
  209     
  210         vpslld  XTMP3, XTMP1, (32-7)
  211         
  212     and y0, b       ; y0 = (a|c)&b
  213     add h, y1       ; h = h + S1 + CH + k + w + S0
  214     
  215         vpor    XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7
  216         
  217     or  y0, y2      ; y0 = MAJ = (a|c)&b)|(a&c)
  218     add h, y0       ; h = h + S1 + CH + k + w + S0 + MAJ
  219 
  220 ROTATE_ARGS
  221 
  222     mov y0, e       ; y0 = e
  223     mov y1, a       ; y1 = a
  224 
  225 
  226     MY_ROR  y0, (25-11) ; y0 = e >> (25-11)
  227     xor y0, e       ; y0 = e ^ (e >> (25-11))
  228     mov y2, f       ; y2 = f
  229     MY_ROR  y1, (22-13) ; y1 = a >> (22-13)
  230 
  231         vpsrld  XTMP2, XTMP1,18
  232 
  233     xor y1, a       ; y1 = a ^ (a >> (22-13)
  234     MY_ROR  y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
  235     xor y2, g       ; y2 = f^g
  236     
  237         vpsrld  XTMP4, XTMP1, 3 ; XTMP4 = W[-15] >> 3
  238         
  239     MY_ROR  y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
  240     xor y0, e       ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  241     and y2, e       ; y2 = (f^g)&e
  242     MY_ROR  y0, 6       ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  243         
  244         vpslld  XTMP1, XTMP1, (32-18)
  245 
  246     xor y1, a       ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  247     xor y2, g       ; y2 = CH = ((f^g)&e)^g
  248 
  249         vpxor   XTMP3, XTMP3, XTMP1
  250 
  251     add y2, y0      ; y2 = S1 + CH
  252     add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
  253     MY_ROR  y1, 2       ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  254     
  255         vpxor   XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18
  256         
  257     mov y0, a       ; y0 = a
  258     add h, y2       ; h = h + S1 + CH + k + w
  259     mov y2, a       ; y2 = a
  260     
  261         vpxor   XTMP1, XTMP3, XTMP4 ; XTMP1 = s0
  262         
  263     or  y0, c       ; y0 = a|c
  264     add d, h        ; d = d + h + S1 + CH + k + w
  265     and y2, c       ; y2 = a&c
  266         ;; compute low s1
  267         vpshufd XTMP2, X3, 11111010b    ; XTMP2 = W[-2] {BBAA}
  268     and y0, b       ; y0 = (a|c)&b
  269     add h, y1       ; h = h + S1 + CH + k + w + S0
  270         vpaddd  XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
  271     or  y0, y2      ; y0 = MAJ = (a|c)&b)|(a&c)
  272     add h, y0       ; h = h + S1 + CH + k + w + S0 + MAJ
  273 
  274 ROTATE_ARGS
  275         ;vmovdqa    XTMP3, XTMP2    ; XTMP3 = W[-2] {BBAA}
  276         
  277     mov y0, e       ; y0 = e
  278     mov y1, a       ; y1 = a
  279     MY_ROR  y0, (25-11) ; y0 = e >> (25-11)
  280     
  281         ;vmovdqa    XTMP4, XTMP2    ; XTMP4 = W[-2] {BBAA}
  282         
  283     xor y0, e       ; y0 = e ^ (e >> (25-11))
  284     MY_ROR  y1, (22-13) ; y1 = a >> (22-13)
  285     mov y2, f       ; y2 = f
  286     xor y1, a       ; y1 = a ^ (a >> (22-13)
  287     MY_ROR  y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
  288     
  289         vpsrld  XTMP4, XTMP2, 10    ; XTMP4 = W[-2] >> 10 {BBAA}
  290         
  291     xor y2, g       ; y2 = f^g
  292     
  293         vpsrlq  XTMP3, XTMP2, 19    ; XTMP3 = W[-2] MY_ROR 19 {xBxA}
  294         
  295     xor y0, e       ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  296     and y2, e       ; y2 = (f^g)&e
  297     
  298         vpsrlq  XTMP2, XTMP2, 17    ; XTMP2 = W[-2] MY_ROR 17 {xBxA}
  299         
  300     MY_ROR  y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
  301     xor y1, a       ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  302     xor y2, g       ; y2 = CH = ((f^g)&e)^g
  303     MY_ROR  y0, 6       ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  304         vpxor   XTMP2, XTMP2, XTMP3
  305     add y2, y0      ; y2 = S1 + CH
  306     MY_ROR  y1, 2       ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  307     add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
  308         vpxor   XTMP4, XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
  309     mov y0, a       ; y0 = a
  310     add h, y2       ; h = h + S1 + CH + k + w
  311     mov y2, a       ; y2 = a
  312         vpshufb XTMP4, XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
  313     or  y0, c       ; y0 = a|c
  314     add d, h        ; d = d + h + S1 + CH + k + w
  315     and y2, c       ; y2 = a&c
  316         vpaddd  XTMP0, XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
  317     and y0, b       ; y0 = (a|c)&b
  318     add h, y1       ; h = h + S1 + CH + k + w + S0
  319         ;; compute high s1
  320         vpshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
  321     or  y0, y2      ; y0 = MAJ = (a|c)&b)|(a&c)
  322     add h, y0       ; h = h + S1 + CH + k + w + S0 + MAJ
  323 
  324 ROTATE_ARGS
  325         ;vmovdqa    XTMP3, XTMP2    ; XTMP3 = W[-2] {DDCC}
  326     mov y0, e       ; y0 = e
  327     MY_ROR  y0, (25-11) ; y0 = e >> (25-11)
  328     mov y1, a       ; y1 = a
  329         ;vmovdqa    XTMP5,    XTMP2 ; XTMP5    = W[-2] {DDCC}
  330     MY_ROR  y1, (22-13) ; y1 = a >> (22-13)
  331     xor y0, e       ; y0 = e ^ (e >> (25-11))
  332     mov y2, f       ; y2 = f
  333     MY_ROR  y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
  334     
  335         vpsrld  XTMP5, XTMP2,   10  ; XTMP5 = W[-2] >> 10 {DDCC}
  336         
  337     xor y1, a       ; y1 = a ^ (a >> (22-13)
  338     xor y2, g       ; y2 = f^g
  339     
  340         vpsrlq  XTMP3, XTMP2, 19    ; XTMP3 = W[-2] MY_ROR 19 {xDxC}
  341         
  342     xor y0, e       ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  343     and y2, e       ; y2 = (f^g)&e
  344     MY_ROR  y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
  345     
  346         vpsrlq  XTMP2, XTMP2, 17    ; XTMP2 = W[-2] MY_ROR 17 {xDxC}
  347         
  348     xor y1, a       ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  349     MY_ROR  y0, 6       ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  350     xor y2, g       ; y2 = CH = ((f^g)&e)^g
  351     
  352         vpxor   XTMP2, XTMP2, XTMP3
  353         
  354     MY_ROR  y1, 2       ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  355     add y2, y0      ; y2 = S1 + CH
  356     add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
  357         vpxor   XTMP5, XTMP5, XTMP2 ; XTMP5 = s1 {xDxC}
  358     mov y0, a       ; y0 = a
  359     add h, y2       ; h = h + S1 + CH + k + w
  360     mov y2, a       ; y2 = a
  361         vpshufb XTMP5, XTMP5, SHUF_DC00 ; XTMP5 = s1 {DC00}
  362     or  y0, c       ; y0 = a|c
  363     add d, h        ; d = d + h + S1 + CH + k + w
  364     and y2, c       ; y2 = a&c
  365         vpaddd  X0, XTMP5, XTMP0    ; X0 = {W[3], W[2], W[1], W[0]}
  366     and y0, b       ; y0 = (a|c)&b
  367     add h, y1       ; h = h + S1 + CH + k + w + S0
  368     or  y0, y2      ; y0 = MAJ = (a|c)&b)|(a&c)
  369     add h, y0       ; h = h + S1 + CH + k + w + S0 + MAJ
  370     
  371 ROTATE_ARGS
  372 rotate_Xs
  373 %endm
  374 
  375 ;; input is [rsp + _XFER + %1 * 4]
  376 %macro DO_ROUND 1
  377     mov y0, e       ; y0 = e
  378     MY_ROR  y0, (25-11) ; y0 = e >> (25-11)
  379     mov y1, a       ; y1 = a
  380     xor y0, e       ; y0 = e ^ (e >> (25-11))
  381     MY_ROR  y1, (22-13) ; y1 = a >> (22-13)
  382     mov y2, f       ; y2 = f
  383     xor y1, a       ; y1 = a ^ (a >> (22-13)
  384     MY_ROR  y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
  385     xor y2, g       ; y2 = f^g
  386     xor y0, e       ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  387     MY_ROR  y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
  388     and y2, e       ; y2 = (f^g)&e
  389     xor y1, a       ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  390     MY_ROR  y0, 6       ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  391     xor y2, g       ; y2 = CH = ((f^g)&e)^g
  392     add y2, y0      ; y2 = S1 + CH
  393     MY_ROR  y1, 2       ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  394     add y2, [rsp + _XFER + %1 * 4]  ; y2 = k + w + S1 + CH
  395     mov y0, a       ; y0 = a
  396     add h, y2       ; h = h + S1 + CH + k + w
  397     mov y2, a       ; y2 = a
  398     or  y0, c       ; y0 = a|c
  399     add d, h        ; d = d + h + S1 + CH + k + w
  400     and y2, c       ; y2 = a&c
  401     and y0, b       ; y0 = (a|c)&b
  402     add h, y1       ; h = h + S1 + CH + k + w + S0
  403     or  y0, y2      ; y0 = MAJ = (a|c)&b)|(a&c)
  404     add h, y0       ; h = h + S1 + CH + k + w + S0 + MAJ
  405     ROTATE_ARGS
  406 %endm
  407 
  408 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  409 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  410 ;; void sha256_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
  411 ;; arg 1 : pointer to input data
  412 ;; arg 2 : pointer to digest
  413 ;; arg 3 : Num blocks
  414 section .text
  415 global sha256_avx
  416 align 32
  417 sha256_avx:
  418     push    rbx
  419 %ifdef WINABI
  420     push    rsi
  421     push    rdi
  422 %endif
  423     push    rbp
  424     push    r13
  425     push    r14
  426     push    r15
  427 
  428     sub rsp,STACK_SIZE
  429 %ifdef WINABI
  430     vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6   
  431     vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7
  432     vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8   
  433     vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9   
  434     vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10
  435     vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11
  436     vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12
  437     vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13
  438 %endif
  439 
  440     shl NUM_BLKS, 6 ; convert to bytes
  441     jz  done_hash
  442     add NUM_BLKS, INP   ; pointer to end of data
  443     mov [rsp + _INP_END], NUM_BLKS
  444 
  445     ;; load initial digest
  446     mov a,[4*0 + CTX]
  447     mov b,[4*1 + CTX]
  448     mov c,[4*2 + CTX]
  449     mov d,[4*3 + CTX]
  450     mov e,[4*4 + CTX]
  451     mov f,[4*5 + CTX]
  452     mov g,[4*6 + CTX]
  453     mov h,[4*7 + CTX]
  454 
  455     vmovdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
  456     vmovdqa SHUF_00BA, [_SHUF_00BA wrt rip]
  457     vmovdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
  458 
  459 loop0:
  460     lea TBL,[K256 wrt rip]
  461 
  462     ;; byte swap first 16 dwords
  463     COPY_XMM_AND_BSWAP  X0, [INP + 0*16], BYTE_FLIP_MASK
  464     COPY_XMM_AND_BSWAP  X1, [INP + 1*16], BYTE_FLIP_MASK
  465     COPY_XMM_AND_BSWAP  X2, [INP + 2*16], BYTE_FLIP_MASK
  466     COPY_XMM_AND_BSWAP  X3, [INP + 3*16], BYTE_FLIP_MASK
  467     
  468     mov [rsp + _INP], INP
  469 
  470     ;; schedule 48 input dwords, by doing 3 rounds of 16 each
  471     mov SRND, 3
  472 align 16
  473 loop1:
  474     vpaddd  XFER, X0, [TBL + 0*16]
  475     vmovdqa [rsp + _XFER], XFER
  476     FOUR_ROUNDS_AND_SCHED
  477 
  478     vpaddd  XFER, X0, [TBL + 1*16]
  479     vmovdqa [rsp + _XFER], XFER
  480     FOUR_ROUNDS_AND_SCHED
  481 
  482     vpaddd  XFER, X0, [TBL + 2*16]
  483     vmovdqa [rsp + _XFER], XFER
  484     FOUR_ROUNDS_AND_SCHED
  485 
  486     vpaddd  XFER, X0, [TBL + 3*16]
  487     vmovdqa [rsp + _XFER], XFER
  488     add TBL, 4*16
  489     FOUR_ROUNDS_AND_SCHED
  490 
  491     sub SRND, 1
  492     jne loop1
  493 
  494     mov SRND, 2
  495 loop2:
  496     vpaddd  XFER, X0, [TBL + 0*16]
  497     vmovdqa [rsp + _XFER], XFER
  498     DO_ROUND    0
  499     DO_ROUND    1
  500     DO_ROUND    2
  501     DO_ROUND    3
  502 
  503     vpaddd  XFER, X1, [TBL + 1*16]
  504     vmovdqa [rsp + _XFER], XFER
  505     add TBL, 2*16
  506     DO_ROUND    0
  507     DO_ROUND    1
  508     DO_ROUND    2
  509     DO_ROUND    3
  510 
  511     vmovdqa X0, X2
  512     vmovdqa X1, X3
  513 
  514     sub SRND, 1
  515     jne loop2
  516 
  517 
  518     addm    [4*0 + CTX],a
  519     addm    [4*1 + CTX],b
  520     addm    [4*2 + CTX],c
  521     addm    [4*3 + CTX],d
  522     addm    [4*4 + CTX],e
  523     addm    [4*5 + CTX],f
  524     addm    [4*6 + CTX],g
  525     addm    [4*7 + CTX],h
  526 
  527     mov INP, [rsp + _INP]
  528     add INP, 64
  529     cmp INP, [rsp + _INP_END]
  530     jne loop0
  531 
  532 done_hash:
  533 %ifdef WINABI
  534     vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16]
  535     vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16]
  536     vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16]
  537     vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16]
  538     vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16]
  539     vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16]
  540     vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16]
  541     vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16]
  542 %endif
  543 
  544 
  545     add rsp, STACK_SIZE
  546 
  547     pop r15
  548     pop r14
  549     pop r13
  550     pop rbp
  551 %ifdef WINABI
  552     pop rdi
  553     pop rsi
  554 %endif
  555     pop rbx
  556 
  557     ret 
  558     
  559 
  560 section .data
  561 align 64
  562 K256:
  563     dd  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  564     dd  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  565     dd  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  566     dd  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  567     dd  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  568     dd  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  569     dd  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  570     dd  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  571     dd  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  572     dd  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  573     dd  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  574     dd  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  575     dd  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  576     dd  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  577     dd  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  578     dd  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  579 
  580 PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
  581 
  582 ; shuffle xBxA -> 00BA
  583 _SHUF_00BA:              ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
  584 
  585 ; shuffle xDxC -> DC00
  586 _SHUF_DC00:              ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
  587 
  588 %ifidn __OUTPUT_FORMAT__,elf
  589 section .note.GNU-stack noalloc noexec nowrite progbits
  590 %endif
  591 %ifidn __OUTPUT_FORMAT__,elf32
  592 section .note.GNU-stack noalloc noexec nowrite progbits
  593 %endif
  594 %ifidn __OUTPUT_FORMAT__,elf64
  595 section .note.GNU-stack noalloc noexec nowrite progbits
  596 %endif