"Fossies" - the Fresh Open Source Software Archive

Member "src/Crypto/sha256_avx2_x64.asm" (10 Oct 2018, 25934 Bytes) of package /windows/misc/VeraCrypt_1.23-Hotfix-2_Source.zip:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Generic Assembler source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file.

    1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    2 ; Copyright (c) 2012, Intel Corporation 
    3 ; 
    4 ; All rights reserved. 
    5 ; 
    6 ; Redistribution and use in source and binary forms, with or without
    7 ; modification, are permitted provided that the following conditions are
    8 ; met: 
    9 ; 
   10 ; * Redistributions of source code must retain the above copyright
   11 ;   notice, this list of conditions and the following disclaimer.  
   12 ; 
   13 ; * Redistributions in binary form must reproduce the above copyright
   14 ;   notice, this list of conditions and the following disclaimer in the
   15 ;   documentation and/or other materials provided with the
   16 ;   distribution. 
   17 ; 
   18 ; * Neither the name of the Intel Corporation nor the names of its
   19 ;   contributors may be used to endorse or promote products derived from
   20 ;   this software without specific prior written permission. 
   21 ; 
   22 ; 
   23 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
   24 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   25 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   26 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
   27 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   28 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   29 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   30 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   31 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   32 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   33 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   34 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   35 ;
   36 ; Example YASM command lines:
   37 ; Windows:  yasm -Xvc -f x64 -rnasm -pnasm -o sha256_avx2_rorx2.obj -g cv8 sha256_avx2_rorx2.asm
   38 ; Linux:    yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_avx2_rorx2.o sha256_avx2_rorx2.asm
   39 ;
   40 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   41 ;
   42 ; This code is described in an Intel White-Paper:
   43 ; "Fast SHA-256 Implementations on Intel Architecture Processors"
   44 ;
   45 ; To find it, surf to http://www.intel.com/p/en_US/embedded 
   46 ; and search for that title.
   47 ; The paper is expected to be released roughly at the end of April, 2012
   48 ;
   49 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   50 ; This code schedules 2 blocks at a time, with 4 lanes per block
   51 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   52 
   53 ; Modified by kerukuro for use in cppcrypto.
   54 
   55 %define VMOVDQ vmovdqu ;; assume buffers not aligned 
   56 
   57 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
   58 
   59 ; addm [mem], reg
   60 ; Add reg to mem using reg-mem add and store
   61 %macro addm 2
   62     add %2, %1
   63     mov %1, %2
   64 %endm
   65 
   66 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   67 
   68 %define X0 ymm4
   69 %define X1 ymm5
   70 %define X2 ymm6
   71 %define X3 ymm7
   72 
   73 ; XMM versions of above
   74 %define XWORD0 xmm4
   75 %define XWORD1 xmm5
   76 %define XWORD2 xmm6
   77 %define XWORD3 xmm7
   78 
   79 %define XTMP0 ymm0
   80 %define XTMP1 ymm1
   81 %define XTMP2 ymm2
   82 %define XTMP3 ymm3
   83 %define XTMP4 ymm8
   84 %define XFER  ymm9
   85 %define XTMP5 ymm11
   86 
   87 %define SHUF_00BA   ymm10 ; shuffle xBxA -> 00BA
   88 %define SHUF_DC00   ymm12 ; shuffle xDxC -> DC00
   89 %define BYTE_FLIP_MASK  ymm13
   90 
   91 %define X_BYTE_FLIP_MASK xmm13 ; XMM version of BYTE_FLIP_MASK
   92 
   93 %ifndef WINABI
   94 %define NUM_BLKS rdx    ; 3rd arg
   95 %define CTX rsi     ; 2nd arg
   96 %define INP rdi ; 1st arg
   97 %define c   ecx
   98 %define d   r8d
   99 %define e       edx ; clobbers NUM_BLKS
  100 %define y3  edi ; clobbers INP
  101 %else
  102 %define NUM_BLKS r8     ; 3rd arg
  103 %define CTX rdx     ; 2nd arg
  104 %define INP rcx     ; 1st arg
  105 %define c   edi
  106 %define d   esi
  107 %define e   r8d ; clobbers NUM_BLKS
  108 %define y3  ecx ; clobbers INP
  109 
  110 %endif
  111 
  112 
  113 %define TBL rbp
  114 %define SRND    CTX ; SRND is same register as CTX
  115     
  116 %define a eax
  117 %define b ebx
  118 %define f r9d
  119 %define g r10d
  120 %define h r11d
  121 %define old_h r11d
  122 
  123 %define T1 r12d
  124 %define y0 r13d
  125 %define y1 r14d
  126 %define y2 r15d
  127 
  128 
  129 _XFER_SIZE  equ 2*64*4  ; 2 blocks, 64 rounds, 4 bytes/round
  130 %ifndef WINABI
  131 _XMM_SAVE_SIZE  equ 0
  132 %else
  133 _XMM_SAVE_SIZE  equ 8*16
  134 %endif
  135 _INP_END_SIZE   equ 8
  136 _INP_SIZE   equ 8
  137 _CTX_SIZE   equ 8
  138 _RSP_SIZE   equ 8
  139 
  140 _XFER       equ 0
  141 _XMM_SAVE   equ _XFER     + _XFER_SIZE
  142 _INP_END    equ _XMM_SAVE + _XMM_SAVE_SIZE
  143 _INP        equ _INP_END  + _INP_END_SIZE
  144 _CTX        equ _INP      + _INP_SIZE
  145 _RSP        equ _CTX      + _CTX_SIZE
  146 STACK_SIZE  equ _RSP      + _RSP_SIZE
  147 
  148 ; rotate_Xs
  149 ; Rotate values of symbols X0...X3
  150 %macro rotate_Xs 0
  151 %xdefine X_ X0
  152 %xdefine X0 X1
  153 %xdefine X1 X2
  154 %xdefine X2 X3
  155 %xdefine X3 X_
  156 %endm
  157 
  158 ; ROTATE_ARGS
  159 ; Rotate values of symbols a...h
  160 %macro ROTATE_ARGS 0
  161 %xdefine old_h h
  162 %xdefine TMP_ h
  163 %xdefine h g
  164 %xdefine g f
  165 %xdefine f e
  166 %xdefine e d
  167 %xdefine d c
  168 %xdefine c b
  169 %xdefine b a
  170 %xdefine a TMP_
  171 %endm
  172 
  173 %macro FOUR_ROUNDS_AND_SCHED 1
  174 %define %%XFER %1
  175 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  176 
  177     mov y3, a       ; y3 = a                                ; MAJA  
  178     rorx    y0, e, 25   ; y0 = e >> 25              ; S1A
  179     rorx    y1, e, 11   ; y1 = e >> 11              ; S1B
  180 
  181     add h, dword[%%XFER+0*4]        ; h = k + w + h         ; --    
  182     or  y3, c       ; y3 = a|c                              ; MAJA  
  183         vpalignr    XTMP0, X3, X2, 4    ; XTMP0 = W[-7]
  184     mov y2, f       ; y2 = f                                ; CH    
  185     rorx    T1, a, 13   ; T1 = a >> 13              ; S0B
  186 
  187     xor y0, y1      ; y0 = (e>>25) ^ (e>>11)        ; S1
  188     xor y2, g       ; y2 = f^g                              ; CH    
  189         vpaddd  XTMP0, XTMP0, X0    ; XTMP0 = W[-7] + W[-16]; y1 = (e >> 6)                 ; S1
  190     rorx    y1, e, 6    ; y1 = (e >> 6)             ; S1
  191 
  192     and y2, e       ; y2 = (f^g)&e                          ; CH    
  193     xor y0, y1      ; y0 = (e>>25) ^ (e>>11) ^ (e>>6)   ; S1
  194     rorx    y1, a, 22   ; y1 = a >> 22              ; S0A
  195     add d, h        ; d = k + w + h + d                     ; --    
  196 
  197     and y3, b       ; y3 = (a|c)&b                          ; MAJA  
  198         vpalignr    XTMP1, X1, X0, 4    ; XTMP1 = W[-15]
  199     xor y1, T1      ; y1 = (a>>22) ^ (a>>13)        ; S0
  200     rorx    T1, a, 2    ; T1 = (a >> 2)             ; S0
  201 
  202     xor y2, g       ; y2 = CH = ((f^g)&e)^g                 ; CH    
  203         vpsrld  XTMP2, XTMP1, 7
  204     xor y1, T1      ; y1 = (a>>22) ^ (a>>13) ^ (a>>2)   ; S0
  205     mov T1, a       ; T1 = a                                ; MAJB  
  206     and T1, c       ; T1 = a&c                              ; MAJB  
  207 
  208     add y2, y0      ; y2 = S1 + CH                          ; --    
  209         vpslld  XTMP3, XTMP1, (32-7)
  210     or  y3, T1      ; y3 = MAJ = (a|c)&b)|(a&c)             ; MAJ   
  211     add h, y1       ; h = k + w + h + S0                    ; --    
  212 
  213     add d, y2       ; d = k + w + h + d + S1 + CH = d + t1  ; --    
  214         vpor    XTMP3, XTMP3, XTMP2 ; XTMP3 = W[-15] ror 7
  215 
  216         vpsrld  XTMP2, XTMP1,18
  217     add h, y2       ; h = k + w + h + S0 + S1 + CH = t1 + S0; --    
  218     add h, y3       ; h = t1 + S0 + MAJ                     ; --    
  219 
  220 
  221 ROTATE_ARGS
  222 
  223 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  224 
  225 
  226     mov y3, a       ; y3 = a                                ; MAJA  
  227     rorx    y0, e, 25   ; y0 = e >> 25              ; S1A
  228     rorx    y1, e, 11   ; y1 = e >> 11              ; S1B
  229     add h, dword[%%XFER+1*4]        ; h = k + w + h         ; --    
  230     or  y3, c       ; y3 = a|c                              ; MAJA  
  231 
  232 
  233         vpsrld  XTMP4, XTMP1, 3 ; XTMP4 = W[-15] >> 3
  234     mov y2, f       ; y2 = f                                ; CH    
  235     rorx    T1, a, 13   ; T1 = a >> 13              ; S0B
  236     xor y0, y1      ; y0 = (e>>25) ^ (e>>11)        ; S1
  237     xor y2, g       ; y2 = f^g                              ; CH    
  238 
  239 
  240     rorx    y1, e, 6    ; y1 = (e >> 6)             ; S1
  241     xor y0, y1      ; y0 = (e>>25) ^ (e>>11) ^ (e>>6)   ; S1
  242     rorx    y1, a, 22   ; y1 = a >> 22              ; S0A
  243     and y2, e       ; y2 = (f^g)&e                          ; CH    
  244     add d, h        ; d = k + w + h + d                     ; --    
  245 
  246         vpslld  XTMP1, XTMP1, (32-18)
  247     and y3, b       ; y3 = (a|c)&b                          ; MAJA  
  248     xor y1, T1      ; y1 = (a>>22) ^ (a>>13)        ; S0
  249 
  250         vpxor   XTMP3, XTMP3, XTMP1
  251     rorx    T1, a, 2    ; T1 = (a >> 2)             ; S0
  252     xor y2, g       ; y2 = CH = ((f^g)&e)^g                 ; CH    
  253 
  254         vpxor   XTMP3, XTMP3, XTMP2 ; XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
  255     xor y1, T1      ; y1 = (a>>22) ^ (a>>13) ^ (a>>2)   ; S0
  256     mov T1, a       ; T1 = a                                ; MAJB  
  257     and T1, c       ; T1 = a&c                              ; MAJB  
  258     add y2, y0      ; y2 = S1 + CH                          ; --    
  259 
  260         vpxor   XTMP1, XTMP3, XTMP4 ; XTMP1 = s0
  261         vpshufd XTMP2, X3, 11111010b    ; XTMP2 = W[-2] {BBAA}
  262     or  y3, T1      ; y3 = MAJ = (a|c)&b)|(a&c)             ; MAJ   
  263     add h, y1       ; h = k + w + h + S0                    ; --    
  264 
  265         vpaddd  XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
  266     add d, y2       ; d = k + w + h + d + S1 + CH = d + t1  ; --    
  267     add h, y2       ; h = k + w + h + S0 + S1 + CH = t1 + S0; --    
  268     add h, y3       ; h = t1 + S0 + MAJ                     ; --    
  269 
  270         vpsrld  XTMP4, XTMP2, 10    ; XTMP4 = W[-2] >> 10 {BBAA}
  271 
  272 
  273 ROTATE_ARGS
  274 
  275 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  276 
  277     mov y3, a       ; y3 = a                                ; MAJA  
  278     rorx    y0, e, 25   ; y0 = e >> 25              ; S1A
  279     add h, [%%XFER+2*4]     ; h = k + w + h         ; --    
  280 
  281         vpsrlq  XTMP3, XTMP2, 19    ; XTMP3 = W[-2] ror 19 {xBxA}
  282     rorx    y1, e, 11   ; y1 = e >> 11              ; S1B
  283     or  y3, c       ; y3 = a|c                              ; MAJA  
  284     mov y2, f       ; y2 = f                                ; CH    
  285     xor y2, g       ; y2 = f^g                              ; CH    
  286 
  287     rorx    T1, a, 13   ; T1 = a >> 13              ; S0B
  288     xor y0, y1      ; y0 = (e>>25) ^ (e>>11)        ; S1
  289         vpsrlq  XTMP2, XTMP2, 17    ; XTMP2 = W[-2] ror 17 {xBxA}
  290     and y2, e       ; y2 = (f^g)&e                          ; CH    
  291 
  292     rorx    y1, e, 6    ; y1 = (e >> 6)             ; S1
  293         vpxor   XTMP2, XTMP2, XTMP3
  294     add d, h        ; d = k + w + h + d                     ; --    
  295     and y3, b       ; y3 = (a|c)&b                          ; MAJA  
  296 
  297     xor y0, y1      ; y0 = (e>>25) ^ (e>>11) ^ (e>>6)   ; S1
  298     rorx    y1, a, 22   ; y1 = a >> 22              ; S0A
  299         vpxor   XTMP4, XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
  300     xor y2, g       ; y2 = CH = ((f^g)&e)^g                 ; CH    
  301 
  302         vpshufb XTMP4, XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
  303     xor y1, T1      ; y1 = (a>>22) ^ (a>>13)        ; S0
  304     rorx    T1, a, 2    ; T1 = (a >> 2)             ; S0
  305         vpaddd  XTMP0, XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
  306 
  307     xor y1, T1      ; y1 = (a>>22) ^ (a>>13) ^ (a>>2)   ; S0
  308     mov T1, a       ; T1 = a                                ; MAJB  
  309     and T1, c       ; T1 = a&c                              ; MAJB  
  310     add y2, y0      ; y2 = S1 + CH                          ; --    
  311         vpshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
  312 
  313     or  y3, T1      ; y3 = MAJ = (a|c)&b)|(a&c)             ; MAJ   
  314     add h, y1       ; h = k + w + h + S0                    ; --    
  315     add d, y2       ; d = k + w + h + d + S1 + CH = d + t1  ; --    
  316     add h, y2       ; h = k + w + h + S0 + S1 + CH = t1 + S0; --    
  317 
  318     add h, y3       ; h = t1 + S0 + MAJ                     ; --    
  319 
  320 
  321 ROTATE_ARGS
  322 
  323 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  324 
  325     mov y3, a       ; y3 = a                                ; MAJA  
  326     rorx    y0, e, 25   ; y0 = e >> 25              ; S1A
  327     rorx    y1, e, 11   ; y1 = e >> 11              ; S1B
  328     add h, dword[%%XFER+3*4]        ; h = k + w + h         ; --    
  329     or  y3, c       ; y3 = a|c                              ; MAJA  
  330 
  331 
  332         vpsrld  XTMP5, XTMP2,   10  ; XTMP5 = W[-2] >> 10 {DDCC}
  333     mov y2, f       ; y2 = f                                ; CH    
  334     rorx    T1, a, 13   ; T1 = a >> 13              ; S0B
  335     xor y0, y1      ; y0 = (e>>25) ^ (e>>11)        ; S1
  336     xor y2, g       ; y2 = f^g                              ; CH    
  337 
  338 
  339         vpsrlq  XTMP3, XTMP2, 19    ; XTMP3 = W[-2] ror 19 {xDxC}
  340     rorx    y1, e, 6    ; y1 = (e >> 6)             ; S1
  341     and y2, e       ; y2 = (f^g)&e                          ; CH    
  342     add d, h        ; d = k + w + h + d                     ; --    
  343     and y3, b       ; y3 = (a|c)&b                          ; MAJA  
  344 
  345         vpsrlq  XTMP2, XTMP2, 17    ; XTMP2 = W[-2] ror 17 {xDxC}
  346     xor y0, y1      ; y0 = (e>>25) ^ (e>>11) ^ (e>>6)   ; S1
  347     xor y2, g       ; y2 = CH = ((f^g)&e)^g                 ; CH    
  348 
  349         vpxor   XTMP2, XTMP2, XTMP3
  350     rorx    y1, a, 22   ; y1 = a >> 22              ; S0A
  351     add y2, y0      ; y2 = S1 + CH                          ; --    
  352 
  353         vpxor   XTMP5, XTMP5, XTMP2 ; XTMP5 = s1 {xDxC}
  354     xor y1, T1      ; y1 = (a>>22) ^ (a>>13)        ; S0
  355     add d, y2       ; d = k + w + h + d + S1 + CH = d + t1  ; --    
  356 
  357     rorx    T1, a, 2    ; T1 = (a >> 2)             ; S0
  358         vpshufb XTMP5, XTMP5, SHUF_DC00 ; XTMP5 = s1 {DC00}
  359 
  360         vpaddd  X0, XTMP5, XTMP0    ; X0 = {W[3], W[2], W[1], W[0]}
  361     xor y1, T1      ; y1 = (a>>22) ^ (a>>13) ^ (a>>2)   ; S0
  362     mov T1, a       ; T1 = a                                ; MAJB  
  363     and T1, c       ; T1 = a&c                              ; MAJB  
  364     or  y3, T1      ; y3 = MAJ = (a|c)&b)|(a&c)             ; MAJ   
  365 
  366     add h, y1       ; h = k + w + h + S0                    ; --    
  367     add h, y2       ; h = k + w + h + S0 + S1 + CH = t1 + S0; --    
  368     add h, y3       ; h = t1 + S0 + MAJ                     ; --    
  369 
  370 ROTATE_ARGS
  371 rotate_Xs
  372 %endm
  373 
  374 %macro DO_4ROUNDS 1
  375 %define %%XFER %1
  376 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;
  377 
  378     mov y2, f       ; y2 = f                                ; CH    
  379     rorx    y0, e, 25   ; y0 = e >> 25              ; S1A
  380     rorx    y1, e, 11   ; y1 = e >> 11              ; S1B
  381     xor y2, g       ; y2 = f^g                              ; CH    
  382 
  383     xor y0, y1      ; y0 = (e>>25) ^ (e>>11)        ; S1
  384     rorx    y1, e, 6    ; y1 = (e >> 6)             ; S1
  385     and y2, e       ; y2 = (f^g)&e                          ; CH    
  386 
  387     xor y0, y1      ; y0 = (e>>25) ^ (e>>11) ^ (e>>6)   ; S1
  388     rorx    T1, a, 13   ; T1 = a >> 13              ; S0B
  389     xor y2, g       ; y2 = CH = ((f^g)&e)^g                 ; CH    
  390     rorx    y1, a, 22   ; y1 = a >> 22              ; S0A
  391     mov y3, a       ; y3 = a                                ; MAJA  
  392 
  393     xor y1, T1      ; y1 = (a>>22) ^ (a>>13)        ; S0
  394     rorx    T1, a, 2    ; T1 = (a >> 2)             ; S0
  395     add h, dword[%%XFER + 4*0]      ; h = k + w + h ; --    
  396     or  y3, c       ; y3 = a|c                              ; MAJA  
  397 
  398     xor y1, T1      ; y1 = (a>>22) ^ (a>>13) ^ (a>>2)   ; S0
  399     mov T1, a       ; T1 = a                                ; MAJB  
  400     and y3, b       ; y3 = (a|c)&b                          ; MAJA  
  401     and T1, c       ; T1 = a&c                              ; MAJB  
  402     add y2, y0      ; y2 = S1 + CH                          ; --    
  403 
  404 
  405     add d, h        ; d = k + w + h + d                     ; --    
  406     or  y3, T1      ; y3 = MAJ = (a|c)&b)|(a&c)             ; MAJ   
  407     add h, y1       ; h = k + w + h + S0                    ; --    
  408 
  409     add d, y2       ; d = k + w + h + d + S1 + CH = d + t1  ; --    
  410 
  411 
  412     ;add    h, y2       ; h = k + w + h + S0 + S1 + CH = t1 + S0; --    
  413 
  414     ;add    h, y3       ; h = t1 + S0 + MAJ                     ; --    
  415 
  416     ROTATE_ARGS
  417 
  418 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;
  419 
  420     add old_h, y2   ; h = k + w + h + S0 + S1 + CH = t1 + S0; --    
  421     mov y2, f       ; y2 = f                                ; CH    
  422     rorx    y0, e, 25   ; y0 = e >> 25              ; S1A
  423     rorx    y1, e, 11   ; y1 = e >> 11              ; S1B
  424     xor y2, g       ; y2 = f^g                              ; CH    
  425 
  426     xor y0, y1      ; y0 = (e>>25) ^ (e>>11)        ; S1
  427     rorx    y1, e, 6    ; y1 = (e >> 6)             ; S1
  428     and y2, e       ; y2 = (f^g)&e                          ; CH    
  429     add old_h, y3   ; h = t1 + S0 + MAJ                     ; --    
  430 
  431     xor y0, y1      ; y0 = (e>>25) ^ (e>>11) ^ (e>>6)   ; S1
  432     rorx    T1, a, 13   ; T1 = a >> 13              ; S0B
  433     xor y2, g       ; y2 = CH = ((f^g)&e)^g                 ; CH    
  434     rorx    y1, a, 22   ; y1 = a >> 22              ; S0A
  435     mov y3, a       ; y3 = a                                ; MAJA  
  436 
  437     xor y1, T1      ; y1 = (a>>22) ^ (a>>13)        ; S0
  438     rorx    T1, a, 2    ; T1 = (a >> 2)             ; S0
  439     add h, dword[%%XFER + 4*1]      ; h = k + w + h ; --    
  440     or  y3, c       ; y3 = a|c                              ; MAJA  
  441 
  442     xor y1, T1      ; y1 = (a>>22) ^ (a>>13) ^ (a>>2)   ; S0
  443     mov T1, a       ; T1 = a                                ; MAJB  
  444     and y3, b       ; y3 = (a|c)&b                          ; MAJA  
  445     and T1, c       ; T1 = a&c                              ; MAJB  
  446     add y2, y0      ; y2 = S1 + CH                          ; --    
  447 
  448 
  449     add d, h        ; d = k + w + h + d                     ; --    
  450     or  y3, T1      ; y3 = MAJ = (a|c)&b)|(a&c)             ; MAJ   
  451     add h, y1       ; h = k + w + h + S0                    ; --    
  452 
  453     add d, y2       ; d = k + w + h + d + S1 + CH = d + t1  ; --    
  454 
  455 
  456     ;add    h, y2       ; h = k + w + h + S0 + S1 + CH = t1 + S0; --    
  457 
  458     ;add    h, y3       ; h = t1 + S0 + MAJ                     ; --    
  459 
  460     ROTATE_ARGS
  461 
  462 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  463 
  464     add old_h, y2   ; h = k + w + h + S0 + S1 + CH = t1 + S0; --    
  465     mov y2, f       ; y2 = f                                ; CH    
  466     rorx    y0, e, 25   ; y0 = e >> 25              ; S1A
  467     rorx    y1, e, 11   ; y1 = e >> 11              ; S1B
  468     xor y2, g       ; y2 = f^g                              ; CH    
  469 
  470     xor y0, y1      ; y0 = (e>>25) ^ (e>>11)        ; S1
  471     rorx    y1, e, 6    ; y1 = (e >> 6)             ; S1
  472     and y2, e       ; y2 = (f^g)&e                          ; CH    
  473     add old_h, y3   ; h = t1 + S0 + MAJ                     ; --    
  474 
  475     xor y0, y1      ; y0 = (e>>25) ^ (e>>11) ^ (e>>6)   ; S1
  476     rorx    T1, a, 13   ; T1 = a >> 13              ; S0B
  477     xor y2, g       ; y2 = CH = ((f^g)&e)^g                 ; CH    
  478     rorx    y1, a, 22   ; y1 = a >> 22              ; S0A
  479     mov y3, a       ; y3 = a                                ; MAJA  
  480 
  481     xor y1, T1      ; y1 = (a>>22) ^ (a>>13)        ; S0
  482     rorx    T1, a, 2    ; T1 = (a >> 2)             ; S0
  483     add h, dword[%%XFER + 4*2]      ; h = k + w + h ; --    
  484     or  y3, c       ; y3 = a|c                              ; MAJA  
  485 
  486     xor y1, T1      ; y1 = (a>>22) ^ (a>>13) ^ (a>>2)   ; S0
  487     mov T1, a       ; T1 = a                                ; MAJB  
  488     and y3, b       ; y3 = (a|c)&b                          ; MAJA  
  489     and T1, c       ; T1 = a&c                              ; MAJB  
  490     add y2, y0      ; y2 = S1 + CH                          ; --    
  491 
  492 
  493     add d, h        ; d = k + w + h + d                     ; --    
  494     or  y3, T1      ; y3 = MAJ = (a|c)&b)|(a&c)             ; MAJ   
  495     add h, y1       ; h = k + w + h + S0                    ; --    
  496 
  497     add d, y2       ; d = k + w + h + d + S1 + CH = d + t1  ; --    
  498 
  499 
  500     ;add    h, y2       ; h = k + w + h + S0 + S1 + CH = t1 + S0; --    
  501 
  502     ;add    h, y3       ; h = t1 + S0 + MAJ                     ; --    
  503 
  504     ROTATE_ARGS
  505 
  506 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;
  507 
  508     add old_h, y2   ; h = k + w + h + S0 + S1 + CH = t1 + S0; --    
  509     mov y2, f       ; y2 = f                                ; CH    
  510     rorx    y0, e, 25   ; y0 = e >> 25              ; S1A
  511     rorx    y1, e, 11   ; y1 = e >> 11              ; S1B
  512     xor y2, g       ; y2 = f^g                              ; CH    
  513 
  514     xor y0, y1      ; y0 = (e>>25) ^ (e>>11)        ; S1
  515     rorx    y1, e, 6    ; y1 = (e >> 6)             ; S1
  516     and y2, e       ; y2 = (f^g)&e                          ; CH    
  517     add old_h, y3   ; h = t1 + S0 + MAJ                     ; --    
  518 
  519     xor y0, y1      ; y0 = (e>>25) ^ (e>>11) ^ (e>>6)   ; S1
  520     rorx    T1, a, 13   ; T1 = a >> 13              ; S0B
  521     xor y2, g       ; y2 = CH = ((f^g)&e)^g                 ; CH    
  522     rorx    y1, a, 22   ; y1 = a >> 22              ; S0A
  523     mov y3, a       ; y3 = a                                ; MAJA  
  524 
  525     xor y1, T1      ; y1 = (a>>22) ^ (a>>13)        ; S0
  526     rorx    T1, a, 2    ; T1 = (a >> 2)             ; S0
  527     add h, dword[%%XFER + 4*3]      ; h = k + w + h ; --    
  528     or  y3, c       ; y3 = a|c                              ; MAJA  
  529 
  530     xor y1, T1      ; y1 = (a>>22) ^ (a>>13) ^ (a>>2)   ; S0
  531     mov T1, a       ; T1 = a                                ; MAJB  
  532     and y3, b       ; y3 = (a|c)&b                          ; MAJA  
  533     and T1, c       ; T1 = a&c                              ; MAJB  
  534     add y2, y0      ; y2 = S1 + CH                          ; --    
  535 
  536 
  537     add d, h        ; d = k + w + h + d                     ; --    
  538     or  y3, T1      ; y3 = MAJ = (a|c)&b)|(a&c)             ; MAJ   
  539     add h, y1       ; h = k + w + h + S0                    ; --    
  540 
  541     add d, y2       ; d = k + w + h + d + S1 + CH = d + t1  ; --    
  542 
  543 
  544     add h, y2       ; h = k + w + h + S0 + S1 + CH = t1 + S0; --    
  545 
  546     add h, y3       ; h = t1 + S0 + MAJ                     ; --    
  547 
  548     ROTATE_ARGS
  549 
  550 %endm
  551 
  552 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  553 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  554 ;; void sha256_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
  555 ;; arg 1 : pointer to input data
  556 ;; arg 2 : pointer to digest
  557 ;; arg 3 : Num blocks
  558 section .text
  559 global sha256_rorx
  560 global _sha256_rorx
  561 align 32
  562 sha256_rorx:
  563 _sha256_rorx:
  564     push    rbx
  565 %ifdef WINABI    
  566     push    rsi
  567     push    rdi
  568 %endif
  569     push    rbp
  570     push    r12
  571     push    r13
  572     push    r14
  573     push    r15
  574 
  575     mov rax, rsp
  576     sub rsp,STACK_SIZE
  577     and rsp, -32
  578     mov [rsp + _RSP], rax
  579 
  580 %ifdef WINABI    
  581     vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6   
  582     vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7
  583     vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8   
  584     vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9   
  585     vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10
  586     vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11
  587     vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12
  588     vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13
  589 %endif
  590 
  591     shl NUM_BLKS, 6 ; convert to bytes
  592     jz  done_hash
  593     lea NUM_BLKS, [NUM_BLKS + INP - 64] ; pointer to last block
  594     mov [rsp + _INP_END], NUM_BLKS
  595 
  596     cmp INP, NUM_BLKS
  597     je  only_one_block
  598 
  599     ;; load initial digest
  600     mov a,[4*0 + CTX]
  601     mov b,[4*1 + CTX]
  602     mov c,[4*2 + CTX]
  603     mov d,[4*3 + CTX]
  604     mov e,[4*4 + CTX]
  605     mov f,[4*5 + CTX]
  606     mov g,[4*6 + CTX]
  607     mov h,[4*7 + CTX]
  608 
  609     vmovdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
  610     vmovdqa SHUF_00BA, [_SHUF_00BA wrt rip]
  611     vmovdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
  612 
  613     mov [rsp + _CTX], CTX
  614 
  615 loop0:
  616     lea TBL,[K256 wrt rip]
  617 
  618     ;; Load first 16 dwords from two blocks
  619     VMOVDQ  XTMP0, [INP + 0*32]
  620     VMOVDQ  XTMP1, [INP + 1*32]
  621     VMOVDQ  XTMP2, [INP + 2*32]
  622     VMOVDQ  XTMP3, [INP + 3*32]
  623 
  624     ;; byte swap data
  625     vpshufb XTMP0, XTMP0, BYTE_FLIP_MASK
  626     vpshufb XTMP1, XTMP1, BYTE_FLIP_MASK
  627     vpshufb XTMP2, XTMP2, BYTE_FLIP_MASK
  628     vpshufb XTMP3, XTMP3, BYTE_FLIP_MASK
  629 
  630     ;; transpose data into high/low halves
  631     vperm2i128  X0, XTMP0, XTMP2, 0x20
  632     vperm2i128  X1, XTMP0, XTMP2, 0x31
  633     vperm2i128  X2, XTMP1, XTMP3, 0x20
  634     vperm2i128  X3, XTMP1, XTMP3, 0x31
  635     
  636 last_block_enter:
  637     add INP, 64
  638     mov [rsp + _INP], INP
  639 
  640     ;; schedule 48 input dwords, by doing 3 rounds of 12 each
  641     xor SRND, SRND
  642 
  643 align 16
  644 loop1:
  645     vpaddd  XFER, X0, [TBL + SRND + 0*32]
  646     vmovdqa [rsp + _XFER + SRND + 0*32], XFER
  647     FOUR_ROUNDS_AND_SCHED   rsp + _XFER + SRND + 0*32
  648 
  649     vpaddd  XFER, X0, [TBL + SRND + 1*32]
  650     vmovdqa [rsp + _XFER + SRND + 1*32], XFER
  651     FOUR_ROUNDS_AND_SCHED   rsp + _XFER + SRND + 1*32
  652 
  653     vpaddd  XFER, X0, [TBL + SRND + 2*32]
  654     vmovdqa [rsp + _XFER + SRND + 2*32], XFER
  655     FOUR_ROUNDS_AND_SCHED   rsp + _XFER + SRND + 2*32
  656 
  657     vpaddd  XFER, X0, [TBL + SRND + 3*32]
  658     vmovdqa [rsp + _XFER + SRND + 3*32], XFER
  659     FOUR_ROUNDS_AND_SCHED   rsp + _XFER + SRND + 3*32
  660 
  661     add SRND, 4*32
  662     cmp SRND, 3 * 4*32
  663     jb  loop1
  664 
  665 loop2:
  666     ;; Do last 16 rounds with no scheduling
  667     vpaddd  XFER, X0, [TBL + SRND + 0*32]
  668     vmovdqa [rsp + _XFER + SRND + 0*32], XFER
  669     DO_4ROUNDS  rsp + _XFER + SRND + 0*32
  670     vpaddd  XFER, X1, [TBL + SRND + 1*32]
  671     vmovdqa [rsp + _XFER + SRND + 1*32], XFER
  672     DO_4ROUNDS  rsp + _XFER + SRND + 1*32
  673     add SRND, 2*32
  674 
  675     vmovdqa X0, X2
  676     vmovdqa X1, X3
  677 
  678     cmp SRND, 4 * 4*32
  679     jb  loop2
  680 
  681     mov CTX, [rsp + _CTX]
  682     mov INP, [rsp + _INP]
  683 
  684     addm    [4*0 + CTX],a
  685     addm    [4*1 + CTX],b
  686     addm    [4*2 + CTX],c
  687     addm    [4*3 + CTX],d
  688     addm    [4*4 + CTX],e
  689     addm    [4*5 + CTX],f
  690     addm    [4*6 + CTX],g
  691     addm    [4*7 + CTX],h
  692 
  693     cmp INP, [rsp + _INP_END]
  694     ja  done_hash
  695 
  696     ;;;; Do second block using previously scheduled results
  697     xor SRND, SRND
  698 align 16
  699 loop3:
  700     DO_4ROUNDS  rsp + _XFER + SRND + 0*32 + 16
  701     DO_4ROUNDS  rsp + _XFER + SRND + 1*32 + 16
  702     add SRND, 2*32
  703     cmp SRND, 4 * 4*32
  704     jb loop3
  705 
  706     mov CTX, [rsp + _CTX]
  707     mov INP, [rsp + _INP]
  708     add INP, 64
  709 
  710     addm    [4*0 + CTX],a
  711     addm    [4*1 + CTX],b
  712     addm    [4*2 + CTX],c
  713     addm    [4*3 + CTX],d
  714     addm    [4*4 + CTX],e
  715     addm    [4*5 + CTX],f
  716     addm    [4*6 + CTX],g
  717     addm    [4*7 + CTX],h
  718 
  719     cmp INP, [rsp + _INP_END]
  720     jb  loop0
  721     ja  done_hash
  722 
  723 do_last_block:
  724     ;;;; do last block
  725     lea TBL,[K256 wrt rip]
  726 
  727     VMOVDQ  XWORD0, [INP + 0*16]
  728     VMOVDQ  XWORD1, [INP + 1*16]
  729     VMOVDQ  XWORD2, [INP + 2*16]
  730     VMOVDQ  XWORD3, [INP + 3*16]
  731 
  732     vpshufb XWORD0, XWORD0, X_BYTE_FLIP_MASK
  733     vpshufb XWORD1, XWORD1, X_BYTE_FLIP_MASK
  734     vpshufb XWORD2, XWORD2, X_BYTE_FLIP_MASK
  735     vpshufb XWORD3, XWORD3, X_BYTE_FLIP_MASK
  736 
  737     jmp last_block_enter
  738 
  739 only_one_block:
  740 
  741     ;; load initial digest
  742     mov a,[4*0 + CTX]
  743     mov b,[4*1 + CTX]
  744     mov c,[4*2 + CTX]
  745     mov d,[4*3 + CTX]
  746     mov e,[4*4 + CTX]
  747     mov f,[4*5 + CTX]
  748     mov g,[4*6 + CTX]
  749     mov h,[4*7 + CTX]
  750 
  751     vmovdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
  752     vmovdqa SHUF_00BA, [_SHUF_00BA wrt rip]
  753     vmovdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
  754 
  755     mov [rsp + _CTX], CTX
  756     jmp do_last_block
  757 
  758 done_hash:
  759 %ifdef WINABI    
  760     vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16]
  761     vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16]
  762     vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16]
  763     vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16]
  764     vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16]
  765     vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16]
  766     vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16]
  767     vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16]
  768 %endif
  769 
  770     mov rsp, [rsp + _RSP]
  771 
  772     pop r15
  773     pop r14
  774     pop r13
  775     pop r12
  776     pop rbp
  777 %ifdef WINABI
  778     pop rdi
  779     pop rsi
  780 %endif
  781     pop rbx
  782 
  783     ret 
  784 
  785 section .data
  786 align 64
  787 K256:
  788     dd  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  789     dd  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  790     dd  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  791     dd  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  792     dd  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  793     dd  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  794     dd  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  795     dd  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  796     dd  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  797     dd  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  798     dd  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  799     dd  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  800     dd  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  801     dd  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  802     dd  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  803     dd  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  804     dd  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  805     dd  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  806     dd  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  807     dd  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  808     dd  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  809     dd  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  810     dd  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  811     dd  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  812     dd  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  813     dd  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  814     dd  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  815     dd  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  816     dd  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  817     dd  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  818     dd  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  819     dd  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  820 
  821 PSHUFFLE_BYTE_FLIP_MASK:
  822     ddq 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
  823 
  824 ; shuffle xBxA -> 00BA
  825 _SHUF_00BA:
  826     ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
  827 
  828 ; shuffle xDxC -> DC00
  829 _SHUF_DC00:
  830     ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
  831 
  832 %ifidn __OUTPUT_FORMAT__,elf
  833 section .note.GNU-stack noalloc noexec nowrite progbits
  834 %endif
  835 %ifidn __OUTPUT_FORMAT__,elf32
  836 section .note.GNU-stack noalloc noexec nowrite progbits
  837 %endif
  838 %ifidn __OUTPUT_FORMAT__,elf64
  839 section .note.GNU-stack noalloc noexec nowrite progbits
  840 %endif