"Fossies" - the Fresh Open Source Software Archive

Member "src/Crypto/sha256_sse4_x64.asm" (10 Oct 2018, 15739 Bytes) of package /windows/misc/VeraCrypt_1.23-Hotfix-2_Source.zip:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Generic Assembler source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file.

    1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    2 ; Copyright (c) 2012, Intel Corporation 
    3 ; 
    4 ; All rights reserved. 
    5 ; 
    6 ; Redistribution and use in source and binary forms, with or without
    7 ; modification, are permitted provided that the following conditions are
    8 ; met: 
    9 ; 
   10 ; * Redistributions of source code must retain the above copyright
   11 ;   notice, this list of conditions and the following disclaimer.  
   12 ; 
   13 ; * Redistributions in binary form must reproduce the above copyright
   14 ;   notice, this list of conditions and the following disclaimer in the
   15 ;   documentation and/or other materials provided with the
   16 ;   distribution. 
   17 ; 
   18 ; * Neither the name of the Intel Corporation nor the names of its
   19 ;   contributors may be used to endorse or promote products derived from
   20 ;   this software without specific prior written permission. 
   21 ; 
   22 ; 
   23 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
   24 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   25 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   26 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
   27 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   28 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   29 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   30 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   31 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   32 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   33 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   34 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   35 ;
   36 ; Example YASM command lines:
   37 ; Windows:  yasm -Xvc -f x64 -rnasm -pnasm -o sha256_sse4.obj -g cv8 sha256_sse4.asm
   38 ; Linux:    yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_sse4.o sha256_sse4.asm
   39 ;
   40 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   41 ;
   42 ; This code is described in an Intel White-Paper:
   43 ; "Fast SHA-256 Implementations on Intel Architecture Processors"
   44 ;
   45 ; To find it, surf to http://www.intel.com/p/en_US/embedded 
   46 ; and search for that title.
   47 ; The paper is expected to be released roughly at the end of April, 2012
   48 ;
   49 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   50 ; This code schedules 1 blocks at a time, with 4 lanes per block
   51 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   52 
   53 ; Modified by kerukuro for use in cppcrypto.
   54 
   55 ; Modified By Mounir IDRASSI for use in VeraCrypt
   56 
   57 %define MOVDQ movdqu ;; assume buffers not aligned 
   58 
   59 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
   60 
   61 ; addm [mem], reg
   62 ; Add reg to mem using reg-mem add and store
   63 %macro addm 2
   64     add %2, %1
   65     mov %1, %2
   66 %endm
   67 
   68 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   69 
   70 ; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
   71 ; Load xmm with mem and byte swap each dword
   72 %macro COPY_XMM_AND_BSWAP 3
   73     MOVDQ %1, %2
   74     pshufb %1, %3
   75 %endmacro
   76 
   77 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   78 
   79 %define X0 xmm4
   80 %define X1 xmm5
   81 %define X2 xmm6
   82 %define X3 xmm7
   83 
   84 %define XTMP0 xmm0
   85 %define XTMP1 xmm1
   86 %define XTMP2 xmm2
   87 %define XTMP3 xmm3
   88 %define XTMP4 xmm8
   89 %define XFER  xmm9
   90 
   91 %define SHUF_00BA   xmm10 ; shuffle xBxA -> 00BA
   92 %define SHUF_DC00   xmm11 ; shuffle xDxC -> DC00
   93 %define BYTE_FLIP_MASK  xmm12
   94     
   95 %ifndef WINABI
   96 %define NUM_BLKS rdx    ; 3rd arg
   97 %define CTX rsi ; 2nd arg
   98 %define INP rdi ; 1st arg
   99 
  100 %define SRND    rdi ; clobbers INP
  101 %define c   ecx
  102 %define d   r8d
  103 %define e   edx
  104 %else
  105 %define NUM_BLKS r8 ; 3rd arg
  106 %define CTX rdx     ; 2nd arg
  107 %define INP rcx     ; 1st arg
  108 
  109 %define SRND    rcx ; clobbers INP
  110 %define c   edi 
  111 %define d   esi 
  112 %define e   r8d
  113     
  114 %endif
  115 %define TBL rbp
  116 %define a eax
  117 %define b ebx
  118 
  119 %define f r9d
  120 %define g r10d
  121 %define h r11d
  122 
  123 %define y0 r13d
  124 %define y1 r14d
  125 %define y2 r15d
  126 
  127 
  128 
  129 _INP_END_SIZE   equ 8
  130 _INP_SIZE   equ 8
  131 _XFER_SIZE  equ 8
  132 %ifndef WINABI
  133 _XMM_SAVE_SIZE  equ 0
  134 %else
  135 _XMM_SAVE_SIZE  equ 7*16
  136 %endif
  137 ; STACK_SIZE plus pushes must be an odd multiple of 8
  138 _ALIGN_SIZE equ 8
  139 
  140 _INP_END    equ 0
  141 _INP        equ _INP_END  + _INP_END_SIZE
  142 _XFER       equ _INP      + _INP_SIZE
  143 _XMM_SAVE   equ _XFER     + _XFER_SIZE + _ALIGN_SIZE
  144 STACK_SIZE  equ _XMM_SAVE + _XMM_SAVE_SIZE
  145 
  146 ; rotate_Xs
  147 ; Rotate values of symbols X0...X3
  148 %macro rotate_Xs 0
  149 %xdefine X_ X0
  150 %xdefine X0 X1
  151 %xdefine X1 X2
  152 %xdefine X2 X3
  153 %xdefine X3 X_
  154 %endm
  155 
  156 ; ROTATE_ARGS
  157 ; Rotate values of symbols a...h
  158 %macro ROTATE_ARGS 0
  159 %xdefine TMP_ h
  160 %xdefine h g
  161 %xdefine g f
  162 %xdefine f e
  163 %xdefine e d
  164 %xdefine d c
  165 %xdefine c b
  166 %xdefine b a
  167 %xdefine a TMP_
  168 %endm
  169 
  170 %macro FOUR_ROUNDS_AND_SCHED 0
  171         ;; compute s0 four at a time and s1 two at a time
  172         ;; compute W[-16] + W[-7] 4 at a time
  173         movdqa  XTMP0, X3
  174     mov y0, e       ; y0 = e
  175     ror y0, (25-11) ; y0 = e >> (25-11)
  176     mov y1, a       ; y1 = a
  177         palignr XTMP0, X2, 4    ; XTMP0 = W[-7]
  178     ror y1, (22-13) ; y1 = a >> (22-13)
  179     xor y0, e       ; y0 = e ^ (e >> (25-11))
  180     mov y2, f       ; y2 = f
  181     ror y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
  182         movdqa  XTMP1, X1
  183     xor y1, a       ; y1 = a ^ (a >> (22-13)
  184     xor y2, g       ; y2 = f^g
  185         paddd   XTMP0, X0   ; XTMP0 = W[-7] + W[-16]
  186     xor y0, e       ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  187     and y2, e       ; y2 = (f^g)&e
  188     ror y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
  189         ;; compute s0
  190         palignr XTMP1, X0, 4    ; XTMP1 = W[-15]
  191     xor y1, a       ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  192     ror y0, 6       ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  193     xor y2, g       ; y2 = CH = ((f^g)&e)^g
  194         movdqa  XTMP2, XTMP1    ; XTMP2 = W[-15]
  195     ror y1, 2       ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  196     add y2, y0      ; y2 = S1 + CH
  197     add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
  198         movdqa  XTMP3, XTMP1    ; XTMP3 = W[-15]
  199     mov y0, a       ; y0 = a
  200     add h, y2       ; h = h + S1 + CH + k + w
  201     mov y2, a       ; y2 = a
  202         pslld   XTMP1, (32-7)
  203     or  y0, c       ; y0 = a|c
  204     add d, h        ; d = d + h + S1 + CH + k + w
  205     and y2, c       ; y2 = a&c
  206         psrld   XTMP2, 7
  207     and y0, b       ; y0 = (a|c)&b
  208     add h, y1       ; h = h + S1 + CH + k + w + S0
  209         por XTMP1, XTMP2    ; XTMP1 = W[-15] ror 7
  210     or  y0, y2      ; y0 = MAJ = (a|c)&b)|(a&c)
  211     add h, y0       ; h = h + S1 + CH + k + w + S0 + MAJ
  212 
  213 ROTATE_ARGS
  214         movdqa  XTMP2, XTMP3    ; XTMP2 = W[-15]
  215     mov y0, e       ; y0 = e
  216     mov y1, a       ; y1 = a
  217         movdqa  XTMP4, XTMP3    ; XTMP4 = W[-15]
  218     ror y0, (25-11) ; y0 = e >> (25-11)
  219     xor y0, e       ; y0 = e ^ (e >> (25-11))
  220     mov y2, f       ; y2 = f
  221     ror y1, (22-13) ; y1 = a >> (22-13)
  222         pslld   XTMP3, (32-18)
  223     xor y1, a       ; y1 = a ^ (a >> (22-13)
  224     ror y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
  225     xor y2, g       ; y2 = f^g
  226         psrld   XTMP2, 18
  227     ror y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
  228     xor y0, e       ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  229     and y2, e       ; y2 = (f^g)&e
  230     ror y0, 6       ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  231         pxor    XTMP1, XTMP3
  232     xor y1, a       ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  233     xor y2, g       ; y2 = CH = ((f^g)&e)^g
  234         psrld   XTMP4, 3    ; XTMP4 = W[-15] >> 3
  235     add y2, y0      ; y2 = S1 + CH
  236     add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
  237     ror y1, 2       ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  238         pxor    XTMP1, XTMP2    ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
  239     mov y0, a       ; y0 = a
  240     add h, y2       ; h = h + S1 + CH + k + w
  241     mov y2, a       ; y2 = a
  242         pxor    XTMP1, XTMP4    ; XTMP1 = s0
  243     or  y0, c       ; y0 = a|c
  244     add d, h        ; d = d + h + S1 + CH + k + w
  245     and y2, c       ; y2 = a&c
  246         ;; compute low s1
  247         pshufd  XTMP2, X3, 11111010b    ; XTMP2 = W[-2] {BBAA}
  248     and y0, b       ; y0 = (a|c)&b
  249     add h, y1       ; h = h + S1 + CH + k + w + S0
  250         paddd   XTMP0, XTMP1    ; XTMP0 = W[-16] + W[-7] + s0
  251     or  y0, y2      ; y0 = MAJ = (a|c)&b)|(a&c)
  252     add h, y0       ; h = h + S1 + CH + k + w + S0 + MAJ
  253 
  254 ROTATE_ARGS
  255         movdqa  XTMP3, XTMP2    ; XTMP3 = W[-2] {BBAA}
  256     mov y0, e       ; y0 = e
  257     mov y1, a       ; y1 = a
  258     ror y0, (25-11) ; y0 = e >> (25-11)
  259         movdqa  XTMP4, XTMP2    ; XTMP4 = W[-2] {BBAA}
  260     xor y0, e       ; y0 = e ^ (e >> (25-11))
  261     ror y1, (22-13) ; y1 = a >> (22-13)
  262     mov y2, f       ; y2 = f
  263     xor y1, a       ; y1 = a ^ (a >> (22-13)
  264     ror y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
  265         psrlq   XTMP2, 17   ; XTMP2 = W[-2] ror 17 {xBxA}
  266     xor y2, g       ; y2 = f^g
  267         psrlq   XTMP3, 19   ; XTMP3 = W[-2] ror 19 {xBxA}
  268     xor y0, e       ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  269     and y2, e       ; y2 = (f^g)&e
  270         psrld   XTMP4, 10   ; XTMP4 = W[-2] >> 10 {BBAA}
  271     ror y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
  272     xor y1, a       ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  273     xor y2, g       ; y2 = CH = ((f^g)&e)^g
  274     ror y0, 6       ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  275         pxor    XTMP2, XTMP3
  276     add y2, y0      ; y2 = S1 + CH
  277     ror y1, 2       ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  278     add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
  279         pxor    XTMP4, XTMP2    ; XTMP4 = s1 {xBxA}
  280     mov y0, a       ; y0 = a
  281     add h, y2       ; h = h + S1 + CH + k + w
  282     mov y2, a       ; y2 = a
  283         pshufb  XTMP4, SHUF_00BA    ; XTMP4 = s1 {00BA}
  284     or  y0, c       ; y0 = a|c
  285     add d, h        ; d = d + h + S1 + CH + k + w
  286     and y2, c       ; y2 = a&c
  287         paddd   XTMP0, XTMP4    ; XTMP0 = {..., ..., W[1], W[0]}
  288     and y0, b       ; y0 = (a|c)&b
  289     add h, y1       ; h = h + S1 + CH + k + w + S0
  290         ;; compute high s1
  291         pshufd  XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
  292     or  y0, y2      ; y0 = MAJ = (a|c)&b)|(a&c)
  293     add h, y0       ; h = h + S1 + CH + k + w + S0 + MAJ
  294 
  295 ROTATE_ARGS
  296         movdqa  XTMP3, XTMP2    ; XTMP3 = W[-2] {DDCC}
  297     mov y0, e       ; y0 = e
  298     ror y0, (25-11) ; y0 = e >> (25-11)
  299     mov y1, a       ; y1 = a
  300         movdqa  X0,    XTMP2    ; X0    = W[-2] {DDCC}
  301     ror y1, (22-13) ; y1 = a >> (22-13)
  302     xor y0, e       ; y0 = e ^ (e >> (25-11))
  303     mov y2, f       ; y2 = f
  304     ror y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
  305         psrlq   XTMP2, 17   ; XTMP2 = W[-2] ror 17 {xDxC}
  306     xor y1, a       ; y1 = a ^ (a >> (22-13)
  307     xor y2, g       ; y2 = f^g
  308         psrlq   XTMP3, 19   ; XTMP3 = W[-2] ror 19 {xDxC}
  309     xor y0, e       ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  310     and y2, e       ; y2 = (f^g)&e
  311     ror y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
  312         psrld   X0,    10   ; X0 = W[-2] >> 10 {DDCC}
  313     xor y1, a       ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  314     ror y0, 6       ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  315     xor y2, g       ; y2 = CH = ((f^g)&e)^g
  316         pxor    XTMP2, XTMP3
  317     ror y1, 2       ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  318     add y2, y0      ; y2 = S1 + CH
  319     add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
  320         pxor    X0, XTMP2   ; X0 = s1 {xDxC}
  321     mov y0, a       ; y0 = a
  322     add h, y2       ; h = h + S1 + CH + k + w
  323     mov y2, a       ; y2 = a
  324         pshufb  X0, SHUF_DC00   ; X0 = s1 {DC00}
  325     or  y0, c       ; y0 = a|c
  326     add d, h        ; d = d + h + S1 + CH + k + w
  327     and y2, c       ; y2 = a&c
  328         paddd   X0, XTMP0   ; X0 = {W[3], W[2], W[1], W[0]}
  329     and y0, b       ; y0 = (a|c)&b
  330     add h, y1       ; h = h + S1 + CH + k + w + S0
  331     or  y0, y2      ; y0 = MAJ = (a|c)&b)|(a&c)
  332     add h, y0       ; h = h + S1 + CH + k + w + S0 + MAJ
  333 
  334 ROTATE_ARGS
  335 rotate_Xs
  336 %endm
  337 
  338 ;; input is [rsp + _XFER + %1 * 4]
  339 %macro DO_ROUND 1
  340     mov y0, e       ; y0 = e
  341     ror y0, (25-11) ; y0 = e >> (25-11)
  342     mov y1, a       ; y1 = a
  343     xor y0, e       ; y0 = e ^ (e >> (25-11))
  344     ror y1, (22-13) ; y1 = a >> (22-13)
  345     mov y2, f       ; y2 = f
  346     xor y1, a       ; y1 = a ^ (a >> (22-13)
  347     ror y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
  348     xor y2, g       ; y2 = f^g
  349     xor y0, e       ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  350     ror y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
  351     and y2, e       ; y2 = (f^g)&e
  352     xor y1, a       ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  353     ror y0, 6       ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  354     xor y2, g       ; y2 = CH = ((f^g)&e)^g
  355     add y2, y0      ; y2 = S1 + CH
  356     ror y1, 2       ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  357     add y2, [rsp + _XFER + %1 * 4]  ; y2 = k + w + S1 + CH
  358     mov y0, a       ; y0 = a
  359     add h, y2       ; h = h + S1 + CH + k + w
  360     mov y2, a       ; y2 = a
  361     or  y0, c       ; y0 = a|c
  362     add d, h        ; d = d + h + S1 + CH + k + w
  363     and y2, c       ; y2 = a&c
  364     and y0, b       ; y0 = (a|c)&b
  365     add h, y1       ; h = h + S1 + CH + k + w + S0
  366     or  y0, y2      ; y0 = MAJ = (a|c)&b)|(a&c)
  367     add h, y0       ; h = h + S1 + CH + k + w + S0 + MAJ
  368     ROTATE_ARGS
  369 %endm
  370 
  371 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  372 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  373 ;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
  374 ;; arg 1 : pointer to input data
  375 ;; arg 2 : pointer to digest
  376 ;; arg 3 : Num blocks
  377 section .text
  378 global sha256_sse4
  379 global _sha256_sse4
  380 align 32
  381 sha256_sse4:
  382 _sha256_sse4:
  383     push    rbx
  384 %ifdef WINABI
  385     push    rsi
  386     push    rdi
  387 %endif
  388     push    rbp
  389     push    r13
  390     push    r14
  391     push    r15
  392 
  393     sub rsp,STACK_SIZE
  394 %ifdef WINABI
  395     movdqa  [rsp + _XMM_SAVE + 0*16],xmm6   
  396     movdqa  [rsp + _XMM_SAVE + 1*16],xmm7
  397     movdqa  [rsp + _XMM_SAVE + 2*16],xmm8   
  398     movdqa  [rsp + _XMM_SAVE + 3*16],xmm9   
  399     movdqa  [rsp + _XMM_SAVE + 4*16],xmm10
  400     movdqa  [rsp + _XMM_SAVE + 5*16],xmm11
  401     movdqa  [rsp + _XMM_SAVE + 6*16],xmm12
  402 %endif
  403 
  404     shl NUM_BLKS, 6 ; convert to bytes
  405     jz  done_hash
  406     add NUM_BLKS, INP   ; pointer to end of data
  407     mov [rsp + _INP_END], NUM_BLKS
  408 
  409     ;; load initial digest
  410     mov a,[4*0 + CTX]
  411     mov b,[4*1 + CTX]
  412     mov c,[4*2 + CTX]
  413     mov d,[4*3 + CTX]
  414     mov e,[4*4 + CTX]
  415     mov f,[4*5 + CTX]
  416     mov g,[4*6 + CTX]
  417     mov h,[4*7 + CTX]
  418 
  419     movdqa  BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
  420     movdqa  SHUF_00BA, [_SHUF_00BA wrt rip]
  421     movdqa  SHUF_DC00, [_SHUF_DC00 wrt rip]
  422 
  423 loop0:
  424     lea TBL,[K256 wrt rip]
  425 
  426     ;; byte swap first 16 dwords
  427     COPY_XMM_AND_BSWAP  X0, [INP + 0*16], BYTE_FLIP_MASK
  428     COPY_XMM_AND_BSWAP  X1, [INP + 1*16], BYTE_FLIP_MASK
  429     COPY_XMM_AND_BSWAP  X2, [INP + 2*16], BYTE_FLIP_MASK
  430     COPY_XMM_AND_BSWAP  X3, [INP + 3*16], BYTE_FLIP_MASK
  431     
  432     mov [rsp + _INP], INP
  433 
  434     ;; schedule 48 input dwords, by doing 3 rounds of 16 each
  435     mov SRND, 3
  436 align 16
  437 loop1:
  438     movdqa  XFER, [TBL + 0*16]
  439     paddd   XFER, X0
  440     movdqa  [rsp + _XFER], XFER
  441     FOUR_ROUNDS_AND_SCHED
  442 
  443     movdqa  XFER, [TBL + 1*16]
  444     paddd   XFER, X0
  445     movdqa  [rsp + _XFER], XFER
  446     FOUR_ROUNDS_AND_SCHED
  447 
  448     movdqa  XFER, [TBL + 2*16]
  449     paddd   XFER, X0
  450     movdqa  [rsp + _XFER], XFER
  451     FOUR_ROUNDS_AND_SCHED
  452 
  453     movdqa  XFER, [TBL + 3*16]
  454     paddd   XFER, X0
  455     movdqa  [rsp + _XFER], XFER
  456     add TBL, 4*16
  457     FOUR_ROUNDS_AND_SCHED
  458 
  459     sub SRND, 1
  460     jne loop1
  461 
  462     mov SRND, 2
  463 loop2:
  464     paddd   X0, [TBL + 0*16]
  465     movdqa  [rsp + _XFER], X0
  466     DO_ROUND    0
  467     DO_ROUND    1
  468     DO_ROUND    2
  469     DO_ROUND    3
  470     paddd   X1, [TBL + 1*16]
  471     movdqa  [rsp + _XFER], X1
  472     add TBL, 2*16
  473     DO_ROUND    0
  474     DO_ROUND    1
  475     DO_ROUND    2
  476     DO_ROUND    3
  477 
  478     movdqa  X0, X2
  479     movdqa  X1, X3
  480 
  481     sub SRND, 1
  482     jne loop2
  483 
  484     addm    [4*0 + CTX],a
  485     addm    [4*1 + CTX],b
  486     addm    [4*2 + CTX],c
  487     addm    [4*3 + CTX],d
  488     addm    [4*4 + CTX],e
  489     addm    [4*5 + CTX],f
  490     addm    [4*6 + CTX],g
  491     addm    [4*7 + CTX],h
  492 
  493     mov INP, [rsp + _INP]
  494     add INP, 64
  495     cmp INP, [rsp + _INP_END]
  496     jne loop0
  497 
  498 done_hash:
  499 %ifdef WINABI
  500     movdqa  xmm6,[rsp + _XMM_SAVE + 0*16]
  501     movdqa  xmm7,[rsp + _XMM_SAVE + 1*16]
  502     movdqa  xmm8,[rsp + _XMM_SAVE + 2*16]
  503     movdqa  xmm9,[rsp + _XMM_SAVE + 3*16]
  504     movdqa  xmm10,[rsp + _XMM_SAVE + 4*16]
  505     movdqa  xmm11,[rsp + _XMM_SAVE + 5*16]
  506     movdqa  xmm12,[rsp + _XMM_SAVE + 6*16]
  507 %endif
  508 
  509     add rsp, STACK_SIZE
  510 
  511     pop r15
  512     pop r14
  513     pop r13
  514     pop rbp
  515 %ifdef WINABI
  516     pop rdi
  517     pop rsi
  518 %endif
  519     pop rbx
  520 
  521     ret 
  522     
  523 
  524 section .data
  525 align 64
  526 K256:
  527     dd  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  528     dd  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  529     dd  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  530     dd  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  531     dd  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  532     dd  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  533     dd  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  534     dd  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  535     dd  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  536     dd  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  537     dd  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  538     dd  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  539     dd  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  540     dd  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  541     dd  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  542     dd  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  543 
  544 PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
  545 
  546 ; shuffle xBxA -> 00BA
  547 _SHUF_00BA:              ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
  548 
  549 ; shuffle xDxC -> DC00
  550 _SHUF_DC00:              ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
  551 
  552 %ifidn __OUTPUT_FORMAT__,elf
  553 section .note.GNU-stack noalloc noexec nowrite progbits
  554 %endif
  555 %ifidn __OUTPUT_FORMAT__,elf32
  556 section .note.GNU-stack noalloc noexec nowrite progbits
  557 %endif
  558 %ifidn __OUTPUT_FORMAT__,elf64
  559 section .note.GNU-stack noalloc noexec nowrite progbits
  560 %endif