"Fossies" - the Fresh Open Source Software Archive

Member "openssl-1.0.2q/crypto/aes/asm/aesni-sha256-x86_64.pl" (20 Nov 2018, 41839 Bytes) of package /linux/misc/openssl-1.0.2q.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Perl source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "aesni-sha256-x86_64.pl" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 1.1.0g_vs_1.1.1-pre2.

    1 #!/usr/bin/env perl
    2 #
    3 # ====================================================================
    4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
    5 # project. The module is, however, dual licensed under OpenSSL and
    6 # CRYPTOGAMS licenses depending on where you obtain it. For further
    7 # details see http://www.openssl.org/~appro/cryptogams/.
    8 # ====================================================================
    9 #
   10 # January 2013
   11 #
   12 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
   13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
   14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
   15 # parallelism, interleaving it with another algorithm would allow to
   16 # utilize processor resources better and achieve better performance.
   17 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
   18 # AESNI code is weaved into it. As SHA256 dominates execution time,
   19 # stitch performance does not depend on AES key length. Below are
   20 # performance numbers in cycles per processed byte, less is better,
   21 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
   22 # subroutine:
   23 #
   24 #        AES-128/-192/-256+SHA256   this(**)gain
   25 # Sandy Bridge      5.05/6.05/7.05+11.6     13.0    +28%/36%/43%
   26 # Ivy Bridge        5.05/6.05/7.05+10.3     11.6    +32%/41%/50%
   27 # Haswell       4.43/5.29/6.19+7.80     8.79    +39%/49%/59%
   28 # Bulldozer     5.77/6.89/8.00+13.7     13.7    +42%/50%/58%
   29 #
   30 # (*)   there are XOP, AVX1 and AVX2 code pathes, meaning that
   31 #   Westmere is omitted from loop, this is because gain was not
   32 #   estimated high enough to justify the effort;
   33 # (**)  these are EVP-free results, results obtained with 'speed
   34 #   -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
   35 
   36 $flavour = shift;
   37 $output  = shift;
   38 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
   39 
   40 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
   41 
   42 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
   43 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
   44 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
   45 die "can't locate x86_64-xlate.pl";
   46 
   47 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
   48         =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
   49     $avx = ($1>=2.19) + ($1>=2.22);
   50 }
   51 
   52 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
   53        `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
   54     $avx = ($1>=2.09) + ($1>=2.10);
   55 }
   56 
   57 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
   58        `ml64 2>&1` =~ /Version ([0-9]+)\./) {
   59     $avx = ($1>=10) + ($1>=12);
   60 }
   61 
   62 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
   63     $avx = ($2>=3.0) + ($2>3.0);
   64 }
   65 
   66 $shaext=$avx;   ### set to zero if compiling for 1.0.1
   67 $avx=1      if (!$shaext && $avx);
   68 
   69 open OUT,"| \"$^X\" $xlate $flavour $output";
   70 *STDOUT=*OUT;
   71 
   72 $func="aesni_cbc_sha256_enc";
   73 $TABLE="K256";
   74 $SZ=4;
   75 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
   76                 "%r8d","%r9d","%r10d","%r11d");
   77 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
   78 @Sigma0=( 2,13,22);
   79 @Sigma1=( 6,11,25);
   80 @sigma0=( 7,18, 3);
   81 @sigma1=(17,19,10);
   82 $rounds=64;
   83 
   84 ########################################################################
   85 # void aesni_cbc_sha256_enc(const void *inp,
   86 #           void *out,
   87 #           size_t length,
   88 #           const AES_KEY *key,
   89 #           unsigned char *iv,
   90 #           SHA256_CTX *ctx,
   91 #           const void *in0);
   92 ($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
   93 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
   94 
   95 $Tbl="%rbp";
   96 
   97 $_inp="16*$SZ+0*8(%rsp)";
   98 $_out="16*$SZ+1*8(%rsp)";
   99 $_end="16*$SZ+2*8(%rsp)";
  100 $_key="16*$SZ+3*8(%rsp)";
  101 $_ivp="16*$SZ+4*8(%rsp)";
  102 $_ctx="16*$SZ+5*8(%rsp)";
  103 $_in0="16*$SZ+6*8(%rsp)";
  104 $_rsp="16*$SZ+7*8(%rsp)";
  105 $framesz=16*$SZ+8*8;
  106 
  107 $code=<<___;
  108 .text
  109 
  110 .extern OPENSSL_ia32cap_P
  111 .globl  $func
  112 .type   $func,\@abi-omnipotent
  113 .align  16
  114 $func:
  115 ___
  116                         if ($avx) {
  117 $code.=<<___;
  118     lea OPENSSL_ia32cap_P(%rip),%r11
  119     mov \$1,%eax
  120     cmp \$0,`$win64?"%rcx":"%rdi"`
  121     je  .Lprobe
  122     mov 0(%r11),%eax
  123     mov 4(%r11),%r10
  124 ___
  125 $code.=<<___ if ($shaext);
  126     bt  \$61,%r10           # check for SHA
  127     jc  ${func}_shaext
  128 ___
  129 $code.=<<___;
  130     mov %r10,%r11
  131     shr \$32,%r11
  132 
  133     test    \$`1<<11`,%r10d         # check for XOP
  134     jnz ${func}_xop
  135 ___
  136 $code.=<<___ if ($avx>1);
  137     and \$`1<<8|1<<5|1<<3`,%r11d    # check for BMI2+AVX2+BMI1
  138     cmp \$`1<<8|1<<5|1<<3`,%r11d
  139     je  ${func}_avx2
  140 ___
  141 $code.=<<___;
  142     and \$`1<<28`,%r10d         # check for AVX
  143     jnz ${func}_avx
  144     ud2
  145 ___
  146                         }
  147 $code.=<<___;
  148     xor %eax,%eax
  149     cmp \$0,`$win64?"%rcx":"%rdi"`
  150     je  .Lprobe
  151     ud2
  152 .Lprobe:
  153     ret
  154 .size   $func,.-$func
  155 
  156 .align  64
  157 .type   $TABLE,\@object
  158 $TABLE:
  159     .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  160     .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  161     .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  162     .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  163     .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  164     .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  165     .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  166     .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  167     .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  168     .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  169     .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  170     .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  171     .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  172     .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  173     .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  174     .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  175     .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  176     .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  177     .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  178     .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  179     .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  180     .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  181     .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  182     .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  183     .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  184     .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  185     .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  186     .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  187     .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  188     .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  189     .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  190     .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  191 
  192     .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
  193     .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
  194     .long   0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
  195     .long   0,0,0,0,   0,0,0,0
  196     .asciz  "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  197 .align  64
  198 ___
  199 
  200 ######################################################################
  201 # SIMD code paths
  202 #
  203 {{{
  204 ($iv,$inout,$roundkey,$temp,
  205  $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
  206 
  207 $aesni_cbc_idx=0;
  208 @aesni_cbc_block = (
  209 ##  &vmovdqu    ($roundkey,"0x00-0x80($inp)");'
  210 ##  &vmovdqu    ($inout,($inp));
  211 ##  &mov        ($_inp,$inp);
  212 
  213     '&vpxor     ($inout,$inout,$roundkey);'.
  214     ' &vmovdqu  ($roundkey,"0x10-0x80($inp)");',
  215 
  216     '&vpxor     ($inout,$inout,$iv);',
  217 
  218     '&vaesenc   ($inout,$inout,$roundkey);'.
  219     ' &vmovdqu  ($roundkey,"0x20-0x80($inp)");',
  220 
  221     '&vaesenc   ($inout,$inout,$roundkey);'.
  222     ' &vmovdqu  ($roundkey,"0x30-0x80($inp)");',
  223 
  224     '&vaesenc   ($inout,$inout,$roundkey);'.
  225     ' &vmovdqu  ($roundkey,"0x40-0x80($inp)");',
  226 
  227     '&vaesenc   ($inout,$inout,$roundkey);'.
  228     ' &vmovdqu  ($roundkey,"0x50-0x80($inp)");',
  229 
  230     '&vaesenc   ($inout,$inout,$roundkey);'.
  231     ' &vmovdqu  ($roundkey,"0x60-0x80($inp)");',
  232 
  233     '&vaesenc   ($inout,$inout,$roundkey);'.
  234     ' &vmovdqu  ($roundkey,"0x70-0x80($inp)");',
  235 
  236     '&vaesenc   ($inout,$inout,$roundkey);'.
  237     ' &vmovdqu  ($roundkey,"0x80-0x80($inp)");',
  238 
  239     '&vaesenc   ($inout,$inout,$roundkey);'.
  240     ' &vmovdqu  ($roundkey,"0x90-0x80($inp)");',
  241 
  242     '&vaesenc   ($inout,$inout,$roundkey);'.
  243     ' &vmovdqu  ($roundkey,"0xa0-0x80($inp)");',
  244 
  245     '&vaesenclast   ($temp,$inout,$roundkey);'.
  246     ' &vaesenc  ($inout,$inout,$roundkey);'.
  247     ' &vmovdqu  ($roundkey,"0xb0-0x80($inp)");',
  248 
  249     '&vpand     ($iv,$temp,$mask10);'.
  250     ' &vaesenc  ($inout,$inout,$roundkey);'.
  251     ' &vmovdqu  ($roundkey,"0xc0-0x80($inp)");',
  252 
  253     '&vaesenclast   ($temp,$inout,$roundkey);'.
  254     ' &vaesenc  ($inout,$inout,$roundkey);'.
  255     ' &vmovdqu  ($roundkey,"0xd0-0x80($inp)");',
  256 
  257     '&vpand     ($temp,$temp,$mask12);'.
  258     ' &vaesenc  ($inout,$inout,$roundkey);'.
  259      '&vmovdqu  ($roundkey,"0xe0-0x80($inp)");',
  260 
  261     '&vpor      ($iv,$iv,$temp);'.
  262     ' &vaesenclast  ($temp,$inout,$roundkey);'.
  263     ' &vmovdqu  ($roundkey,"0x00-0x80($inp)");'
  264 
  265 ##  &mov        ($inp,$_inp);
  266 ##  &mov        ($out,$_out);
  267 ##  &vpand      ($temp,$temp,$mask14);
  268 ##  &vpor       ($iv,$iv,$temp);
  269 ##  &vmovdqu    ($iv,($out,$inp);
  270 ##  &lea        (inp,16($inp));
  271 );
  272 
  273 my $a4=$T1;
  274 my ($a,$b,$c,$d,$e,$f,$g,$h);
  275 
  276 sub AUTOLOAD()      # thunk [simplified] 32-bit style perlasm
  277 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
  278   my $arg = pop;
  279     $arg = "\$$arg" if ($arg*1 eq $arg);
  280     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
  281 }
  282 
  283 sub body_00_15 () {
  284     (
  285     '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
  286 
  287     '&ror   ($a0,$Sigma1[2]-$Sigma1[1])',
  288     '&mov   ($a,$a1)',
  289     '&mov   ($a4,$f)',
  290 
  291     '&xor   ($a0,$e)',
  292     '&ror   ($a1,$Sigma0[2]-$Sigma0[1])',
  293     '&xor   ($a4,$g)',          # f^g
  294 
  295     '&ror   ($a0,$Sigma1[1]-$Sigma1[0])',
  296     '&xor   ($a1,$a)',
  297     '&and   ($a4,$e)',          # (f^g)&e
  298 
  299     @aesni_cbc_block[$aesni_cbc_idx++].
  300     '&xor   ($a0,$e)',
  301     '&add   ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
  302     '&mov   ($a2,$a)',
  303 
  304     '&ror   ($a1,$Sigma0[1]-$Sigma0[0])',
  305     '&xor   ($a4,$g)',          # Ch(e,f,g)=((f^g)&e)^g
  306     '&xor   ($a2,$b)',          # a^b, b^c in next round
  307 
  308     '&ror   ($a0,$Sigma1[0])',      # Sigma1(e)
  309     '&add   ($h,$a4)',          # h+=Ch(e,f,g)
  310     '&and   ($a3,$a2)',         # (b^c)&(a^b)
  311 
  312     '&xor   ($a1,$a)',
  313     '&add   ($h,$a0)',          # h+=Sigma1(e)
  314     '&xor   ($a3,$b)',          # Maj(a,b,c)=Ch(a^b,c,b)
  315 
  316     '&add   ($d,$h)',           # d+=h
  317     '&ror   ($a1,$Sigma0[0])',      # Sigma0(a)
  318     '&add   ($h,$a3)',          # h+=Maj(a,b,c)
  319 
  320     '&mov   ($a0,$d)',
  321     '&add   ($a1,$h);'.         # h+=Sigma0(a)
  322     '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
  323     );
  324 }
  325 
  326 if ($avx) {{
  327 ######################################################################
  328 # XOP code path
  329 #
  330 $code.=<<___;
  331 .type   ${func}_xop,\@function,6
  332 .align  64
  333 ${func}_xop:
  334 .Lxop_shortcut:
  335     mov `($win64?56:8)`(%rsp),$in0  # load 7th parameter
  336     push    %rbx
  337     push    %rbp
  338     push    %r12
  339     push    %r13
  340     push    %r14
  341     push    %r15
  342     mov %rsp,%r11       # copy %rsp
  343     sub \$`$framesz+$win64*16*10`,%rsp
  344     and \$-64,%rsp      # align stack frame
  345 
  346     shl \$6,$len
  347     sub $inp,$out       # re-bias
  348     sub $inp,$in0
  349     add $inp,$len       # end of input
  350 
  351     #mov    $inp,$_inp      # saved later
  352     mov $out,$_out
  353     mov $len,$_end
  354     #mov    $key,$_key      # remains resident in $inp register
  355     mov $ivp,$_ivp
  356     mov $ctx,$_ctx
  357     mov $in0,$_in0
  358     mov %r11,$_rsp
  359 ___
  360 $code.=<<___ if ($win64);
  361     movaps  %xmm6,`$framesz+16*0`(%rsp)
  362     movaps  %xmm7,`$framesz+16*1`(%rsp)
  363     movaps  %xmm8,`$framesz+16*2`(%rsp)
  364     movaps  %xmm9,`$framesz+16*3`(%rsp)
  365     movaps  %xmm10,`$framesz+16*4`(%rsp)
  366     movaps  %xmm11,`$framesz+16*5`(%rsp)
  367     movaps  %xmm12,`$framesz+16*6`(%rsp)
  368     movaps  %xmm13,`$framesz+16*7`(%rsp)
  369     movaps  %xmm14,`$framesz+16*8`(%rsp)
  370     movaps  %xmm15,`$framesz+16*9`(%rsp)
  371 ___
  372 $code.=<<___;
  373 .Lprologue_xop:
  374     vzeroall
  375 
  376     mov $inp,%r12       # borrow $a4
  377     lea 0x80($key),$inp     # size optimization, reassign
  378     lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
  379     mov 0xf0-0x80($inp),%r14d   # rounds, borrow $a1
  380     mov $ctx,%r15       # borrow $a2
  381     mov $in0,%rsi       # borrow $a3
  382     vmovdqu ($ivp),$iv      # load IV
  383     sub \$9,%r14
  384 
  385     mov $SZ*0(%r15),$A
  386     mov $SZ*1(%r15),$B
  387     mov $SZ*2(%r15),$C
  388     mov $SZ*3(%r15),$D
  389     mov $SZ*4(%r15),$E
  390     mov $SZ*5(%r15),$F
  391     mov $SZ*6(%r15),$G
  392     mov $SZ*7(%r15),$H
  393 
  394     vmovdqa 0x00(%r13,%r14,8),$mask14
  395     vmovdqa 0x10(%r13,%r14,8),$mask12
  396     vmovdqa 0x20(%r13,%r14,8),$mask10
  397     vmovdqu 0x00-0x80($inp),$roundkey
  398     jmp .Lloop_xop
  399 ___
  400                     if ($SZ==4) {   # SHA256
  401     my @X = map("%xmm$_",(0..3));
  402     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
  403 
  404 $code.=<<___;
  405 .align  16
  406 .Lloop_xop:
  407     vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  408     vmovdqu 0x00(%rsi,%r12),@X[0]
  409     vmovdqu 0x10(%rsi,%r12),@X[1]
  410     vmovdqu 0x20(%rsi,%r12),@X[2]
  411     vmovdqu 0x30(%rsi,%r12),@X[3]
  412     vpshufb $t3,@X[0],@X[0]
  413     lea $TABLE(%rip),$Tbl
  414     vpshufb $t3,@X[1],@X[1]
  415     vpshufb $t3,@X[2],@X[2]
  416     vpaddd  0x00($Tbl),@X[0],$t0
  417     vpshufb $t3,@X[3],@X[3]
  418     vpaddd  0x20($Tbl),@X[1],$t1
  419     vpaddd  0x40($Tbl),@X[2],$t2
  420     vpaddd  0x60($Tbl),@X[3],$t3
  421     vmovdqa $t0,0x00(%rsp)
  422     mov $A,$a1
  423     vmovdqa $t1,0x10(%rsp)
  424     mov $B,$a3
  425     vmovdqa $t2,0x20(%rsp)
  426     xor $C,$a3          # magic
  427     vmovdqa $t3,0x30(%rsp)
  428     mov $E,$a0
  429     jmp .Lxop_00_47
  430 
  431 .align  16
  432 .Lxop_00_47:
  433     sub \$-16*2*$SZ,$Tbl    # size optimization
  434     vmovdqu (%r12),$inout       # $a4
  435     mov %r12,$_inp      # $a4
  436 ___
  437 sub XOP_256_00_47 () {
  438 my $j = shift;
  439 my $body = shift;
  440 my @X = @_;
  441 my @insns = (&$body,&$body,&$body,&$body);  # 104 instructions
  442 
  443     &vpalignr   ($t0,@X[1],@X[0],$SZ);  # X[1..4]
  444       eval(shift(@insns));
  445       eval(shift(@insns));
  446      &vpalignr  ($t3,@X[3],@X[2],$SZ);  # X[9..12]
  447       eval(shift(@insns));
  448       eval(shift(@insns));
  449     &vprotd     ($t1,$t0,8*$SZ-$sigma0[1]);
  450       eval(shift(@insns));
  451       eval(shift(@insns));
  452     &vpsrld     ($t0,$t0,$sigma0[2]);
  453       eval(shift(@insns));
  454       eval(shift(@insns));
  455      &vpaddd    (@X[0],@X[0],$t3);  # X[0..3] += X[9..12]
  456       eval(shift(@insns));
  457       eval(shift(@insns));
  458       eval(shift(@insns));
  459       eval(shift(@insns));
  460     &vprotd     ($t2,$t1,$sigma0[1]-$sigma0[0]);
  461       eval(shift(@insns));
  462       eval(shift(@insns));
  463     &vpxor      ($t0,$t0,$t1);
  464       eval(shift(@insns));
  465       eval(shift(@insns));
  466       eval(shift(@insns));
  467       eval(shift(@insns));
  468      &vprotd    ($t3,@X[3],8*$SZ-$sigma1[1]);
  469       eval(shift(@insns));
  470       eval(shift(@insns));
  471     &vpxor      ($t0,$t0,$t2);      # sigma0(X[1..4])
  472       eval(shift(@insns));
  473       eval(shift(@insns));
  474      &vpsrld    ($t2,@X[3],$sigma1[2]);
  475       eval(shift(@insns));
  476       eval(shift(@insns));
  477     &vpaddd     (@X[0],@X[0],$t0);  # X[0..3] += sigma0(X[1..4])
  478       eval(shift(@insns));
  479       eval(shift(@insns));
  480      &vprotd    ($t1,$t3,$sigma1[1]-$sigma1[0]);
  481       eval(shift(@insns));
  482       eval(shift(@insns));
  483      &vpxor     ($t3,$t3,$t2);
  484       eval(shift(@insns));
  485       eval(shift(@insns));
  486       eval(shift(@insns));
  487       eval(shift(@insns));
  488      &vpxor     ($t3,$t3,$t1);      # sigma1(X[14..15])
  489       eval(shift(@insns));
  490       eval(shift(@insns));
  491       eval(shift(@insns));
  492       eval(shift(@insns));
  493     &vpsrldq    ($t3,$t3,8);
  494       eval(shift(@insns));
  495       eval(shift(@insns));
  496       eval(shift(@insns));
  497       eval(shift(@insns));
  498     &vpaddd     (@X[0],@X[0],$t3);  # X[0..1] += sigma1(X[14..15])
  499       eval(shift(@insns));
  500       eval(shift(@insns));
  501       eval(shift(@insns));
  502       eval(shift(@insns));
  503      &vprotd    ($t3,@X[0],8*$SZ-$sigma1[1]);
  504       eval(shift(@insns));
  505       eval(shift(@insns));
  506      &vpsrld    ($t2,@X[0],$sigma1[2]);
  507       eval(shift(@insns));
  508       eval(shift(@insns));
  509      &vprotd    ($t1,$t3,$sigma1[1]-$sigma1[0]);
  510       eval(shift(@insns));
  511       eval(shift(@insns));
  512      &vpxor     ($t3,$t3,$t2);
  513       eval(shift(@insns));
  514       eval(shift(@insns));
  515       eval(shift(@insns));
  516       eval(shift(@insns));
  517      &vpxor     ($t3,$t3,$t1);      # sigma1(X[16..17])
  518       eval(shift(@insns));
  519       eval(shift(@insns));
  520       eval(shift(@insns));
  521       eval(shift(@insns));
  522     &vpslldq    ($t3,$t3,8);        # 22 instructions
  523       eval(shift(@insns));
  524       eval(shift(@insns));
  525       eval(shift(@insns));
  526       eval(shift(@insns));
  527     &vpaddd     (@X[0],@X[0],$t3);  # X[2..3] += sigma1(X[16..17])
  528       eval(shift(@insns));
  529       eval(shift(@insns));
  530       eval(shift(@insns));
  531       eval(shift(@insns));
  532     &vpaddd     ($t2,@X[0],16*2*$j."($Tbl)");
  533       foreach (@insns) { eval; }        # remaining instructions
  534     &vmovdqa    (16*$j."(%rsp)",$t2);
  535 }
  536 
  537     $aesni_cbc_idx=0;
  538     for ($i=0,$j=0; $j<4; $j++) {
  539     &XOP_256_00_47($j,\&body_00_15,@X);
  540     push(@X,shift(@X));         # rotate(@X)
  541     }
  542         &mov        ("%r12",$_inp);     # borrow $a4
  543     &vpand      ($temp,$temp,$mask14);
  544     &mov        ("%r15",$_out);     # borrow $a2
  545     &vpor       ($iv,$iv,$temp);
  546     &vmovdqu    ("(%r15,%r12)",$iv);    # write output
  547     &lea        ("%r12","16(%r12)");    # inp++
  548 
  549     &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
  550     &jne    (".Lxop_00_47");
  551 
  552     &vmovdqu    ($inout,"(%r12)");
  553     &mov        ($_inp,"%r12");
  554 
  555     $aesni_cbc_idx=0;
  556     for ($i=0; $i<16; ) {
  557     foreach(body_00_15()) { eval; }
  558     }
  559                     }
  560 $code.=<<___;
  561     mov $_inp,%r12      # borrow $a4
  562     mov $_out,%r13      # borrow $a0
  563     mov $_ctx,%r15      # borrow $a2
  564     mov $_in0,%rsi      # borrow $a3
  565 
  566     vpand   $mask14,$temp,$temp
  567     mov $a1,$A
  568     vpor    $temp,$iv,$iv
  569     vmovdqu $iv,(%r13,%r12)     # write output
  570     lea 16(%r12),%r12       # inp++
  571 
  572     add $SZ*0(%r15),$A
  573     add $SZ*1(%r15),$B
  574     add $SZ*2(%r15),$C
  575     add $SZ*3(%r15),$D
  576     add $SZ*4(%r15),$E
  577     add $SZ*5(%r15),$F
  578     add $SZ*6(%r15),$G
  579     add $SZ*7(%r15),$H
  580 
  581     cmp $_end,%r12
  582 
  583     mov $A,$SZ*0(%r15)
  584     mov $B,$SZ*1(%r15)
  585     mov $C,$SZ*2(%r15)
  586     mov $D,$SZ*3(%r15)
  587     mov $E,$SZ*4(%r15)
  588     mov $F,$SZ*5(%r15)
  589     mov $G,$SZ*6(%r15)
  590     mov $H,$SZ*7(%r15)
  591 
  592     jb  .Lloop_xop
  593 
  594     mov $_ivp,$ivp
  595     mov $_rsp,%rsi
  596     vmovdqu $iv,($ivp)      # output IV
  597     vzeroall
  598 ___
  599 $code.=<<___ if ($win64);
  600     movaps  `$framesz+16*0`(%rsp),%xmm6
  601     movaps  `$framesz+16*1`(%rsp),%xmm7
  602     movaps  `$framesz+16*2`(%rsp),%xmm8
  603     movaps  `$framesz+16*3`(%rsp),%xmm9
  604     movaps  `$framesz+16*4`(%rsp),%xmm10
  605     movaps  `$framesz+16*5`(%rsp),%xmm11
  606     movaps  `$framesz+16*6`(%rsp),%xmm12
  607     movaps  `$framesz+16*7`(%rsp),%xmm13
  608     movaps  `$framesz+16*8`(%rsp),%xmm14
  609     movaps  `$framesz+16*9`(%rsp),%xmm15
  610 ___
  611 $code.=<<___;
  612     mov (%rsi),%r15
  613     mov 8(%rsi),%r14
  614     mov 16(%rsi),%r13
  615     mov 24(%rsi),%r12
  616     mov 32(%rsi),%rbp
  617     mov 40(%rsi),%rbx
  618     lea 48(%rsi),%rsp
  619 .Lepilogue_xop:
  620     ret
  621 .size   ${func}_xop,.-${func}_xop
  622 ___
  623 ######################################################################
  624 # AVX+shrd code path
  625 #
  626 local *ror = sub { &shrd(@_[0],@_) };
  627 
  628 $code.=<<___;
  629 .type   ${func}_avx,\@function,6
  630 .align  64
  631 ${func}_avx:
  632 .Lavx_shortcut:
  633     mov `($win64?56:8)`(%rsp),$in0  # load 7th parameter
  634     push    %rbx
  635     push    %rbp
  636     push    %r12
  637     push    %r13
  638     push    %r14
  639     push    %r15
  640     mov %rsp,%r11       # copy %rsp
  641     sub \$`$framesz+$win64*16*10`,%rsp
  642     and \$-64,%rsp      # align stack frame
  643 
  644     shl \$6,$len
  645     sub $inp,$out       # re-bias
  646     sub $inp,$in0
  647     add $inp,$len       # end of input
  648 
  649     #mov    $inp,$_inp      # saved later
  650     mov $out,$_out
  651     mov $len,$_end
  652     #mov    $key,$_key      # remains resident in $inp register
  653     mov $ivp,$_ivp
  654     mov $ctx,$_ctx
  655     mov $in0,$_in0
  656     mov %r11,$_rsp
  657 ___
  658 $code.=<<___ if ($win64);
  659     movaps  %xmm6,`$framesz+16*0`(%rsp)
  660     movaps  %xmm7,`$framesz+16*1`(%rsp)
  661     movaps  %xmm8,`$framesz+16*2`(%rsp)
  662     movaps  %xmm9,`$framesz+16*3`(%rsp)
  663     movaps  %xmm10,`$framesz+16*4`(%rsp)
  664     movaps  %xmm11,`$framesz+16*5`(%rsp)
  665     movaps  %xmm12,`$framesz+16*6`(%rsp)
  666     movaps  %xmm13,`$framesz+16*7`(%rsp)
  667     movaps  %xmm14,`$framesz+16*8`(%rsp)
  668     movaps  %xmm15,`$framesz+16*9`(%rsp)
  669 ___
  670 $code.=<<___;
  671 .Lprologue_avx:
  672     vzeroall
  673 
  674     mov $inp,%r12       # borrow $a4
  675     lea 0x80($key),$inp     # size optimization, reassign
  676     lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
  677     mov 0xf0-0x80($inp),%r14d   # rounds, borrow $a1
  678     mov $ctx,%r15       # borrow $a2
  679     mov $in0,%rsi       # borrow $a3
  680     vmovdqu ($ivp),$iv      # load IV
  681     sub \$9,%r14
  682 
  683     mov $SZ*0(%r15),$A
  684     mov $SZ*1(%r15),$B
  685     mov $SZ*2(%r15),$C
  686     mov $SZ*3(%r15),$D
  687     mov $SZ*4(%r15),$E
  688     mov $SZ*5(%r15),$F
  689     mov $SZ*6(%r15),$G
  690     mov $SZ*7(%r15),$H
  691 
  692     vmovdqa 0x00(%r13,%r14,8),$mask14
  693     vmovdqa 0x10(%r13,%r14,8),$mask12
  694     vmovdqa 0x20(%r13,%r14,8),$mask10
  695     vmovdqu 0x00-0x80($inp),$roundkey
  696 ___
  697                     if ($SZ==4) {   # SHA256
  698     my @X = map("%xmm$_",(0..3));
  699     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
  700 
  701 $code.=<<___;
  702     jmp .Lloop_avx
  703 .align  16
  704 .Lloop_avx:
  705     vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  706     vmovdqu 0x00(%rsi,%r12),@X[0]
  707     vmovdqu 0x10(%rsi,%r12),@X[1]
  708     vmovdqu 0x20(%rsi,%r12),@X[2]
  709     vmovdqu 0x30(%rsi,%r12),@X[3]
  710     vpshufb $t3,@X[0],@X[0]
  711     lea $TABLE(%rip),$Tbl
  712     vpshufb $t3,@X[1],@X[1]
  713     vpshufb $t3,@X[2],@X[2]
  714     vpaddd  0x00($Tbl),@X[0],$t0
  715     vpshufb $t3,@X[3],@X[3]
  716     vpaddd  0x20($Tbl),@X[1],$t1
  717     vpaddd  0x40($Tbl),@X[2],$t2
  718     vpaddd  0x60($Tbl),@X[3],$t3
  719     vmovdqa $t0,0x00(%rsp)
  720     mov $A,$a1
  721     vmovdqa $t1,0x10(%rsp)
  722     mov $B,$a3
  723     vmovdqa $t2,0x20(%rsp)
  724     xor $C,$a3          # magic
  725     vmovdqa $t3,0x30(%rsp)
  726     mov $E,$a0
  727     jmp .Lavx_00_47
  728 
  729 .align  16
  730 .Lavx_00_47:
  731     sub \$-16*2*$SZ,$Tbl    # size optimization
  732     vmovdqu (%r12),$inout       # $a4
  733     mov %r12,$_inp      # $a4
  734 ___
  735 sub Xupdate_256_AVX () {
  736     (
  737     '&vpalignr  ($t0,@X[1],@X[0],$SZ)', # X[1..4]
  738      '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
  739     '&vpsrld    ($t2,$t0,$sigma0[0]);',
  740      '&vpaddd   (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
  741     '&vpsrld    ($t3,$t0,$sigma0[2])',
  742     '&vpslld    ($t1,$t0,8*$SZ-$sigma0[1]);',
  743     '&vpxor     ($t0,$t3,$t2)',
  744      '&vpshufd  ($t3,@X[3],0b11111010)',# X[14..15]
  745     '&vpsrld    ($t2,$t2,$sigma0[1]-$sigma0[0]);',
  746     '&vpxor     ($t0,$t0,$t1)',
  747     '&vpslld    ($t1,$t1,$sigma0[1]-$sigma0[0]);',
  748     '&vpxor     ($t0,$t0,$t2)',
  749      '&vpsrld   ($t2,$t3,$sigma1[2]);',
  750     '&vpxor     ($t0,$t0,$t1)',     # sigma0(X[1..4])
  751      '&vpsrlq   ($t3,$t3,$sigma1[0]);',
  752     '&vpaddd    (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
  753      '&vpxor    ($t2,$t2,$t3);',
  754      '&vpsrlq   ($t3,$t3,$sigma1[1]-$sigma1[0])',
  755      '&vpxor    ($t2,$t2,$t3)',     # sigma1(X[14..15])
  756      '&vpshufd  ($t2,$t2,0b10000100)',
  757      '&vpsrldq  ($t2,$t2,8)',
  758     '&vpaddd    (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
  759      '&vpshufd  ($t3,@X[0],0b01010000)',# X[16..17]
  760      '&vpsrld   ($t2,$t3,$sigma1[2])',
  761      '&vpsrlq   ($t3,$t3,$sigma1[0])',
  762      '&vpxor    ($t2,$t2,$t3);',
  763      '&vpsrlq   ($t3,$t3,$sigma1[1]-$sigma1[0])',
  764      '&vpxor    ($t2,$t2,$t3)',
  765      '&vpshufd  ($t2,$t2,0b11101000)',
  766      '&vpslldq  ($t2,$t2,8)',
  767     '&vpaddd    (@X[0],@X[0],$t2)'  # X[2..3] += sigma1(X[16..17])
  768     );
  769 }
  770 
  771 sub AVX_256_00_47 () {
  772 my $j = shift;
  773 my $body = shift;
  774 my @X = @_;
  775 my @insns = (&$body,&$body,&$body,&$body);  # 104 instructions
  776 
  777     foreach (Xupdate_256_AVX()) {       # 29 instructions
  778         eval;
  779         eval(shift(@insns));
  780         eval(shift(@insns));
  781         eval(shift(@insns));
  782     }
  783     &vpaddd     ($t2,@X[0],16*2*$j."($Tbl)");
  784       foreach (@insns) { eval; }        # remaining instructions
  785     &vmovdqa    (16*$j."(%rsp)",$t2);
  786 }
  787 
  788     $aesni_cbc_idx=0;
  789     for ($i=0,$j=0; $j<4; $j++) {
  790     &AVX_256_00_47($j,\&body_00_15,@X);
  791     push(@X,shift(@X));         # rotate(@X)
  792     }
  793         &mov        ("%r12",$_inp);     # borrow $a4
  794     &vpand      ($temp,$temp,$mask14);
  795     &mov        ("%r15",$_out);     # borrow $a2
  796     &vpor       ($iv,$iv,$temp);
  797     &vmovdqu    ("(%r15,%r12)",$iv);    # write output
  798     &lea        ("%r12","16(%r12)");    # inp++
  799 
  800     &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
  801     &jne    (".Lavx_00_47");
  802 
  803     &vmovdqu    ($inout,"(%r12)");
  804     &mov        ($_inp,"%r12");
  805 
  806     $aesni_cbc_idx=0;
  807     for ($i=0; $i<16; ) {
  808     foreach(body_00_15()) { eval; }
  809     }
  810 
  811                     }
  812 $code.=<<___;
  813     mov $_inp,%r12      # borrow $a4
  814     mov $_out,%r13      # borrow $a0
  815     mov $_ctx,%r15      # borrow $a2
  816     mov $_in0,%rsi      # borrow $a3
  817 
  818     vpand   $mask14,$temp,$temp
  819     mov $a1,$A
  820     vpor    $temp,$iv,$iv
  821     vmovdqu $iv,(%r13,%r12)     # write output
  822     lea 16(%r12),%r12       # inp++
  823 
  824     add $SZ*0(%r15),$A
  825     add $SZ*1(%r15),$B
  826     add $SZ*2(%r15),$C
  827     add $SZ*3(%r15),$D
  828     add $SZ*4(%r15),$E
  829     add $SZ*5(%r15),$F
  830     add $SZ*6(%r15),$G
  831     add $SZ*7(%r15),$H
  832 
  833     cmp $_end,%r12
  834 
  835     mov $A,$SZ*0(%r15)
  836     mov $B,$SZ*1(%r15)
  837     mov $C,$SZ*2(%r15)
  838     mov $D,$SZ*3(%r15)
  839     mov $E,$SZ*4(%r15)
  840     mov $F,$SZ*5(%r15)
  841     mov $G,$SZ*6(%r15)
  842     mov $H,$SZ*7(%r15)
  843     jb  .Lloop_avx
  844 
  845     mov $_ivp,$ivp
  846     mov $_rsp,%rsi
  847     vmovdqu $iv,($ivp)      # output IV
  848     vzeroall
  849 ___
  850 $code.=<<___ if ($win64);
  851     movaps  `$framesz+16*0`(%rsp),%xmm6
  852     movaps  `$framesz+16*1`(%rsp),%xmm7
  853     movaps  `$framesz+16*2`(%rsp),%xmm8
  854     movaps  `$framesz+16*3`(%rsp),%xmm9
  855     movaps  `$framesz+16*4`(%rsp),%xmm10
  856     movaps  `$framesz+16*5`(%rsp),%xmm11
  857     movaps  `$framesz+16*6`(%rsp),%xmm12
  858     movaps  `$framesz+16*7`(%rsp),%xmm13
  859     movaps  `$framesz+16*8`(%rsp),%xmm14
  860     movaps  `$framesz+16*9`(%rsp),%xmm15
  861 ___
  862 $code.=<<___;
  863     mov (%rsi),%r15
  864     mov 8(%rsi),%r14
  865     mov 16(%rsi),%r13
  866     mov 24(%rsi),%r12
  867     mov 32(%rsi),%rbp
  868     mov 40(%rsi),%rbx
  869     lea 48(%rsi),%rsp
  870 .Lepilogue_avx:
  871     ret
  872 .size   ${func}_avx,.-${func}_avx
  873 ___
  874 
  875 if ($avx>1) {{
  876 ######################################################################
  877 # AVX2+BMI code path
  878 #
  879 my $a5=$SZ==4?"%esi":"%rsi";    # zap $inp 
  880 my $PUSH8=8*2*$SZ;
  881 use integer;
  882 
  883 sub bodyx_00_15 () {
  884     # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
  885     (
  886     '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
  887 
  888     '&add   ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
  889     '&and   ($a4,$e)',      # f&e
  890     '&rorx  ($a0,$e,$Sigma1[2])',
  891     '&rorx  ($a2,$e,$Sigma1[1])',
  892 
  893     '&lea   ($a,"($a,$a1)")',   # h+=Sigma0(a) from the past
  894     '&lea   ($h,"($h,$a4)")',
  895     '&andn  ($a4,$e,$g)',       # ~e&g
  896     '&xor   ($a0,$a2)',
  897 
  898     '&rorx  ($a1,$e,$Sigma1[0])',
  899     '&lea   ($h,"($h,$a4)")',   # h+=Ch(e,f,g)=(e&f)+(~e&g)
  900     '&xor   ($a0,$a1)',     # Sigma1(e)
  901     '&mov   ($a2,$a)',
  902 
  903     '&rorx  ($a4,$a,$Sigma0[2])',
  904     '&lea   ($h,"($h,$a0)")',   # h+=Sigma1(e)
  905     '&xor   ($a2,$b)',      # a^b, b^c in next round
  906     '&rorx  ($a1,$a,$Sigma0[1])',
  907 
  908     '&rorx  ($a0,$a,$Sigma0[0])',
  909     '&lea   ($d,"($d,$h)")',    # d+=h
  910     '&and   ($a3,$a2)',     # (b^c)&(a^b)
  911     @aesni_cbc_block[$aesni_cbc_idx++].
  912     '&xor   ($a1,$a4)',
  913 
  914     '&xor   ($a3,$b)',      # Maj(a,b,c)=Ch(a^b,c,b)
  915     '&xor   ($a1,$a0)',     # Sigma0(a)
  916     '&lea   ($h,"($h,$a3)");'.  # h+=Maj(a,b,c)
  917     '&mov   ($a4,$e)',      # copy of f in future
  918 
  919     '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
  920     );
  921     # and at the finish one has to $a+=$a1
  922 }
  923 
  924 $code.=<<___;
  925 .type   ${func}_avx2,\@function,6
  926 .align  64
  927 ${func}_avx2:
  928 .Lavx2_shortcut:
  929     mov `($win64?56:8)`(%rsp),$in0  # load 7th parameter
  930     push    %rbx
  931     push    %rbp
  932     push    %r12
  933     push    %r13
  934     push    %r14
  935     push    %r15
  936     mov %rsp,%r11       # copy %rsp
  937     sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
  938     and \$-256*$SZ,%rsp     # align stack frame
  939     add \$`2*$SZ*($rounds-8)`,%rsp
  940 
  941     shl \$6,$len
  942     sub $inp,$out       # re-bias
  943     sub $inp,$in0
  944     add $inp,$len       # end of input
  945 
  946     #mov    $inp,$_inp      # saved later
  947     #mov    $out,$_out      # kept in $offload
  948     mov $len,$_end
  949     #mov    $key,$_key      # remains resident in $inp register
  950     mov $ivp,$_ivp
  951     mov $ctx,$_ctx
  952     mov $in0,$_in0
  953     mov %r11,$_rsp
  954 ___
  955 $code.=<<___ if ($win64);
  956     movaps  %xmm6,`$framesz+16*0`(%rsp)
  957     movaps  %xmm7,`$framesz+16*1`(%rsp)
  958     movaps  %xmm8,`$framesz+16*2`(%rsp)
  959     movaps  %xmm9,`$framesz+16*3`(%rsp)
  960     movaps  %xmm10,`$framesz+16*4`(%rsp)
  961     movaps  %xmm11,`$framesz+16*5`(%rsp)
  962     movaps  %xmm12,`$framesz+16*6`(%rsp)
  963     movaps  %xmm13,`$framesz+16*7`(%rsp)
  964     movaps  %xmm14,`$framesz+16*8`(%rsp)
  965     movaps  %xmm15,`$framesz+16*9`(%rsp)
  966 ___
  967 $code.=<<___;
  968 .Lprologue_avx2:
  969     vzeroall
  970 
  971     mov $inp,%r13       # borrow $a0
  972     vpinsrq \$1,$out,$offload,$offload
  973     lea 0x80($key),$inp     # size optimization, reassign
  974     lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12    # borrow $a4
  975     mov 0xf0-0x80($inp),%r14d   # rounds, borrow $a1
  976     mov $ctx,%r15       # borrow $a2
  977     mov $in0,%rsi       # borrow $a3
  978     vmovdqu ($ivp),$iv      # load IV
  979     lea -9(%r14),%r14
  980 
  981     vmovdqa 0x00(%r12,%r14,8),$mask14
  982     vmovdqa 0x10(%r12,%r14,8),$mask12
  983     vmovdqa 0x20(%r12,%r14,8),$mask10
  984 
  985     sub \$-16*$SZ,%r13      # inp++, size optimization
  986     mov $SZ*0(%r15),$A
  987     lea (%rsi,%r13),%r12    # borrow $a0
  988     mov $SZ*1(%r15),$B
  989     cmp $len,%r13       # $_end
  990     mov $SZ*2(%r15),$C
  991     cmove   %rsp,%r12       # next block or random data
  992     mov $SZ*3(%r15),$D
  993     mov $SZ*4(%r15),$E
  994     mov $SZ*5(%r15),$F
  995     mov $SZ*6(%r15),$G
  996     mov $SZ*7(%r15),$H
  997     vmovdqu 0x00-0x80($inp),$roundkey
  998 ___
  999                     if ($SZ==4) {   # SHA256
 1000     my @X = map("%ymm$_",(0..3));
 1001     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
 1002 
 1003 $code.=<<___;
 1004     jmp .Loop_avx2
 1005 .align  16
 1006 .Loop_avx2:
 1007     vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 1008     vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
 1009     vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
 1010     vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
 1011     vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
 1012 
 1013     vinserti128 \$1,(%r12),@X[0],@X[0]
 1014     vinserti128 \$1,16(%r12),@X[1],@X[1]
 1015      vpshufb    $t3,@X[0],@X[0]
 1016     vinserti128 \$1,32(%r12),@X[2],@X[2]
 1017      vpshufb    $t3,@X[1],@X[1]
 1018     vinserti128 \$1,48(%r12),@X[3],@X[3]
 1019 
 1020     lea $TABLE(%rip),$Tbl
 1021     vpshufb $t3,@X[2],@X[2]
 1022     lea -16*$SZ(%r13),%r13
 1023     vpaddd  0x00($Tbl),@X[0],$t0
 1024     vpshufb $t3,@X[3],@X[3]
 1025     vpaddd  0x20($Tbl),@X[1],$t1
 1026     vpaddd  0x40($Tbl),@X[2],$t2
 1027     vpaddd  0x60($Tbl),@X[3],$t3
 1028     vmovdqa $t0,0x00(%rsp)
 1029     xor $a1,$a1
 1030     vmovdqa $t1,0x20(%rsp)
 1031     lea -$PUSH8(%rsp),%rsp
 1032     mov $B,$a3
 1033     vmovdqa $t2,0x00(%rsp)
 1034     xor $C,$a3          # magic
 1035     vmovdqa $t3,0x20(%rsp)
 1036     mov $F,$a4
 1037     sub \$-16*2*$SZ,$Tbl    # size optimization
 1038     jmp .Lavx2_00_47
 1039 
 1040 .align  16
 1041 .Lavx2_00_47:
 1042     vmovdqu (%r13),$inout
 1043     vpinsrq \$0,%r13,$offload,$offload
 1044 ___
 1045 
 1046 sub AVX2_256_00_47 () {
 1047 my $j = shift;
 1048 my $body = shift;
 1049 my @X = @_;
 1050 my @insns = (&$body,&$body,&$body,&$body);  # 96 instructions
 1051 my $base = "+2*$PUSH8(%rsp)";
 1052 
 1053     &lea    ("%rsp","-$PUSH8(%rsp)")    if (($j%2)==0);
 1054     foreach (Xupdate_256_AVX()) {       # 29 instructions
 1055         eval;
 1056         eval(shift(@insns));
 1057         eval(shift(@insns));
 1058         eval(shift(@insns));
 1059     }
 1060     &vpaddd     ($t2,@X[0],16*2*$j."($Tbl)");
 1061       foreach (@insns) { eval; }        # remaining instructions
 1062     &vmovdqa    ((32*$j)%$PUSH8."(%rsp)",$t2);
 1063 }
 1064     $aesni_cbc_idx=0;
 1065     for ($i=0,$j=0; $j<4; $j++) {
 1066     &AVX2_256_00_47($j,\&bodyx_00_15,@X);
 1067     push(@X,shift(@X));         # rotate(@X)
 1068     }
 1069     &vmovq      ("%r13",$offload);  # borrow $a0
 1070     &vpextrq    ("%r15",$offload,1);    # borrow $a2
 1071     &vpand      ($temp,$temp,$mask14);
 1072     &vpor       ($iv,$iv,$temp);
 1073     &vmovdqu    ("(%r15,%r13)",$iv);    # write output
 1074     &lea        ("%r13","16(%r13)");    # inp++
 1075 
 1076     &lea    ($Tbl,16*2*$SZ."($Tbl)");
 1077     &cmpb   (($SZ-1)."($Tbl)",0);
 1078     &jne    (".Lavx2_00_47");
 1079 
 1080     &vmovdqu    ($inout,"(%r13)");
 1081     &vpinsrq    ($offload,$offload,"%r13",0);
 1082 
 1083     $aesni_cbc_idx=0;
 1084     for ($i=0; $i<16; ) {
 1085     my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
 1086     foreach(bodyx_00_15()) { eval; }
 1087     }
 1088                     }
 1089 $code.=<<___;
 1090     vpextrq \$1,$offload,%r12       # $_out, borrow $a4
 1091     vmovq   $offload,%r13           # $_inp, borrow $a0
 1092     mov `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
 1093     add $a1,$A
 1094     lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
 1095 
 1096     vpand   $mask14,$temp,$temp
 1097     vpor    $temp,$iv,$iv
 1098     vmovdqu $iv,(%r12,%r13)         # write output
 1099     lea 16(%r13),%r13
 1100 
 1101     add $SZ*0(%r15),$A
 1102     add $SZ*1(%r15),$B
 1103     add $SZ*2(%r15),$C
 1104     add $SZ*3(%r15),$D
 1105     add $SZ*4(%r15),$E
 1106     add $SZ*5(%r15),$F
 1107     add $SZ*6(%r15),$G
 1108     add $SZ*7(%r15),$H
 1109 
 1110     mov $A,$SZ*0(%r15)
 1111     mov $B,$SZ*1(%r15)
 1112     mov $C,$SZ*2(%r15)
 1113     mov $D,$SZ*3(%r15)
 1114     mov $E,$SZ*4(%r15)
 1115     mov $F,$SZ*5(%r15)
 1116     mov $G,$SZ*6(%r15)
 1117     mov $H,$SZ*7(%r15)
 1118 
 1119     cmp `$PUSH8+2*8`($Tbl),%r13     # $_end
 1120     je  .Ldone_avx2
 1121 
 1122     xor $a1,$a1
 1123     mov $B,$a3
 1124     mov $F,$a4
 1125     xor $C,$a3          # magic
 1126     jmp .Lower_avx2
 1127 .align  16
 1128 .Lower_avx2:
 1129     vmovdqu (%r13),$inout
 1130     vpinsrq \$0,%r13,$offload,$offload
 1131 ___
 1132     $aesni_cbc_idx=0;
 1133     for ($i=0; $i<16; ) {
 1134     my $base="+16($Tbl)";
 1135     foreach(bodyx_00_15()) { eval; }
 1136     &lea    ($Tbl,"-$PUSH8($Tbl)")  if ($i==8);
 1137     }
 1138 $code.=<<___;
 1139     vmovq   $offload,%r13           # borrow $a0
 1140     vpextrq \$1,$offload,%r15       # borrow $a2
 1141     vpand   $mask14,$temp,$temp
 1142     vpor    $temp,$iv,$iv
 1143     lea -$PUSH8($Tbl),$Tbl
 1144     vmovdqu $iv,(%r15,%r13)         # write output
 1145     lea 16(%r13),%r13           # inp++
 1146     cmp %rsp,$Tbl
 1147     jae .Lower_avx2
 1148 
 1149     mov `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
 1150     lea 16*$SZ(%r13),%r13
 1151     mov `2*$SZ*$rounds+6*8`(%rsp),%rsi  # $_in0, borrow $a3
 1152     add $a1,$A
 1153     lea `2*$SZ*($rounds-8)`(%rsp),%rsp
 1154 
 1155     add $SZ*0(%r15),$A
 1156     add $SZ*1(%r15),$B
 1157     add $SZ*2(%r15),$C
 1158     add $SZ*3(%r15),$D
 1159     add $SZ*4(%r15),$E
 1160     add $SZ*5(%r15),$F
 1161     add $SZ*6(%r15),$G
 1162     lea (%rsi,%r13),%r12
 1163     add $SZ*7(%r15),$H
 1164 
 1165     cmp $_end,%r13
 1166 
 1167     mov $A,$SZ*0(%r15)
 1168     cmove   %rsp,%r12       # next block or stale data
 1169     mov $B,$SZ*1(%r15)
 1170     mov $C,$SZ*2(%r15)
 1171     mov $D,$SZ*3(%r15)
 1172     mov $E,$SZ*4(%r15)
 1173     mov $F,$SZ*5(%r15)
 1174     mov $G,$SZ*6(%r15)
 1175     mov $H,$SZ*7(%r15)
 1176 
 1177     jbe .Loop_avx2
 1178     lea (%rsp),$Tbl
 1179 
 1180 .Ldone_avx2:
 1181     lea ($Tbl),%rsp
 1182     mov $_ivp,$ivp
 1183     mov $_rsp,%rsi
 1184     vmovdqu $iv,($ivp)      # output IV
 1185     vzeroall
 1186 ___
 1187 $code.=<<___ if ($win64);
 1188     movaps  `$framesz+16*0`(%rsp),%xmm6
 1189     movaps  `$framesz+16*1`(%rsp),%xmm7
 1190     movaps  `$framesz+16*2`(%rsp),%xmm8
 1191     movaps  `$framesz+16*3`(%rsp),%xmm9
 1192     movaps  `$framesz+16*4`(%rsp),%xmm10
 1193     movaps  `$framesz+16*5`(%rsp),%xmm11
 1194     movaps  `$framesz+16*6`(%rsp),%xmm12
 1195     movaps  `$framesz+16*7`(%rsp),%xmm13
 1196     movaps  `$framesz+16*8`(%rsp),%xmm14
 1197     movaps  `$framesz+16*9`(%rsp),%xmm15
 1198 ___
 1199 $code.=<<___;
 1200     mov (%rsi),%r15
 1201     mov 8(%rsi),%r14
 1202     mov 16(%rsi),%r13
 1203     mov 24(%rsi),%r12
 1204     mov 32(%rsi),%rbp
 1205     mov 40(%rsi),%rbx
 1206     lea 48(%rsi),%rsp
 1207 .Lepilogue_avx2:
 1208     ret
 1209 .size   ${func}_avx2,.-${func}_avx2
 1210 ___
 1211 }}
 1212 }}
 1213 {{
 1214 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
 1215 
 1216 my ($rounds,$Tbl)=("%r11d","%rbx");
 1217 
 1218 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
 1219 my @rndkey=("%xmm4","%xmm5");
 1220 my $r=0;
 1221 my $sn=0;
 1222 
 1223 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
 1224 my @MSG=map("%xmm$_",(10..13));
 1225 
 1226 my $aesenc=sub {
 1227   use integer;
 1228   my ($n,$k)=($r/10,$r%10);
 1229     if ($k==0) {
 1230       $code.=<<___;
 1231     movups      `16*$n`($in0),$in       # load input
 1232     xorps       $rndkey0,$in
 1233 ___
 1234       $code.=<<___ if ($n);
 1235     movups      $iv,`16*($n-1)`($out,$in0)  # write output
 1236 ___
 1237       $code.=<<___;
 1238     xorps       $in,$iv
 1239     movups      `32+16*$k-112`($key),$rndkey[1]
 1240     aesenc      $rndkey[0],$iv
 1241 ___
 1242     } elsif ($k==9) {
 1243       $sn++;
 1244       $code.=<<___;
 1245     cmp     \$11,$rounds
 1246     jb      .Laesenclast$sn
 1247     movups      `32+16*($k+0)-112`($key),$rndkey[1]
 1248     aesenc      $rndkey[0],$iv
 1249     movups      `32+16*($k+1)-112`($key),$rndkey[0]
 1250     aesenc      $rndkey[1],$iv
 1251     je      .Laesenclast$sn
 1252     movups      `32+16*($k+2)-112`($key),$rndkey[1]
 1253     aesenc      $rndkey[0],$iv
 1254     movups      `32+16*($k+3)-112`($key),$rndkey[0]
 1255     aesenc      $rndkey[1],$iv
 1256 .Laesenclast$sn:
 1257     aesenclast  $rndkey[0],$iv
 1258     movups      16-112($key),$rndkey[1]     # forward reference
 1259     nop
 1260 ___
 1261     } else {
 1262       $code.=<<___;
 1263     movups      `32+16*$k-112`($key),$rndkey[1]
 1264     aesenc      $rndkey[0],$iv
 1265 ___
 1266     }
 1267     $r++;   unshift(@rndkey,pop(@rndkey));
 1268 };
 1269 
 1270 if ($shaext) {
 1271 my $Tbl="%rax";
 1272 
 1273 $code.=<<___;
 1274 .type   ${func}_shaext,\@function,6
 1275 .align  32
 1276 ${func}_shaext:
 1277     mov `($win64?56:8)`(%rsp),$inp  # load 7th argument
 1278 ___
 1279 $code.=<<___ if ($win64);
 1280     lea `-8-10*16`(%rsp),%rsp
 1281     movaps  %xmm6,-8-10*16(%rax)
 1282     movaps  %xmm7,-8-9*16(%rax)
 1283     movaps  %xmm8,-8-8*16(%rax)
 1284     movaps  %xmm9,-8-7*16(%rax)
 1285     movaps  %xmm10,-8-6*16(%rax)
 1286     movaps  %xmm11,-8-5*16(%rax)
 1287     movaps  %xmm12,-8-4*16(%rax)
 1288     movaps  %xmm13,-8-3*16(%rax)
 1289     movaps  %xmm14,-8-2*16(%rax)
 1290     movaps  %xmm15,-8-1*16(%rax)
 1291 .Lprologue_shaext:
 1292 ___
 1293 $code.=<<___;
 1294     lea     K256+0x80(%rip),$Tbl
 1295     movdqu      ($ctx),$ABEF        # DCBA
 1296     movdqu      16($ctx),$CDGH      # HGFE
 1297     movdqa      0x200-0x80($Tbl),$TMP   # byte swap mask
 1298 
 1299     mov     240($key),$rounds
 1300     sub     $in0,$out
 1301     movups      ($key),$rndkey0     # $key[0]
 1302     movups      ($ivp),$iv      # load IV
 1303     movups      16($key),$rndkey[0] # forward reference
 1304     lea     112($key),$key      # size optimization
 1305 
 1306     pshufd      \$0x1b,$ABEF,$Wi    # ABCD
 1307     pshufd      \$0xb1,$ABEF,$ABEF  # CDAB
 1308     pshufd      \$0x1b,$CDGH,$CDGH  # EFGH
 1309     movdqa      $TMP,$BSWAP     # offload
 1310     palignr     \$8,$CDGH,$ABEF     # ABEF
 1311     punpcklqdq  $Wi,$CDGH       # CDGH
 1312 
 1313     jmp .Loop_shaext
 1314 
 1315 .align  16
 1316 .Loop_shaext:
 1317     movdqu      ($inp),@MSG[0]
 1318     movdqu      0x10($inp),@MSG[1]
 1319     movdqu      0x20($inp),@MSG[2]
 1320     pshufb      $TMP,@MSG[0]
 1321     movdqu      0x30($inp),@MSG[3]
 1322 
 1323     movdqa      0*32-0x80($Tbl),$Wi
 1324     paddd       @MSG[0],$Wi
 1325     pshufb      $TMP,@MSG[1]
 1326     movdqa      $CDGH,$CDGH_SAVE    # offload
 1327     movdqa      $ABEF,$ABEF_SAVE    # offload
 1328 ___
 1329     &$aesenc();
 1330 $code.=<<___;
 1331     sha256rnds2 $ABEF,$CDGH     # 0-3
 1332     pshufd      \$0x0e,$Wi,$Wi
 1333 ___
 1334     &$aesenc();
 1335 $code.=<<___;
 1336     sha256rnds2 $CDGH,$ABEF
 1337 
 1338     movdqa      1*32-0x80($Tbl),$Wi
 1339     paddd       @MSG[1],$Wi
 1340     pshufb      $TMP,@MSG[2]
 1341     lea     0x40($inp),$inp
 1342 ___
 1343     &$aesenc();
 1344 $code.=<<___;
 1345     sha256rnds2 $ABEF,$CDGH     # 4-7
 1346     pshufd      \$0x0e,$Wi,$Wi
 1347 ___
 1348     &$aesenc();
 1349 $code.=<<___;
 1350     sha256rnds2 $CDGH,$ABEF
 1351 
 1352     movdqa      2*32-0x80($Tbl),$Wi
 1353     paddd       @MSG[2],$Wi
 1354     pshufb      $TMP,@MSG[3]
 1355     sha256msg1  @MSG[1],@MSG[0]
 1356 ___
 1357     &$aesenc();
 1358 $code.=<<___;
 1359     sha256rnds2 $ABEF,$CDGH     # 8-11
 1360     pshufd      \$0x0e,$Wi,$Wi
 1361     movdqa      @MSG[3],$TMP
 1362     palignr     \$4,@MSG[2],$TMP
 1363     paddd       $TMP,@MSG[0]
 1364 ___
 1365     &$aesenc();
 1366 $code.=<<___;
 1367     sha256rnds2 $CDGH,$ABEF
 1368 
 1369     movdqa      3*32-0x80($Tbl),$Wi
 1370     paddd       @MSG[3],$Wi
 1371     sha256msg2  @MSG[3],@MSG[0]
 1372     sha256msg1  @MSG[2],@MSG[1]
 1373 ___
 1374     &$aesenc();
 1375 $code.=<<___;
 1376     sha256rnds2 $ABEF,$CDGH     # 12-15
 1377     pshufd      \$0x0e,$Wi,$Wi
 1378 ___
 1379     &$aesenc();
 1380 $code.=<<___;
 1381     movdqa      @MSG[0],$TMP
 1382     palignr     \$4,@MSG[3],$TMP
 1383     paddd       $TMP,@MSG[1]
 1384     sha256rnds2 $CDGH,$ABEF
 1385 ___
 1386 for($i=4;$i<16-3;$i++) {
 1387     &$aesenc()  if (($r%10)==0);
 1388 $code.=<<___;
 1389     movdqa      $i*32-0x80($Tbl),$Wi
 1390     paddd       @MSG[0],$Wi
 1391     sha256msg2  @MSG[0],@MSG[1]
 1392     sha256msg1  @MSG[3],@MSG[2]
 1393 ___
 1394     &$aesenc();
 1395 $code.=<<___;
 1396     sha256rnds2 $ABEF,$CDGH     # 16-19...
 1397     pshufd      \$0x0e,$Wi,$Wi
 1398     movdqa      @MSG[1],$TMP
 1399     palignr     \$4,@MSG[0],$TMP
 1400     paddd       $TMP,@MSG[2]
 1401 ___
 1402     &$aesenc();
 1403     &$aesenc()  if ($r==19);
 1404 $code.=<<___;
 1405     sha256rnds2 $CDGH,$ABEF
 1406 ___
 1407     push(@MSG,shift(@MSG));
 1408 }
 1409 $code.=<<___;
 1410     movdqa      13*32-0x80($Tbl),$Wi
 1411     paddd       @MSG[0],$Wi
 1412     sha256msg2  @MSG[0],@MSG[1]
 1413     sha256msg1  @MSG[3],@MSG[2]
 1414 ___
 1415     &$aesenc();
 1416 $code.=<<___;
 1417     sha256rnds2 $ABEF,$CDGH     # 52-55
 1418     pshufd      \$0x0e,$Wi,$Wi
 1419     movdqa      @MSG[1],$TMP
 1420     palignr     \$4,@MSG[0],$TMP
 1421     paddd       $TMP,@MSG[2]
 1422 ___
 1423     &$aesenc();
 1424     &$aesenc();
 1425 $code.=<<___;
 1426     sha256rnds2 $CDGH,$ABEF
 1427 
 1428     movdqa      14*32-0x80($Tbl),$Wi
 1429     paddd       @MSG[1],$Wi
 1430     sha256msg2  @MSG[1],@MSG[2]
 1431     movdqa      $BSWAP,$TMP
 1432 ___
 1433     &$aesenc();
 1434 $code.=<<___;
 1435     sha256rnds2 $ABEF,$CDGH     # 56-59
 1436     pshufd      \$0x0e,$Wi,$Wi
 1437 ___
 1438     &$aesenc();
 1439 $code.=<<___;
 1440     sha256rnds2 $CDGH,$ABEF
 1441 
 1442     movdqa      15*32-0x80($Tbl),$Wi
 1443     paddd       @MSG[2],$Wi
 1444 ___
 1445     &$aesenc();
 1446     &$aesenc();
 1447 $code.=<<___;
 1448     sha256rnds2 $ABEF,$CDGH     # 60-63
 1449     pshufd      \$0x0e,$Wi,$Wi
 1450 ___
 1451     &$aesenc();
 1452 $code.=<<___;
 1453     sha256rnds2 $CDGH,$ABEF
 1454     #pxor       $CDGH,$rndkey0      # black magic
 1455 ___
 1456     while ($r<40)   { &$aesenc(); }     # remaining aesenc's
 1457 $code.=<<___;
 1458     #xorps      $CDGH,$rndkey0      # black magic
 1459     paddd       $CDGH_SAVE,$CDGH
 1460     paddd       $ABEF_SAVE,$ABEF
 1461 
 1462     dec     $len
 1463     movups      $iv,48($out,$in0)   # write output
 1464     lea     64($in0),$in0
 1465     jnz     .Loop_shaext
 1466 
 1467     pshufd      \$0xb1,$CDGH,$CDGH  # DCHG
 1468     pshufd      \$0x1b,$ABEF,$TMP   # FEBA
 1469     pshufd      \$0xb1,$ABEF,$ABEF  # BAFE
 1470     punpckhqdq  $CDGH,$ABEF     # DCBA
 1471     palignr     \$8,$TMP,$CDGH      # HGFE
 1472 
 1473     movups      $iv,($ivp)      # write IV
 1474     movdqu      $ABEF,($ctx)
 1475     movdqu      $CDGH,16($ctx)
 1476 ___
 1477 $code.=<<___ if ($win64);
 1478     movaps  0*16(%rsp),%xmm6
 1479     movaps  1*16(%rsp),%xmm7
 1480     movaps  2*16(%rsp),%xmm8
 1481     movaps  3*16(%rsp),%xmm9
 1482     movaps  4*16(%rsp),%xmm10
 1483     movaps  5*16(%rsp),%xmm11
 1484     movaps  6*16(%rsp),%xmm12
 1485     movaps  7*16(%rsp),%xmm13
 1486     movaps  8*16(%rsp),%xmm14
 1487     movaps  9*16(%rsp),%xmm15
 1488     lea 8+10*16(%rsp),%rsp
 1489 .Lepilogue_shaext:
 1490 ___
 1491 $code.=<<___;
 1492     ret
 1493 .size   ${func}_shaext,.-${func}_shaext
 1494 ___
 1495 }
 1496 }}}}}
 1497 
 1498 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
 1499 #       CONTEXT *context,DISPATCHER_CONTEXT *disp)
 1500 if ($win64 && $avx) {
 1501 $rec="%rcx";
 1502 $frame="%rdx";
 1503 $context="%r8";
 1504 $disp="%r9";
 1505 
 1506 $code.=<<___;
 1507 .extern __imp_RtlVirtualUnwind
 1508 .type   se_handler,\@abi-omnipotent
 1509 .align  16
 1510 se_handler:
 1511     push    %rsi
 1512     push    %rdi
 1513     push    %rbx
 1514     push    %rbp
 1515     push    %r12
 1516     push    %r13
 1517     push    %r14
 1518     push    %r15
 1519     pushfq
 1520     sub \$64,%rsp
 1521 
 1522     mov 120($context),%rax  # pull context->Rax
 1523     mov 248($context),%rbx  # pull context->Rip
 1524 
 1525     mov 8($disp),%rsi       # disp->ImageBase
 1526     mov 56($disp),%r11      # disp->HanderlData
 1527 
 1528     mov 0(%r11),%r10d       # HandlerData[0]
 1529     lea (%rsi,%r10),%r10    # prologue label
 1530     cmp %r10,%rbx       # context->Rip<prologue label
 1531     jb  .Lin_prologue
 1532 
 1533     mov 152($context),%rax  # pull context->Rsp
 1534 
 1535     mov 4(%r11),%r10d       # HandlerData[1]
 1536     lea (%rsi,%r10),%r10    # epilogue label
 1537     cmp %r10,%rbx       # context->Rip>=epilogue label
 1538     jae .Lin_prologue
 1539 ___
 1540 $code.=<<___ if ($shaext);
 1541     lea aesni_cbc_sha256_enc_shaext(%rip),%r10
 1542     cmp %r10,%rbx
 1543     jb  .Lnot_in_shaext
 1544 
 1545     lea (%rax),%rsi
 1546     lea 512($context),%rdi  # &context.Xmm6
 1547     mov \$20,%ecx
 1548     .long   0xa548f3fc      # cld; rep movsq
 1549     lea 168(%rax),%rax      # adjust stack pointer
 1550     jmp .Lin_prologue
 1551 .Lnot_in_shaext:
 1552 ___
 1553 $code.=<<___ if ($avx>1);
 1554     lea .Lavx2_shortcut(%rip),%r10
 1555     cmp %r10,%rbx       # context->Rip<avx2_shortcut
 1556     jb  .Lnot_in_avx2
 1557 
 1558     and \$-256*$SZ,%rax
 1559     add \$`2*$SZ*($rounds-8)`,%rax
 1560 .Lnot_in_avx2:
 1561 ___
 1562 $code.=<<___;
 1563     mov %rax,%rsi       # put aside Rsp
 1564     mov 16*$SZ+7*8(%rax),%rax   # pull $_rsp
 1565     lea 48(%rax),%rax
 1566 
 1567     mov -8(%rax),%rbx
 1568     mov -16(%rax),%rbp
 1569     mov -24(%rax),%r12
 1570     mov -32(%rax),%r13
 1571     mov -40(%rax),%r14
 1572     mov -48(%rax),%r15
 1573     mov %rbx,144($context)  # restore context->Rbx
 1574     mov %rbp,160($context)  # restore context->Rbp
 1575     mov %r12,216($context)  # restore context->R12
 1576     mov %r13,224($context)  # restore context->R13
 1577     mov %r14,232($context)  # restore context->R14
 1578     mov %r15,240($context)  # restore context->R15
 1579 
 1580     lea 16*$SZ+8*8(%rsi),%rsi   # Xmm6- save area
 1581     lea 512($context),%rdi  # &context.Xmm6
 1582     mov \$20,%ecx
 1583     .long   0xa548f3fc      # cld; rep movsq
 1584 
 1585 .Lin_prologue:
 1586     mov 8(%rax),%rdi
 1587     mov 16(%rax),%rsi
 1588     mov %rax,152($context)  # restore context->Rsp
 1589     mov %rsi,168($context)  # restore context->Rsi
 1590     mov %rdi,176($context)  # restore context->Rdi
 1591 
 1592     mov 40($disp),%rdi      # disp->ContextRecord
 1593     mov $context,%rsi       # context
 1594     mov \$154,%ecx      # sizeof(CONTEXT)
 1595     .long   0xa548f3fc      # cld; rep movsq
 1596 
 1597     mov $disp,%rsi
 1598     xor %rcx,%rcx       # arg1, UNW_FLAG_NHANDLER
 1599     mov 8(%rsi),%rdx        # arg2, disp->ImageBase
 1600     mov 0(%rsi),%r8     # arg3, disp->ControlPc
 1601     mov 16(%rsi),%r9        # arg4, disp->FunctionEntry
 1602     mov 40(%rsi),%r10       # disp->ContextRecord
 1603     lea 56(%rsi),%r11       # &disp->HandlerData
 1604     lea 24(%rsi),%r12       # &disp->EstablisherFrame
 1605     mov %r10,32(%rsp)       # arg5
 1606     mov %r11,40(%rsp)       # arg6
 1607     mov %r12,48(%rsp)       # arg7
 1608     mov %rcx,56(%rsp)       # arg8, (NULL)
 1609     call    *__imp_RtlVirtualUnwind(%rip)
 1610 
 1611     mov \$1,%eax        # ExceptionContinueSearch
 1612     add \$64,%rsp
 1613     popfq
 1614     pop %r15
 1615     pop %r14
 1616     pop %r13
 1617     pop %r12
 1618     pop %rbp
 1619     pop %rbx
 1620     pop %rdi
 1621     pop %rsi
 1622     ret
 1623 .size   se_handler,.-se_handler
 1624 
 1625 .section    .pdata
 1626     .rva    .LSEH_begin_${func}_xop
 1627     .rva    .LSEH_end_${func}_xop
 1628     .rva    .LSEH_info_${func}_xop
 1629 
 1630     .rva    .LSEH_begin_${func}_avx
 1631     .rva    .LSEH_end_${func}_avx
 1632     .rva    .LSEH_info_${func}_avx
 1633 ___
 1634 $code.=<<___ if ($avx>1);
 1635     .rva    .LSEH_begin_${func}_avx2
 1636     .rva    .LSEH_end_${func}_avx2
 1637     .rva    .LSEH_info_${func}_avx2
 1638 ___
 1639 $code.=<<___ if ($shaext);
 1640     .rva    .LSEH_begin_${func}_shaext
 1641     .rva    .LSEH_end_${func}_shaext
 1642     .rva    .LSEH_info_${func}_shaext
 1643 ___
 1644 $code.=<<___;
 1645 .section    .xdata
 1646 .align  8
 1647 .LSEH_info_${func}_xop:
 1648     .byte   9,0,0,0
 1649     .rva    se_handler
 1650     .rva    .Lprologue_xop,.Lepilogue_xop       # HandlerData[]
 1651 
 1652 .LSEH_info_${func}_avx:
 1653     .byte   9,0,0,0
 1654     .rva    se_handler
 1655     .rva    .Lprologue_avx,.Lepilogue_avx       # HandlerData[]
 1656 ___
 1657 $code.=<<___ if ($avx>1);
 1658 .LSEH_info_${func}_avx2:
 1659     .byte   9,0,0,0
 1660     .rva    se_handler
 1661     .rva    .Lprologue_avx2,.Lepilogue_avx2     # HandlerData[]
 1662 ___
 1663 $code.=<<___ if ($shaext);
 1664 .LSEH_info_${func}_shaext:
 1665     .byte   9,0,0,0
 1666     .rva    se_handler
 1667     .rva    .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
 1668 ___
 1669 }
 1670 
 1671 ####################################################################
 1672 sub rex {
 1673   local *opcode=shift;
 1674   my ($dst,$src)=@_;
 1675   my $rex=0;
 1676 
 1677     $rex|=0x04          if($dst>=8);
 1678     $rex|=0x01          if($src>=8);
 1679     unshift @opcode,$rex|0x40   if($rex);
 1680 }
 1681 
 1682 {
 1683   my %opcodelet = (
 1684         "sha256rnds2" => 0xcb,
 1685         "sha256msg1"  => 0xcc,
 1686         "sha256msg2"  => 0xcd   );
 1687 
 1688   sub sha256op38 {
 1689     my $instr = shift;
 1690 
 1691     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
 1692       my @opcode=(0x0f,0x38);
 1693     rex(\@opcode,$2,$1);
 1694     push @opcode,$opcodelet{$instr};
 1695     push @opcode,0xc0|($1&7)|(($2&7)<<3);       # ModR/M
 1696     return ".byte\t".join(',',@opcode);
 1697     } else {
 1698     return $instr."\t".@_[0];
 1699     }
 1700   }
 1701 }
 1702 
 1703 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 1704 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
 1705 print $code;
 1706 close STDOUT;