"Fossies" - the Fresh Open Source Software Archive

Member "openssl-1.0.2q/crypto/aes/asm/aesni-mb-x86_64.pl" (20 Nov 2018, 35971 Bytes) of package /linux/misc/openssl-1.0.2q.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Perl source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "aesni-mb-x86_64.pl" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 1.1.0g_vs_1.1.1-pre2.

    1 #!/usr/bin/env perl
    2 
    3 # ====================================================================
    4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
    5 # project. The module is, however, dual licensed under OpenSSL and
    6 # CRYPTOGAMS licenses depending on where you obtain it. For further
    7 # details see http://www.openssl.org/~appro/cryptogams/.
    8 # ====================================================================
    9 
   10 # Multi-buffer AES-NI procedures process several independent buffers
   11 # in parallel by interleaving independent instructions.
   12 #
   13 # Cycles per byte for interleave factor 4:
   14 #
   15 #           asymptotic  measured
   16 #           ---------------------------
   17 # Westmere      5.00/4=1.25 5.13/4=1.28
   18 # Atom          15.0/4=3.75 ?15.7/4=3.93
   19 # Sandy Bridge      5.06/4=1.27 5.18/4=1.29
   20 # Ivy Bridge        5.06/4=1.27 5.14/4=1.29
   21 # Haswell       4.44/4=1.11 4.44/4=1.11
   22 # Bulldozer     5.75/4=1.44 5.76/4=1.44
   23 #
   24 # Cycles per byte for interleave factor 8 (not implemented for
   25 # pre-AVX processors, where higher interleave factor incidentally
   26 # doesn't result in improvement):
   27 #
   28 #           asymptotic  measured
   29 #           ---------------------------
   30 # Sandy Bridge      5.06/8=0.64 7.10/8=0.89(*)
   31 # Ivy Bridge        5.06/8=0.64 7.14/8=0.89(*)
   32 # Haswell       5.00/8=0.63 5.00/8=0.63
   33 # Bulldozer     5.75/8=0.72 5.77/8=0.72
   34 #
   35 # (*)   Sandy/Ivy Bridge are known to handle high interleave factors
   36 #   suboptimally;
   37 
   38 $flavour = shift;
   39 $output  = shift;
   40 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
   41 
   42 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
   43 
   44 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
   45 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
   46 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
   47 die "can't locate x86_64-xlate.pl";
   48 
   49 $avx=0;
   50 
   51 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
   52         =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
   53     $avx = ($1>=2.19) + ($1>=2.22);
   54 }
   55 
   56 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
   57        `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
   58     $avx = ($1>=2.09) + ($1>=2.10);
   59 }
   60 
   61 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
   62        `ml64 2>&1` =~ /Version ([0-9]+)\./) {
   63     $avx = ($1>=10) + ($1>=11);
   64 }
   65 
   66 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
   67     $avx = ($2>=3.0) + ($2>3.0);
   68 }
   69 
   70 open OUT,"| \"$^X\" $xlate $flavour $output";
   71 *STDOUT=*OUT;
   72 
   73 # void aesni_multi_cbc_encrypt (
   74 #     struct {  void *inp,*out; int blocks; double iv[2]; } inp[8];
   75 #     const AES_KEY *key,
   76 #     int num);     /* 1 or 2 */
   77 #
   78 $inp="%rdi";    # 1st arg
   79 $key="%rsi";    # 2nd arg
   80 $num="%edx";
   81 
   82 @inptr=map("%r$_",(8..11));
   83 @outptr=map("%r$_",(12..15));
   84 
   85 ($rndkey0,$rndkey1)=("%xmm0","%xmm1");
   86 @out=map("%xmm$_",(2..5));
   87 @inp=map("%xmm$_",(6..9));
   88 ($counters,$mask,$zero)=map("%xmm$_",(10..12));
   89 
   90 ($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx");
   91 
   92 $code.=<<___;
   93 .text
   94 
   95 .extern OPENSSL_ia32cap_P
   96 
   97 .globl  aesni_multi_cbc_encrypt
   98 .type   aesni_multi_cbc_encrypt,\@function,3
   99 .align  32
  100 aesni_multi_cbc_encrypt:
  101 ___
  102 $code.=<<___ if ($avx);
  103     cmp \$2,$num
  104     jb  .Lenc_non_avx
  105     mov OPENSSL_ia32cap_P+4(%rip),%ecx
  106     test    \$`1<<28`,%ecx          # AVX bit
  107     jnz _avx_cbc_enc_shortcut
  108     jmp .Lenc_non_avx
  109 .align  16
  110 .Lenc_non_avx:
  111 ___
  112 $code.=<<___;
  113     mov %rsp,%rax
  114     push    %rbx
  115     push    %rbp
  116     push    %r12
  117     push    %r13
  118     push    %r14
  119     push    %r15
  120 ___
  121 $code.=<<___ if ($win64);
  122     lea -0xa8(%rsp),%rsp
  123     movaps  %xmm6,(%rsp)
  124     movaps  %xmm7,0x10(%rsp)
  125     movaps  %xmm8,0x20(%rsp)
  126     movaps  %xmm9,0x30(%rsp)
  127     movaps  %xmm10,0x40(%rsp)
  128     movaps  %xmm11,0x50(%rsp)
  129     movaps  %xmm12,0x60(%rsp)
  130     movaps  %xmm13,-0x68(%rax)  # not used, saved to share se_handler 
  131     movaps  %xmm14,-0x58(%rax)
  132     movaps  %xmm15,-0x48(%rax)
  133 ___
  134 $code.=<<___;
  135     # stack layout
  136     #
  137     # +0    output sink
  138     # +16   input sink [original %rsp and $num]
  139     # +32   counters
  140 
  141     sub \$48,%rsp
  142     and \$-64,%rsp
  143     mov %rax,16(%rsp)           # original %rsp
  144 
  145 .Lenc4x_body:
  146     movdqu  ($key),$zero            # 0-round key
  147     lea 0x78($key),$key         # size optimization
  148     lea 40*2($inp),$inp
  149 
  150 .Lenc4x_loop_grande:
  151     mov $num,24(%rsp)           # original $num
  152     xor $num,$num
  153 ___
  154 for($i=0;$i<4;$i++) {
  155     $code.=<<___;
  156     mov `40*$i+16-40*2`($inp),$one  # borrow $one for number of blocks
  157     mov `40*$i+0-40*2`($inp),@inptr[$i]
  158     cmp $num,$one
  159     mov `40*$i+8-40*2`($inp),@outptr[$i]
  160     cmovg   $one,$num           # find maximum
  161     test    $one,$one
  162     movdqu  `40*$i+24-40*2`($inp),@out[$i]  # load IV
  163     mov $one,`32+4*$i`(%rsp)        # initialize counters
  164     cmovle  %rsp,@inptr[$i]         # cancel input
  165 ___
  166 }
  167 $code.=<<___;
  168     test    $num,$num
  169     jz  .Lenc4x_done
  170 
  171     movups  0x10-0x78($key),$rndkey1
  172      pxor   $zero,@out[0]
  173     movups  0x20-0x78($key),$rndkey0
  174      pxor   $zero,@out[1]
  175     mov 0xf0-0x78($key),$rounds
  176      pxor   $zero,@out[2]
  177     movdqu  (@inptr[0]),@inp[0]     # load inputs
  178      pxor   $zero,@out[3]
  179     movdqu  (@inptr[1]),@inp[1]
  180      pxor   @inp[0],@out[0]
  181     movdqu  (@inptr[2]),@inp[2]
  182      pxor   @inp[1],@out[1]
  183     movdqu  (@inptr[3]),@inp[3]
  184      pxor   @inp[2],@out[2]
  185      pxor   @inp[3],@out[3]
  186     movdqa  32(%rsp),$counters      # load counters
  187     xor $offset,$offset
  188     jmp .Loop_enc4x
  189 
  190 .align  32
  191 .Loop_enc4x:
  192     add \$16,$offset
  193     lea 16(%rsp),$sink          # sink pointer
  194     mov \$1,$one            # constant of 1
  195     sub $offset,$sink
  196 
  197     aesenc      $rndkey1,@out[0]
  198     prefetcht0  31(@inptr[0],$offset)   # prefetch input
  199     prefetcht0  31(@inptr[1],$offset)
  200     aesenc      $rndkey1,@out[1]
  201     prefetcht0  31(@inptr[2],$offset)
  202     prefetcht0  31(@inptr[2],$offset)
  203     aesenc      $rndkey1,@out[2]
  204     aesenc      $rndkey1,@out[3]
  205     movups      0x30-0x78($key),$rndkey1
  206 ___
  207 for($i=0;$i<4;$i++) {
  208 my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
  209 $code.=<<___;
  210      cmp        `32+4*$i`(%rsp),$one
  211     aesenc      $rndkey,@out[0]
  212     aesenc      $rndkey,@out[1]
  213     aesenc      $rndkey,@out[2]
  214      cmovge     $sink,@inptr[$i]    # cancel input
  215      cmovg      $sink,@outptr[$i]   # sink output
  216     aesenc      $rndkey,@out[3]
  217     movups      `0x40+16*$i-0x78`($key),$rndkey
  218 ___
  219 }
  220 $code.=<<___;
  221      movdqa     $counters,$mask
  222     aesenc      $rndkey0,@out[0]
  223     prefetcht0  15(@outptr[0],$offset)  # prefetch output
  224     prefetcht0  15(@outptr[1],$offset)
  225     aesenc      $rndkey0,@out[1]
  226     prefetcht0  15(@outptr[2],$offset)
  227     prefetcht0  15(@outptr[3],$offset)
  228     aesenc      $rndkey0,@out[2]
  229     aesenc      $rndkey0,@out[3]
  230     movups      0x80-0x78($key),$rndkey0
  231      pxor       $zero,$zero
  232 
  233     aesenc      $rndkey1,@out[0]
  234      pcmpgtd    $zero,$mask
  235      movdqu     -0x78($key),$zero   # reload 0-round key
  236     aesenc      $rndkey1,@out[1]
  237      paddd      $mask,$counters     # decrement counters
  238      movdqa     $counters,32(%rsp)  # update counters
  239     aesenc      $rndkey1,@out[2]
  240     aesenc      $rndkey1,@out[3]
  241     movups      0x90-0x78($key),$rndkey1
  242 
  243     cmp \$11,$rounds
  244 
  245     aesenc      $rndkey0,@out[0]
  246     aesenc      $rndkey0,@out[1]
  247     aesenc      $rndkey0,@out[2]
  248     aesenc      $rndkey0,@out[3]
  249     movups      0xa0-0x78($key),$rndkey0
  250 
  251     jb  .Lenc4x_tail
  252 
  253     aesenc      $rndkey1,@out[0]
  254     aesenc      $rndkey1,@out[1]
  255     aesenc      $rndkey1,@out[2]
  256     aesenc      $rndkey1,@out[3]
  257     movups      0xb0-0x78($key),$rndkey1
  258 
  259     aesenc      $rndkey0,@out[0]
  260     aesenc      $rndkey0,@out[1]
  261     aesenc      $rndkey0,@out[2]
  262     aesenc      $rndkey0,@out[3]
  263     movups      0xc0-0x78($key),$rndkey0
  264 
  265     je  .Lenc4x_tail
  266 
  267     aesenc      $rndkey1,@out[0]
  268     aesenc      $rndkey1,@out[1]
  269     aesenc      $rndkey1,@out[2]
  270     aesenc      $rndkey1,@out[3]
  271     movups      0xd0-0x78($key),$rndkey1
  272 
  273     aesenc      $rndkey0,@out[0]
  274     aesenc      $rndkey0,@out[1]
  275     aesenc      $rndkey0,@out[2]
  276     aesenc      $rndkey0,@out[3]
  277     movups      0xe0-0x78($key),$rndkey0
  278     jmp .Lenc4x_tail
  279 
  280 .align  32
  281 .Lenc4x_tail:
  282     aesenc      $rndkey1,@out[0]
  283     aesenc      $rndkey1,@out[1]
  284     aesenc      $rndkey1,@out[2]
  285     aesenc      $rndkey1,@out[3]
  286      movdqu     (@inptr[0],$offset),@inp[0]
  287     movdqu      0x10-0x78($key),$rndkey1
  288 
  289     aesenclast  $rndkey0,@out[0]
  290      movdqu     (@inptr[1],$offset),@inp[1]
  291      pxor       $zero,@inp[0]
  292     aesenclast  $rndkey0,@out[1]
  293      movdqu     (@inptr[2],$offset),@inp[2]
  294      pxor       $zero,@inp[1]
  295     aesenclast  $rndkey0,@out[2]
  296      movdqu     (@inptr[3],$offset),@inp[3]
  297      pxor       $zero,@inp[2]
  298     aesenclast  $rndkey0,@out[3]
  299     movdqu      0x20-0x78($key),$rndkey0
  300      pxor       $zero,@inp[3]
  301 
  302     movups      @out[0],-16(@outptr[0],$offset)
  303      pxor       @inp[0],@out[0]
  304     movups      @out[1],-16(@outptr[1],$offset) 
  305      pxor       @inp[1],@out[1]
  306     movups      @out[2],-16(@outptr[2],$offset) 
  307      pxor       @inp[2],@out[2]
  308     movups      @out[3],-16(@outptr[3],$offset)
  309      pxor       @inp[3],@out[3]
  310 
  311     dec $num
  312     jnz .Loop_enc4x
  313 
  314     mov 16(%rsp),%rax           # original %rsp
  315     mov 24(%rsp),$num
  316 
  317     #pxor   @inp[0],@out[0]
  318     #pxor   @inp[1],@out[1]
  319     #movdqu @out[0],`40*0+24-40*2`($inp)    # output iv FIX ME!
  320     #pxor   @inp[2],@out[2]
  321     #movdqu @out[1],`40*1+24-40*2`($inp)
  322     #pxor   @inp[3],@out[3]
  323     #movdqu @out[2],`40*2+24-40*2`($inp)    # won't fix, let caller
  324     #movdqu @out[3],`40*3+24-40*2`($inp)    # figure this out...
  325 
  326     lea `40*4`($inp),$inp
  327     dec $num
  328     jnz .Lenc4x_loop_grande
  329 
  330 .Lenc4x_done:
  331 ___
  332 $code.=<<___ if ($win64);
  333     movaps  -0xd8(%rax),%xmm6
  334     movaps  -0xc8(%rax),%xmm7
  335     movaps  -0xb8(%rax),%xmm8
  336     movaps  -0xa8(%rax),%xmm9
  337     movaps  -0x98(%rax),%xmm10
  338     movaps  -0x88(%rax),%xmm11
  339     movaps  -0x78(%rax),%xmm12
  340     #movaps -0x68(%rax),%xmm13
  341     #movaps -0x58(%rax),%xmm14
  342     #movaps -0x48(%rax),%xmm15
  343 ___
  344 $code.=<<___;
  345     mov -48(%rax),%r15
  346     mov -40(%rax),%r14
  347     mov -32(%rax),%r13
  348     mov -24(%rax),%r12
  349     mov -16(%rax),%rbp
  350     mov -8(%rax),%rbx
  351     lea (%rax),%rsp
  352 .Lenc4x_epilogue:
  353     ret
  354 .size   aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt
  355 
  356 .globl  aesni_multi_cbc_decrypt
  357 .type   aesni_multi_cbc_decrypt,\@function,3
  358 .align  32
  359 aesni_multi_cbc_decrypt:
  360 ___
  361 $code.=<<___ if ($avx);
  362     cmp \$2,$num
  363     jb  .Ldec_non_avx
  364     mov OPENSSL_ia32cap_P+4(%rip),%ecx
  365     test    \$`1<<28`,%ecx          # AVX bit
  366     jnz _avx_cbc_dec_shortcut
  367     jmp .Ldec_non_avx
  368 .align  16
  369 .Ldec_non_avx:
  370 ___
  371 $code.=<<___;
  372     mov %rsp,%rax
  373     push    %rbx
  374     push    %rbp
  375     push    %r12
  376     push    %r13
  377     push    %r14
  378     push    %r15
  379 ___
  380 $code.=<<___ if ($win64);
  381     lea -0xa8(%rsp),%rsp
  382     movaps  %xmm6,(%rsp)
  383     movaps  %xmm7,0x10(%rsp)
  384     movaps  %xmm8,0x20(%rsp)
  385     movaps  %xmm9,0x30(%rsp)
  386     movaps  %xmm10,0x40(%rsp)
  387     movaps  %xmm11,0x50(%rsp)
  388     movaps  %xmm12,0x60(%rsp)
  389     movaps  %xmm13,-0x68(%rax)  # not used, saved to share se_handler 
  390     movaps  %xmm14,-0x58(%rax)
  391     movaps  %xmm15,-0x48(%rax)
  392 ___
  393 $code.=<<___;
  394     # stack layout
  395     #
  396     # +0    output sink
  397     # +16   input sink [original %rsp and $num]
  398     # +32   counters
  399 
  400     sub \$48,%rsp
  401     and \$-64,%rsp
  402     mov %rax,16(%rsp)           # original %rsp
  403 
  404 .Ldec4x_body:
  405     movdqu  ($key),$zero            # 0-round key
  406     lea 0x78($key),$key         # size optimization
  407     lea 40*2($inp),$inp
  408 
  409 .Ldec4x_loop_grande:
  410     mov $num,24(%rsp)           # original $num
  411     xor $num,$num
  412 ___
  413 for($i=0;$i<4;$i++) {
  414     $code.=<<___;
  415     mov `40*$i+16-40*2`($inp),$one  # borrow $one for number of blocks
  416     mov `40*$i+0-40*2`($inp),@inptr[$i]
  417     cmp $num,$one
  418     mov `40*$i+8-40*2`($inp),@outptr[$i]
  419     cmovg   $one,$num           # find maximum
  420     test    $one,$one
  421     movdqu  `40*$i+24-40*2`($inp),@inp[$i]  # load IV
  422     mov $one,`32+4*$i`(%rsp)        # initialize counters
  423     cmovle  %rsp,@inptr[$i]         # cancel input
  424 ___
  425 }
  426 $code.=<<___;
  427     test    $num,$num
  428     jz  .Ldec4x_done
  429 
  430     movups  0x10-0x78($key),$rndkey1
  431     movups  0x20-0x78($key),$rndkey0
  432     mov 0xf0-0x78($key),$rounds
  433     movdqu  (@inptr[0]),@out[0]     # load inputs
  434     movdqu  (@inptr[1]),@out[1]
  435      pxor   $zero,@out[0]
  436     movdqu  (@inptr[2]),@out[2]
  437      pxor   $zero,@out[1]
  438     movdqu  (@inptr[3]),@out[3]
  439      pxor   $zero,@out[2]
  440      pxor   $zero,@out[3]
  441     movdqa  32(%rsp),$counters      # load counters
  442     xor $offset,$offset
  443     jmp .Loop_dec4x
  444 
  445 .align  32
  446 .Loop_dec4x:
  447     add \$16,$offset
  448     lea 16(%rsp),$sink          # sink pointer
  449     mov \$1,$one            # constant of 1
  450     sub $offset,$sink
  451 
  452     aesdec      $rndkey1,@out[0]
  453     prefetcht0  31(@inptr[0],$offset)   # prefetch input
  454     prefetcht0  31(@inptr[1],$offset)
  455     aesdec      $rndkey1,@out[1]
  456     prefetcht0  31(@inptr[2],$offset)
  457     prefetcht0  31(@inptr[3],$offset)
  458     aesdec      $rndkey1,@out[2]
  459     aesdec      $rndkey1,@out[3]
  460     movups      0x30-0x78($key),$rndkey1
  461 ___
  462 for($i=0;$i<4;$i++) {
  463 my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
  464 $code.=<<___;
  465      cmp        `32+4*$i`(%rsp),$one
  466     aesdec      $rndkey,@out[0]
  467     aesdec      $rndkey,@out[1]
  468     aesdec      $rndkey,@out[2]
  469      cmovge     $sink,@inptr[$i]    # cancel input
  470      cmovg      $sink,@outptr[$i]   # sink output
  471     aesdec      $rndkey,@out[3]
  472     movups      `0x40+16*$i-0x78`($key),$rndkey
  473 ___
  474 }
  475 $code.=<<___;
  476      movdqa     $counters,$mask
  477     aesdec      $rndkey0,@out[0]
  478     prefetcht0  15(@outptr[0],$offset)  # prefetch output
  479     prefetcht0  15(@outptr[1],$offset)
  480     aesdec      $rndkey0,@out[1]
  481     prefetcht0  15(@outptr[2],$offset)
  482     prefetcht0  15(@outptr[3],$offset)
  483     aesdec      $rndkey0,@out[2]
  484     aesdec      $rndkey0,@out[3]
  485     movups      0x80-0x78($key),$rndkey0
  486      pxor       $zero,$zero
  487 
  488     aesdec      $rndkey1,@out[0]
  489      pcmpgtd    $zero,$mask
  490      movdqu     -0x78($key),$zero   # reload 0-round key
  491     aesdec      $rndkey1,@out[1]
  492      paddd      $mask,$counters     # decrement counters
  493      movdqa     $counters,32(%rsp)  # update counters
  494     aesdec      $rndkey1,@out[2]
  495     aesdec      $rndkey1,@out[3]
  496     movups      0x90-0x78($key),$rndkey1
  497 
  498     cmp \$11,$rounds
  499 
  500     aesdec      $rndkey0,@out[0]
  501     aesdec      $rndkey0,@out[1]
  502     aesdec      $rndkey0,@out[2]
  503     aesdec      $rndkey0,@out[3]
  504     movups      0xa0-0x78($key),$rndkey0
  505 
  506     jb  .Ldec4x_tail
  507 
  508     aesdec      $rndkey1,@out[0]
  509     aesdec      $rndkey1,@out[1]
  510     aesdec      $rndkey1,@out[2]
  511     aesdec      $rndkey1,@out[3]
  512     movups      0xb0-0x78($key),$rndkey1
  513 
  514     aesdec      $rndkey0,@out[0]
  515     aesdec      $rndkey0,@out[1]
  516     aesdec      $rndkey0,@out[2]
  517     aesdec      $rndkey0,@out[3]
  518     movups      0xc0-0x78($key),$rndkey0
  519 
  520     je  .Ldec4x_tail
  521 
  522     aesdec      $rndkey1,@out[0]
  523     aesdec      $rndkey1,@out[1]
  524     aesdec      $rndkey1,@out[2]
  525     aesdec      $rndkey1,@out[3]
  526     movups      0xd0-0x78($key),$rndkey1
  527 
  528     aesdec      $rndkey0,@out[0]
  529     aesdec      $rndkey0,@out[1]
  530     aesdec      $rndkey0,@out[2]
  531     aesdec      $rndkey0,@out[3]
  532     movups      0xe0-0x78($key),$rndkey0
  533     jmp .Ldec4x_tail
  534 
  535 .align  32
  536 .Ldec4x_tail:
  537     aesdec      $rndkey1,@out[0]
  538     aesdec      $rndkey1,@out[1]
  539     aesdec      $rndkey1,@out[2]
  540      pxor       $rndkey0,@inp[0]
  541      pxor       $rndkey0,@inp[1]
  542     aesdec      $rndkey1,@out[3]
  543     movdqu      0x10-0x78($key),$rndkey1
  544      pxor       $rndkey0,@inp[2]
  545      pxor       $rndkey0,@inp[3]
  546     movdqu      0x20-0x78($key),$rndkey0
  547 
  548     aesdeclast  @inp[0],@out[0]
  549     aesdeclast  @inp[1],@out[1]
  550      movdqu     -16(@inptr[0],$offset),@inp[0]  # load next IV
  551      movdqu     -16(@inptr[1],$offset),@inp[1]
  552     aesdeclast  @inp[2],@out[2]
  553     aesdeclast  @inp[3],@out[3]
  554      movdqu     -16(@inptr[2],$offset),@inp[2]
  555      movdqu     -16(@inptr[3],$offset),@inp[3]
  556 
  557     movups      @out[0],-16(@outptr[0],$offset)
  558      movdqu     (@inptr[0],$offset),@out[0]
  559     movups      @out[1],-16(@outptr[1],$offset) 
  560      movdqu     (@inptr[1],$offset),@out[1]
  561      pxor       $zero,@out[0]
  562     movups      @out[2],-16(@outptr[2],$offset) 
  563      movdqu     (@inptr[2],$offset),@out[2]
  564      pxor       $zero,@out[1]
  565     movups      @out[3],-16(@outptr[3],$offset)
  566      movdqu     (@inptr[3],$offset),@out[3]
  567      pxor       $zero,@out[2]
  568      pxor       $zero,@out[3]
  569 
  570     dec $num
  571     jnz .Loop_dec4x
  572 
  573     mov 16(%rsp),%rax           # original %rsp
  574     mov 24(%rsp),$num
  575 
  576     lea `40*4`($inp),$inp
  577     dec $num
  578     jnz .Ldec4x_loop_grande
  579 
  580 .Ldec4x_done:
  581 ___
  582 $code.=<<___ if ($win64);
  583     movaps  -0xd8(%rax),%xmm6
  584     movaps  -0xc8(%rax),%xmm7
  585     movaps  -0xb8(%rax),%xmm8
  586     movaps  -0xa8(%rax),%xmm9
  587     movaps  -0x98(%rax),%xmm10
  588     movaps  -0x88(%rax),%xmm11
  589     movaps  -0x78(%rax),%xmm12
  590     #movaps -0x68(%rax),%xmm13
  591     #movaps -0x58(%rax),%xmm14
  592     #movaps -0x48(%rax),%xmm15
  593 ___
  594 $code.=<<___;
  595     mov -48(%rax),%r15
  596     mov -40(%rax),%r14
  597     mov -32(%rax),%r13
  598     mov -24(%rax),%r12
  599     mov -16(%rax),%rbp
  600     mov -8(%rax),%rbx
  601     lea (%rax),%rsp
  602 .Ldec4x_epilogue:
  603     ret
  604 .size   aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
  605 ___
  606 
  607                         if ($avx) {{{
  608 my @ptr=map("%r$_",(8..15));
  609 my $offload=$sink;
  610 
  611 my @out=map("%xmm$_",(2..9));
  612 my @inp=map("%xmm$_",(10..13));
  613 my ($counters,$zero)=("%xmm14","%xmm15");
  614 
  615 $code.=<<___;
  616 .type   aesni_multi_cbc_encrypt_avx,\@function,3
  617 .align  32
  618 aesni_multi_cbc_encrypt_avx:
  619 _avx_cbc_enc_shortcut:
  620     mov %rsp,%rax
  621     push    %rbx
  622     push    %rbp
  623     push    %r12
  624     push    %r13
  625     push    %r14
  626     push    %r15
  627 ___
  628 $code.=<<___ if ($win64);
  629     lea -0xa8(%rsp),%rsp
  630     movaps  %xmm6,(%rsp)
  631     movaps  %xmm7,0x10(%rsp)
  632     movaps  %xmm8,0x20(%rsp)
  633     movaps  %xmm9,0x30(%rsp)
  634     movaps  %xmm10,0x40(%rsp)
  635     movaps  %xmm11,0x50(%rsp)
  636     movaps  %xmm12,-0x78(%rax)
  637     movaps  %xmm13,-0x68(%rax)
  638     movaps  %xmm14,-0x58(%rax)
  639     movaps  %xmm15,-0x48(%rax)
  640 ___
  641 $code.=<<___;
  642     # stack layout
  643     #
  644     # +0    output sink
  645     # +16   input sink [original %rsp and $num]
  646     # +32   counters
  647     # +64   distances between inputs and outputs
  648     # +128  off-load area for @inp[0..3]
  649 
  650     sub \$192,%rsp
  651     and \$-128,%rsp
  652     mov %rax,16(%rsp)           # original %rsp
  653 
  654 .Lenc8x_body:
  655     vzeroupper
  656     vmovdqu ($key),$zero            # 0-round key
  657     lea 0x78($key),$key         # size optimization
  658     lea 40*4($inp),$inp
  659     shr \$1,$num
  660 
  661 .Lenc8x_loop_grande:
  662     #mov    $num,24(%rsp)           # original $num
  663     xor $num,$num
  664 ___
  665 for($i=0;$i<8;$i++) {
  666   my $temp = $i ? $offload : $offset;
  667     $code.=<<___;
  668     mov `40*$i+16-40*4`($inp),$one  # borrow $one for number of blocks
  669     mov `40*$i+0-40*4`($inp),@ptr[$i]   # input pointer
  670     cmp $num,$one
  671     mov `40*$i+8-40*4`($inp),$temp  # output pointer
  672     cmovg   $one,$num           # find maximum
  673     test    $one,$one
  674     vmovdqu `40*$i+24-40*4`($inp),@out[$i]  # load IV
  675     mov $one,`32+4*$i`(%rsp)        # initialize counters
  676     cmovle  %rsp,@ptr[$i]           # cancel input
  677     sub @ptr[$i],$temp          # distance between input and output
  678     mov $temp,`64+8*$i`(%rsp)       # initialize distances
  679 ___
  680 }
  681 $code.=<<___;
  682     test    $num,$num
  683     jz  .Lenc8x_done
  684 
  685     vmovups 0x10-0x78($key),$rndkey1
  686     vmovups 0x20-0x78($key),$rndkey0
  687     mov 0xf0-0x78($key),$rounds
  688 
  689     vpxor   (@ptr[0]),$zero,@inp[0]     # load inputs and xor with 0-round
  690      lea    128(%rsp),$offload      # offload area
  691     vpxor   (@ptr[1]),$zero,@inp[1]
  692     vpxor   (@ptr[2]),$zero,@inp[2]
  693     vpxor   (@ptr[3]),$zero,@inp[3]
  694      vpxor  @inp[0],@out[0],@out[0]
  695     vpxor   (@ptr[4]),$zero,@inp[0]
  696      vpxor  @inp[1],@out[1],@out[1]
  697     vpxor   (@ptr[5]),$zero,@inp[1]
  698      vpxor  @inp[2],@out[2],@out[2]
  699     vpxor   (@ptr[6]),$zero,@inp[2]
  700      vpxor  @inp[3],@out[3],@out[3]
  701     vpxor   (@ptr[7]),$zero,@inp[3]
  702      vpxor  @inp[0],@out[4],@out[4]
  703     mov \$1,$one            # constant of 1
  704      vpxor  @inp[1],@out[5],@out[5]
  705      vpxor  @inp[2],@out[6],@out[6]
  706      vpxor  @inp[3],@out[7],@out[7]
  707     jmp .Loop_enc8x
  708 
  709 .align  32
  710 .Loop_enc8x:
  711 ___
  712 for($i=0;$i<8;$i++) {
  713 my $rndkey=($i&1)?$rndkey0:$rndkey1;
  714 $code.=<<___;
  715     vaesenc     $rndkey,@out[0],@out[0]
  716      cmp        32+4*$i(%rsp),$one
  717 ___
  718 $code.=<<___ if ($i);
  719      mov        64+8*$i(%rsp),$offset
  720 ___
  721 $code.=<<___;
  722     vaesenc     $rndkey,@out[1],@out[1]
  723     prefetcht0  31(@ptr[$i])            # prefetch input
  724     vaesenc     $rndkey,@out[2],@out[2]
  725 ___
  726 $code.=<<___ if ($i>1);
  727     prefetcht0  15(@ptr[$i-2])          # prefetch output
  728 ___
  729 $code.=<<___;
  730     vaesenc     $rndkey,@out[3],@out[3]
  731      lea        (@ptr[$i],$offset),$offset
  732      cmovge     %rsp,@ptr[$i]           # cancel input
  733     vaesenc     $rndkey,@out[4],@out[4]
  734      cmovg      %rsp,$offset            # sink output
  735     vaesenc     $rndkey,@out[5],@out[5]
  736      sub        @ptr[$i],$offset
  737     vaesenc     $rndkey,@out[6],@out[6]
  738      vpxor      16(@ptr[$i]),$zero,@inp[$i%4]   # load input and xor with 0-round
  739      mov        $offset,64+8*$i(%rsp)
  740     vaesenc     $rndkey,@out[7],@out[7]
  741     vmovups     `16*(3+$i)-0x78`($key),$rndkey
  742      lea        16(@ptr[$i],$offset),@ptr[$i]   # switch to output
  743 ___
  744 $code.=<<___ if ($i<4)
  745      vmovdqu    @inp[$i%4],`16*$i`($offload)    # off-load
  746 ___
  747 }
  748 $code.=<<___;
  749      vmovdqu    32(%rsp),$counters
  750     prefetcht0  15(@ptr[$i-2])          # prefetch output
  751     prefetcht0  15(@ptr[$i-1])
  752     cmp \$11,$rounds
  753     jb  .Lenc8x_tail
  754 
  755     vaesenc     $rndkey1,@out[0],@out[0]
  756     vaesenc     $rndkey1,@out[1],@out[1]
  757     vaesenc     $rndkey1,@out[2],@out[2]
  758     vaesenc     $rndkey1,@out[3],@out[3]
  759     vaesenc     $rndkey1,@out[4],@out[4]
  760     vaesenc     $rndkey1,@out[5],@out[5]
  761     vaesenc     $rndkey1,@out[6],@out[6]
  762     vaesenc     $rndkey1,@out[7],@out[7]
  763     vmovups     0xb0-0x78($key),$rndkey1
  764 
  765     vaesenc     $rndkey0,@out[0],@out[0]
  766     vaesenc     $rndkey0,@out[1],@out[1]
  767     vaesenc     $rndkey0,@out[2],@out[2]
  768     vaesenc     $rndkey0,@out[3],@out[3]
  769     vaesenc     $rndkey0,@out[4],@out[4]
  770     vaesenc     $rndkey0,@out[5],@out[5]
  771     vaesenc     $rndkey0,@out[6],@out[6]
  772     vaesenc     $rndkey0,@out[7],@out[7]
  773     vmovups     0xc0-0x78($key),$rndkey0
  774     je  .Lenc8x_tail
  775 
  776     vaesenc     $rndkey1,@out[0],@out[0]
  777     vaesenc     $rndkey1,@out[1],@out[1]
  778     vaesenc     $rndkey1,@out[2],@out[2]
  779     vaesenc     $rndkey1,@out[3],@out[3]
  780     vaesenc     $rndkey1,@out[4],@out[4]
  781     vaesenc     $rndkey1,@out[5],@out[5]
  782     vaesenc     $rndkey1,@out[6],@out[6]
  783     vaesenc     $rndkey1,@out[7],@out[7]
  784     vmovups     0xd0-0x78($key),$rndkey1
  785 
  786     vaesenc     $rndkey0,@out[0],@out[0]
  787     vaesenc     $rndkey0,@out[1],@out[1]
  788     vaesenc     $rndkey0,@out[2],@out[2]
  789     vaesenc     $rndkey0,@out[3],@out[3]
  790     vaesenc     $rndkey0,@out[4],@out[4]
  791     vaesenc     $rndkey0,@out[5],@out[5]
  792     vaesenc     $rndkey0,@out[6],@out[6]
  793     vaesenc     $rndkey0,@out[7],@out[7]
  794     vmovups     0xe0-0x78($key),$rndkey0
  795 
  796 .Lenc8x_tail:
  797     vaesenc     $rndkey1,@out[0],@out[0]
  798      vpxor      $zero,$zero,$zero
  799     vaesenc     $rndkey1,@out[1],@out[1]
  800     vaesenc     $rndkey1,@out[2],@out[2]
  801      vpcmpgtd   $zero,$counters,$zero
  802     vaesenc     $rndkey1,@out[3],@out[3]
  803     vaesenc     $rndkey1,@out[4],@out[4]
  804      vpaddd     $counters,$zero,$zero       # decrement counters
  805      vmovdqu    48(%rsp),$counters
  806     vaesenc     $rndkey1,@out[5],@out[5]
  807      mov        64(%rsp),$offset        # pre-load 1st offset
  808     vaesenc     $rndkey1,@out[6],@out[6]
  809     vaesenc     $rndkey1,@out[7],@out[7]
  810     vmovups     0x10-0x78($key),$rndkey1
  811 
  812     vaesenclast $rndkey0,@out[0],@out[0]
  813      vmovdqa    $zero,32(%rsp)          # update counters
  814      vpxor      $zero,$zero,$zero
  815     vaesenclast $rndkey0,@out[1],@out[1]
  816     vaesenclast $rndkey0,@out[2],@out[2]
  817      vpcmpgtd   $zero,$counters,$zero
  818     vaesenclast $rndkey0,@out[3],@out[3]
  819     vaesenclast $rndkey0,@out[4],@out[4]
  820      vpaddd     $zero,$counters,$counters   # decrement counters
  821      vmovdqu    -0x78($key),$zero       # 0-round
  822     vaesenclast $rndkey0,@out[5],@out[5]
  823     vaesenclast $rndkey0,@out[6],@out[6]
  824      vmovdqa    $counters,48(%rsp)      # update counters
  825     vaesenclast $rndkey0,@out[7],@out[7]
  826     vmovups     0x20-0x78($key),$rndkey0
  827 
  828     vmovups     @out[0],-16(@ptr[0])        # write output
  829      sub        $offset,@ptr[0]         # switch to input
  830      vpxor      0x00($offload),@out[0],@out[0]
  831     vmovups     @out[1],-16(@ptr[1])    
  832      sub        `64+1*8`(%rsp),@ptr[1]
  833      vpxor      0x10($offload),@out[1],@out[1]
  834     vmovups     @out[2],-16(@ptr[2])    
  835      sub        `64+2*8`(%rsp),@ptr[2]
  836      vpxor      0x20($offload),@out[2],@out[2]
  837     vmovups     @out[3],-16(@ptr[3])
  838      sub        `64+3*8`(%rsp),@ptr[3]
  839      vpxor      0x30($offload),@out[3],@out[3]
  840     vmovups     @out[4],-16(@ptr[4])
  841      sub        `64+4*8`(%rsp),@ptr[4]
  842      vpxor      @inp[0],@out[4],@out[4]
  843     vmovups     @out[5],-16(@ptr[5])    
  844      sub        `64+5*8`(%rsp),@ptr[5]
  845      vpxor      @inp[1],@out[5],@out[5]
  846     vmovups     @out[6],-16(@ptr[6])    
  847      sub        `64+6*8`(%rsp),@ptr[6]
  848      vpxor      @inp[2],@out[6],@out[6]
  849     vmovups     @out[7],-16(@ptr[7])
  850      sub        `64+7*8`(%rsp),@ptr[7]
  851      vpxor      @inp[3],@out[7],@out[7]
  852 
  853     dec $num
  854     jnz .Loop_enc8x
  855 
  856     mov 16(%rsp),%rax           # original %rsp
  857     #mov    24(%rsp),$num
  858     #lea    `40*8`($inp),$inp
  859     #dec    $num
  860     #jnz    .Lenc8x_loop_grande
  861 
  862 .Lenc8x_done:
  863     vzeroupper
  864 ___
  865 $code.=<<___ if ($win64);
  866     movaps  -0xd8(%rax),%xmm6
  867     movaps  -0xc8(%rax),%xmm7
  868     movaps  -0xb8(%rax),%xmm8
  869     movaps  -0xa8(%rax),%xmm9
  870     movaps  -0x98(%rax),%xmm10
  871     movaps  -0x88(%rax),%xmm11
  872     movaps  -0x78(%rax),%xmm12
  873     movaps  -0x68(%rax),%xmm13
  874     movaps  -0x58(%rax),%xmm14
  875     movaps  -0x48(%rax),%xmm15
  876 ___
  877 $code.=<<___;
  878     mov -48(%rax),%r15
  879     mov -40(%rax),%r14
  880     mov -32(%rax),%r13
  881     mov -24(%rax),%r12
  882     mov -16(%rax),%rbp
  883     mov -8(%rax),%rbx
  884     lea (%rax),%rsp
  885 .Lenc8x_epilogue:
  886     ret
  887 .size   aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
  888 
  889 .type   aesni_multi_cbc_decrypt_avx,\@function,3
  890 .align  32
  891 aesni_multi_cbc_decrypt_avx:
  892 _avx_cbc_dec_shortcut:
  893     mov %rsp,%rax
  894     push    %rbx
  895     push    %rbp
  896     push    %r12
  897     push    %r13
  898     push    %r14
  899     push    %r15
  900 ___
  901 $code.=<<___ if ($win64);
  902     lea -0xa8(%rsp),%rsp
  903     movaps  %xmm6,(%rsp)
  904     movaps  %xmm7,0x10(%rsp)
  905     movaps  %xmm8,0x20(%rsp)
  906     movaps  %xmm9,0x30(%rsp)
  907     movaps  %xmm10,0x40(%rsp)
  908     movaps  %xmm11,0x50(%rsp)
  909     movaps  %xmm12,-0x78(%rax)
  910     movaps  %xmm13,-0x68(%rax)
  911     movaps  %xmm14,-0x58(%rax)
  912     movaps  %xmm15,-0x48(%rax)
  913 ___
  914 $code.=<<___;
  915     # stack layout
  916     #
  917     # +0    output sink
  918     # +16   input sink [original %rsp and $num]
  919     # +32   counters
  920     # +64   distances between inputs and outputs
  921     # +128  off-load area for @inp[0..3]
  922     # +192  IV/input offload
  923 
  924     sub \$256,%rsp
  925     and \$-256,%rsp
  926     sub \$192,%rsp
  927     mov %rax,16(%rsp)           # original %rsp
  928 
  929 .Ldec8x_body:
  930     vzeroupper
  931     vmovdqu ($key),$zero            # 0-round key
  932     lea 0x78($key),$key         # size optimization
  933     lea 40*4($inp),$inp
  934     shr \$1,$num
  935 
  936 .Ldec8x_loop_grande:
  937     #mov    $num,24(%rsp)           # original $num
  938     xor $num,$num
  939 ___
  940 for($i=0;$i<8;$i++) {
  941   my $temp = $i ? $offload : $offset;
  942     $code.=<<___;
  943     mov `40*$i+16-40*4`($inp),$one  # borrow $one for number of blocks
  944     mov `40*$i+0-40*4`($inp),@ptr[$i]   # input pointer
  945     cmp $num,$one
  946     mov `40*$i+8-40*4`($inp),$temp  # output pointer
  947     cmovg   $one,$num           # find maximum
  948     test    $one,$one
  949     vmovdqu `40*$i+24-40*4`($inp),@out[$i]  # load IV
  950     mov $one,`32+4*$i`(%rsp)        # initialize counters
  951     cmovle  %rsp,@ptr[$i]           # cancel input
  952     sub @ptr[$i],$temp          # distance between input and output
  953     mov $temp,`64+8*$i`(%rsp)       # initialize distances
  954     vmovdqu @out[$i],`192+16*$i`(%rsp)  # offload IV
  955 ___
  956 }
  957 $code.=<<___;
  958     test    $num,$num
  959     jz  .Ldec8x_done
  960 
  961     vmovups 0x10-0x78($key),$rndkey1
  962     vmovups 0x20-0x78($key),$rndkey0
  963     mov 0xf0-0x78($key),$rounds
  964      lea    192+128(%rsp),$offload      # offload area
  965 
  966     vmovdqu (@ptr[0]),@out[0]       # load inputs
  967     vmovdqu (@ptr[1]),@out[1]
  968     vmovdqu (@ptr[2]),@out[2]
  969     vmovdqu (@ptr[3]),@out[3]
  970     vmovdqu (@ptr[4]),@out[4]
  971     vmovdqu (@ptr[5]),@out[5]
  972     vmovdqu (@ptr[6]),@out[6]
  973     vmovdqu (@ptr[7]),@out[7]
  974     vmovdqu @out[0],0x00($offload)      # offload inputs
  975     vpxor   $zero,@out[0],@out[0]       # xor inputs with 0-round
  976     vmovdqu @out[1],0x10($offload)
  977     vpxor   $zero,@out[1],@out[1]
  978     vmovdqu @out[2],0x20($offload)
  979     vpxor   $zero,@out[2],@out[2]
  980     vmovdqu @out[3],0x30($offload)
  981     vpxor   $zero,@out[3],@out[3]
  982     vmovdqu @out[4],0x40($offload)
  983     vpxor   $zero,@out[4],@out[4]
  984     vmovdqu @out[5],0x50($offload)
  985     vpxor   $zero,@out[5],@out[5]
  986     vmovdqu @out[6],0x60($offload)
  987     vpxor   $zero,@out[6],@out[6]
  988     vmovdqu @out[7],0x70($offload)
  989     vpxor   $zero,@out[7],@out[7]
  990     xor \$0x80,$offload
  991     mov \$1,$one            # constant of 1
  992     jmp .Loop_dec8x
  993 
  994 .align  32
  995 .Loop_dec8x:
  996 ___
  997 for($i=0;$i<8;$i++) {
  998 my $rndkey=($i&1)?$rndkey0:$rndkey1;
  999 $code.=<<___;
 1000     vaesdec     $rndkey,@out[0],@out[0]
 1001      cmp        32+4*$i(%rsp),$one
 1002 ___
 1003 $code.=<<___ if ($i);
 1004      mov        64+8*$i(%rsp),$offset
 1005 ___
 1006 $code.=<<___;
 1007     vaesdec     $rndkey,@out[1],@out[1]
 1008     prefetcht0  31(@ptr[$i])            # prefetch input
 1009     vaesdec     $rndkey,@out[2],@out[2]
 1010 ___
 1011 $code.=<<___ if ($i>1);
 1012     prefetcht0  15(@ptr[$i-2])          # prefetch output
 1013 ___
 1014 $code.=<<___;
 1015     vaesdec     $rndkey,@out[3],@out[3]
 1016      lea        (@ptr[$i],$offset),$offset
 1017      cmovge     %rsp,@ptr[$i]           # cancel input
 1018     vaesdec     $rndkey,@out[4],@out[4]
 1019      cmovg      %rsp,$offset            # sink output
 1020     vaesdec     $rndkey,@out[5],@out[5]
 1021      sub        @ptr[$i],$offset
 1022     vaesdec     $rndkey,@out[6],@out[6]
 1023      vmovdqu    16(@ptr[$i]),@inp[$i%4]     # load input
 1024      mov        $offset,64+8*$i(%rsp)
 1025     vaesdec     $rndkey,@out[7],@out[7]
 1026     vmovups     `16*(3+$i)-0x78`($key),$rndkey
 1027      lea        16(@ptr[$i],$offset),@ptr[$i]   # switch to output
 1028 ___
 1029 $code.=<<___ if ($i<4);
 1030      vmovdqu    @inp[$i%4],`128+16*$i`(%rsp)    # off-load
 1031 ___
 1032 }
 1033 $code.=<<___;
 1034      vmovdqu    32(%rsp),$counters
 1035     prefetcht0  15(@ptr[$i-2])          # prefetch output
 1036     prefetcht0  15(@ptr[$i-1])
 1037     cmp \$11,$rounds
 1038     jb  .Ldec8x_tail
 1039 
 1040     vaesdec     $rndkey1,@out[0],@out[0]
 1041     vaesdec     $rndkey1,@out[1],@out[1]
 1042     vaesdec     $rndkey1,@out[2],@out[2]
 1043     vaesdec     $rndkey1,@out[3],@out[3]
 1044     vaesdec     $rndkey1,@out[4],@out[4]
 1045     vaesdec     $rndkey1,@out[5],@out[5]
 1046     vaesdec     $rndkey1,@out[6],@out[6]
 1047     vaesdec     $rndkey1,@out[7],@out[7]
 1048     vmovups     0xb0-0x78($key),$rndkey1
 1049 
 1050     vaesdec     $rndkey0,@out[0],@out[0]
 1051     vaesdec     $rndkey0,@out[1],@out[1]
 1052     vaesdec     $rndkey0,@out[2],@out[2]
 1053     vaesdec     $rndkey0,@out[3],@out[3]
 1054     vaesdec     $rndkey0,@out[4],@out[4]
 1055     vaesdec     $rndkey0,@out[5],@out[5]
 1056     vaesdec     $rndkey0,@out[6],@out[6]
 1057     vaesdec     $rndkey0,@out[7],@out[7]
 1058     vmovups     0xc0-0x78($key),$rndkey0
 1059     je  .Ldec8x_tail
 1060 
 1061     vaesdec     $rndkey1,@out[0],@out[0]
 1062     vaesdec     $rndkey1,@out[1],@out[1]
 1063     vaesdec     $rndkey1,@out[2],@out[2]
 1064     vaesdec     $rndkey1,@out[3],@out[3]
 1065     vaesdec     $rndkey1,@out[4],@out[4]
 1066     vaesdec     $rndkey1,@out[5],@out[5]
 1067     vaesdec     $rndkey1,@out[6],@out[6]
 1068     vaesdec     $rndkey1,@out[7],@out[7]
 1069     vmovups     0xd0-0x78($key),$rndkey1
 1070 
 1071     vaesdec     $rndkey0,@out[0],@out[0]
 1072     vaesdec     $rndkey0,@out[1],@out[1]
 1073     vaesdec     $rndkey0,@out[2],@out[2]
 1074     vaesdec     $rndkey0,@out[3],@out[3]
 1075     vaesdec     $rndkey0,@out[4],@out[4]
 1076     vaesdec     $rndkey0,@out[5],@out[5]
 1077     vaesdec     $rndkey0,@out[6],@out[6]
 1078     vaesdec     $rndkey0,@out[7],@out[7]
 1079     vmovups     0xe0-0x78($key),$rndkey0
 1080 
 1081 .Ldec8x_tail:
 1082     vaesdec     $rndkey1,@out[0],@out[0]
 1083      vpxor      $zero,$zero,$zero
 1084     vaesdec     $rndkey1,@out[1],@out[1]
 1085     vaesdec     $rndkey1,@out[2],@out[2]
 1086      vpcmpgtd   $zero,$counters,$zero
 1087     vaesdec     $rndkey1,@out[3],@out[3]
 1088     vaesdec     $rndkey1,@out[4],@out[4]
 1089      vpaddd     $counters,$zero,$zero       # decrement counters
 1090      vmovdqu    48(%rsp),$counters
 1091     vaesdec     $rndkey1,@out[5],@out[5]
 1092      mov        64(%rsp),$offset        # pre-load 1st offset
 1093     vaesdec     $rndkey1,@out[6],@out[6]
 1094     vaesdec     $rndkey1,@out[7],@out[7]
 1095     vmovups     0x10-0x78($key),$rndkey1
 1096 
 1097     vaesdeclast $rndkey0,@out[0],@out[0]
 1098      vmovdqa    $zero,32(%rsp)          # update counters
 1099      vpxor      $zero,$zero,$zero
 1100     vaesdeclast $rndkey0,@out[1],@out[1]
 1101     vpxor       0x00($offload),@out[0],@out[0]  # xor with IV
 1102     vaesdeclast $rndkey0,@out[2],@out[2]
 1103     vpxor       0x10($offload),@out[1],@out[1]
 1104      vpcmpgtd   $zero,$counters,$zero
 1105     vaesdeclast $rndkey0,@out[3],@out[3]
 1106     vpxor       0x20($offload),@out[2],@out[2]
 1107     vaesdeclast $rndkey0,@out[4],@out[4]
 1108     vpxor       0x30($offload),@out[3],@out[3]
 1109      vpaddd     $zero,$counters,$counters   # decrement counters
 1110      vmovdqu    -0x78($key),$zero       # 0-round
 1111     vaesdeclast $rndkey0,@out[5],@out[5]
 1112     vpxor       0x40($offload),@out[4],@out[4]
 1113     vaesdeclast $rndkey0,@out[6],@out[6]
 1114     vpxor       0x50($offload),@out[5],@out[5]
 1115      vmovdqa    $counters,48(%rsp)      # update counters
 1116     vaesdeclast $rndkey0,@out[7],@out[7]
 1117     vpxor       0x60($offload),@out[6],@out[6]
 1118     vmovups     0x20-0x78($key),$rndkey0
 1119 
 1120     vmovups     @out[0],-16(@ptr[0])        # write output
 1121      sub        $offset,@ptr[0]         # switch to input
 1122      vmovdqu    128+0(%rsp),@out[0]
 1123     vpxor       0x70($offload),@out[7],@out[7]
 1124     vmovups     @out[1],-16(@ptr[1])    
 1125      sub        `64+1*8`(%rsp),@ptr[1]
 1126      vmovdqu    @out[0],0x00($offload)
 1127      vpxor      $zero,@out[0],@out[0]
 1128      vmovdqu    128+16(%rsp),@out[1]
 1129     vmovups     @out[2],-16(@ptr[2])    
 1130      sub        `64+2*8`(%rsp),@ptr[2]
 1131      vmovdqu    @out[1],0x10($offload)
 1132      vpxor      $zero,@out[1],@out[1]
 1133      vmovdqu    128+32(%rsp),@out[2]
 1134     vmovups     @out[3],-16(@ptr[3])
 1135      sub        `64+3*8`(%rsp),@ptr[3]
 1136      vmovdqu    @out[2],0x20($offload)
 1137      vpxor      $zero,@out[2],@out[2]
 1138      vmovdqu    128+48(%rsp),@out[3]
 1139     vmovups     @out[4],-16(@ptr[4])
 1140      sub        `64+4*8`(%rsp),@ptr[4]
 1141      vmovdqu    @out[3],0x30($offload)
 1142      vpxor      $zero,@out[3],@out[3]
 1143      vmovdqu    @inp[0],0x40($offload)
 1144      vpxor      @inp[0],$zero,@out[4]
 1145     vmovups     @out[5],-16(@ptr[5])    
 1146      sub        `64+5*8`(%rsp),@ptr[5]
 1147      vmovdqu    @inp[1],0x50($offload)
 1148      vpxor      @inp[1],$zero,@out[5]
 1149     vmovups     @out[6],-16(@ptr[6])    
 1150      sub        `64+6*8`(%rsp),@ptr[6]
 1151      vmovdqu    @inp[2],0x60($offload)
 1152      vpxor      @inp[2],$zero,@out[6]
 1153     vmovups     @out[7],-16(@ptr[7])
 1154      sub        `64+7*8`(%rsp),@ptr[7]
 1155      vmovdqu    @inp[3],0x70($offload)
 1156      vpxor      @inp[3],$zero,@out[7]
 1157 
 1158     xor \$128,$offload
 1159     dec $num
 1160     jnz .Loop_dec8x
 1161 
 1162     mov 16(%rsp),%rax           # original %rsp
 1163     #mov    24(%rsp),$num
 1164     #lea    `40*8`($inp),$inp
 1165     #dec    $num
 1166     #jnz    .Ldec8x_loop_grande
 1167 
 1168 .Ldec8x_done:
 1169     vzeroupper
 1170 ___
 1171 $code.=<<___ if ($win64);
 1172     movaps  -0xd8(%rax),%xmm6
 1173     movaps  -0xc8(%rax),%xmm7
 1174     movaps  -0xb8(%rax),%xmm8
 1175     movaps  -0xa8(%rax),%xmm9
 1176     movaps  -0x98(%rax),%xmm10
 1177     movaps  -0x88(%rax),%xmm11
 1178     movaps  -0x78(%rax),%xmm12
 1179     movaps  -0x68(%rax),%xmm13
 1180     movaps  -0x58(%rax),%xmm14
 1181     movaps  -0x48(%rax),%xmm15
 1182 ___
 1183 $code.=<<___;
 1184     mov -48(%rax),%r15
 1185     mov -40(%rax),%r14
 1186     mov -32(%rax),%r13
 1187     mov -24(%rax),%r12
 1188     mov -16(%rax),%rbp
 1189     mov -8(%rax),%rbx
 1190     lea (%rax),%rsp
 1191 .Ldec8x_epilogue:
 1192     ret
 1193 .size   aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx
 1194 ___
 1195                         }}}
 1196 
 1197 if ($win64) {
 1198 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
 1199 #       CONTEXT *context,DISPATCHER_CONTEXT *disp)
 1200 $rec="%rcx";
 1201 $frame="%rdx";
 1202 $context="%r8";
 1203 $disp="%r9";
 1204 
 1205 $code.=<<___;
 1206 .extern __imp_RtlVirtualUnwind
 1207 .type   se_handler,\@abi-omnipotent
 1208 .align  16
 1209 se_handler:
 1210     push    %rsi
 1211     push    %rdi
 1212     push    %rbx
 1213     push    %rbp
 1214     push    %r12
 1215     push    %r13
 1216     push    %r14
 1217     push    %r15
 1218     pushfq
 1219     sub \$64,%rsp
 1220 
 1221     mov 120($context),%rax  # pull context->Rax
 1222     mov 248($context),%rbx  # pull context->Rip
 1223 
 1224     mov 8($disp),%rsi       # disp->ImageBase
 1225     mov 56($disp),%r11      # disp->HandlerData
 1226 
 1227     mov 0(%r11),%r10d       # HandlerData[0]
 1228     lea (%rsi,%r10),%r10    # prologue label
 1229     cmp %r10,%rbx       # context->Rip<.Lprologue
 1230     jb  .Lin_prologue
 1231 
 1232     mov 152($context),%rax  # pull context->Rsp
 1233 
 1234     mov 4(%r11),%r10d       # HandlerData[1]
 1235     lea (%rsi,%r10),%r10    # epilogue label
 1236     cmp %r10,%rbx       # context->Rip>=.Lepilogue
 1237     jae .Lin_prologue
 1238 
 1239     mov 16(%rax),%rax       # pull saved stack pointer
 1240 
 1241     mov -8(%rax),%rbx
 1242     mov -16(%rax),%rbp
 1243     mov -24(%rax),%r12
 1244     mov -32(%rax),%r13
 1245     mov -40(%rax),%r14
 1246     mov -48(%rax),%r15
 1247     mov %rbx,144($context)  # restore context->Rbx
 1248     mov %rbp,160($context)  # restore context->Rbp
 1249     mov %r12,216($context)  # restore cotnext->R12
 1250     mov %r13,224($context)  # restore cotnext->R13
 1251     mov %r14,232($context)  # restore cotnext->R14
 1252     mov %r15,240($context)  # restore cotnext->R15
 1253 
 1254     lea -56-10*16(%rax),%rsi
 1255     lea 512($context),%rdi  # &context.Xmm6
 1256     mov \$20,%ecx
 1257     .long   0xa548f3fc      # cld; rep movsq
 1258 
 1259 .Lin_prologue:
 1260     mov 8(%rax),%rdi
 1261     mov 16(%rax),%rsi
 1262     mov %rax,152($context)  # restore context->Rsp
 1263     mov %rsi,168($context)  # restore context->Rsi
 1264     mov %rdi,176($context)  # restore context->Rdi
 1265 
 1266     mov 40($disp),%rdi      # disp->ContextRecord
 1267     mov $context,%rsi       # context
 1268     mov \$154,%ecx      # sizeof(CONTEXT)
 1269     .long   0xa548f3fc      # cld; rep movsq
 1270 
 1271     mov $disp,%rsi
 1272     xor %rcx,%rcx       # arg1, UNW_FLAG_NHANDLER
 1273     mov 8(%rsi),%rdx        # arg2, disp->ImageBase
 1274     mov 0(%rsi),%r8     # arg3, disp->ControlPc
 1275     mov 16(%rsi),%r9        # arg4, disp->FunctionEntry
 1276     mov 40(%rsi),%r10       # disp->ContextRecord
 1277     lea 56(%rsi),%r11       # &disp->HandlerData
 1278     lea 24(%rsi),%r12       # &disp->EstablisherFrame
 1279     mov %r10,32(%rsp)       # arg5
 1280     mov %r11,40(%rsp)       # arg6
 1281     mov %r12,48(%rsp)       # arg7
 1282     mov %rcx,56(%rsp)       # arg8, (NULL)
 1283     call    *__imp_RtlVirtualUnwind(%rip)
 1284 
 1285     mov \$1,%eax        # ExceptionContinueSearch
 1286     add \$64,%rsp
 1287     popfq
 1288     pop %r15
 1289     pop %r14
 1290     pop %r13
 1291     pop %r12
 1292     pop %rbp
 1293     pop %rbx
 1294     pop %rdi
 1295     pop %rsi
 1296     ret
 1297 .size   se_handler,.-se_handler
 1298 
 1299 .section    .pdata
 1300 .align  4
 1301     .rva    .LSEH_begin_aesni_multi_cbc_encrypt
 1302     .rva    .LSEH_end_aesni_multi_cbc_encrypt
 1303     .rva    .LSEH_info_aesni_multi_cbc_encrypt
 1304     .rva    .LSEH_begin_aesni_multi_cbc_decrypt
 1305     .rva    .LSEH_end_aesni_multi_cbc_decrypt
 1306     .rva    .LSEH_info_aesni_multi_cbc_decrypt
 1307 ___
 1308 $code.=<<___ if ($avx);
 1309     .rva    .LSEH_begin_aesni_multi_cbc_encrypt_avx
 1310     .rva    .LSEH_end_aesni_multi_cbc_encrypt_avx
 1311     .rva    .LSEH_info_aesni_multi_cbc_encrypt_avx
 1312     .rva    .LSEH_begin_aesni_multi_cbc_decrypt_avx
 1313     .rva    .LSEH_end_aesni_multi_cbc_decrypt_avx
 1314     .rva    .LSEH_info_aesni_multi_cbc_decrypt_avx
 1315 ___
 1316 $code.=<<___;
 1317 .section    .xdata
 1318 .align  8
 1319 .LSEH_info_aesni_multi_cbc_encrypt:
 1320     .byte   9,0,0,0
 1321     .rva    se_handler
 1322     .rva    .Lenc4x_body,.Lenc4x_epilogue       # HandlerData[]
 1323 .LSEH_info_aesni_multi_cbc_decrypt:
 1324     .byte   9,0,0,0
 1325     .rva    se_handler
 1326     .rva    .Ldec4x_body,.Ldec4x_epilogue       # HandlerData[]
 1327 ___
 1328 $code.=<<___ if ($avx);
 1329 .LSEH_info_aesni_multi_cbc_encrypt_avx:
 1330     .byte   9,0,0,0
 1331     .rva    se_handler
 1332     .rva    .Lenc8x_body,.Lenc8x_epilogue       # HandlerData[]
 1333 .LSEH_info_aesni_multi_cbc_decrypt_avx:
 1334     .byte   9,0,0,0
 1335     .rva    se_handler
 1336     .rva    .Ldec8x_body,.Ldec8x_epilogue       # HandlerData[]
 1337 ___
 1338 }
 1339 ####################################################################
 1340 
 1341 sub rex {
 1342   local *opcode=shift;
 1343   my ($dst,$src)=@_;
 1344   my $rex=0;
 1345 
 1346     $rex|=0x04          if($dst>=8);
 1347     $rex|=0x01          if($src>=8);
 1348     push @opcode,$rex|0x40  if($rex);
 1349 }
 1350 
 1351 sub aesni {
 1352   my $line=shift;
 1353   my @opcode=(0x66);
 1354 
 1355     if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
 1356     rex(\@opcode,$4,$3);
 1357     push @opcode,0x0f,0x3a,0xdf;
 1358     push @opcode,0xc0|($3&7)|(($4&7)<<3);   # ModR/M
 1359     my $c=$2;
 1360     push @opcode,$c=~/^0/?oct($c):$c;
 1361     return ".byte\t".join(',',@opcode);
 1362     }
 1363     elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
 1364     my %opcodelet = (
 1365         "aesimc" => 0xdb,
 1366         "aesenc" => 0xdc,   "aesenclast" => 0xdd,
 1367         "aesdec" => 0xde,   "aesdeclast" => 0xdf
 1368     );
 1369     return undef if (!defined($opcodelet{$1}));
 1370     rex(\@opcode,$3,$2);
 1371     push @opcode,0x0f,0x38,$opcodelet{$1};
 1372     push @opcode,0xc0|($2&7)|(($3&7)<<3);   # ModR/M
 1373     return ".byte\t".join(',',@opcode);
 1374     }
 1375     elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
 1376     my %opcodelet = (
 1377         "aesenc" => 0xdc,   "aesenclast" => 0xdd,
 1378         "aesdec" => 0xde,   "aesdeclast" => 0xdf
 1379     );
 1380     return undef if (!defined($opcodelet{$1}));
 1381     my $off = $2;
 1382     push @opcode,0x44 if ($3>=8);
 1383     push @opcode,0x0f,0x38,$opcodelet{$1};
 1384     push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
 1385     push @opcode,($off=~/^0/?oct($off):$off)&0xff;
 1386     return ".byte\t".join(',',@opcode);
 1387     }
 1388     return $line;
 1389 }
 1390 
 1391 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
 1392 $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
 1393 
 1394 print $code;
 1395 close STDOUT;