"Fossies" - the Fresh Open Source Software Archive

Member "openssl-1.0.2q/crypto/aes/asm/aesni-x86.pl" (20 Nov 2018, 75812 Bytes) of package /linux/misc/openssl-1.0.2q.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Perl source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "aesni-x86.pl" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 1.1.0g_vs_1.1.1-pre2.

    1 #!/usr/bin/env perl
    2 
    3 # ====================================================================
    4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
    5 # project. The module is, however, dual licensed under OpenSSL and
    6 # CRYPTOGAMS licenses depending on where you obtain it. For further
    7 # details see http://www.openssl.org/~appro/cryptogams/.
    8 # ====================================================================
    9 #
   10 # This module implements support for Intel AES-NI extension. In
   11 # OpenSSL context it's used with Intel engine, but can also be used as
   12 # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
   13 # details].
   14 #
   15 # Performance.
   16 #
   17 # To start with see corresponding paragraph in aesni-x86_64.pl...
   18 # Instead of filling table similar to one found there I've chosen to
   19 # summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
   20 # The simplified table below represents 32-bit performance relative
   21 # to 64-bit one in every given point. Ratios vary for different
   22 # encryption modes, therefore interval values.
   23 #
   24 #   16-byte     64-byte     256-byte    1-KB        8-KB
   25 #   53-67%      67-84%      91-94%      95-98%      97-99.5%
   26 #
   27 # Lower ratios for smaller block sizes are perfectly understandable,
   28 # because function call overhead is higher in 32-bit mode. Largest
   29 # 8-KB block performance is virtually same: 32-bit code is less than
   30 # 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
   31 
   32 # January 2011
   33 #
   34 # See aesni-x86_64.pl for details. Unlike x86_64 version this module
   35 # interleaves at most 6 aes[enc|dec] instructions, because there are
   36 # not enough registers for 8x interleave [which should be optimal for
   37 # Sandy Bridge]. Actually, performance results for 6x interleave
   38 # factor presented in aesni-x86_64.pl (except for CTR) are for this
   39 # module.
   40 
   41 # April 2011
   42 #
   43 # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
   44 # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
   45 
   46 ######################################################################
   47 # Current large-block performance in cycles per byte processed with
   48 # 128-bit key (less is better).
   49 #
   50 #       CBC en-/decrypt CTR XTS ECB
   51 # Westmere  3.77/1.37   1.37    1.52    1.27
   52 # * Bridge  5.07/0.98   0.99    1.09    0.91
   53 # Haswell   4.44/0.80   0.97    1.03    0.72
   54 # Silvermont    5.77/3.56   3.67    4.03    3.46
   55 # Bulldozer 5.80/0.98   1.05    1.24    0.93
   56 
   57 $PREFIX="aesni";    # if $PREFIX is set to "AES", the script
   58             # generates drop-in replacement for
   59             # crypto/aes/asm/aes-586.pl:-)
   60 $inline=1;      # inline _aesni_[en|de]crypt
   61 
   62 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
   63 push(@INC,"${dir}","${dir}../../perlasm");
   64 require "x86asm.pl";
   65 
   66 &asm_init($ARGV[0],$0);
   67 
   68 &external_label("OPENSSL_ia32cap_P");
   69 &static_label("key_const");
   70 
   71 if ($PREFIX eq "aesni") { $movekey=\&movups; }
   72 else            { $movekey=\&movups; }
   73 
   74 $len="eax";
   75 $rounds="ecx";
   76 $key="edx";
   77 $inp="esi";
   78 $out="edi";
   79 $rounds_="ebx"; # backup copy for $rounds
   80 $key_="ebp";    # backup copy for $key
   81 
   82 $rndkey0="xmm0";
   83 $rndkey1="xmm1";
   84 $inout0="xmm2";
   85 $inout1="xmm3";
   86 $inout2="xmm4";
   87 $inout3="xmm5"; $in1="xmm5";
   88 $inout4="xmm6"; $in0="xmm6";
   89 $inout5="xmm7"; $ivec="xmm7";
   90 
   91 # AESNI extension
   92 sub aeskeygenassist
   93 { my($dst,$src,$imm)=@_;
   94     if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
   95     {   &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm);   }
   96 }
   97 sub aescommon
   98 { my($opcodelet,$dst,$src)=@_;
   99     if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
  100     {   &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
  101 }
  102 sub aesimc  { aescommon(0xdb,@_); }
  103 sub aesenc  { aescommon(0xdc,@_); }
  104 sub aesenclast  { aescommon(0xdd,@_); }
  105 sub aesdec  { aescommon(0xde,@_); }
  106 sub aesdeclast  { aescommon(0xdf,@_); }
  107 
  108 # Inline version of internal aesni_[en|de]crypt1
  109 { my $sn;
  110 sub aesni_inline_generate1
  111 { my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
  112   $sn++;
  113 
  114     &$movekey       ($rndkey0,&QWP(0,$key));
  115     &$movekey       ($rndkey1,&QWP(16,$key));
  116     &xorps      ($ivec,$rndkey0)    if (defined($ivec));
  117     &lea        ($key,&DWP(32,$key));
  118     &xorps      ($inout,$ivec)      if (defined($ivec));
  119     &xorps      ($inout,$rndkey0)   if (!defined($ivec));
  120     &set_label("${p}1_loop_$sn");
  121     eval"&aes${p}   ($inout,$rndkey1)";
  122     &dec        ($rounds);
  123     &$movekey   ($rndkey1,&QWP(0,$key));
  124     &lea        ($key,&DWP(16,$key));
  125     &jnz        (&label("${p}1_loop_$sn"));
  126     eval"&aes${p}last   ($inout,$rndkey1)";
  127 }}
  128 
  129 sub aesni_generate1 # fully unrolled loop
  130 { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
  131 
  132     &function_begin_B("_aesni_${p}rypt1");
  133     &movups     ($rndkey0,&QWP(0,$key));
  134     &$movekey   ($rndkey1,&QWP(0x10,$key));
  135     &xorps      ($inout,$rndkey0);
  136     &$movekey   ($rndkey0,&QWP(0x20,$key));
  137     &lea        ($key,&DWP(0x30,$key));
  138     &cmp        ($rounds,11);
  139     &jb     (&label("${p}128"));
  140     &lea        ($key,&DWP(0x20,$key));
  141     &je     (&label("${p}192"));
  142     &lea        ($key,&DWP(0x20,$key));
  143     eval"&aes${p}   ($inout,$rndkey1)";
  144     &$movekey   ($rndkey1,&QWP(-0x40,$key));
  145     eval"&aes${p}   ($inout,$rndkey0)";
  146     &$movekey   ($rndkey0,&QWP(-0x30,$key));
  147     &set_label("${p}192");
  148     eval"&aes${p}   ($inout,$rndkey1)";
  149     &$movekey   ($rndkey1,&QWP(-0x20,$key));
  150     eval"&aes${p}   ($inout,$rndkey0)";
  151     &$movekey   ($rndkey0,&QWP(-0x10,$key));
  152     &set_label("${p}128");
  153     eval"&aes${p}   ($inout,$rndkey1)";
  154     &$movekey   ($rndkey1,&QWP(0,$key));
  155     eval"&aes${p}   ($inout,$rndkey0)";
  156     &$movekey   ($rndkey0,&QWP(0x10,$key));
  157     eval"&aes${p}   ($inout,$rndkey1)";
  158     &$movekey   ($rndkey1,&QWP(0x20,$key));
  159     eval"&aes${p}   ($inout,$rndkey0)";
  160     &$movekey   ($rndkey0,&QWP(0x30,$key));
  161     eval"&aes${p}   ($inout,$rndkey1)";
  162     &$movekey   ($rndkey1,&QWP(0x40,$key));
  163     eval"&aes${p}   ($inout,$rndkey0)";
  164     &$movekey   ($rndkey0,&QWP(0x50,$key));
  165     eval"&aes${p}   ($inout,$rndkey1)";
  166     &$movekey   ($rndkey1,&QWP(0x60,$key));
  167     eval"&aes${p}   ($inout,$rndkey0)";
  168     &$movekey   ($rndkey0,&QWP(0x70,$key));
  169     eval"&aes${p}   ($inout,$rndkey1)";
  170     eval"&aes${p}last   ($inout,$rndkey0)";
  171     &ret();
  172     &function_end_B("_aesni_${p}rypt1");
  173 }
  174 
  175 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
  176 &aesni_generate1("enc") if (!$inline);
  177 &function_begin_B("${PREFIX}_encrypt");
  178     &mov    ("eax",&wparam(0));
  179     &mov    ($key,&wparam(2));
  180     &movups ($inout0,&QWP(0,"eax"));
  181     &mov    ($rounds,&DWP(240,$key));
  182     &mov    ("eax",&wparam(1));
  183     if ($inline)
  184     {   &aesni_inline_generate1("enc"); }
  185     else
  186     {   &call   ("_aesni_encrypt1");    }
  187     &pxor   ($rndkey0,$rndkey0);        # clear register bank
  188     &pxor   ($rndkey1,$rndkey1);
  189     &movups (&QWP(0,"eax"),$inout0);
  190     &pxor   ($inout0,$inout0);
  191     &ret    ();
  192 &function_end_B("${PREFIX}_encrypt");
  193 
  194 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
  195 &aesni_generate1("dec") if(!$inline);
  196 &function_begin_B("${PREFIX}_decrypt");
  197     &mov    ("eax",&wparam(0));
  198     &mov    ($key,&wparam(2));
  199     &movups ($inout0,&QWP(0,"eax"));
  200     &mov    ($rounds,&DWP(240,$key));
  201     &mov    ("eax",&wparam(1));
  202     if ($inline)
  203     {   &aesni_inline_generate1("dec"); }
  204     else
  205     {   &call   ("_aesni_decrypt1");    }
  206     &pxor   ($rndkey0,$rndkey0);        # clear register bank
  207     &pxor   ($rndkey1,$rndkey1);
  208     &movups (&QWP(0,"eax"),$inout0);
  209     &pxor   ($inout0,$inout0);
  210     &ret    ();
  211 &function_end_B("${PREFIX}_decrypt");
  212 
  213 # _aesni_[en|de]cryptN are private interfaces, N denotes interleave
  214 # factor. Why 3x subroutine were originally used in loops? Even though
  215 # aes[enc|dec] latency was originally 6, it could be scheduled only
  216 # every *2nd* cycle. Thus 3x interleave was the one providing optimal
  217 # utilization, i.e. when subroutine's throughput is virtually same as
  218 # of non-interleaved subroutine [for number of input blocks up to 3].
  219 # This is why it originally made no sense to implement 2x subroutine.
  220 # But times change and it became appropriate to spend extra 192 bytes
  221 # on 2x subroutine on Atom Silvermont account. For processors that
  222 # can schedule aes[enc|dec] every cycle optimal interleave factor
  223 # equals to corresponding instructions latency. 8x is optimal for
  224 # * Bridge, but it's unfeasible to accommodate such implementation
  225 # in XMM registers addreassable in 32-bit mode and therefore maximum
  226 # of 6x is used instead...
  227 
  228 sub aesni_generate2
  229 { my $p=shift;
  230 
  231     &function_begin_B("_aesni_${p}rypt2");
  232     &$movekey   ($rndkey0,&QWP(0,$key));
  233     &shl        ($rounds,4);
  234     &$movekey   ($rndkey1,&QWP(16,$key));
  235     &xorps      ($inout0,$rndkey0);
  236     &pxor       ($inout1,$rndkey0);
  237     &$movekey   ($rndkey0,&QWP(32,$key));
  238     &lea        ($key,&DWP(32,$key,$rounds));
  239     &neg        ($rounds);
  240     &add        ($rounds,16);
  241 
  242     &set_label("${p}2_loop");
  243     eval"&aes${p}   ($inout0,$rndkey1)";
  244     eval"&aes${p}   ($inout1,$rndkey1)";
  245     &$movekey   ($rndkey1,&QWP(0,$key,$rounds));
  246     &add        ($rounds,32);
  247     eval"&aes${p}   ($inout0,$rndkey0)";
  248     eval"&aes${p}   ($inout1,$rndkey0)";
  249     &$movekey   ($rndkey0,&QWP(-16,$key,$rounds));
  250     &jnz        (&label("${p}2_loop"));
  251     eval"&aes${p}   ($inout0,$rndkey1)";
  252     eval"&aes${p}   ($inout1,$rndkey1)";
  253     eval"&aes${p}last   ($inout0,$rndkey0)";
  254     eval"&aes${p}last   ($inout1,$rndkey0)";
  255     &ret();
  256     &function_end_B("_aesni_${p}rypt2");
  257 }
  258 
  259 sub aesni_generate3
  260 { my $p=shift;
  261 
  262     &function_begin_B("_aesni_${p}rypt3");
  263     &$movekey   ($rndkey0,&QWP(0,$key));
  264     &shl        ($rounds,4);
  265     &$movekey   ($rndkey1,&QWP(16,$key));
  266     &xorps      ($inout0,$rndkey0);
  267     &pxor       ($inout1,$rndkey0);
  268     &pxor       ($inout2,$rndkey0);
  269     &$movekey   ($rndkey0,&QWP(32,$key));
  270     &lea        ($key,&DWP(32,$key,$rounds));
  271     &neg        ($rounds);
  272     &add        ($rounds,16);
  273 
  274     &set_label("${p}3_loop");
  275     eval"&aes${p}   ($inout0,$rndkey1)";
  276     eval"&aes${p}   ($inout1,$rndkey1)";
  277     eval"&aes${p}   ($inout2,$rndkey1)";
  278     &$movekey   ($rndkey1,&QWP(0,$key,$rounds));
  279     &add        ($rounds,32);
  280     eval"&aes${p}   ($inout0,$rndkey0)";
  281     eval"&aes${p}   ($inout1,$rndkey0)";
  282     eval"&aes${p}   ($inout2,$rndkey0)";
  283     &$movekey   ($rndkey0,&QWP(-16,$key,$rounds));
  284     &jnz        (&label("${p}3_loop"));
  285     eval"&aes${p}   ($inout0,$rndkey1)";
  286     eval"&aes${p}   ($inout1,$rndkey1)";
  287     eval"&aes${p}   ($inout2,$rndkey1)";
  288     eval"&aes${p}last   ($inout0,$rndkey0)";
  289     eval"&aes${p}last   ($inout1,$rndkey0)";
  290     eval"&aes${p}last   ($inout2,$rndkey0)";
  291     &ret();
  292     &function_end_B("_aesni_${p}rypt3");
  293 }
  294 
  295 # 4x interleave is implemented to improve small block performance,
  296 # most notably [and naturally] 4 block by ~30%. One can argue that one
  297 # should have implemented 5x as well, but improvement  would be <20%,
  298 # so it's not worth it...
  299 sub aesni_generate4
  300 { my $p=shift;
  301 
  302     &function_begin_B("_aesni_${p}rypt4");
  303     &$movekey   ($rndkey0,&QWP(0,$key));
  304     &$movekey   ($rndkey1,&QWP(16,$key));
  305     &shl        ($rounds,4);
  306     &xorps      ($inout0,$rndkey0);
  307     &pxor       ($inout1,$rndkey0);
  308     &pxor       ($inout2,$rndkey0);
  309     &pxor       ($inout3,$rndkey0);
  310     &$movekey   ($rndkey0,&QWP(32,$key));
  311     &lea        ($key,&DWP(32,$key,$rounds));
  312     &neg        ($rounds);
  313     &data_byte  (0x0f,0x1f,0x40,0x00);
  314     &add        ($rounds,16);
  315 
  316     &set_label("${p}4_loop");
  317     eval"&aes${p}   ($inout0,$rndkey1)";
  318     eval"&aes${p}   ($inout1,$rndkey1)";
  319     eval"&aes${p}   ($inout2,$rndkey1)";
  320     eval"&aes${p}   ($inout3,$rndkey1)";
  321     &$movekey   ($rndkey1,&QWP(0,$key,$rounds));
  322     &add        ($rounds,32);
  323     eval"&aes${p}   ($inout0,$rndkey0)";
  324     eval"&aes${p}   ($inout1,$rndkey0)";
  325     eval"&aes${p}   ($inout2,$rndkey0)";
  326     eval"&aes${p}   ($inout3,$rndkey0)";
  327     &$movekey   ($rndkey0,&QWP(-16,$key,$rounds));
  328     &jnz        (&label("${p}4_loop"));
  329 
  330     eval"&aes${p}   ($inout0,$rndkey1)";
  331     eval"&aes${p}   ($inout1,$rndkey1)";
  332     eval"&aes${p}   ($inout2,$rndkey1)";
  333     eval"&aes${p}   ($inout3,$rndkey1)";
  334     eval"&aes${p}last   ($inout0,$rndkey0)";
  335     eval"&aes${p}last   ($inout1,$rndkey0)";
  336     eval"&aes${p}last   ($inout2,$rndkey0)";
  337     eval"&aes${p}last   ($inout3,$rndkey0)";
  338     &ret();
  339     &function_end_B("_aesni_${p}rypt4");
  340 }
  341 
  342 sub aesni_generate6
  343 { my $p=shift;
  344 
  345     &function_begin_B("_aesni_${p}rypt6");
  346     &static_label("_aesni_${p}rypt6_enter");
  347     &$movekey   ($rndkey0,&QWP(0,$key));
  348     &shl        ($rounds,4);
  349     &$movekey   ($rndkey1,&QWP(16,$key));
  350     &xorps      ($inout0,$rndkey0);
  351     &pxor       ($inout1,$rndkey0); # pxor does better here
  352     &pxor       ($inout2,$rndkey0);
  353     eval"&aes${p}   ($inout0,$rndkey1)";
  354     &pxor       ($inout3,$rndkey0);
  355     &pxor       ($inout4,$rndkey0);
  356     eval"&aes${p}   ($inout1,$rndkey1)";
  357     &lea        ($key,&DWP(32,$key,$rounds));
  358     &neg        ($rounds);
  359     eval"&aes${p}   ($inout2,$rndkey1)";
  360     &pxor       ($inout5,$rndkey0);
  361     &$movekey   ($rndkey0,&QWP(0,$key,$rounds));
  362     &add        ($rounds,16);
  363     &jmp        (&label("_aesni_${p}rypt6_inner"));
  364 
  365     &set_label("${p}6_loop",16);
  366     eval"&aes${p}   ($inout0,$rndkey1)";
  367     eval"&aes${p}   ($inout1,$rndkey1)";
  368     eval"&aes${p}   ($inout2,$rndkey1)";
  369     &set_label("_aesni_${p}rypt6_inner");
  370     eval"&aes${p}   ($inout3,$rndkey1)";
  371     eval"&aes${p}   ($inout4,$rndkey1)";
  372     eval"&aes${p}   ($inout5,$rndkey1)";
  373     &set_label("_aesni_${p}rypt6_enter");
  374     &$movekey   ($rndkey1,&QWP(0,$key,$rounds));
  375     &add        ($rounds,32);
  376     eval"&aes${p}   ($inout0,$rndkey0)";
  377     eval"&aes${p}   ($inout1,$rndkey0)";
  378     eval"&aes${p}   ($inout2,$rndkey0)";
  379     eval"&aes${p}   ($inout3,$rndkey0)";
  380     eval"&aes${p}   ($inout4,$rndkey0)";
  381     eval"&aes${p}   ($inout5,$rndkey0)";
  382     &$movekey   ($rndkey0,&QWP(-16,$key,$rounds));
  383     &jnz        (&label("${p}6_loop"));
  384 
  385     eval"&aes${p}   ($inout0,$rndkey1)";
  386     eval"&aes${p}   ($inout1,$rndkey1)";
  387     eval"&aes${p}   ($inout2,$rndkey1)";
  388     eval"&aes${p}   ($inout3,$rndkey1)";
  389     eval"&aes${p}   ($inout4,$rndkey1)";
  390     eval"&aes${p}   ($inout5,$rndkey1)";
  391     eval"&aes${p}last   ($inout0,$rndkey0)";
  392     eval"&aes${p}last   ($inout1,$rndkey0)";
  393     eval"&aes${p}last   ($inout2,$rndkey0)";
  394     eval"&aes${p}last   ($inout3,$rndkey0)";
  395     eval"&aes${p}last   ($inout4,$rndkey0)";
  396     eval"&aes${p}last   ($inout5,$rndkey0)";
  397     &ret();
  398     &function_end_B("_aesni_${p}rypt6");
  399 }
  400 &aesni_generate2("enc") if ($PREFIX eq "aesni");
  401 &aesni_generate2("dec");
  402 &aesni_generate3("enc") if ($PREFIX eq "aesni");
  403 &aesni_generate3("dec");
  404 &aesni_generate4("enc") if ($PREFIX eq "aesni");
  405 &aesni_generate4("dec");
  406 &aesni_generate6("enc") if ($PREFIX eq "aesni");
  407 &aesni_generate6("dec");
  408 
  409 if ($PREFIX eq "aesni") {
  410 ######################################################################
  411 # void aesni_ecb_encrypt (const void *in, void *out,
  412 #                         size_t length, const AES_KEY *key,
  413 #                         int enc);
  414 &function_begin("aesni_ecb_encrypt");
  415     &mov    ($inp,&wparam(0));
  416     &mov    ($out,&wparam(1));
  417     &mov    ($len,&wparam(2));
  418     &mov    ($key,&wparam(3));
  419     &mov    ($rounds_,&wparam(4));
  420     &and    ($len,-16);
  421     &jz (&label("ecb_ret"));
  422     &mov    ($rounds,&DWP(240,$key));
  423     &test   ($rounds_,$rounds_);
  424     &jz (&label("ecb_decrypt"));
  425 
  426     &mov    ($key_,$key);       # backup $key
  427     &mov    ($rounds_,$rounds); # backup $rounds
  428     &cmp    ($len,0x60);
  429     &jb (&label("ecb_enc_tail"));
  430 
  431     &movdqu ($inout0,&QWP(0,$inp));
  432     &movdqu ($inout1,&QWP(0x10,$inp));
  433     &movdqu ($inout2,&QWP(0x20,$inp));
  434     &movdqu ($inout3,&QWP(0x30,$inp));
  435     &movdqu ($inout4,&QWP(0x40,$inp));
  436     &movdqu ($inout5,&QWP(0x50,$inp));
  437     &lea    ($inp,&DWP(0x60,$inp));
  438     &sub    ($len,0x60);
  439     &jmp    (&label("ecb_enc_loop6_enter"));
  440 
  441 &set_label("ecb_enc_loop6",16);
  442     &movups (&QWP(0,$out),$inout0);
  443     &movdqu ($inout0,&QWP(0,$inp));
  444     &movups (&QWP(0x10,$out),$inout1);
  445     &movdqu ($inout1,&QWP(0x10,$inp));
  446     &movups (&QWP(0x20,$out),$inout2);
  447     &movdqu ($inout2,&QWP(0x20,$inp));
  448     &movups (&QWP(0x30,$out),$inout3);
  449     &movdqu ($inout3,&QWP(0x30,$inp));
  450     &movups (&QWP(0x40,$out),$inout4);
  451     &movdqu ($inout4,&QWP(0x40,$inp));
  452     &movups (&QWP(0x50,$out),$inout5);
  453     &lea    ($out,&DWP(0x60,$out));
  454     &movdqu ($inout5,&QWP(0x50,$inp));
  455     &lea    ($inp,&DWP(0x60,$inp));
  456 &set_label("ecb_enc_loop6_enter");
  457 
  458     &call   ("_aesni_encrypt6");
  459 
  460     &mov    ($key,$key_);       # restore $key
  461     &mov    ($rounds,$rounds_); # restore $rounds
  462     &sub    ($len,0x60);
  463     &jnc    (&label("ecb_enc_loop6"));
  464 
  465     &movups (&QWP(0,$out),$inout0);
  466     &movups (&QWP(0x10,$out),$inout1);
  467     &movups (&QWP(0x20,$out),$inout2);
  468     &movups (&QWP(0x30,$out),$inout3);
  469     &movups (&QWP(0x40,$out),$inout4);
  470     &movups (&QWP(0x50,$out),$inout5);
  471     &lea    ($out,&DWP(0x60,$out));
  472     &add    ($len,0x60);
  473     &jz (&label("ecb_ret"));
  474 
  475 &set_label("ecb_enc_tail");
  476     &movups ($inout0,&QWP(0,$inp));
  477     &cmp    ($len,0x20);
  478     &jb (&label("ecb_enc_one"));
  479     &movups ($inout1,&QWP(0x10,$inp));
  480     &je (&label("ecb_enc_two"));
  481     &movups ($inout2,&QWP(0x20,$inp));
  482     &cmp    ($len,0x40);
  483     &jb (&label("ecb_enc_three"));
  484     &movups ($inout3,&QWP(0x30,$inp));
  485     &je (&label("ecb_enc_four"));
  486     &movups ($inout4,&QWP(0x40,$inp));
  487     &xorps  ($inout5,$inout5);
  488     &call   ("_aesni_encrypt6");
  489     &movups (&QWP(0,$out),$inout0);
  490     &movups (&QWP(0x10,$out),$inout1);
  491     &movups (&QWP(0x20,$out),$inout2);
  492     &movups (&QWP(0x30,$out),$inout3);
  493     &movups (&QWP(0x40,$out),$inout4);
  494     jmp (&label("ecb_ret"));
  495 
  496 &set_label("ecb_enc_one",16);
  497     if ($inline)
  498     {   &aesni_inline_generate1("enc"); }
  499     else
  500     {   &call   ("_aesni_encrypt1");    }
  501     &movups (&QWP(0,$out),$inout0);
  502     &jmp    (&label("ecb_ret"));
  503 
  504 &set_label("ecb_enc_two",16);
  505     &call   ("_aesni_encrypt2");
  506     &movups (&QWP(0,$out),$inout0);
  507     &movups (&QWP(0x10,$out),$inout1);
  508     &jmp    (&label("ecb_ret"));
  509 
  510 &set_label("ecb_enc_three",16);
  511     &call   ("_aesni_encrypt3");
  512     &movups (&QWP(0,$out),$inout0);
  513     &movups (&QWP(0x10,$out),$inout1);
  514     &movups (&QWP(0x20,$out),$inout2);
  515     &jmp    (&label("ecb_ret"));
  516 
  517 &set_label("ecb_enc_four",16);
  518     &call   ("_aesni_encrypt4");
  519     &movups (&QWP(0,$out),$inout0);
  520     &movups (&QWP(0x10,$out),$inout1);
  521     &movups (&QWP(0x20,$out),$inout2);
  522     &movups (&QWP(0x30,$out),$inout3);
  523     &jmp    (&label("ecb_ret"));
  524 ######################################################################
  525 &set_label("ecb_decrypt",16);
  526     &mov    ($key_,$key);       # backup $key
  527     &mov    ($rounds_,$rounds); # backup $rounds
  528     &cmp    ($len,0x60);
  529     &jb (&label("ecb_dec_tail"));
  530 
  531     &movdqu ($inout0,&QWP(0,$inp));
  532     &movdqu ($inout1,&QWP(0x10,$inp));
  533     &movdqu ($inout2,&QWP(0x20,$inp));
  534     &movdqu ($inout3,&QWP(0x30,$inp));
  535     &movdqu ($inout4,&QWP(0x40,$inp));
  536     &movdqu ($inout5,&QWP(0x50,$inp));
  537     &lea    ($inp,&DWP(0x60,$inp));
  538     &sub    ($len,0x60);
  539     &jmp    (&label("ecb_dec_loop6_enter"));
  540 
  541 &set_label("ecb_dec_loop6",16);
  542     &movups (&QWP(0,$out),$inout0);
  543     &movdqu ($inout0,&QWP(0,$inp));
  544     &movups (&QWP(0x10,$out),$inout1);
  545     &movdqu ($inout1,&QWP(0x10,$inp));
  546     &movups (&QWP(0x20,$out),$inout2);
  547     &movdqu ($inout2,&QWP(0x20,$inp));
  548     &movups (&QWP(0x30,$out),$inout3);
  549     &movdqu ($inout3,&QWP(0x30,$inp));
  550     &movups (&QWP(0x40,$out),$inout4);
  551     &movdqu ($inout4,&QWP(0x40,$inp));
  552     &movups (&QWP(0x50,$out),$inout5);
  553     &lea    ($out,&DWP(0x60,$out));
  554     &movdqu ($inout5,&QWP(0x50,$inp));
  555     &lea    ($inp,&DWP(0x60,$inp));
  556 &set_label("ecb_dec_loop6_enter");
  557 
  558     &call   ("_aesni_decrypt6");
  559 
  560     &mov    ($key,$key_);       # restore $key
  561     &mov    ($rounds,$rounds_); # restore $rounds
  562     &sub    ($len,0x60);
  563     &jnc    (&label("ecb_dec_loop6"));
  564 
  565     &movups (&QWP(0,$out),$inout0);
  566     &movups (&QWP(0x10,$out),$inout1);
  567     &movups (&QWP(0x20,$out),$inout2);
  568     &movups (&QWP(0x30,$out),$inout3);
  569     &movups (&QWP(0x40,$out),$inout4);
  570     &movups (&QWP(0x50,$out),$inout5);
  571     &lea    ($out,&DWP(0x60,$out));
  572     &add    ($len,0x60);
  573     &jz (&label("ecb_ret"));
  574 
  575 &set_label("ecb_dec_tail");
  576     &movups ($inout0,&QWP(0,$inp));
  577     &cmp    ($len,0x20);
  578     &jb (&label("ecb_dec_one"));
  579     &movups ($inout1,&QWP(0x10,$inp));
  580     &je (&label("ecb_dec_two"));
  581     &movups ($inout2,&QWP(0x20,$inp));
  582     &cmp    ($len,0x40);
  583     &jb (&label("ecb_dec_three"));
  584     &movups ($inout3,&QWP(0x30,$inp));
  585     &je (&label("ecb_dec_four"));
  586     &movups ($inout4,&QWP(0x40,$inp));
  587     &xorps  ($inout5,$inout5);
  588     &call   ("_aesni_decrypt6");
  589     &movups (&QWP(0,$out),$inout0);
  590     &movups (&QWP(0x10,$out),$inout1);
  591     &movups (&QWP(0x20,$out),$inout2);
  592     &movups (&QWP(0x30,$out),$inout3);
  593     &movups (&QWP(0x40,$out),$inout4);
  594     &jmp    (&label("ecb_ret"));
  595 
  596 &set_label("ecb_dec_one",16);
  597     if ($inline)
  598     {   &aesni_inline_generate1("dec"); }
  599     else
  600     {   &call   ("_aesni_decrypt1");    }
  601     &movups (&QWP(0,$out),$inout0);
  602     &jmp    (&label("ecb_ret"));
  603 
  604 &set_label("ecb_dec_two",16);
  605     &call   ("_aesni_decrypt2");
  606     &movups (&QWP(0,$out),$inout0);
  607     &movups (&QWP(0x10,$out),$inout1);
  608     &jmp    (&label("ecb_ret"));
  609 
  610 &set_label("ecb_dec_three",16);
  611     &call   ("_aesni_decrypt3");
  612     &movups (&QWP(0,$out),$inout0);
  613     &movups (&QWP(0x10,$out),$inout1);
  614     &movups (&QWP(0x20,$out),$inout2);
  615     &jmp    (&label("ecb_ret"));
  616 
  617 &set_label("ecb_dec_four",16);
  618     &call   ("_aesni_decrypt4");
  619     &movups (&QWP(0,$out),$inout0);
  620     &movups (&QWP(0x10,$out),$inout1);
  621     &movups (&QWP(0x20,$out),$inout2);
  622     &movups (&QWP(0x30,$out),$inout3);
  623 
  624 &set_label("ecb_ret");
  625     &pxor   ("xmm0","xmm0");        # clear register bank
  626     &pxor   ("xmm1","xmm1");
  627     &pxor   ("xmm2","xmm2");
  628     &pxor   ("xmm3","xmm3");
  629     &pxor   ("xmm4","xmm4");
  630     &pxor   ("xmm5","xmm5");
  631     &pxor   ("xmm6","xmm6");
  632     &pxor   ("xmm7","xmm7");
  633 &function_end("aesni_ecb_encrypt");
  634 
  635 ######################################################################
  636 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
  637 #                         size_t blocks, const AES_KEY *key,
  638 #                         const char *ivec,char *cmac);
  639 #
  640 # Handles only complete blocks, operates on 64-bit counter and
  641 # does not update *ivec! Nor does it finalize CMAC value
  642 # (see engine/eng_aesni.c for details)
  643 #
  644 { my $cmac=$inout1;
  645 &function_begin("aesni_ccm64_encrypt_blocks");
  646     &mov    ($inp,&wparam(0));
  647     &mov    ($out,&wparam(1));
  648     &mov    ($len,&wparam(2));
  649     &mov    ($key,&wparam(3));
  650     &mov    ($rounds_,&wparam(4));
  651     &mov    ($rounds,&wparam(5));
  652     &mov    ($key_,"esp");
  653     &sub    ("esp",60);
  654     &and    ("esp",-16);            # align stack
  655     &mov    (&DWP(48,"esp"),$key_);
  656 
  657     &movdqu ($ivec,&QWP(0,$rounds_));   # load ivec
  658     &movdqu ($cmac,&QWP(0,$rounds));    # load cmac
  659     &mov    ($rounds,&DWP(240,$key));
  660 
  661     # compose byte-swap control mask for pshufb on stack
  662     &mov    (&DWP(0,"esp"),0x0c0d0e0f);
  663     &mov    (&DWP(4,"esp"),0x08090a0b);
  664     &mov    (&DWP(8,"esp"),0x04050607);
  665     &mov    (&DWP(12,"esp"),0x00010203);
  666 
  667     # compose counter increment vector on stack
  668     &mov    ($rounds_,1);
  669     &xor    ($key_,$key_);
  670     &mov    (&DWP(16,"esp"),$rounds_);
  671     &mov    (&DWP(20,"esp"),$key_);
  672     &mov    (&DWP(24,"esp"),$key_);
  673     &mov    (&DWP(28,"esp"),$key_);
  674 
  675     &shl    ($rounds,4);
  676     &mov    ($rounds_,16);
  677     &lea    ($key_,&DWP(0,$key));
  678     &movdqa ($inout3,&QWP(0,"esp"));
  679     &movdqa ($inout0,$ivec);
  680     &lea    ($key,&DWP(32,$key,$rounds));
  681     &sub    ($rounds_,$rounds);
  682     &pshufb ($ivec,$inout3);
  683 
  684 &set_label("ccm64_enc_outer");
  685     &$movekey   ($rndkey0,&QWP(0,$key_));
  686     &mov        ($rounds,$rounds_);
  687     &movups     ($in0,&QWP(0,$inp));
  688 
  689     &xorps      ($inout0,$rndkey0);
  690     &$movekey   ($rndkey1,&QWP(16,$key_));
  691     &xorps      ($rndkey0,$in0);
  692     &xorps      ($cmac,$rndkey0);       # cmac^=inp
  693     &$movekey   ($rndkey0,&QWP(32,$key_));
  694 
  695 &set_label("ccm64_enc2_loop");
  696     &aesenc     ($inout0,$rndkey1);
  697     &aesenc     ($cmac,$rndkey1);
  698     &$movekey   ($rndkey1,&QWP(0,$key,$rounds));
  699     &add        ($rounds,32);
  700     &aesenc     ($inout0,$rndkey0);
  701     &aesenc     ($cmac,$rndkey0);
  702     &$movekey   ($rndkey0,&QWP(-16,$key,$rounds));
  703     &jnz        (&label("ccm64_enc2_loop"));
  704     &aesenc     ($inout0,$rndkey1);
  705     &aesenc     ($cmac,$rndkey1);
  706     &paddq      ($ivec,&QWP(16,"esp"));
  707     &dec        ($len);
  708     &aesenclast ($inout0,$rndkey0);
  709     &aesenclast ($cmac,$rndkey0);
  710 
  711     &lea    ($inp,&DWP(16,$inp));
  712     &xorps  ($in0,$inout0);         # inp^=E(ivec)
  713     &movdqa ($inout0,$ivec);
  714     &movups (&QWP(0,$out),$in0);        # save output
  715     &pshufb ($inout0,$inout3);
  716     &lea    ($out,&DWP(16,$out));
  717     &jnz    (&label("ccm64_enc_outer"));
  718 
  719     &mov    ("esp",&DWP(48,"esp"));
  720     &mov    ($out,&wparam(5));
  721     &movups (&QWP(0,$out),$cmac);
  722 
  723     &pxor   ("xmm0","xmm0");        # clear register bank
  724     &pxor   ("xmm1","xmm1");
  725     &pxor   ("xmm2","xmm2");
  726     &pxor   ("xmm3","xmm3");
  727     &pxor   ("xmm4","xmm4");
  728     &pxor   ("xmm5","xmm5");
  729     &pxor   ("xmm6","xmm6");
  730     &pxor   ("xmm7","xmm7");
  731 &function_end("aesni_ccm64_encrypt_blocks");
  732 
  733 &function_begin("aesni_ccm64_decrypt_blocks");
  734     &mov    ($inp,&wparam(0));
  735     &mov    ($out,&wparam(1));
  736     &mov    ($len,&wparam(2));
  737     &mov    ($key,&wparam(3));
  738     &mov    ($rounds_,&wparam(4));
  739     &mov    ($rounds,&wparam(5));
  740     &mov    ($key_,"esp");
  741     &sub    ("esp",60);
  742     &and    ("esp",-16);            # align stack
  743     &mov    (&DWP(48,"esp"),$key_);
  744 
  745     &movdqu ($ivec,&QWP(0,$rounds_));   # load ivec
  746     &movdqu ($cmac,&QWP(0,$rounds));    # load cmac
  747     &mov    ($rounds,&DWP(240,$key));
  748 
  749     # compose byte-swap control mask for pshufb on stack
  750     &mov    (&DWP(0,"esp"),0x0c0d0e0f);
  751     &mov    (&DWP(4,"esp"),0x08090a0b);
  752     &mov    (&DWP(8,"esp"),0x04050607);
  753     &mov    (&DWP(12,"esp"),0x00010203);
  754 
  755     # compose counter increment vector on stack
  756     &mov    ($rounds_,1);
  757     &xor    ($key_,$key_);
  758     &mov    (&DWP(16,"esp"),$rounds_);
  759     &mov    (&DWP(20,"esp"),$key_);
  760     &mov    (&DWP(24,"esp"),$key_);
  761     &mov    (&DWP(28,"esp"),$key_);
  762 
  763     &movdqa ($inout3,&QWP(0,"esp"));    # bswap mask
  764     &movdqa ($inout0,$ivec);
  765 
  766     &mov    ($key_,$key);
  767     &mov    ($rounds_,$rounds);
  768 
  769     &pshufb ($ivec,$inout3);
  770     if ($inline)
  771     {   &aesni_inline_generate1("enc"); }
  772     else
  773     {   &call   ("_aesni_encrypt1");    }
  774     &shl    ($rounds_,4);
  775     &mov    ($rounds,16);
  776     &movups ($in0,&QWP(0,$inp));        # load inp
  777     &paddq  ($ivec,&QWP(16,"esp"));
  778     &lea    ($inp,&QWP(16,$inp));
  779     &sub    ($rounds,$rounds_);
  780     &lea    ($key,&DWP(32,$key_,$rounds_));
  781     &mov    ($rounds_,$rounds);
  782     &jmp    (&label("ccm64_dec_outer"));
  783 
  784 &set_label("ccm64_dec_outer",16);
  785     &xorps  ($in0,$inout0);         # inp ^= E(ivec)
  786     &movdqa ($inout0,$ivec);
  787     &movups (&QWP(0,$out),$in0);        # save output
  788     &lea    ($out,&DWP(16,$out));
  789     &pshufb ($inout0,$inout3);
  790 
  791     &sub    ($len,1);
  792     &jz (&label("ccm64_dec_break"));
  793 
  794     &$movekey   ($rndkey0,&QWP(0,$key_));
  795     &mov        ($rounds,$rounds_);
  796     &$movekey   ($rndkey1,&QWP(16,$key_));
  797     &xorps      ($in0,$rndkey0);
  798     &xorps      ($inout0,$rndkey0);
  799     &xorps      ($cmac,$in0);       # cmac^=out
  800     &$movekey   ($rndkey0,&QWP(32,$key_));
  801 
  802 &set_label("ccm64_dec2_loop");
  803     &aesenc     ($inout0,$rndkey1);
  804     &aesenc     ($cmac,$rndkey1);
  805     &$movekey   ($rndkey1,&QWP(0,$key,$rounds));
  806     &add        ($rounds,32);
  807     &aesenc     ($inout0,$rndkey0);
  808     &aesenc     ($cmac,$rndkey0);
  809     &$movekey   ($rndkey0,&QWP(-16,$key,$rounds));
  810     &jnz        (&label("ccm64_dec2_loop"));
  811     &movups     ($in0,&QWP(0,$inp));    # load inp
  812     &paddq      ($ivec,&QWP(16,"esp"));
  813     &aesenc     ($inout0,$rndkey1);
  814     &aesenc     ($cmac,$rndkey1);
  815     &aesenclast ($inout0,$rndkey0);
  816     &aesenclast ($cmac,$rndkey0);
  817     &lea        ($inp,&QWP(16,$inp));
  818     &jmp    (&label("ccm64_dec_outer"));
  819 
  820 &set_label("ccm64_dec_break",16);
  821     &mov    ($rounds,&DWP(240,$key_));
  822     &mov    ($key,$key_);
  823     if ($inline)
  824     {   &aesni_inline_generate1("enc",$cmac,$in0);  }
  825     else
  826     {   &call   ("_aesni_encrypt1",$cmac);  }
  827 
  828     &mov    ("esp",&DWP(48,"esp"));
  829     &mov    ($out,&wparam(5));
  830     &movups (&QWP(0,$out),$cmac);
  831 
  832     &pxor   ("xmm0","xmm0");        # clear register bank
  833     &pxor   ("xmm1","xmm1");
  834     &pxor   ("xmm2","xmm2");
  835     &pxor   ("xmm3","xmm3");
  836     &pxor   ("xmm4","xmm4");
  837     &pxor   ("xmm5","xmm5");
  838     &pxor   ("xmm6","xmm6");
  839     &pxor   ("xmm7","xmm7");
  840 &function_end("aesni_ccm64_decrypt_blocks");
  841 }
  842 
  843 ######################################################################
  844 # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
  845 #                         size_t blocks, const AES_KEY *key,
  846 #                         const char *ivec);
  847 #
  848 # Handles only complete blocks, operates on 32-bit counter and
  849 # does not update *ivec! (see crypto/modes/ctr128.c for details)
  850 #
  851 # stack layout:
  852 #   0   pshufb mask
  853 #   16  vector addend: 0,6,6,6
  854 #   32  counter-less ivec
  855 #   48  1st triplet of counter vector
  856 #   64  2nd triplet of counter vector
  857 #   80  saved %esp
  858 
  859 &function_begin("aesni_ctr32_encrypt_blocks");
  860     &mov    ($inp,&wparam(0));
  861     &mov    ($out,&wparam(1));
  862     &mov    ($len,&wparam(2));
  863     &mov    ($key,&wparam(3));
  864     &mov    ($rounds_,&wparam(4));
  865     &mov    ($key_,"esp");
  866     &sub    ("esp",88);
  867     &and    ("esp",-16);            # align stack
  868     &mov    (&DWP(80,"esp"),$key_);
  869 
  870     &cmp    ($len,1);
  871     &je (&label("ctr32_one_shortcut"));
  872 
  873     &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec
  874 
  875     # compose byte-swap control mask for pshufb on stack
  876     &mov    (&DWP(0,"esp"),0x0c0d0e0f);
  877     &mov    (&DWP(4,"esp"),0x08090a0b);
  878     &mov    (&DWP(8,"esp"),0x04050607);
  879     &mov    (&DWP(12,"esp"),0x00010203);
  880 
  881     # compose counter increment vector on stack
  882     &mov    ($rounds,6);
  883     &xor    ($key_,$key_);
  884     &mov    (&DWP(16,"esp"),$rounds);
  885     &mov    (&DWP(20,"esp"),$rounds);
  886     &mov    (&DWP(24,"esp"),$rounds);
  887     &mov    (&DWP(28,"esp"),$key_);
  888 
  889     &pextrd ($rounds_,$inout5,3);       # pull 32-bit counter
  890     &pinsrd ($inout5,$key_,3);      # wipe 32-bit counter
  891 
  892     &mov    ($rounds,&DWP(240,$key));   # key->rounds
  893 
  894     # compose 2 vectors of 3x32-bit counters
  895     &bswap  ($rounds_);
  896     &pxor   ($rndkey0,$rndkey0);
  897     &pxor   ($rndkey1,$rndkey1);
  898     &movdqa ($inout0,&QWP(0,"esp"));    # load byte-swap mask
  899     &pinsrd ($rndkey0,$rounds_,0);
  900     &lea    ($key_,&DWP(3,$rounds_));
  901     &pinsrd ($rndkey1,$key_,0);
  902     &inc    ($rounds_);
  903     &pinsrd ($rndkey0,$rounds_,1);
  904     &inc    ($key_);
  905     &pinsrd ($rndkey1,$key_,1);
  906     &inc    ($rounds_);
  907     &pinsrd ($rndkey0,$rounds_,2);
  908     &inc    ($key_);
  909     &pinsrd ($rndkey1,$key_,2);
  910     &movdqa (&QWP(48,"esp"),$rndkey0);  # save 1st triplet
  911     &pshufb ($rndkey0,$inout0);     # byte swap
  912     &movdqu ($inout4,&QWP(0,$key));     # key[0]
  913     &movdqa (&QWP(64,"esp"),$rndkey1);  # save 2nd triplet
  914     &pshufb ($rndkey1,$inout0);     # byte swap
  915 
  916     &pshufd ($inout0,$rndkey0,3<<6);    # place counter to upper dword
  917     &pshufd ($inout1,$rndkey0,2<<6);
  918     &cmp    ($len,6);
  919     &jb (&label("ctr32_tail"));
  920     &pxor   ($inout5,$inout4);      # counter-less ivec^key[0]
  921     &shl    ($rounds,4);
  922     &mov    ($rounds_,16);
  923     &movdqa (&QWP(32,"esp"),$inout5);   # save counter-less ivec^key[0]
  924     &mov    ($key_,$key);           # backup $key
  925     &sub    ($rounds_,$rounds);     # backup twisted $rounds
  926     &lea    ($key,&DWP(32,$key,$rounds));
  927     &sub    ($len,6);
  928     &jmp    (&label("ctr32_loop6"));
  929 
  930 &set_label("ctr32_loop6",16);
  931     # inlining _aesni_encrypt6's prologue gives ~6% improvement...
  932     &pshufd ($inout2,$rndkey0,1<<6);
  933     &movdqa ($rndkey0,&QWP(32,"esp"));  # pull counter-less ivec
  934     &pshufd ($inout3,$rndkey1,3<<6);
  935     &pxor       ($inout0,$rndkey0); # merge counter-less ivec
  936     &pshufd ($inout4,$rndkey1,2<<6);
  937     &pxor       ($inout1,$rndkey0);
  938     &pshufd ($inout5,$rndkey1,1<<6);
  939     &$movekey   ($rndkey1,&QWP(16,$key_));
  940     &pxor       ($inout2,$rndkey0);
  941     &pxor       ($inout3,$rndkey0);
  942     &aesenc     ($inout0,$rndkey1);
  943     &pxor       ($inout4,$rndkey0);
  944     &pxor       ($inout5,$rndkey0);
  945     &aesenc     ($inout1,$rndkey1);
  946     &$movekey   ($rndkey0,&QWP(32,$key_));
  947     &mov        ($rounds,$rounds_);
  948     &aesenc     ($inout2,$rndkey1);
  949     &aesenc     ($inout3,$rndkey1);
  950     &aesenc     ($inout4,$rndkey1);
  951     &aesenc     ($inout5,$rndkey1);
  952 
  953     &call       (&label("_aesni_encrypt6_enter"));
  954 
  955     &movups ($rndkey1,&QWP(0,$inp));
  956     &movups ($rndkey0,&QWP(0x10,$inp));
  957     &xorps  ($inout0,$rndkey1);
  958     &movups ($rndkey1,&QWP(0x20,$inp));
  959     &xorps  ($inout1,$rndkey0);
  960     &movups (&QWP(0,$out),$inout0);
  961     &movdqa ($rndkey0,&QWP(16,"esp"));  # load increment
  962     &xorps  ($inout2,$rndkey1);
  963     &movdqa ($rndkey1,&QWP(64,"esp"));  # load 2nd triplet
  964     &movups (&QWP(0x10,$out),$inout1);
  965     &movups (&QWP(0x20,$out),$inout2);
  966 
  967     &paddd  ($rndkey1,$rndkey0);        # 2nd triplet increment
  968     &paddd  ($rndkey0,&QWP(48,"esp"));  # 1st triplet increment
  969     &movdqa ($inout0,&QWP(0,"esp"));    # load byte swap mask
  970 
  971     &movups ($inout1,&QWP(0x30,$inp));
  972     &movups ($inout2,&QWP(0x40,$inp));
  973     &xorps  ($inout3,$inout1);
  974     &movups ($inout1,&QWP(0x50,$inp));
  975     &lea    ($inp,&DWP(0x60,$inp));
  976     &movdqa (&QWP(48,"esp"),$rndkey0);  # save 1st triplet
  977     &pshufb ($rndkey0,$inout0);     # byte swap
  978     &xorps  ($inout4,$inout2);
  979     &movups (&QWP(0x30,$out),$inout3);
  980     &xorps  ($inout5,$inout1);
  981     &movdqa (&QWP(64,"esp"),$rndkey1);  # save 2nd triplet
  982     &pshufb ($rndkey1,$inout0);     # byte swap
  983     &movups (&QWP(0x40,$out),$inout4);
  984     &pshufd ($inout0,$rndkey0,3<<6);
  985     &movups (&QWP(0x50,$out),$inout5);
  986     &lea    ($out,&DWP(0x60,$out));
  987 
  988     &pshufd ($inout1,$rndkey0,2<<6);
  989     &sub    ($len,6);
  990     &jnc    (&label("ctr32_loop6"));
  991 
  992     &add    ($len,6);
  993     &jz (&label("ctr32_ret"));
  994     &movdqu ($inout5,&QWP(0,$key_));
  995     &mov    ($key,$key_);
  996     &pxor   ($inout5,&QWP(32,"esp"));   # restore count-less ivec
  997     &mov    ($rounds,&DWP(240,$key_));  # restore $rounds
  998 
  999 &set_label("ctr32_tail");
 1000     &por    ($inout0,$inout5);
 1001     &cmp    ($len,2);
 1002     &jb (&label("ctr32_one"));
 1003 
 1004     &pshufd ($inout2,$rndkey0,1<<6);
 1005     &por    ($inout1,$inout5);
 1006     &je (&label("ctr32_two"));
 1007 
 1008     &pshufd ($inout3,$rndkey1,3<<6);
 1009     &por    ($inout2,$inout5);
 1010     &cmp    ($len,4);
 1011     &jb (&label("ctr32_three"));
 1012 
 1013     &pshufd ($inout4,$rndkey1,2<<6);
 1014     &por    ($inout3,$inout5);
 1015     &je (&label("ctr32_four"));
 1016 
 1017     &por    ($inout4,$inout5);
 1018     &call   ("_aesni_encrypt6");
 1019     &movups ($rndkey1,&QWP(0,$inp));
 1020     &movups ($rndkey0,&QWP(0x10,$inp));
 1021     &xorps  ($inout0,$rndkey1);
 1022     &movups ($rndkey1,&QWP(0x20,$inp));
 1023     &xorps  ($inout1,$rndkey0);
 1024     &movups ($rndkey0,&QWP(0x30,$inp));
 1025     &xorps  ($inout2,$rndkey1);
 1026     &movups ($rndkey1,&QWP(0x40,$inp));
 1027     &xorps  ($inout3,$rndkey0);
 1028     &movups (&QWP(0,$out),$inout0);
 1029     &xorps  ($inout4,$rndkey1);
 1030     &movups (&QWP(0x10,$out),$inout1);
 1031     &movups (&QWP(0x20,$out),$inout2);
 1032     &movups (&QWP(0x30,$out),$inout3);
 1033     &movups (&QWP(0x40,$out),$inout4);
 1034     &jmp    (&label("ctr32_ret"));
 1035 
 1036 &set_label("ctr32_one_shortcut",16);
 1037     &movups ($inout0,&QWP(0,$rounds_)); # load ivec
 1038     &mov    ($rounds,&DWP(240,$key));
 1039     
 1040 &set_label("ctr32_one");
 1041     if ($inline)
 1042     {   &aesni_inline_generate1("enc"); }
 1043     else
 1044     {   &call   ("_aesni_encrypt1");    }
 1045     &movups ($in0,&QWP(0,$inp));
 1046     &xorps  ($in0,$inout0);
 1047     &movups (&QWP(0,$out),$in0);
 1048     &jmp    (&label("ctr32_ret"));
 1049 
 1050 &set_label("ctr32_two",16);
 1051     &call   ("_aesni_encrypt2");
 1052     &movups ($inout3,&QWP(0,$inp));
 1053     &movups ($inout4,&QWP(0x10,$inp));
 1054     &xorps  ($inout0,$inout3);
 1055     &xorps  ($inout1,$inout4);
 1056     &movups (&QWP(0,$out),$inout0);
 1057     &movups (&QWP(0x10,$out),$inout1);
 1058     &jmp    (&label("ctr32_ret"));
 1059 
 1060 &set_label("ctr32_three",16);
 1061     &call   ("_aesni_encrypt3");
 1062     &movups ($inout3,&QWP(0,$inp));
 1063     &movups ($inout4,&QWP(0x10,$inp));
 1064     &xorps  ($inout0,$inout3);
 1065     &movups ($inout5,&QWP(0x20,$inp));
 1066     &xorps  ($inout1,$inout4);
 1067     &movups (&QWP(0,$out),$inout0);
 1068     &xorps  ($inout2,$inout5);
 1069     &movups (&QWP(0x10,$out),$inout1);
 1070     &movups (&QWP(0x20,$out),$inout2);
 1071     &jmp    (&label("ctr32_ret"));
 1072 
 1073 &set_label("ctr32_four",16);
 1074     &call   ("_aesni_encrypt4");
 1075     &movups ($inout4,&QWP(0,$inp));
 1076     &movups ($inout5,&QWP(0x10,$inp));
 1077     &movups ($rndkey1,&QWP(0x20,$inp));
 1078     &xorps  ($inout0,$inout4);
 1079     &movups ($rndkey0,&QWP(0x30,$inp));
 1080     &xorps  ($inout1,$inout5);
 1081     &movups (&QWP(0,$out),$inout0);
 1082     &xorps  ($inout2,$rndkey1);
 1083     &movups (&QWP(0x10,$out),$inout1);
 1084     &xorps  ($inout3,$rndkey0);
 1085     &movups (&QWP(0x20,$out),$inout2);
 1086     &movups (&QWP(0x30,$out),$inout3);
 1087 
 1088 &set_label("ctr32_ret");
 1089     &pxor   ("xmm0","xmm0");        # clear register bank
 1090     &pxor   ("xmm1","xmm1");
 1091     &pxor   ("xmm2","xmm2");
 1092     &pxor   ("xmm3","xmm3");
 1093     &pxor   ("xmm4","xmm4");
 1094     &movdqa (&QWP(32,"esp"),"xmm0");    # clear stack
 1095     &pxor   ("xmm5","xmm5");
 1096     &movdqa (&QWP(48,"esp"),"xmm0");
 1097     &pxor   ("xmm6","xmm6");
 1098     &movdqa (&QWP(64,"esp"),"xmm0");
 1099     &pxor   ("xmm7","xmm7");
 1100     &mov    ("esp",&DWP(80,"esp"));
 1101 &function_end("aesni_ctr32_encrypt_blocks");
 1102 
 1103 ######################################################################
 1104 # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
 1105 #   const AES_KEY *key1, const AES_KEY *key2
 1106 #   const unsigned char iv[16]);
 1107 #
 1108 { my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
 1109 
 1110 &function_begin("aesni_xts_encrypt");
 1111     &mov    ($key,&wparam(4));      # key2
 1112     &mov    ($inp,&wparam(5));      # clear-text tweak
 1113 
 1114     &mov    ($rounds,&DWP(240,$key));   # key2->rounds
 1115     &movups ($inout0,&QWP(0,$inp));
 1116     if ($inline)
 1117     {   &aesni_inline_generate1("enc"); }
 1118     else
 1119     {   &call   ("_aesni_encrypt1");    }
 1120 
 1121     &mov    ($inp,&wparam(0));
 1122     &mov    ($out,&wparam(1));
 1123     &mov    ($len,&wparam(2));
 1124     &mov    ($key,&wparam(3));      # key1
 1125 
 1126     &mov    ($key_,"esp");
 1127     &sub    ("esp",16*7+8);
 1128     &mov    ($rounds,&DWP(240,$key));   # key1->rounds
 1129     &and    ("esp",-16);            # align stack
 1130 
 1131     &mov    (&DWP(16*6+0,"esp"),0x87);  # compose the magic constant
 1132     &mov    (&DWP(16*6+4,"esp"),0);
 1133     &mov    (&DWP(16*6+8,"esp"),1);
 1134     &mov    (&DWP(16*6+12,"esp"),0);
 1135     &mov    (&DWP(16*7+0,"esp"),$len);  # save original $len
 1136     &mov    (&DWP(16*7+4,"esp"),$key_); # save original %esp
 1137 
 1138     &movdqa ($tweak,$inout0);
 1139     &pxor   ($twtmp,$twtmp);
 1140     &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
 1141     &pcmpgtd($twtmp,$tweak);        # broadcast upper bits
 1142 
 1143     &and    ($len,-16);
 1144     &mov    ($key_,$key);           # backup $key
 1145     &mov    ($rounds_,$rounds);     # backup $rounds
 1146     &sub    ($len,16*6);
 1147     &jc (&label("xts_enc_short"));
 1148 
 1149     &shl    ($rounds,4);
 1150     &mov    ($rounds_,16);
 1151     &sub    ($rounds_,$rounds);
 1152     &lea    ($key,&DWP(32,$key,$rounds));
 1153     &jmp    (&label("xts_enc_loop6"));
 1154 
 1155 &set_label("xts_enc_loop6",16);
 1156     for ($i=0;$i<4;$i++) {
 1157         &pshufd ($twres,$twtmp,0x13);
 1158         &pxor   ($twtmp,$twtmp);
 1159         &movdqa (&QWP(16*$i,"esp"),$tweak);
 1160         &paddq  ($tweak,$tweak);    # &psllq($tweak,1);
 1161         &pand   ($twres,$twmask);   # isolate carry and residue
 1162         &pcmpgtd    ($twtmp,$tweak);    # broadcast upper bits
 1163         &pxor   ($tweak,$twres);
 1164     }
 1165     &pshufd ($inout5,$twtmp,0x13);
 1166     &movdqa (&QWP(16*$i++,"esp"),$tweak);
 1167     &paddq  ($tweak,$tweak);        # &psllq($tweak,1);
 1168      &$movekey  ($rndkey0,&QWP(0,$key_));
 1169     &pand   ($inout5,$twmask);      # isolate carry and residue
 1170      &movups    ($inout0,&QWP(0,$inp)); # load input
 1171     &pxor   ($inout5,$tweak);
 1172 
 1173     # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
 1174     &mov    ($rounds,$rounds_);     # restore $rounds
 1175     &movdqu ($inout1,&QWP(16*1,$inp));
 1176      &xorps     ($inout0,$rndkey0); # input^=rndkey[0]
 1177     &movdqu ($inout2,&QWP(16*2,$inp));
 1178      &pxor      ($inout1,$rndkey0);
 1179     &movdqu ($inout3,&QWP(16*3,$inp));
 1180      &pxor      ($inout2,$rndkey0);
 1181     &movdqu ($inout4,&QWP(16*4,$inp));
 1182      &pxor      ($inout3,$rndkey0);
 1183     &movdqu ($rndkey1,&QWP(16*5,$inp));
 1184      &pxor      ($inout4,$rndkey0);
 1185     &lea    ($inp,&DWP(16*6,$inp));
 1186     &pxor   ($inout0,&QWP(16*0,"esp")); # input^=tweak
 1187     &movdqa (&QWP(16*$i,"esp"),$inout5);    # save last tweak
 1188     &pxor   ($inout5,$rndkey1);
 1189 
 1190      &$movekey  ($rndkey1,&QWP(16,$key_));
 1191     &pxor   ($inout1,&QWP(16*1,"esp"));
 1192     &pxor   ($inout2,&QWP(16*2,"esp"));
 1193      &aesenc    ($inout0,$rndkey1);
 1194     &pxor   ($inout3,&QWP(16*3,"esp"));
 1195     &pxor   ($inout4,&QWP(16*4,"esp"));
 1196      &aesenc    ($inout1,$rndkey1);
 1197     &pxor       ($inout5,$rndkey0);
 1198      &$movekey  ($rndkey0,&QWP(32,$key_));
 1199      &aesenc    ($inout2,$rndkey1);
 1200      &aesenc    ($inout3,$rndkey1);
 1201      &aesenc    ($inout4,$rndkey1);
 1202      &aesenc    ($inout5,$rndkey1);
 1203     &call       (&label("_aesni_encrypt6_enter"));
 1204 
 1205     &movdqa ($tweak,&QWP(16*5,"esp"));  # last tweak
 1206        &pxor    ($twtmp,$twtmp);
 1207     &xorps  ($inout0,&QWP(16*0,"esp")); # output^=tweak
 1208        &pcmpgtd ($twtmp,$tweak);        # broadcast upper bits
 1209     &xorps  ($inout1,&QWP(16*1,"esp"));
 1210     &movups (&QWP(16*0,$out),$inout0);  # write output
 1211     &xorps  ($inout2,&QWP(16*2,"esp"));
 1212     &movups (&QWP(16*1,$out),$inout1);
 1213     &xorps  ($inout3,&QWP(16*3,"esp"));
 1214     &movups (&QWP(16*2,$out),$inout2);
 1215     &xorps  ($inout4,&QWP(16*4,"esp"));
 1216     &movups (&QWP(16*3,$out),$inout3);
 1217     &xorps  ($inout5,$tweak);
 1218     &movups (&QWP(16*4,$out),$inout4);
 1219        &pshufd  ($twres,$twtmp,0x13);
 1220     &movups (&QWP(16*5,$out),$inout5);
 1221     &lea    ($out,&DWP(16*6,$out));
 1222        &movdqa  ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
 1223 
 1224     &pxor   ($twtmp,$twtmp);
 1225     &paddq  ($tweak,$tweak);        # &psllq($tweak,1);
 1226     &pand   ($twres,$twmask);       # isolate carry and residue
 1227     &pcmpgtd($twtmp,$tweak);        # broadcast upper bits
 1228     &pxor   ($tweak,$twres);
 1229 
 1230     &sub    ($len,16*6);
 1231     &jnc    (&label("xts_enc_loop6"));
 1232 
 1233     &mov    ($rounds,&DWP(240,$key_));  # restore $rounds
 1234     &mov    ($key,$key_);           # restore $key
 1235     &mov    ($rounds_,$rounds);
 1236 
 1237 &set_label("xts_enc_short");
 1238     &add    ($len,16*6);
 1239     &jz (&label("xts_enc_done6x"));
 1240 
 1241     &movdqa ($inout3,$tweak);       # put aside previous tweak
 1242     &cmp    ($len,0x20);
 1243     &jb (&label("xts_enc_one"));
 1244 
 1245     &pshufd ($twres,$twtmp,0x13);
 1246     &pxor   ($twtmp,$twtmp);
 1247     &paddq  ($tweak,$tweak);        # &psllq($tweak,1);
 1248     &pand   ($twres,$twmask);       # isolate carry and residue
 1249     &pcmpgtd($twtmp,$tweak);        # broadcast upper bits
 1250     &pxor   ($tweak,$twres);
 1251     &je (&label("xts_enc_two"));
 1252 
 1253     &pshufd ($twres,$twtmp,0x13);
 1254     &pxor   ($twtmp,$twtmp);
 1255     &movdqa ($inout4,$tweak);       # put aside previous tweak
 1256     &paddq  ($tweak,$tweak);        # &psllq($tweak,1);
 1257     &pand   ($twres,$twmask);       # isolate carry and residue
 1258     &pcmpgtd($twtmp,$tweak);        # broadcast upper bits
 1259     &pxor   ($tweak,$twres);
 1260     &cmp    ($len,0x40);
 1261     &jb (&label("xts_enc_three"));
 1262 
 1263     &pshufd ($twres,$twtmp,0x13);
 1264     &pxor   ($twtmp,$twtmp);
 1265     &movdqa ($inout5,$tweak);       # put aside previous tweak
 1266     &paddq  ($tweak,$tweak);        # &psllq($tweak,1);
 1267     &pand   ($twres,$twmask);       # isolate carry and residue
 1268     &pcmpgtd($twtmp,$tweak);        # broadcast upper bits
 1269     &pxor   ($tweak,$twres);
 1270     &movdqa (&QWP(16*0,"esp"),$inout3);
 1271     &movdqa (&QWP(16*1,"esp"),$inout4);
 1272     &je (&label("xts_enc_four"));
 1273 
 1274     &movdqa (&QWP(16*2,"esp"),$inout5);
 1275     &pshufd ($inout5,$twtmp,0x13);
 1276     &movdqa (&QWP(16*3,"esp"),$tweak);
 1277     &paddq  ($tweak,$tweak);        # &psllq($inout0,1);
 1278     &pand   ($inout5,$twmask);      # isolate carry and residue
 1279     &pxor   ($inout5,$tweak);
 1280 
 1281     &movdqu ($inout0,&QWP(16*0,$inp));  # load input
 1282     &movdqu ($inout1,&QWP(16*1,$inp));
 1283     &movdqu ($inout2,&QWP(16*2,$inp));
 1284     &pxor   ($inout0,&QWP(16*0,"esp")); # input^=tweak
 1285     &movdqu ($inout3,&QWP(16*3,$inp));
 1286     &pxor   ($inout1,&QWP(16*1,"esp"));
 1287     &movdqu ($inout4,&QWP(16*4,$inp));
 1288     &pxor   ($inout2,&QWP(16*2,"esp"));
 1289     &lea    ($inp,&DWP(16*5,$inp));
 1290     &pxor   ($inout3,&QWP(16*3,"esp"));
 1291     &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
 1292     &pxor   ($inout4,$inout5);
 1293 
 1294     &call   ("_aesni_encrypt6");
 1295 
 1296     &movaps ($tweak,&QWP(16*4,"esp"));  # last tweak
 1297     &xorps  ($inout0,&QWP(16*0,"esp")); # output^=tweak
 1298     &xorps  ($inout1,&QWP(16*1,"esp"));
 1299     &xorps  ($inout2,&QWP(16*2,"esp"));
 1300     &movups (&QWP(16*0,$out),$inout0);  # write output
 1301     &xorps  ($inout3,&QWP(16*3,"esp"));
 1302     &movups (&QWP(16*1,$out),$inout1);
 1303     &xorps  ($inout4,$tweak);
 1304     &movups (&QWP(16*2,$out),$inout2);
 1305     &movups (&QWP(16*3,$out),$inout3);
 1306     &movups (&QWP(16*4,$out),$inout4);
 1307     &lea    ($out,&DWP(16*5,$out));
 1308     &jmp    (&label("xts_enc_done"));
 1309 
 1310 &set_label("xts_enc_one",16);
 1311     &movups ($inout0,&QWP(16*0,$inp));  # load input
 1312     &lea    ($inp,&DWP(16*1,$inp));
 1313     &xorps  ($inout0,$inout3);      # input^=tweak
 1314     if ($inline)
 1315     {   &aesni_inline_generate1("enc"); }
 1316     else
 1317     {   &call   ("_aesni_encrypt1");    }
 1318     &xorps  ($inout0,$inout3);      # output^=tweak
 1319     &movups (&QWP(16*0,$out),$inout0);  # write output
 1320     &lea    ($out,&DWP(16*1,$out));
 1321 
 1322     &movdqa ($tweak,$inout3);       # last tweak
 1323     &jmp    (&label("xts_enc_done"));
 1324 
 1325 &set_label("xts_enc_two",16);
 1326     &movaps ($inout4,$tweak);       # put aside last tweak
 1327 
 1328     &movups ($inout0,&QWP(16*0,$inp));  # load input
 1329     &movups ($inout1,&QWP(16*1,$inp));
 1330     &lea    ($inp,&DWP(16*2,$inp));
 1331     &xorps  ($inout0,$inout3);      # input^=tweak
 1332     &xorps  ($inout1,$inout4);
 1333 
 1334     &call   ("_aesni_encrypt2");
 1335 
 1336     &xorps  ($inout0,$inout3);      # output^=tweak
 1337     &xorps  ($inout1,$inout4);
 1338     &movups (&QWP(16*0,$out),$inout0);  # write output
 1339     &movups (&QWP(16*1,$out),$inout1);
 1340     &lea    ($out,&DWP(16*2,$out));
 1341 
 1342     &movdqa ($tweak,$inout4);       # last tweak
 1343     &jmp    (&label("xts_enc_done"));
 1344 
 1345 &set_label("xts_enc_three",16);
 1346     &movaps ($inout5,$tweak);       # put aside last tweak
 1347     &movups ($inout0,&QWP(16*0,$inp));  # load input
 1348     &movups ($inout1,&QWP(16*1,$inp));
 1349     &movups ($inout2,&QWP(16*2,$inp));
 1350     &lea    ($inp,&DWP(16*3,$inp));
 1351     &xorps  ($inout0,$inout3);      # input^=tweak
 1352     &xorps  ($inout1,$inout4);
 1353     &xorps  ($inout2,$inout5);
 1354 
 1355     &call   ("_aesni_encrypt3");
 1356 
 1357     &xorps  ($inout0,$inout3);      # output^=tweak
 1358     &xorps  ($inout1,$inout4);
 1359     &xorps  ($inout2,$inout5);
 1360     &movups (&QWP(16*0,$out),$inout0);  # write output
 1361     &movups (&QWP(16*1,$out),$inout1);
 1362     &movups (&QWP(16*2,$out),$inout2);
 1363     &lea    ($out,&DWP(16*3,$out));
 1364 
 1365     &movdqa ($tweak,$inout5);       # last tweak
 1366     &jmp    (&label("xts_enc_done"));
 1367 
 1368 &set_label("xts_enc_four",16);
 1369     &movaps ($inout4,$tweak);       # put aside last tweak
 1370 
 1371     &movups ($inout0,&QWP(16*0,$inp));  # load input
 1372     &movups ($inout1,&QWP(16*1,$inp));
 1373     &movups ($inout2,&QWP(16*2,$inp));
 1374     &xorps  ($inout0,&QWP(16*0,"esp")); # input^=tweak
 1375     &movups ($inout3,&QWP(16*3,$inp));
 1376     &lea    ($inp,&DWP(16*4,$inp));
 1377     &xorps  ($inout1,&QWP(16*1,"esp"));
 1378     &xorps  ($inout2,$inout5);
 1379     &xorps  ($inout3,$inout4);
 1380 
 1381     &call   ("_aesni_encrypt4");
 1382 
 1383     &xorps  ($inout0,&QWP(16*0,"esp")); # output^=tweak
 1384     &xorps  ($inout1,&QWP(16*1,"esp"));
 1385     &xorps  ($inout2,$inout5);
 1386     &movups (&QWP(16*0,$out),$inout0);  # write output
 1387     &xorps  ($inout3,$inout4);
 1388     &movups (&QWP(16*1,$out),$inout1);
 1389     &movups (&QWP(16*2,$out),$inout2);
 1390     &movups (&QWP(16*3,$out),$inout3);
 1391     &lea    ($out,&DWP(16*4,$out));
 1392 
 1393     &movdqa ($tweak,$inout4);       # last tweak
 1394     &jmp    (&label("xts_enc_done"));
 1395 
 1396 &set_label("xts_enc_done6x",16);        # $tweak is pre-calculated
 1397     &mov    ($len,&DWP(16*7+0,"esp"));  # restore original $len
 1398     &and    ($len,15);
 1399     &jz (&label("xts_enc_ret"));
 1400     &movdqa ($inout3,$tweak);
 1401     &mov    (&DWP(16*7+0,"esp"),$len);  # save $len%16
 1402     &jmp    (&label("xts_enc_steal"));
 1403 
 1404 &set_label("xts_enc_done",16);
 1405     &mov    ($len,&DWP(16*7+0,"esp"));  # restore original $len
 1406     &pxor   ($twtmp,$twtmp);
 1407     &and    ($len,15);
 1408     &jz (&label("xts_enc_ret"));
 1409 
 1410     &pcmpgtd($twtmp,$tweak);        # broadcast upper bits
 1411     &mov    (&DWP(16*7+0,"esp"),$len);  # save $len%16
 1412     &pshufd ($inout3,$twtmp,0x13);
 1413     &paddq  ($tweak,$tweak);        # &psllq($tweak,1);
 1414     &pand   ($inout3,&QWP(16*6,"esp")); # isolate carry and residue
 1415     &pxor   ($inout3,$tweak);
 1416 
 1417 &set_label("xts_enc_steal");
 1418     &movz   ($rounds,&BP(0,$inp));
 1419     &movz   ($key,&BP(-16,$out));
 1420     &lea    ($inp,&DWP(1,$inp));
 1421     &mov    (&BP(-16,$out),&LB($rounds));
 1422     &mov    (&BP(0,$out),&LB($key));
 1423     &lea    ($out,&DWP(1,$out));
 1424     &sub    ($len,1);
 1425     &jnz    (&label("xts_enc_steal"));
 1426 
 1427     &sub    ($out,&DWP(16*7+0,"esp"));  # rewind $out
 1428     &mov    ($key,$key_);           # restore $key
 1429     &mov    ($rounds,$rounds_);     # restore $rounds
 1430 
 1431     &movups ($inout0,&QWP(-16,$out));   # load input
 1432     &xorps  ($inout0,$inout3);      # input^=tweak
 1433     if ($inline)
 1434     {   &aesni_inline_generate1("enc"); }
 1435     else
 1436     {   &call   ("_aesni_encrypt1");    }
 1437     &xorps  ($inout0,$inout3);      # output^=tweak
 1438     &movups (&QWP(-16,$out),$inout0);   # write output
 1439 
 1440 &set_label("xts_enc_ret");
 1441     &pxor   ("xmm0","xmm0");        # clear register bank
 1442     &pxor   ("xmm1","xmm1");
 1443     &pxor   ("xmm2","xmm2");
 1444     &movdqa (&QWP(16*0,"esp"),"xmm0");  # clear stack
 1445     &pxor   ("xmm3","xmm3");
 1446     &movdqa (&QWP(16*1,"esp"),"xmm0");
 1447     &pxor   ("xmm4","xmm4");
 1448     &movdqa (&QWP(16*2,"esp"),"xmm0");
 1449     &pxor   ("xmm5","xmm5");
 1450     &movdqa (&QWP(16*3,"esp"),"xmm0");
 1451     &pxor   ("xmm6","xmm6");
 1452     &movdqa (&QWP(16*4,"esp"),"xmm0");
 1453     &pxor   ("xmm7","xmm7");
 1454     &movdqa (&QWP(16*5,"esp"),"xmm0");
 1455     &mov    ("esp",&DWP(16*7+4,"esp")); # restore %esp
 1456 &function_end("aesni_xts_encrypt");
 1457 
 1458 &function_begin("aesni_xts_decrypt");
 1459     &mov    ($key,&wparam(4));      # key2
 1460     &mov    ($inp,&wparam(5));      # clear-text tweak
 1461 
 1462     &mov    ($rounds,&DWP(240,$key));   # key2->rounds
 1463     &movups ($inout0,&QWP(0,$inp));
 1464     if ($inline)
 1465     {   &aesni_inline_generate1("enc"); }
 1466     else
 1467     {   &call   ("_aesni_encrypt1");    }
 1468 
 1469     &mov    ($inp,&wparam(0));
 1470     &mov    ($out,&wparam(1));
 1471     &mov    ($len,&wparam(2));
 1472     &mov    ($key,&wparam(3));      # key1
 1473 
 1474     &mov    ($key_,"esp");
 1475     &sub    ("esp",16*7+8);
 1476     &and    ("esp",-16);            # align stack
 1477 
 1478     &xor    ($rounds_,$rounds_);        # if(len%16) len-=16;
 1479     &test   ($len,15);
 1480     &setnz  (&LB($rounds_));
 1481     &shl    ($rounds_,4);
 1482     &sub    ($len,$rounds_);
 1483 
 1484     &mov    (&DWP(16*6+0,"esp"),0x87);  # compose the magic constant
 1485     &mov    (&DWP(16*6+4,"esp"),0);
 1486     &mov    (&DWP(16*6+8,"esp"),1);
 1487     &mov    (&DWP(16*6+12,"esp"),0);
 1488     &mov    (&DWP(16*7+0,"esp"),$len);  # save original $len
 1489     &mov    (&DWP(16*7+4,"esp"),$key_); # save original %esp
 1490 
 1491     &mov    ($rounds,&DWP(240,$key));   # key1->rounds
 1492     &mov    ($key_,$key);           # backup $key
 1493     &mov    ($rounds_,$rounds);     # backup $rounds
 1494 
 1495     &movdqa ($tweak,$inout0);
 1496     &pxor   ($twtmp,$twtmp);
 1497     &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
 1498     &pcmpgtd($twtmp,$tweak);        # broadcast upper bits
 1499 
 1500     &and    ($len,-16);
 1501     &sub    ($len,16*6);
 1502     &jc (&label("xts_dec_short"));
 1503 
 1504     &shl    ($rounds,4);
 1505     &mov    ($rounds_,16);
 1506     &sub    ($rounds_,$rounds);
 1507     &lea    ($key,&DWP(32,$key,$rounds));
 1508     &jmp    (&label("xts_dec_loop6"));
 1509 
 1510 &set_label("xts_dec_loop6",16);
 1511     for ($i=0;$i<4;$i++) {
 1512         &pshufd ($twres,$twtmp,0x13);
 1513         &pxor   ($twtmp,$twtmp);
 1514         &movdqa (&QWP(16*$i,"esp"),$tweak);
 1515         &paddq  ($tweak,$tweak);    # &psllq($tweak,1);
 1516         &pand   ($twres,$twmask);   # isolate carry and residue
 1517         &pcmpgtd    ($twtmp,$tweak);    # broadcast upper bits
 1518         &pxor   ($tweak,$twres);
 1519     }
 1520     &pshufd ($inout5,$twtmp,0x13);
 1521     &movdqa (&QWP(16*$i++,"esp"),$tweak);
 1522     &paddq  ($tweak,$tweak);        # &psllq($tweak,1);
 1523      &$movekey  ($rndkey0,&QWP(0,$key_));
 1524     &pand   ($inout5,$twmask);      # isolate carry and residue
 1525      &movups    ($inout0,&QWP(0,$inp)); # load input
 1526     &pxor   ($inout5,$tweak);
 1527 
 1528     # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
 1529     &mov    ($rounds,$rounds_);
 1530     &movdqu ($inout1,&QWP(16*1,$inp));
 1531      &xorps     ($inout0,$rndkey0); # input^=rndkey[0]
 1532     &movdqu ($inout2,&QWP(16*2,$inp));
 1533      &pxor      ($inout1,$rndkey0);
 1534     &movdqu ($inout3,&QWP(16*3,$inp));
 1535      &pxor      ($inout2,$rndkey0);
 1536     &movdqu ($inout4,&QWP(16*4,$inp));
 1537      &pxor      ($inout3,$rndkey0);
 1538     &movdqu ($rndkey1,&QWP(16*5,$inp));
 1539      &pxor      ($inout4,$rndkey0);
 1540     &lea    ($inp,&DWP(16*6,$inp));
 1541     &pxor   ($inout0,&QWP(16*0,"esp")); # input^=tweak
 1542     &movdqa (&QWP(16*$i,"esp"),$inout5);    # save last tweak
 1543     &pxor   ($inout5,$rndkey1);
 1544 
 1545      &$movekey  ($rndkey1,&QWP(16,$key_));
 1546     &pxor   ($inout1,&QWP(16*1,"esp"));
 1547     &pxor   ($inout2,&QWP(16*2,"esp"));
 1548      &aesdec    ($inout0,$rndkey1);
 1549     &pxor   ($inout3,&QWP(16*3,"esp"));
 1550     &pxor   ($inout4,&QWP(16*4,"esp"));
 1551      &aesdec    ($inout1,$rndkey1);
 1552     &pxor       ($inout5,$rndkey0);
 1553      &$movekey  ($rndkey0,&QWP(32,$key_));
 1554      &aesdec    ($inout2,$rndkey1);
 1555      &aesdec    ($inout3,$rndkey1);
 1556      &aesdec    ($inout4,$rndkey1);
 1557      &aesdec    ($inout5,$rndkey1);
 1558     &call       (&label("_aesni_decrypt6_enter"));
 1559 
 1560     &movdqa ($tweak,&QWP(16*5,"esp"));  # last tweak
 1561        &pxor    ($twtmp,$twtmp);
 1562     &xorps  ($inout0,&QWP(16*0,"esp")); # output^=tweak
 1563        &pcmpgtd ($twtmp,$tweak);        # broadcast upper bits
 1564     &xorps  ($inout1,&QWP(16*1,"esp"));
 1565     &movups (&QWP(16*0,$out),$inout0);  # write output
 1566     &xorps  ($inout2,&QWP(16*2,"esp"));
 1567     &movups (&QWP(16*1,$out),$inout1);
 1568     &xorps  ($inout3,&QWP(16*3,"esp"));
 1569     &movups (&QWP(16*2,$out),$inout2);
 1570     &xorps  ($inout4,&QWP(16*4,"esp"));
 1571     &movups (&QWP(16*3,$out),$inout3);
 1572     &xorps  ($inout5,$tweak);
 1573     &movups (&QWP(16*4,$out),$inout4);
 1574        &pshufd  ($twres,$twtmp,0x13);
 1575     &movups (&QWP(16*5,$out),$inout5);
 1576     &lea    ($out,&DWP(16*6,$out));
 1577        &movdqa  ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
 1578 
 1579     &pxor   ($twtmp,$twtmp);
 1580     &paddq  ($tweak,$tweak);        # &psllq($tweak,1);
 1581     &pand   ($twres,$twmask);       # isolate carry and residue
 1582     &pcmpgtd($twtmp,$tweak);        # broadcast upper bits
 1583     &pxor   ($tweak,$twres);
 1584 
 1585     &sub    ($len,16*6);
 1586     &jnc    (&label("xts_dec_loop6"));
 1587 
 1588     &mov    ($rounds,&DWP(240,$key_));  # restore $rounds
 1589     &mov    ($key,$key_);           # restore $key
 1590     &mov    ($rounds_,$rounds);
 1591 
 1592 &set_label("xts_dec_short");
 1593     &add    ($len,16*6);
 1594     &jz (&label("xts_dec_done6x"));
 1595 
 1596     &movdqa ($inout3,$tweak);       # put aside previous tweak
 1597     &cmp    ($len,0x20);
 1598     &jb (&label("xts_dec_one"));
 1599 
 1600     &pshufd ($twres,$twtmp,0x13);
 1601     &pxor   ($twtmp,$twtmp);
 1602     &paddq  ($tweak,$tweak);        # &psllq($tweak,1);
 1603     &pand   ($twres,$twmask);       # isolate carry and residue
 1604     &pcmpgtd($twtmp,$tweak);        # broadcast upper bits
 1605     &pxor   ($tweak,$twres);
 1606     &je (&label("xts_dec_two"));
 1607 
 1608     &pshufd ($twres,$twtmp,0x13);
 1609     &pxor   ($twtmp,$twtmp);
 1610     &movdqa ($inout4,$tweak);       # put aside previous tweak
 1611     &paddq  ($tweak,$tweak);        # &psllq($tweak,1);
 1612     &pand   ($twres,$twmask);       # isolate carry and residue
 1613     &pcmpgtd($twtmp,$tweak);        # broadcast upper bits
 1614     &pxor   ($tweak,$twres);
 1615     &cmp    ($len,0x40);
 1616     &jb (&label("xts_dec_three"));
 1617 
 1618     &pshufd ($twres,$twtmp,0x13);
 1619     &pxor   ($twtmp,$twtmp);
 1620     &movdqa ($inout5,$tweak);       # put aside previous tweak
 1621     &paddq  ($tweak,$tweak);        # &psllq($tweak,1);
 1622     &pand   ($twres,$twmask);       # isolate carry and residue
 1623     &pcmpgtd($twtmp,$tweak);        # broadcast upper bits
 1624     &pxor   ($tweak,$twres);
 1625     &movdqa (&QWP(16*0,"esp"),$inout3);
 1626     &movdqa (&QWP(16*1,"esp"),$inout4);
 1627     &je (&label("xts_dec_four"));
 1628 
 1629     &movdqa (&QWP(16*2,"esp"),$inout5);
 1630     &pshufd ($inout5,$twtmp,0x13);
 1631     &movdqa (&QWP(16*3,"esp"),$tweak);
 1632     &paddq  ($tweak,$tweak);        # &psllq($inout0,1);
 1633     &pand   ($inout5,$twmask);      # isolate carry and residue
 1634     &pxor   ($inout5,$tweak);
 1635 
 1636     &movdqu ($inout0,&QWP(16*0,$inp));  # load input
 1637     &movdqu ($inout1,&QWP(16*1,$inp));
 1638     &movdqu ($inout2,&QWP(16*2,$inp));
 1639     &pxor   ($inout0,&QWP(16*0,"esp")); # input^=tweak
 1640     &movdqu ($inout3,&QWP(16*3,$inp));
 1641     &pxor   ($inout1,&QWP(16*1,"esp"));
 1642     &movdqu ($inout4,&QWP(16*4,$inp));
 1643     &pxor   ($inout2,&QWP(16*2,"esp"));
 1644     &lea    ($inp,&DWP(16*5,$inp));
 1645     &pxor   ($inout3,&QWP(16*3,"esp"));
 1646     &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
 1647     &pxor   ($inout4,$inout5);
 1648 
 1649     &call   ("_aesni_decrypt6");
 1650 
 1651     &movaps ($tweak,&QWP(16*4,"esp"));  # last tweak
 1652     &xorps  ($inout0,&QWP(16*0,"esp")); # output^=tweak
 1653     &xorps  ($inout1,&QWP(16*1,"esp"));
 1654     &xorps  ($inout2,&QWP(16*2,"esp"));
 1655     &movups (&QWP(16*0,$out),$inout0);  # write output
 1656     &xorps  ($inout3,&QWP(16*3,"esp"));
 1657     &movups (&QWP(16*1,$out),$inout1);
 1658     &xorps  ($inout4,$tweak);
 1659     &movups (&QWP(16*2,$out),$inout2);
 1660     &movups (&QWP(16*3,$out),$inout3);
 1661     &movups (&QWP(16*4,$out),$inout4);
 1662     &lea    ($out,&DWP(16*5,$out));
 1663     &jmp    (&label("xts_dec_done"));
 1664 
 1665 &set_label("xts_dec_one",16);
 1666     &movups ($inout0,&QWP(16*0,$inp));  # load input
 1667     &lea    ($inp,&DWP(16*1,$inp));
 1668     &xorps  ($inout0,$inout3);      # input^=tweak
 1669     if ($inline)
 1670     {   &aesni_inline_generate1("dec"); }
 1671     else
 1672     {   &call   ("_aesni_decrypt1");    }
 1673     &xorps  ($inout0,$inout3);      # output^=tweak
 1674     &movups (&QWP(16*0,$out),$inout0);  # write output
 1675     &lea    ($out,&DWP(16*1,$out));
 1676 
 1677     &movdqa ($tweak,$inout3);       # last tweak
 1678     &jmp    (&label("xts_dec_done"));
 1679 
 1680 &set_label("xts_dec_two",16);
 1681     &movaps ($inout4,$tweak);       # put aside last tweak
 1682 
 1683     &movups ($inout0,&QWP(16*0,$inp));  # load input
 1684     &movups ($inout1,&QWP(16*1,$inp));
 1685     &lea    ($inp,&DWP(16*2,$inp));
 1686     &xorps  ($inout0,$inout3);      # input^=tweak
 1687     &xorps  ($inout1,$inout4);
 1688 
 1689     &call   ("_aesni_decrypt2");
 1690 
 1691     &xorps  ($inout0,$inout3);      # output^=tweak
 1692     &xorps  ($inout1,$inout4);
 1693     &movups (&QWP(16*0,$out),$inout0);  # write output
 1694     &movups (&QWP(16*1,$out),$inout1);
 1695     &lea    ($out,&DWP(16*2,$out));
 1696 
 1697     &movdqa ($tweak,$inout4);       # last tweak
 1698     &jmp    (&label("xts_dec_done"));
 1699 
 1700 &set_label("xts_dec_three",16);
 1701     &movaps ($inout5,$tweak);       # put aside last tweak
 1702     &movups ($inout0,&QWP(16*0,$inp));  # load input
 1703     &movups ($inout1,&QWP(16*1,$inp));
 1704     &movups ($inout2,&QWP(16*2,$inp));
 1705     &lea    ($inp,&DWP(16*3,$inp));
 1706     &xorps  ($inout0,$inout3);      # input^=tweak
 1707     &xorps  ($inout1,$inout4);
 1708     &xorps  ($inout2,$inout5);
 1709 
 1710     &call   ("_aesni_decrypt3");
 1711 
 1712     &xorps  ($inout0,$inout3);      # output^=tweak
 1713     &xorps  ($inout1,$inout4);
 1714     &xorps  ($inout2,$inout5);
 1715     &movups (&QWP(16*0,$out),$inout0);  # write output
 1716     &movups (&QWP(16*1,$out),$inout1);
 1717     &movups (&QWP(16*2,$out),$inout2);
 1718     &lea    ($out,&DWP(16*3,$out));
 1719 
 1720     &movdqa ($tweak,$inout5);       # last tweak
 1721     &jmp    (&label("xts_dec_done"));
 1722 
 1723 &set_label("xts_dec_four",16);
 1724     &movaps ($inout4,$tweak);       # put aside last tweak
 1725 
 1726     &movups ($inout0,&QWP(16*0,$inp));  # load input
 1727     &movups ($inout1,&QWP(16*1,$inp));
 1728     &movups ($inout2,&QWP(16*2,$inp));
 1729     &xorps  ($inout0,&QWP(16*0,"esp")); # input^=tweak
 1730     &movups ($inout3,&QWP(16*3,$inp));
 1731     &lea    ($inp,&DWP(16*4,$inp));
 1732     &xorps  ($inout1,&QWP(16*1,"esp"));
 1733     &xorps  ($inout2,$inout5);
 1734     &xorps  ($inout3,$inout4);
 1735 
 1736     &call   ("_aesni_decrypt4");
 1737 
 1738     &xorps  ($inout0,&QWP(16*0,"esp")); # output^=tweak
 1739     &xorps  ($inout1,&QWP(16*1,"esp"));
 1740     &xorps  ($inout2,$inout5);
 1741     &movups (&QWP(16*0,$out),$inout0);  # write output
 1742     &xorps  ($inout3,$inout4);
 1743     &movups (&QWP(16*1,$out),$inout1);
 1744     &movups (&QWP(16*2,$out),$inout2);
 1745     &movups (&QWP(16*3,$out),$inout3);
 1746     &lea    ($out,&DWP(16*4,$out));
 1747 
 1748     &movdqa ($tweak,$inout4);       # last tweak
 1749     &jmp    (&label("xts_dec_done"));
 1750 
 1751 &set_label("xts_dec_done6x",16);        # $tweak is pre-calculated
 1752     &mov    ($len,&DWP(16*7+0,"esp"));  # restore original $len
 1753     &and    ($len,15);
 1754     &jz (&label("xts_dec_ret"));
 1755     &mov    (&DWP(16*7+0,"esp"),$len);  # save $len%16
 1756     &jmp    (&label("xts_dec_only_one_more"));
 1757 
 1758 &set_label("xts_dec_done",16);
 1759     &mov    ($len,&DWP(16*7+0,"esp"));  # restore original $len
 1760     &pxor   ($twtmp,$twtmp);
 1761     &and    ($len,15);
 1762     &jz (&label("xts_dec_ret"));
 1763 
 1764     &pcmpgtd($twtmp,$tweak);        # broadcast upper bits
 1765     &mov    (&DWP(16*7+0,"esp"),$len);  # save $len%16
 1766     &pshufd ($twres,$twtmp,0x13);
 1767     &pxor   ($twtmp,$twtmp);
 1768     &movdqa ($twmask,&QWP(16*6,"esp"));
 1769     &paddq  ($tweak,$tweak);        # &psllq($tweak,1);
 1770     &pand   ($twres,$twmask);       # isolate carry and residue
 1771     &pcmpgtd($twtmp,$tweak);        # broadcast upper bits
 1772     &pxor   ($tweak,$twres);
 1773 
 1774 &set_label("xts_dec_only_one_more");
 1775     &pshufd ($inout3,$twtmp,0x13);
 1776     &movdqa ($inout4,$tweak);       # put aside previous tweak
 1777     &paddq  ($tweak,$tweak);        # &psllq($tweak,1);
 1778     &pand   ($inout3,$twmask);      # isolate carry and residue
 1779     &pxor   ($inout3,$tweak);
 1780 
 1781     &mov    ($key,$key_);           # restore $key
 1782     &mov    ($rounds,$rounds_);     # restore $rounds
 1783 
 1784     &movups ($inout0,&QWP(0,$inp));     # load input
 1785     &xorps  ($inout0,$inout3);      # input^=tweak
 1786     if ($inline)
 1787     {   &aesni_inline_generate1("dec"); }
 1788     else
 1789     {   &call   ("_aesni_decrypt1");    }
 1790     &xorps  ($inout0,$inout3);      # output^=tweak
 1791     &movups (&QWP(0,$out),$inout0);     # write output
 1792 
 1793 &set_label("xts_dec_steal");
 1794     &movz   ($rounds,&BP(16,$inp));
 1795     &movz   ($key,&BP(0,$out));
 1796     &lea    ($inp,&DWP(1,$inp));
 1797     &mov    (&BP(0,$out),&LB($rounds));
 1798     &mov    (&BP(16,$out),&LB($key));
 1799     &lea    ($out,&DWP(1,$out));
 1800     &sub    ($len,1);
 1801     &jnz    (&label("xts_dec_steal"));
 1802 
 1803     &sub    ($out,&DWP(16*7+0,"esp"));  # rewind $out
 1804     &mov    ($key,$key_);           # restore $key
 1805     &mov    ($rounds,$rounds_);     # restore $rounds
 1806 
 1807     &movups ($inout0,&QWP(0,$out));     # load input
 1808     &xorps  ($inout0,$inout4);      # input^=tweak
 1809     if ($inline)
 1810     {   &aesni_inline_generate1("dec"); }
 1811     else
 1812     {   &call   ("_aesni_decrypt1");    }
 1813     &xorps  ($inout0,$inout4);      # output^=tweak
 1814     &movups (&QWP(0,$out),$inout0);     # write output
 1815 
 1816 &set_label("xts_dec_ret");
 1817     &pxor   ("xmm0","xmm0");        # clear register bank
 1818     &pxor   ("xmm1","xmm1");
 1819     &pxor   ("xmm2","xmm2");
 1820     &movdqa (&QWP(16*0,"esp"),"xmm0");  # clear stack
 1821     &pxor   ("xmm3","xmm3");
 1822     &movdqa (&QWP(16*1,"esp"),"xmm0");
 1823     &pxor   ("xmm4","xmm4");
 1824     &movdqa (&QWP(16*2,"esp"),"xmm0");
 1825     &pxor   ("xmm5","xmm5");
 1826     &movdqa (&QWP(16*3,"esp"),"xmm0");
 1827     &pxor   ("xmm6","xmm6");
 1828     &movdqa (&QWP(16*4,"esp"),"xmm0");
 1829     &pxor   ("xmm7","xmm7");
 1830     &movdqa (&QWP(16*5,"esp"),"xmm0");
 1831     &mov    ("esp",&DWP(16*7+4,"esp")); # restore %esp
 1832 &function_end("aesni_xts_decrypt");
 1833 }
 1834 }
 1835 
 1836 ######################################################################
 1837 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
 1838 #                           size_t length, const AES_KEY *key,
 1839 #                           unsigned char *ivp,const int enc);
 1840 &function_begin("${PREFIX}_cbc_encrypt");
 1841     &mov    ($inp,&wparam(0));
 1842     &mov    ($rounds_,"esp");
 1843     &mov    ($out,&wparam(1));
 1844     &sub    ($rounds_,24);
 1845     &mov    ($len,&wparam(2));
 1846     &and    ($rounds_,-16);
 1847     &mov    ($key,&wparam(3));
 1848     &mov    ($key_,&wparam(4));
 1849     &test   ($len,$len);
 1850     &jz (&label("cbc_abort"));
 1851 
 1852     &cmp    (&wparam(5),0);
 1853     &xchg   ($rounds_,"esp");       # alloca
 1854     &movups ($ivec,&QWP(0,$key_));      # load IV
 1855     &mov    ($rounds,&DWP(240,$key));
 1856     &mov    ($key_,$key);           # backup $key
 1857     &mov    (&DWP(16,"esp"),$rounds_);  # save original %esp
 1858     &mov    ($rounds_,$rounds);     # backup $rounds
 1859     &je (&label("cbc_decrypt"));
 1860 
 1861     &movaps ($inout0,$ivec);
 1862     &cmp    ($len,16);
 1863     &jb (&label("cbc_enc_tail"));
 1864     &sub    ($len,16);
 1865     &jmp    (&label("cbc_enc_loop"));
 1866 
 1867 &set_label("cbc_enc_loop",16);
 1868     &movups ($ivec,&QWP(0,$inp));       # input actually
 1869     &lea    ($inp,&DWP(16,$inp));
 1870     if ($inline)
 1871     {   &aesni_inline_generate1("enc",$inout0,$ivec);   }
 1872     else
 1873     {   &xorps($inout0,$ivec); &call("_aesni_encrypt1");    }
 1874     &mov    ($rounds,$rounds_); # restore $rounds
 1875     &mov    ($key,$key_);       # restore $key
 1876     &movups (&QWP(0,$out),$inout0); # store output
 1877     &lea    ($out,&DWP(16,$out));
 1878     &sub    ($len,16);
 1879     &jnc    (&label("cbc_enc_loop"));
 1880     &add    ($len,16);
 1881     &jnz    (&label("cbc_enc_tail"));
 1882     &movaps ($ivec,$inout0);
 1883     &pxor   ($inout0,$inout0);
 1884     &jmp    (&label("cbc_ret"));
 1885 
 1886 &set_label("cbc_enc_tail");
 1887     &mov    ("ecx",$len);       # zaps $rounds
 1888     &data_word(0xA4F3F689);     # rep movsb
 1889     &mov    ("ecx",16);     # zero tail
 1890     &sub    ("ecx",$len);
 1891     &xor    ("eax","eax");      # zaps $len
 1892     &data_word(0xAAF3F689);     # rep stosb
 1893     &lea    ($out,&DWP(-16,$out));  # rewind $out by 1 block
 1894     &mov    ($rounds,$rounds_); # restore $rounds
 1895     &mov    ($inp,$out);        # $inp and $out are the same
 1896     &mov    ($key,$key_);       # restore $key
 1897     &jmp    (&label("cbc_enc_loop"));
 1898 ######################################################################
 1899 &set_label("cbc_decrypt",16);
 1900     &cmp    ($len,0x50);
 1901     &jbe    (&label("cbc_dec_tail"));
 1902     &movaps (&QWP(0,"esp"),$ivec);      # save IV
 1903     &sub    ($len,0x50);
 1904     &jmp    (&label("cbc_dec_loop6_enter"));
 1905 
 1906 &set_label("cbc_dec_loop6",16);
 1907     &movaps (&QWP(0,"esp"),$rndkey0);   # save IV
 1908     &movups (&QWP(0,$out),$inout5);
 1909     &lea    ($out,&DWP(0x10,$out));
 1910 &set_label("cbc_dec_loop6_enter");
 1911     &movdqu ($inout0,&QWP(0,$inp));
 1912     &movdqu ($inout1,&QWP(0x10,$inp));
 1913     &movdqu ($inout2,&QWP(0x20,$inp));
 1914     &movdqu ($inout3,&QWP(0x30,$inp));
 1915     &movdqu ($inout4,&QWP(0x40,$inp));
 1916     &movdqu ($inout5,&QWP(0x50,$inp));
 1917 
 1918     &call   ("_aesni_decrypt6");
 1919 
 1920     &movups ($rndkey1,&QWP(0,$inp));
 1921     &movups ($rndkey0,&QWP(0x10,$inp));
 1922     &xorps  ($inout0,&QWP(0,"esp"));    # ^=IV
 1923     &xorps  ($inout1,$rndkey1);
 1924     &movups ($rndkey1,&QWP(0x20,$inp));
 1925     &xorps  ($inout2,$rndkey0);
 1926     &movups ($rndkey0,&QWP(0x30,$inp));
 1927     &xorps  ($inout3,$rndkey1);
 1928     &movups ($rndkey1,&QWP(0x40,$inp));
 1929     &xorps  ($inout4,$rndkey0);
 1930     &movups ($rndkey0,&QWP(0x50,$inp)); # IV
 1931     &xorps  ($inout5,$rndkey1);
 1932     &movups (&QWP(0,$out),$inout0);
 1933     &movups (&QWP(0x10,$out),$inout1);
 1934     &lea    ($inp,&DWP(0x60,$inp));
 1935     &movups (&QWP(0x20,$out),$inout2);
 1936     &mov    ($rounds,$rounds_);     # restore $rounds
 1937     &movups (&QWP(0x30,$out),$inout3);
 1938     &mov    ($key,$key_);           # restore $key
 1939     &movups (&QWP(0x40,$out),$inout4);
 1940     &lea    ($out,&DWP(0x50,$out));
 1941     &sub    ($len,0x60);
 1942     &ja (&label("cbc_dec_loop6"));
 1943 
 1944     &movaps ($inout0,$inout5);
 1945     &movaps ($ivec,$rndkey0);
 1946     &add    ($len,0x50);
 1947     &jle    (&label("cbc_dec_clear_tail_collected"));
 1948     &movups (&QWP(0,$out),$inout0);
 1949     &lea    ($out,&DWP(0x10,$out));
 1950 &set_label("cbc_dec_tail");
 1951     &movups ($inout0,&QWP(0,$inp));
 1952     &movaps ($in0,$inout0);
 1953     &cmp    ($len,0x10);
 1954     &jbe    (&label("cbc_dec_one"));
 1955 
 1956     &movups ($inout1,&QWP(0x10,$inp));
 1957     &movaps ($in1,$inout1);
 1958     &cmp    ($len,0x20);
 1959     &jbe    (&label("cbc_dec_two"));
 1960 
 1961     &movups ($inout2,&QWP(0x20,$inp));
 1962     &cmp    ($len,0x30);
 1963     &jbe    (&label("cbc_dec_three"));
 1964 
 1965     &movups ($inout3,&QWP(0x30,$inp));
 1966     &cmp    ($len,0x40);
 1967     &jbe    (&label("cbc_dec_four"));
 1968 
 1969     &movups ($inout4,&QWP(0x40,$inp));
 1970     &movaps (&QWP(0,"esp"),$ivec);      # save IV
 1971     &movups ($inout0,&QWP(0,$inp));
 1972     &xorps  ($inout5,$inout5);
 1973     &call   ("_aesni_decrypt6");
 1974     &movups ($rndkey1,&QWP(0,$inp));
 1975     &movups ($rndkey0,&QWP(0x10,$inp));
 1976     &xorps  ($inout0,&QWP(0,"esp"));    # ^= IV
 1977     &xorps  ($inout1,$rndkey1);
 1978     &movups ($rndkey1,&QWP(0x20,$inp));
 1979     &xorps  ($inout2,$rndkey0);
 1980     &movups ($rndkey0,&QWP(0x30,$inp));
 1981     &xorps  ($inout3,$rndkey1);
 1982     &movups ($ivec,&QWP(0x40,$inp));    # IV
 1983     &xorps  ($inout4,$rndkey0);
 1984     &movups (&QWP(0,$out),$inout0);
 1985     &movups (&QWP(0x10,$out),$inout1);
 1986     &pxor   ($inout1,$inout1);
 1987     &movups (&QWP(0x20,$out),$inout2);
 1988     &pxor   ($inout2,$inout2);
 1989     &movups (&QWP(0x30,$out),$inout3);
 1990     &pxor   ($inout3,$inout3);
 1991     &lea    ($out,&DWP(0x40,$out));
 1992     &movaps ($inout0,$inout4);
 1993     &pxor   ($inout4,$inout4);
 1994     &sub    ($len,0x50);
 1995     &jmp    (&label("cbc_dec_tail_collected"));
 1996 
 1997 &set_label("cbc_dec_one",16);
 1998     if ($inline)
 1999     {   &aesni_inline_generate1("dec"); }
 2000     else
 2001     {   &call   ("_aesni_decrypt1");    }
 2002     &xorps  ($inout0,$ivec);
 2003     &movaps ($ivec,$in0);
 2004     &sub    ($len,0x10);
 2005     &jmp    (&label("cbc_dec_tail_collected"));
 2006 
 2007 &set_label("cbc_dec_two",16);
 2008     &call   ("_aesni_decrypt2");
 2009     &xorps  ($inout0,$ivec);
 2010     &xorps  ($inout1,$in0);
 2011     &movups (&QWP(0,$out),$inout0);
 2012     &movaps ($inout0,$inout1);
 2013     &pxor   ($inout1,$inout1);
 2014     &lea    ($out,&DWP(0x10,$out));
 2015     &movaps ($ivec,$in1);
 2016     &sub    ($len,0x20);
 2017     &jmp    (&label("cbc_dec_tail_collected"));
 2018 
 2019 &set_label("cbc_dec_three",16);
 2020     &call   ("_aesni_decrypt3");
 2021     &xorps  ($inout0,$ivec);
 2022     &xorps  ($inout1,$in0);
 2023     &xorps  ($inout2,$in1);
 2024     &movups (&QWP(0,$out),$inout0);
 2025     &movaps ($inout0,$inout2);
 2026     &pxor   ($inout2,$inout2);
 2027     &movups (&QWP(0x10,$out),$inout1);
 2028     &pxor   ($inout1,$inout1);
 2029     &lea    ($out,&DWP(0x20,$out));
 2030     &movups ($ivec,&QWP(0x20,$inp));
 2031     &sub    ($len,0x30);
 2032     &jmp    (&label("cbc_dec_tail_collected"));
 2033 
 2034 &set_label("cbc_dec_four",16);
 2035     &call   ("_aesni_decrypt4");
 2036     &movups ($rndkey1,&QWP(0x10,$inp));
 2037     &movups ($rndkey0,&QWP(0x20,$inp));
 2038     &xorps  ($inout0,$ivec);
 2039     &movups ($ivec,&QWP(0x30,$inp));
 2040     &xorps  ($inout1,$in0);
 2041     &movups (&QWP(0,$out),$inout0);
 2042     &xorps  ($inout2,$rndkey1);
 2043     &movups (&QWP(0x10,$out),$inout1);
 2044     &pxor   ($inout1,$inout1);
 2045     &xorps  ($inout3,$rndkey0);
 2046     &movups (&QWP(0x20,$out),$inout2);
 2047     &pxor   ($inout2,$inout2);
 2048     &lea    ($out,&DWP(0x30,$out));
 2049     &movaps ($inout0,$inout3);
 2050     &pxor   ($inout3,$inout3);
 2051     &sub    ($len,0x40);
 2052     &jmp    (&label("cbc_dec_tail_collected"));
 2053 
 2054 &set_label("cbc_dec_clear_tail_collected",16);
 2055     &pxor   ($inout1,$inout1);
 2056     &pxor   ($inout2,$inout2);
 2057     &pxor   ($inout3,$inout3);
 2058     &pxor   ($inout4,$inout4);
 2059 &set_label("cbc_dec_tail_collected");
 2060     &and    ($len,15);
 2061     &jnz    (&label("cbc_dec_tail_partial"));
 2062     &movups (&QWP(0,$out),$inout0);
 2063     &pxor   ($rndkey0,$rndkey0);
 2064     &jmp    (&label("cbc_ret"));
 2065 
 2066 &set_label("cbc_dec_tail_partial",16);
 2067     &movaps (&QWP(0,"esp"),$inout0);
 2068     &pxor   ($rndkey0,$rndkey0);
 2069     &mov    ("ecx",16);
 2070     &mov    ($inp,"esp");
 2071     &sub    ("ecx",$len);
 2072     &data_word(0xA4F3F689);     # rep movsb
 2073     &movdqa (&QWP(0,"esp"),$inout0);
 2074 
 2075 &set_label("cbc_ret");
 2076     &mov    ("esp",&DWP(16,"esp")); # pull original %esp
 2077     &mov    ($key_,&wparam(4));
 2078     &pxor   ($inout0,$inout0);
 2079     &pxor   ($rndkey1,$rndkey1);
 2080     &movups (&QWP(0,$key_),$ivec);  # output IV
 2081     &pxor   ($ivec,$ivec);
 2082 &set_label("cbc_abort");
 2083 &function_end("${PREFIX}_cbc_encrypt");
 2084 
 2085 ######################################################################
 2086 # Mechanical port from aesni-x86_64.pl.
 2087 #
 2088 # _aesni_set_encrypt_key is private interface,
 2089 # input:
 2090 #   "eax"   const unsigned char *userKey
 2091 #   $rounds int bits
 2092 #   $key    AES_KEY *key
 2093 # output:
 2094 #   "eax"   return code
 2095 #   $round  rounds
 2096 
 2097 &function_begin_B("_aesni_set_encrypt_key");
 2098     &push   ("ebp");
 2099     &push   ("ebx");
 2100     &test   ("eax","eax");
 2101     &jz (&label("bad_pointer"));
 2102     &test   ($key,$key);
 2103     &jz (&label("bad_pointer"));
 2104 
 2105     &call   (&label("pic"));
 2106 &set_label("pic");
 2107     &blindpop("ebx");
 2108     &lea    ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
 2109 
 2110     &picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
 2111     &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
 2112     &xorps  ("xmm4","xmm4");    # low dword of xmm4 is assumed 0
 2113     &mov    ("ebp",&DWP(4,"ebp"));
 2114     &lea    ($key,&DWP(16,$key));
 2115     &and    ("ebp",1<<28|1<<11);    # AVX and XOP bits
 2116     &cmp    ($rounds,256);
 2117     &je (&label("14rounds"));
 2118     &cmp    ($rounds,192);
 2119     &je (&label("12rounds"));
 2120     &cmp    ($rounds,128);
 2121     &jne    (&label("bad_keybits"));
 2122 
 2123 &set_label("10rounds",16);
 2124     &cmp        ("ebp",1<<28);
 2125     &je     (&label("10rounds_alt"));
 2126 
 2127     &mov        ($rounds,9);
 2128     &$movekey   (&QWP(-16,$key),"xmm0");    # round 0
 2129     &aeskeygenassist("xmm1","xmm0",0x01);       # round 1
 2130     &call       (&label("key_128_cold"));
 2131     &aeskeygenassist("xmm1","xmm0",0x2);        # round 2
 2132     &call       (&label("key_128"));
 2133     &aeskeygenassist("xmm1","xmm0",0x04);       # round 3
 2134     &call       (&label("key_128"));
 2135     &aeskeygenassist("xmm1","xmm0",0x08);       # round 4
 2136     &call       (&label("key_128"));
 2137     &aeskeygenassist("xmm1","xmm0",0x10);       # round 5
 2138     &call       (&label("key_128"));
 2139     &aeskeygenassist("xmm1","xmm0",0x20);       # round 6
 2140     &call       (&label("key_128"));
 2141     &aeskeygenassist("xmm1","xmm0",0x40);       # round 7
 2142     &call       (&label("key_128"));
 2143     &aeskeygenassist("xmm1","xmm0",0x80);       # round 8
 2144     &call       (&label("key_128"));
 2145     &aeskeygenassist("xmm1","xmm0",0x1b);       # round 9
 2146     &call       (&label("key_128"));
 2147     &aeskeygenassist("xmm1","xmm0",0x36);       # round 10
 2148     &call       (&label("key_128"));
 2149     &$movekey   (&QWP(0,$key),"xmm0");
 2150     &mov        (&DWP(80,$key),$rounds);
 2151 
 2152     &jmp    (&label("good_key"));
 2153 
 2154 &set_label("key_128",16);
 2155     &$movekey   (&QWP(0,$key),"xmm0");
 2156     &lea        ($key,&DWP(16,$key));
 2157 &set_label("key_128_cold");
 2158     &shufps     ("xmm4","xmm0",0b00010000);
 2159     &xorps      ("xmm0","xmm4");
 2160     &shufps     ("xmm4","xmm0",0b10001100);
 2161     &xorps      ("xmm0","xmm4");
 2162     &shufps     ("xmm1","xmm1",0b11111111); # critical path
 2163     &xorps      ("xmm0","xmm1");
 2164     &ret();
 2165 
 2166 &set_label("10rounds_alt",16);
 2167     &movdqa     ("xmm5",&QWP(0x00,"ebx"));
 2168     &mov        ($rounds,8);
 2169     &movdqa     ("xmm4",&QWP(0x20,"ebx"));
 2170     &movdqa     ("xmm2","xmm0");
 2171     &movdqu     (&QWP(-16,$key),"xmm0");
 2172 
 2173 &set_label("loop_key128");
 2174     &pshufb     ("xmm0","xmm5");
 2175     &aesenclast ("xmm0","xmm4");
 2176     &pslld      ("xmm4",1);
 2177     &lea        ($key,&DWP(16,$key));
 2178 
 2179     &movdqa     ("xmm3","xmm2");
 2180     &pslldq     ("xmm2",4);
 2181     &pxor       ("xmm3","xmm2");
 2182     &pslldq     ("xmm2",4);
 2183     &pxor       ("xmm3","xmm2");
 2184     &pslldq     ("xmm2",4);
 2185     &pxor       ("xmm2","xmm3");
 2186 
 2187     &pxor       ("xmm0","xmm2");
 2188     &movdqu     (&QWP(-16,$key),"xmm0");
 2189     &movdqa     ("xmm2","xmm0");
 2190 
 2191     &dec        ($rounds);
 2192     &jnz        (&label("loop_key128"));
 2193 
 2194     &movdqa     ("xmm4",&QWP(0x30,"ebx"));
 2195 
 2196     &pshufb     ("xmm0","xmm5");
 2197     &aesenclast ("xmm0","xmm4");
 2198     &pslld      ("xmm4",1);
 2199 
 2200     &movdqa     ("xmm3","xmm2");
 2201     &pslldq     ("xmm2",4);
 2202     &pxor       ("xmm3","xmm2");
 2203     &pslldq     ("xmm2",4);
 2204     &pxor       ("xmm3","xmm2");
 2205     &pslldq     ("xmm2",4);
 2206     &pxor       ("xmm2","xmm3");
 2207 
 2208     &pxor       ("xmm0","xmm2");
 2209     &movdqu     (&QWP(0,$key),"xmm0");
 2210 
 2211     &movdqa     ("xmm2","xmm0");
 2212     &pshufb     ("xmm0","xmm5");
 2213     &aesenclast ("xmm0","xmm4");
 2214 
 2215     &movdqa     ("xmm3","xmm2");
 2216     &pslldq     ("xmm2",4);
 2217     &pxor       ("xmm3","xmm2");
 2218     &pslldq     ("xmm2",4);
 2219     &pxor       ("xmm3","xmm2");
 2220     &pslldq     ("xmm2",4);
 2221     &pxor       ("xmm2","xmm3");
 2222 
 2223     &pxor       ("xmm0","xmm2");
 2224     &movdqu     (&QWP(16,$key),"xmm0");
 2225 
 2226     &mov        ($rounds,9);
 2227     &mov        (&DWP(96,$key),$rounds);
 2228 
 2229     &jmp    (&label("good_key"));
 2230 
 2231 &set_label("12rounds",16);
 2232     &movq       ("xmm2",&QWP(16,"eax"));    # remaining 1/3 of *userKey
 2233     &cmp        ("ebp",1<<28);
 2234     &je     (&label("12rounds_alt"));
 2235 
 2236     &mov        ($rounds,11);
 2237     &$movekey   (&QWP(-16,$key),"xmm0");    # round 0
 2238     &aeskeygenassist("xmm1","xmm2",0x01);       # round 1,2
 2239     &call       (&label("key_192a_cold"));
 2240     &aeskeygenassist("xmm1","xmm2",0x02);       # round 2,3
 2241     &call       (&label("key_192b"));
 2242     &aeskeygenassist("xmm1","xmm2",0x04);       # round 4,5
 2243     &call       (&label("key_192a"));
 2244     &aeskeygenassist("xmm1","xmm2",0x08);       # round 5,6
 2245     &call       (&label("key_192b"));
 2246     &aeskeygenassist("xmm1","xmm2",0x10);       # round 7,8
 2247     &call       (&label("key_192a"));
 2248     &aeskeygenassist("xmm1","xmm2",0x20);       # round 8,9
 2249     &call       (&label("key_192b"));
 2250     &aeskeygenassist("xmm1","xmm2",0x40);       # round 10,11
 2251     &call       (&label("key_192a"));
 2252     &aeskeygenassist("xmm1","xmm2",0x80);       # round 11,12
 2253     &call       (&label("key_192b"));
 2254     &$movekey   (&QWP(0,$key),"xmm0");
 2255     &mov        (&DWP(48,$key),$rounds);
 2256 
 2257     &jmp    (&label("good_key"));
 2258 
 2259 &set_label("key_192a",16);
 2260     &$movekey   (&QWP(0,$key),"xmm0");
 2261     &lea        ($key,&DWP(16,$key));
 2262 &set_label("key_192a_cold",16);
 2263     &movaps     ("xmm5","xmm2");
 2264 &set_label("key_192b_warm");
 2265     &shufps     ("xmm4","xmm0",0b00010000);
 2266     &movdqa     ("xmm3","xmm2");
 2267     &xorps      ("xmm0","xmm4");
 2268     &shufps     ("xmm4","xmm0",0b10001100);
 2269     &pslldq     ("xmm3",4);
 2270     &xorps      ("xmm0","xmm4");
 2271     &pshufd     ("xmm1","xmm1",0b01010101); # critical path
 2272     &pxor       ("xmm2","xmm3");
 2273     &pxor       ("xmm0","xmm1");
 2274     &pshufd     ("xmm3","xmm0",0b11111111);
 2275     &pxor       ("xmm2","xmm3");
 2276     &ret();
 2277 
 2278 &set_label("key_192b",16);
 2279     &movaps     ("xmm3","xmm0");
 2280     &shufps     ("xmm5","xmm0",0b01000100);
 2281     &$movekey   (&QWP(0,$key),"xmm5");
 2282     &shufps     ("xmm3","xmm2",0b01001110);
 2283     &$movekey   (&QWP(16,$key),"xmm3");
 2284     &lea        ($key,&DWP(32,$key));
 2285     &jmp        (&label("key_192b_warm"));
 2286 
 2287 &set_label("12rounds_alt",16);
 2288     &movdqa     ("xmm5",&QWP(0x10,"ebx"));
 2289     &movdqa     ("xmm4",&QWP(0x20,"ebx"));
 2290     &mov        ($rounds,8);
 2291     &movdqu     (&QWP(-16,$key),"xmm0");
 2292 
 2293 &set_label("loop_key192");
 2294     &movq       (&QWP(0,$key),"xmm2");
 2295     &movdqa     ("xmm1","xmm2");
 2296     &pshufb     ("xmm2","xmm5");
 2297     &aesenclast ("xmm2","xmm4");
 2298     &pslld      ("xmm4",1);
 2299     &lea        ($key,&DWP(24,$key));
 2300 
 2301     &movdqa     ("xmm3","xmm0");
 2302     &pslldq     ("xmm0",4);
 2303     &pxor       ("xmm3","xmm0");
 2304     &pslldq     ("xmm0",4);
 2305     &pxor       ("xmm3","xmm0");
 2306     &pslldq     ("xmm0",4);
 2307     &pxor       ("xmm0","xmm3");
 2308 
 2309     &pshufd     ("xmm3","xmm0",0xff);
 2310     &pxor       ("xmm3","xmm1");
 2311     &pslldq     ("xmm1",4);
 2312     &pxor       ("xmm3","xmm1");
 2313 
 2314     &pxor       ("xmm0","xmm2");
 2315     &pxor       ("xmm2","xmm3");
 2316     &movdqu     (&QWP(-16,$key),"xmm0");
 2317 
 2318     &dec        ($rounds);
 2319     &jnz        (&label("loop_key192"));
 2320 
 2321     &mov    ($rounds,11);
 2322     &mov    (&DWP(32,$key),$rounds);
 2323 
 2324     &jmp    (&label("good_key"));
 2325 
 2326 &set_label("14rounds",16);
 2327     &movups     ("xmm2",&QWP(16,"eax"));    # remaining half of *userKey
 2328     &lea        ($key,&DWP(16,$key));
 2329     &cmp        ("ebp",1<<28);
 2330     &je     (&label("14rounds_alt"));
 2331 
 2332     &mov        ($rounds,13);
 2333     &$movekey   (&QWP(-32,$key),"xmm0");    # round 0
 2334     &$movekey   (&QWP(-16,$key),"xmm2");    # round 1
 2335     &aeskeygenassist("xmm1","xmm2",0x01);       # round 2
 2336     &call       (&label("key_256a_cold"));
 2337     &aeskeygenassist("xmm1","xmm0",0x01);       # round 3
 2338     &call       (&label("key_256b"));
 2339     &aeskeygenassist("xmm1","xmm2",0x02);       # round 4
 2340     &call       (&label("key_256a"));
 2341     &aeskeygenassist("xmm1","xmm0",0x02);       # round 5
 2342     &call       (&label("key_256b"));
 2343     &aeskeygenassist("xmm1","xmm2",0x04);       # round 6
 2344     &call       (&label("key_256a"));
 2345     &aeskeygenassist("xmm1","xmm0",0x04);       # round 7
 2346     &call       (&label("key_256b"));
 2347     &aeskeygenassist("xmm1","xmm2",0x08);       # round 8
 2348     &call       (&label("key_256a"));
 2349     &aeskeygenassist("xmm1","xmm0",0x08);       # round 9
 2350     &call       (&label("key_256b"));
 2351     &aeskeygenassist("xmm1","xmm2",0x10);       # round 10
 2352     &call       (&label("key_256a"));
 2353     &aeskeygenassist("xmm1","xmm0",0x10);       # round 11
 2354     &call       (&label("key_256b"));
 2355     &aeskeygenassist("xmm1","xmm2",0x20);       # round 12
 2356     &call       (&label("key_256a"));
 2357     &aeskeygenassist("xmm1","xmm0",0x20);       # round 13
 2358     &call       (&label("key_256b"));
 2359     &aeskeygenassist("xmm1","xmm2",0x40);       # round 14
 2360     &call       (&label("key_256a"));
 2361     &$movekey   (&QWP(0,$key),"xmm0");
 2362     &mov        (&DWP(16,$key),$rounds);
 2363     &xor        ("eax","eax");
 2364 
 2365     &jmp    (&label("good_key"));
 2366 
 2367 &set_label("key_256a",16);
 2368     &$movekey   (&QWP(0,$key),"xmm2");
 2369     &lea        ($key,&DWP(16,$key));
 2370 &set_label("key_256a_cold");
 2371     &shufps     ("xmm4","xmm0",0b00010000);
 2372     &xorps      ("xmm0","xmm4");
 2373     &shufps     ("xmm4","xmm0",0b10001100);
 2374     &xorps      ("xmm0","xmm4");
 2375     &shufps     ("xmm1","xmm1",0b11111111); # critical path
 2376     &xorps      ("xmm0","xmm1");
 2377     &ret();
 2378 
 2379 &set_label("key_256b",16);
 2380     &$movekey   (&QWP(0,$key),"xmm0");
 2381     &lea        ($key,&DWP(16,$key));
 2382 
 2383     &shufps     ("xmm4","xmm2",0b00010000);
 2384     &xorps      ("xmm2","xmm4");
 2385     &shufps     ("xmm4","xmm2",0b10001100);
 2386     &xorps      ("xmm2","xmm4");
 2387     &shufps     ("xmm1","xmm1",0b10101010); # critical path
 2388     &xorps      ("xmm2","xmm1");
 2389     &ret();
 2390 
 2391 &set_label("14rounds_alt",16);
 2392     &movdqa     ("xmm5",&QWP(0x00,"ebx"));
 2393     &movdqa     ("xmm4",&QWP(0x20,"ebx"));
 2394     &mov        ($rounds,7);
 2395     &movdqu     (&QWP(-32,$key),"xmm0");
 2396     &movdqa     ("xmm1","xmm2");
 2397     &movdqu     (&QWP(-16,$key),"xmm2");
 2398 
 2399 &set_label("loop_key256");
 2400     &pshufb     ("xmm2","xmm5");
 2401     &aesenclast ("xmm2","xmm4");
 2402 
 2403     &movdqa     ("xmm3","xmm0");
 2404     &pslldq     ("xmm0",4);
 2405     &pxor       ("xmm3","xmm0");
 2406     &pslldq     ("xmm0",4);
 2407     &pxor       ("xmm3","xmm0");
 2408     &pslldq     ("xmm0",4);
 2409     &pxor       ("xmm0","xmm3");
 2410     &pslld      ("xmm4",1);
 2411 
 2412     &pxor       ("xmm0","xmm2");
 2413     &movdqu     (&QWP(0,$key),"xmm0");
 2414 
 2415     &dec        ($rounds);
 2416     &jz     (&label("done_key256"));
 2417 
 2418     &pshufd     ("xmm2","xmm0",0xff);
 2419     &pxor       ("xmm3","xmm3");
 2420     &aesenclast ("xmm2","xmm3");
 2421 
 2422     &movdqa     ("xmm3","xmm1")
 2423     &pslldq     ("xmm1",4);
 2424     &pxor       ("xmm3","xmm1");
 2425     &pslldq     ("xmm1",4);
 2426     &pxor       ("xmm3","xmm1");
 2427     &pslldq     ("xmm1",4);
 2428     &pxor       ("xmm1","xmm3");
 2429 
 2430     &pxor       ("xmm2","xmm1");
 2431     &movdqu     (&QWP(16,$key),"xmm2");
 2432     &lea        ($key,&DWP(32,$key));
 2433     &movdqa     ("xmm1","xmm2");
 2434     &jmp        (&label("loop_key256"));
 2435 
 2436 &set_label("done_key256");
 2437     &mov        ($rounds,13);
 2438     &mov        (&DWP(16,$key),$rounds);
 2439 
 2440 &set_label("good_key");
 2441     &pxor   ("xmm0","xmm0");
 2442     &pxor   ("xmm1","xmm1");
 2443     &pxor   ("xmm2","xmm2");
 2444     &pxor   ("xmm3","xmm3");
 2445     &pxor   ("xmm4","xmm4");
 2446     &pxor   ("xmm5","xmm5");
 2447     &xor    ("eax","eax");
 2448     &pop    ("ebx");
 2449     &pop    ("ebp");
 2450     &ret    ();
 2451 
 2452 &set_label("bad_pointer",4);
 2453     &mov    ("eax",-1);
 2454     &pop    ("ebx");
 2455     &pop    ("ebp");
 2456     &ret    ();
 2457 &set_label("bad_keybits",4);
 2458     &pxor   ("xmm0","xmm0");
 2459     &mov    ("eax",-2);
 2460     &pop    ("ebx");
 2461     &pop    ("ebp");
 2462     &ret    ();
 2463 &function_end_B("_aesni_set_encrypt_key");
 2464 
 2465 # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
 2466 #                              AES_KEY *key)
 2467 &function_begin_B("${PREFIX}_set_encrypt_key");
 2468     &mov    ("eax",&wparam(0));
 2469     &mov    ($rounds,&wparam(1));
 2470     &mov    ($key,&wparam(2));
 2471     &call   ("_aesni_set_encrypt_key");
 2472     &ret    ();
 2473 &function_end_B("${PREFIX}_set_encrypt_key");
 2474 
 2475 # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
 2476 #                              AES_KEY *key)
 2477 &function_begin_B("${PREFIX}_set_decrypt_key");
 2478     &mov    ("eax",&wparam(0));
 2479     &mov    ($rounds,&wparam(1));
 2480     &mov    ($key,&wparam(2));
 2481     &call   ("_aesni_set_encrypt_key");
 2482     &mov    ($key,&wparam(2));
 2483     &shl    ($rounds,4);    # rounds-1 after _aesni_set_encrypt_key
 2484     &test   ("eax","eax");
 2485     &jnz    (&label("dec_key_ret"));
 2486     &lea    ("eax",&DWP(16,$key,$rounds));  # end of key schedule
 2487 
 2488     &$movekey   ("xmm0",&QWP(0,$key));  # just swap
 2489     &$movekey   ("xmm1",&QWP(0,"eax"));
 2490     &$movekey   (&QWP(0,"eax"),"xmm0");
 2491     &$movekey   (&QWP(0,$key),"xmm1");
 2492     &lea        ($key,&DWP(16,$key));
 2493     &lea        ("eax",&DWP(-16,"eax"));
 2494 
 2495 &set_label("dec_key_inverse");
 2496     &$movekey   ("xmm0",&QWP(0,$key));  # swap and inverse
 2497     &$movekey   ("xmm1",&QWP(0,"eax"));
 2498     &aesimc     ("xmm0","xmm0");
 2499     &aesimc     ("xmm1","xmm1");
 2500     &lea        ($key,&DWP(16,$key));
 2501     &lea        ("eax",&DWP(-16,"eax"));
 2502     &$movekey   (&QWP(16,"eax"),"xmm0");
 2503     &$movekey   (&QWP(-16,$key),"xmm1");
 2504     &cmp        ("eax",$key);
 2505     &ja     (&label("dec_key_inverse"));
 2506 
 2507     &$movekey   ("xmm0",&QWP(0,$key));  # inverse middle
 2508     &aesimc     ("xmm0","xmm0");
 2509     &$movekey   (&QWP(0,$key),"xmm0");
 2510 
 2511     &pxor       ("xmm0","xmm0");
 2512     &pxor       ("xmm1","xmm1");
 2513     &xor        ("eax","eax");      # return success
 2514 &set_label("dec_key_ret");
 2515     &ret    ();
 2516 &function_end_B("${PREFIX}_set_decrypt_key");
 2517 
 2518 &set_label("key_const",64);
 2519 &data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
 2520 &data_word(0x04070605,0x04070605,0x04070605,0x04070605);
 2521 &data_word(1,1,1,1);
 2522 &data_word(0x1b,0x1b,0x1b,0x1b);
 2523 &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
 2524 
 2525 &asm_finish();