"Fossies" - the Fresh Open Source Software Archive

Member "openssl-1.0.2q/crypto/aes/asm/aesni-sha1-x86_64.pl" (20 Nov 2018, 52624 Bytes) of package /linux/misc/openssl-1.0.2q.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Perl source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "aesni-sha1-x86_64.pl" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 1.1.0g_vs_1.1.1-pre2.

    1 #!/usr/bin/env perl
    2 #
    3 # ====================================================================
    4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
    5 # project. The module is, however, dual licensed under OpenSSL and
    6 # CRYPTOGAMS licenses depending on where you obtain it. For further
    7 # details see http://www.openssl.org/~appro/cryptogams/.
    8 # ====================================================================
    9 #
   10 # June 2011
   11 #
   12 # This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
   13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
   14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
   15 # parallelism, interleaving it with another algorithm would allow to
   16 # utilize processor resources better and achieve better performance.
   17 # SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
   18 # AESNI code is weaved into it. Below are performance numbers in
   19 # cycles per processed byte, less is better, for standalone AESNI-CBC
   20 # encrypt, sum of the latter and standalone SHA1, and "stitched"
   21 # subroutine:
   22 #
   23 #       AES-128-CBC +SHA1       stitch      gain
   24 # Westmere  3.77[+5.3]  9.07        6.55        +38%
   25 # Sandy Bridge  5.05[+5.0(6.1)] 10.06(11.15)    5.98(7.05)  +68%(+58%)
   26 # Ivy Bridge    5.05[+4.6]  9.65        5.54        +74%
   27 # Haswell   4.43[+3.6(4.2)] 8.00(8.58)  4.55(5.21)  +75%(+65%)
   28 # Bulldozer 5.77[+6.0]  11.72       6.37        +84%
   29 #
   30 #       AES-192-CBC
   31 # Westmere  4.51        9.81        6.80        +44%
   32 # Sandy Bridge  6.05        11.06(12.15)    6.11(7.19)  +81%(+69%)
   33 # Ivy Bridge    6.05        10.65       6.07        +75%
   34 # Haswell   5.29        8.86(9.44)  5.32(5.32)  +67%(+77%)
   35 # Bulldozer 6.89        12.84       6.96        +84%
   36 #
   37 #       AES-256-CBC
   38 # Westmere  5.25        10.55       7.21        +46%
   39 # Sandy Bridge  7.05        12.06(13.15)    7.12(7.72)  +69%(+70%)
   40 # Ivy Bridge    7.05        11.65       7.12        +64%
   41 # Haswell   6.19        9.76(10.34) 6.21(6.25)  +57%(+65%)
   42 # Bulldozer 8.00        13.95       8.25        +69%
   43 #
   44 # (*)   There are two code paths: SSSE3 and AVX. See sha1-568.pl for
   45 #   background information. Above numbers in parentheses are SSSE3
   46 #   results collected on AVX-capable CPU, i.e. apply on OSes that
   47 #   don't support AVX.
   48 #
   49 # Needless to mention that it makes no sense to implement "stitched"
   50 # *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
   51 # fully utilize parallelism, so stitching would not give any gain
   52 # anyway. Well, there might be some, e.g. because of better cache
   53 # locality... For reference, here are performance results for
   54 # standalone AESNI-CBC decrypt:
   55 #
   56 #       AES-128-CBC AES-192-CBC AES-256-CBC
   57 # Westmere  1.25        1.50        1.75
   58 # Sandy Bridge  0.74        0.91        1.09
   59 # Ivy Bridge    0.74        0.90        1.11
   60 # Haswell   0.63        0.76        0.88
   61 # Bulldozer 0.70        0.85        0.99
   62 
   63 # And indeed:
   64 #
   65 #       AES-256-CBC +SHA1       stitch      gain
   66 # Westmere  1.75        7.20        6.68        +7.8%
   67 # Sandy Bridge  1.09        6.09(7.22)  5.82(6.95)  +4.6%(+3.9%)
   68 # Ivy Bridge    1.11        5.70        5.45        +4.6%
   69 # Haswell   0.88        4.45(5.00)  4.39(4.69)  +1.4%(*)(+6.6%)
   70 # Bulldozer 0.99        6.95        5.95        +17%(**)
   71 #
   72 # (*)   Tiny improvement coefficient on Haswell is because we compare
   73 #   AVX1 stitch to sum with AVX2 SHA1.
   74 # (**)  Execution is fully dominated by integer code sequence and
   75 #   SIMD still hardly shows [in single-process benchmark;-]
   76 
   77 $flavour = shift;
   78 $output  = shift;
   79 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
   80 
   81 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
   82 
   83 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
   84 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
   85 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
   86 die "can't locate x86_64-xlate.pl";
   87 
   88 $avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
   89         =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
   90        $1>=2.19);
   91 $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
   92        `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
   93        $1>=2.09);
   94 $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
   95        `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
   96        $1>=10);
   97 $avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/ && $2>=3.0);
   98 
   99 $shaext=1;  ### set to zero if compiling for 1.0.1
  100 
  101 $stitched_decrypt=0;
  102 
  103 open OUT,"| \"$^X\" $xlate $flavour $output";
  104 *STDOUT=*OUT;
  105 
  106 # void aesni_cbc_sha1_enc(const void *inp,
  107 #           void *out,
  108 #           size_t length,
  109 #           const AES_KEY *key,
  110 #           unsigned char *iv,
  111 #           SHA_CTX *ctx,
  112 #           const void *in0);
  113 
  114 $code.=<<___;
  115 .text
  116 .extern OPENSSL_ia32cap_P
  117 
  118 .globl  aesni_cbc_sha1_enc
  119 .type   aesni_cbc_sha1_enc,\@abi-omnipotent
  120 .align  32
  121 aesni_cbc_sha1_enc:
  122     # caller should check for SSSE3 and AES-NI bits
  123     mov OPENSSL_ia32cap_P+0(%rip),%r10d
  124     mov OPENSSL_ia32cap_P+4(%rip),%r11
  125 ___
  126 $code.=<<___ if ($shaext);
  127     bt  \$61,%r11       # check SHA bit
  128     jc  aesni_cbc_sha1_enc_shaext
  129 ___
  130 $code.=<<___ if ($avx);
  131     and \$`1<<28`,%r11d     # mask AVX bit
  132     and \$`1<<30`,%r10d     # mask "Intel CPU" bit
  133     or  %r11d,%r10d
  134     cmp \$`1<<28|1<<30`,%r10d
  135     je  aesni_cbc_sha1_enc_avx
  136 ___
  137 $code.=<<___;
  138     jmp aesni_cbc_sha1_enc_ssse3
  139     ret
  140 .size   aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
  141 ___
  142 
  143 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
  144 
  145 my $Xi=4;
  146 my @X=map("%xmm$_",(4..7,0..3));
  147 my @Tx=map("%xmm$_",(8..10));
  148 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");    # size optimization
  149 my @T=("%esi","%edi");
  150 my $j=0; my $jj=0; my $r=0; my $sn=0; my $rx=0;
  151 my $K_XX_XX="%r11";
  152 my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13));           # for enc
  153 my @rndkey=("%xmm14","%xmm15");                 # for enc
  154 my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15));    # for dec
  155 
  156 if (1) {    # reassign for Atom Silvermont
  157     # The goal is to minimize amount of instructions with more than
  158     # 3 prefix bytes. Or in more practical terms to keep AES-NI *and*
  159     # SSSE3 instructions to upper half of the register bank.
  160     @X=map("%xmm$_",(8..11,4..7));
  161     @Tx=map("%xmm$_",(12,13,3));
  162     ($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15));
  163     @rndkey=("%xmm0","%xmm1");
  164 }
  165 
  166 sub AUTOLOAD()      # thunk [simplified] 32-bit style perlasm
  167 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
  168   my $arg = pop;
  169     $arg = "\$$arg" if ($arg*1 eq $arg);
  170     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
  171 }
  172 
  173 my $_rol=sub { &rol(@_) };
  174 my $_ror=sub { &ror(@_) };
  175 
  176 $code.=<<___;
  177 .type   aesni_cbc_sha1_enc_ssse3,\@function,6
  178 .align  32
  179 aesni_cbc_sha1_enc_ssse3:
  180     mov `($win64?56:8)`(%rsp),$inp  # load 7th argument
  181     #shr    \$6,$len            # debugging artefact
  182     #jz .Lepilogue_ssse3        # debugging artefact
  183     push    %rbx
  184     push    %rbp
  185     push    %r12
  186     push    %r13
  187     push    %r14
  188     push    %r15
  189     lea `-104-($win64?10*16:0)`(%rsp),%rsp
  190     #mov    $in0,$inp           # debugging artefact
  191     #lea    64(%rsp),$ctx           # debugging artefact
  192 ___
  193 $code.=<<___ if ($win64);
  194     movaps  %xmm6,96+0(%rsp)
  195     movaps  %xmm7,96+16(%rsp)
  196     movaps  %xmm8,96+32(%rsp)
  197     movaps  %xmm9,96+48(%rsp)
  198     movaps  %xmm10,96+64(%rsp)
  199     movaps  %xmm11,96+80(%rsp)
  200     movaps  %xmm12,96+96(%rsp)
  201     movaps  %xmm13,96+112(%rsp)
  202     movaps  %xmm14,96+128(%rsp)
  203     movaps  %xmm15,96+144(%rsp)
  204 .Lprologue_ssse3:
  205 ___
  206 $code.=<<___;
  207     mov $in0,%r12           # reassign arguments
  208     mov $out,%r13
  209     mov $len,%r14
  210     lea 112($key),%r15          # size optimization
  211     movdqu  ($ivp),$iv          # load IV
  212     mov $ivp,88(%rsp)           # save $ivp
  213 ___
  214 ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
  215 my $rounds="${ivp}d";
  216 $code.=<<___;
  217     shl \$6,$len
  218     sub $in0,$out
  219     mov 240-112($key),$rounds
  220     add $inp,$len       # end of input
  221 
  222     lea K_XX_XX(%rip),$K_XX_XX
  223     mov 0($ctx),$A      # load context
  224     mov 4($ctx),$B
  225     mov 8($ctx),$C
  226     mov 12($ctx),$D
  227     mov $B,@T[0]        # magic seed
  228     mov 16($ctx),$E
  229     mov $C,@T[1]
  230     xor $D,@T[1]
  231     and @T[1],@T[0]
  232 
  233     movdqa  64($K_XX_XX),@Tx[2] # pbswap mask
  234     movdqa  0($K_XX_XX),@Tx[1]  # K_00_19
  235     movdqu  0($inp),@X[-4&7]    # load input to %xmm[0-3]
  236     movdqu  16($inp),@X[-3&7]
  237     movdqu  32($inp),@X[-2&7]
  238     movdqu  48($inp),@X[-1&7]
  239     pshufb  @Tx[2],@X[-4&7]     # byte swap
  240     pshufb  @Tx[2],@X[-3&7]
  241     pshufb  @Tx[2],@X[-2&7]
  242     add \$64,$inp
  243     paddd   @Tx[1],@X[-4&7]     # add K_00_19
  244     pshufb  @Tx[2],@X[-1&7]
  245     paddd   @Tx[1],@X[-3&7]
  246     paddd   @Tx[1],@X[-2&7]
  247     movdqa  @X[-4&7],0(%rsp)    # X[]+K xfer to IALU
  248     psubd   @Tx[1],@X[-4&7]     # restore X[]
  249     movdqa  @X[-3&7],16(%rsp)
  250     psubd   @Tx[1],@X[-3&7]
  251     movdqa  @X[-2&7],32(%rsp)
  252     psubd   @Tx[1],@X[-2&7]
  253     movups  -112($key),$rndkey0 # $key[0]
  254     movups  16-112($key),$rndkey[0] # forward reference
  255     jmp .Loop_ssse3
  256 ___
  257 
  258 my $aesenc=sub {
  259   use integer;
  260   my ($n,$k)=($r/10,$r%10);
  261     if ($k==0) {
  262       $code.=<<___;
  263     movups      `16*$n`($in0),$in       # load input
  264     xorps       $rndkey0,$in
  265 ___
  266       $code.=<<___ if ($n);
  267     movups      $iv,`16*($n-1)`($out,$in0)  # write output
  268 ___
  269       $code.=<<___;
  270     xorps       $in,$iv
  271     movups      `32+16*$k-112`($key),$rndkey[1]
  272     aesenc      $rndkey[0],$iv
  273 ___
  274     } elsif ($k==9) {
  275       $sn++;
  276       $code.=<<___;
  277     cmp     \$11,$rounds
  278     jb      .Laesenclast$sn
  279     movups      `32+16*($k+0)-112`($key),$rndkey[1]
  280     aesenc      $rndkey[0],$iv
  281     movups      `32+16*($k+1)-112`($key),$rndkey[0]
  282     aesenc      $rndkey[1],$iv
  283     je      .Laesenclast$sn
  284     movups      `32+16*($k+2)-112`($key),$rndkey[1]
  285     aesenc      $rndkey[0],$iv
  286     movups      `32+16*($k+3)-112`($key),$rndkey[0]
  287     aesenc      $rndkey[1],$iv
  288 .Laesenclast$sn:
  289     aesenclast  $rndkey[0],$iv
  290     movups      16-112($key),$rndkey[1]     # forward reference
  291 ___
  292     } else {
  293       $code.=<<___;
  294     movups      `32+16*$k-112`($key),$rndkey[1]
  295     aesenc      $rndkey[0],$iv
  296 ___
  297     }
  298     $r++;   unshift(@rndkey,pop(@rndkey));
  299 };
  300 
  301 sub Xupdate_ssse3_16_31()       # recall that $Xi starts wtih 4
  302 { use integer;
  303   my $body = shift;
  304   my @insns = (&$body,&$body,&$body,&$body);    # 40 instructions
  305   my ($a,$b,$c,$d,$e);
  306 
  307      eval(shift(@insns));       # ror
  308     &pshufd (@X[0],@X[-4&7],0xee);  # was &movdqa   (@X[0],@X[-3&7]);
  309      eval(shift(@insns));
  310     &movdqa (@Tx[0],@X[-1&7]);
  311       &paddd    (@Tx[1],@X[-1&7]);
  312      eval(shift(@insns));
  313      eval(shift(@insns));
  314 
  315     &punpcklqdq(@X[0],@X[-3&7]);    # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
  316      eval(shift(@insns));
  317      eval(shift(@insns));       # rol
  318      eval(shift(@insns));
  319     &psrldq (@Tx[0],4);     # "X[-3]", 3 dwords
  320      eval(shift(@insns));
  321      eval(shift(@insns));
  322 
  323     &pxor   (@X[0],@X[-4&7]);   # "X[0]"^="X[-16]"
  324      eval(shift(@insns));
  325      eval(shift(@insns));       # ror
  326     &pxor   (@Tx[0],@X[-2&7]);  # "X[-3]"^"X[-8]"
  327      eval(shift(@insns));
  328      eval(shift(@insns));
  329      eval(shift(@insns));
  330 
  331     &pxor   (@X[0],@Tx[0]);     # "X[0]"^="X[-3]"^"X[-8]"
  332      eval(shift(@insns));
  333      eval(shift(@insns));       # rol
  334       &movdqa   (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
  335      eval(shift(@insns));
  336      eval(shift(@insns));
  337 
  338     &movdqa (@Tx[2],@X[0]);
  339      eval(shift(@insns));
  340      eval(shift(@insns));
  341      eval(shift(@insns));       # ror
  342     &movdqa (@Tx[0],@X[0]);
  343      eval(shift(@insns));
  344 
  345     &pslldq (@Tx[2],12);        # "X[0]"<<96, extract one dword
  346     &paddd  (@X[0],@X[0]);
  347      eval(shift(@insns));
  348      eval(shift(@insns));
  349 
  350     &psrld  (@Tx[0],31);
  351      eval(shift(@insns));
  352      eval(shift(@insns));       # rol
  353      eval(shift(@insns));
  354     &movdqa (@Tx[1],@Tx[2]);
  355      eval(shift(@insns));
  356      eval(shift(@insns));
  357 
  358     &psrld  (@Tx[2],30);
  359      eval(shift(@insns));
  360      eval(shift(@insns));       # ror
  361     &por    (@X[0],@Tx[0]);     # "X[0]"<<<=1
  362      eval(shift(@insns));
  363      eval(shift(@insns));
  364      eval(shift(@insns));
  365 
  366     &pslld  (@Tx[1],2);
  367     &pxor   (@X[0],@Tx[2]);
  368      eval(shift(@insns));
  369       &movdqa   (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");   # K_XX_XX
  370      eval(shift(@insns));       # rol
  371      eval(shift(@insns));
  372      eval(shift(@insns));
  373 
  374     &pxor   (@X[0],@Tx[1]);     # "X[0]"^=("X[0]">>96)<<<2
  375     &pshufd (@Tx[1],@X[-1&7],0xee)  if ($Xi==7);    # was &movdqa   (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
  376 
  377      foreach (@insns) { eval; } # remaining instructions [if any]
  378 
  379   $Xi++;    push(@X,shift(@X)); # "rotate" X[]
  380         push(@Tx,shift(@Tx));
  381 }
  382 
  383 sub Xupdate_ssse3_32_79()
  384 { use integer;
  385   my $body = shift;
  386   my @insns = (&$body,&$body,&$body,&$body);    # 32 to 44 instructions
  387   my ($a,$b,$c,$d,$e);
  388 
  389      eval(shift(@insns))        if ($Xi==8);
  390     &pxor   (@X[0],@X[-4&7]);   # "X[0]"="X[-32]"^"X[-16]"
  391      eval(shift(@insns))        if ($Xi==8);
  392      eval(shift(@insns));       # body_20_39
  393      eval(shift(@insns));
  394      eval(shift(@insns))        if (@insns[1] =~ /_ror/);
  395      eval(shift(@insns))        if (@insns[0] =~ /_ror/);
  396     &punpcklqdq(@Tx[0],@X[-1&7]);   # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
  397      eval(shift(@insns));
  398      eval(shift(@insns));       # rol
  399 
  400     &pxor   (@X[0],@X[-7&7]);   # "X[0]"^="X[-28]"
  401      eval(shift(@insns));
  402      eval(shift(@insns));
  403     if ($Xi%5) {
  404       &movdqa   (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
  405     } else {            # ... or load next one
  406       &movdqa   (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
  407     }
  408      eval(shift(@insns));       # ror
  409       &paddd    (@Tx[1],@X[-1&7]);
  410      eval(shift(@insns));
  411 
  412     &pxor   (@X[0],@Tx[0]);     # "X[0]"^="X[-6]"
  413      eval(shift(@insns));       # body_20_39
  414      eval(shift(@insns));
  415      eval(shift(@insns));
  416      eval(shift(@insns));       # rol
  417      eval(shift(@insns))        if (@insns[0] =~ /_ror/);
  418 
  419     &movdqa (@Tx[0],@X[0]);
  420      eval(shift(@insns));
  421      eval(shift(@insns));
  422       &movdqa   (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
  423      eval(shift(@insns));       # ror
  424      eval(shift(@insns));
  425      eval(shift(@insns));       # body_20_39
  426 
  427     &pslld  (@X[0],2);
  428      eval(shift(@insns));
  429      eval(shift(@insns));
  430     &psrld  (@Tx[0],30);
  431      eval(shift(@insns))        if (@insns[0] =~ /_rol/);# rol
  432      eval(shift(@insns));
  433      eval(shift(@insns));
  434      eval(shift(@insns));       # ror
  435 
  436     &por    (@X[0],@Tx[0]);     # "X[0]"<<<=2
  437      eval(shift(@insns));
  438      eval(shift(@insns));       # body_20_39
  439      eval(shift(@insns))        if (@insns[1] =~ /_rol/);
  440      eval(shift(@insns))        if (@insns[0] =~ /_rol/);
  441       &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19);    # was &movdqa   (@Tx[1],@X[0])
  442      eval(shift(@insns));
  443      eval(shift(@insns));       # rol
  444      eval(shift(@insns));
  445      eval(shift(@insns));
  446      eval(shift(@insns));       # rol
  447      eval(shift(@insns));
  448 
  449      foreach (@insns) { eval; } # remaining instructions
  450 
  451   $Xi++;    push(@X,shift(@X)); # "rotate" X[]
  452         push(@Tx,shift(@Tx));
  453 }
  454 
  455 sub Xuplast_ssse3_80()
  456 { use integer;
  457   my $body = shift;
  458   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
  459   my ($a,$b,$c,$d,$e);
  460 
  461      eval(shift(@insns));
  462      eval(shift(@insns));
  463      eval(shift(@insns));
  464      eval(shift(@insns));
  465       &paddd    (@Tx[1],@X[-1&7]);
  466      eval(shift(@insns));
  467      eval(shift(@insns));
  468 
  469       &movdqa   (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
  470 
  471      foreach (@insns) { eval; }     # remaining instructions
  472 
  473     &cmp    ($inp,$len);
  474     &je (shift);
  475 
  476     unshift(@Tx,pop(@Tx));
  477 
  478     &movdqa (@Tx[2],"64($K_XX_XX)");    # pbswap mask
  479     &movdqa (@Tx[1],"0($K_XX_XX)");     # K_00_19
  480     &movdqu (@X[-4&7],"0($inp)");       # load input
  481     &movdqu (@X[-3&7],"16($inp)");
  482     &movdqu (@X[-2&7],"32($inp)");
  483     &movdqu (@X[-1&7],"48($inp)");
  484     &pshufb (@X[-4&7],@Tx[2]);      # byte swap
  485     &add    ($inp,64);
  486 
  487   $Xi=0;
  488 }
  489 
  490 sub Xloop_ssse3()
  491 { use integer;
  492   my $body = shift;
  493   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
  494   my ($a,$b,$c,$d,$e);
  495 
  496      eval(shift(@insns));
  497      eval(shift(@insns));
  498      eval(shift(@insns));
  499     &pshufb (@X[($Xi-3)&7],@Tx[2]);
  500      eval(shift(@insns));
  501      eval(shift(@insns));
  502      eval(shift(@insns));
  503      eval(shift(@insns));
  504     &paddd  (@X[($Xi-4)&7],@Tx[1]);
  505      eval(shift(@insns));
  506      eval(shift(@insns));
  507      eval(shift(@insns));
  508      eval(shift(@insns));
  509     &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);  # X[]+K xfer to IALU
  510      eval(shift(@insns));
  511      eval(shift(@insns));
  512      eval(shift(@insns));
  513      eval(shift(@insns));
  514     &psubd  (@X[($Xi-4)&7],@Tx[1]);
  515 
  516     foreach (@insns) { eval; }
  517   $Xi++;
  518 }
  519 
  520 sub Xtail_ssse3()
  521 { use integer;
  522   my $body = shift;
  523   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
  524   my ($a,$b,$c,$d,$e);
  525 
  526     foreach (@insns) { eval; }
  527 }
  528 
  529 my @body_00_19 = (
  530     '($a,$b,$c,$d,$e)=@V;'.
  531     '&$_ror ($b,$j?7:2);',  # $b>>>2
  532     '&xor   (@T[0],$d);',
  533     '&mov   (@T[1],$a);',   # $b for next round
  534 
  535     '&add   ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
  536     '&xor   ($b,$c);',  # $c^$d for next round
  537 
  538     '&$_rol ($a,5);',
  539     '&add   ($e,@T[0]);',
  540     '&and   (@T[1],$b);',   # ($b&($c^$d)) for next round
  541 
  542     '&xor   ($b,$c);',  # restore $b
  543     '&add   ($e,$a);'   .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
  544     );
  545 
  546 sub body_00_19 () { # ((c^d)&b)^d
  547     # on start @T[0]=(c^d)&b
  548     return &body_20_39() if ($rx==19); $rx++;
  549 
  550     use integer;
  551     my ($k,$n);
  552     my @r=@body_00_19;
  553 
  554     $n = scalar(@r);
  555     $k = (($jj+1)*12/20)*20*$n/12;  # 12 aesencs per these 20 rounds
  556     @r[$k%$n].='&$aesenc();'    if ($jj==$k/$n);
  557     $jj++;
  558 
  559     return @r;
  560 }
  561 
  562 my @body_20_39 = (
  563     '($a,$b,$c,$d,$e)=@V;'.
  564     '&add   ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
  565     '&xor   (@T[0],$d)  if($j==19);'.
  566     '&xor   (@T[0],$c)  if($j> 19);',   # ($b^$d^$c)
  567     '&mov   (@T[1],$a);',   # $b for next round
  568 
  569     '&$_rol ($a,5);',
  570     '&add   ($e,@T[0]);',
  571     '&xor   (@T[1],$c)  if ($j< 79);',  # $b^$d for next round
  572 
  573     '&$_ror ($b,7);',   # $b>>>2
  574     '&add   ($e,$a);'   .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
  575     );
  576 
  577 sub body_20_39 () { # b^d^c
  578     # on entry @T[0]=b^d
  579     return &body_40_59() if ($rx==39); $rx++;
  580 
  581     use integer;
  582     my ($k,$n);
  583     my @r=@body_20_39;
  584 
  585     $n = scalar(@r);
  586     $k = (($jj+1)*8/20)*20*$n/8;    # 8 aesencs per these 20 rounds
  587     @r[$k%$n].='&$aesenc();'    if ($jj==$k/$n && $rx!=20);
  588     $jj++;
  589 
  590     return @r;
  591 }
  592 
  593 my @body_40_59 = (
  594     '($a,$b,$c,$d,$e)=@V;'.
  595     '&add   ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
  596     '&and   (@T[0],$c)  if ($j>=40);',  # (b^c)&(c^d)
  597     '&xor   ($c,$d)     if ($j>=40);',  # restore $c
  598 
  599     '&$_ror ($b,7);',   # $b>>>2
  600     '&mov   (@T[1],$a);',   # $b for next round
  601     '&xor   (@T[0],$c);',
  602 
  603     '&$_rol ($a,5);',
  604     '&add   ($e,@T[0]);',
  605     '&xor   (@T[1],$c)  if ($j==59);'.
  606     '&xor   (@T[1],$b)  if ($j< 59);',  # b^c for next round
  607 
  608     '&xor   ($b,$c)     if ($j< 59);',  # c^d for next round
  609     '&add   ($e,$a);'   .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
  610     );
  611 
  612 sub body_40_59 () { # ((b^c)&(c^d))^c
  613     # on entry @T[0]=(b^c), (c^=d)
  614     $rx++;
  615 
  616     use integer;
  617     my ($k,$n);
  618     my @r=@body_40_59;
  619 
  620     $n = scalar(@r);
  621     $k=(($jj+1)*12/20)*20*$n/12;    # 12 aesencs per these 20 rounds
  622     @r[$k%$n].='&$aesenc();'    if ($jj==$k/$n && $rx!=40);
  623     $jj++;
  624 
  625     return @r;
  626 }
  627 $code.=<<___;
  628 .align  32
  629 .Loop_ssse3:
  630 ___
  631     &Xupdate_ssse3_16_31(\&body_00_19);
  632     &Xupdate_ssse3_16_31(\&body_00_19);
  633     &Xupdate_ssse3_16_31(\&body_00_19);
  634     &Xupdate_ssse3_16_31(\&body_00_19);
  635     &Xupdate_ssse3_32_79(\&body_00_19);
  636     &Xupdate_ssse3_32_79(\&body_20_39);
  637     &Xupdate_ssse3_32_79(\&body_20_39);
  638     &Xupdate_ssse3_32_79(\&body_20_39);
  639     &Xupdate_ssse3_32_79(\&body_20_39);
  640     &Xupdate_ssse3_32_79(\&body_20_39);
  641     &Xupdate_ssse3_32_79(\&body_40_59);
  642     &Xupdate_ssse3_32_79(\&body_40_59);
  643     &Xupdate_ssse3_32_79(\&body_40_59);
  644     &Xupdate_ssse3_32_79(\&body_40_59);
  645     &Xupdate_ssse3_32_79(\&body_40_59);
  646     &Xupdate_ssse3_32_79(\&body_20_39);
  647     &Xuplast_ssse3_80(\&body_20_39,".Ldone_ssse3"); # can jump to "done"
  648 
  649                 $saved_j=$j; @saved_V=@V;
  650                 $saved_r=$r; @saved_rndkey=@rndkey;
  651 
  652     &Xloop_ssse3(\&body_20_39);
  653     &Xloop_ssse3(\&body_20_39);
  654     &Xloop_ssse3(\&body_20_39);
  655 
  656 $code.=<<___;
  657     movups  $iv,48($out,$in0)       # write output
  658     lea 64($in0),$in0
  659 
  660     add 0($ctx),$A          # update context
  661     add 4($ctx),@T[0]
  662     add 8($ctx),$C
  663     add 12($ctx),$D
  664     mov $A,0($ctx)
  665     add 16($ctx),$E
  666     mov @T[0],4($ctx)
  667     mov @T[0],$B            # magic seed
  668     mov $C,8($ctx)
  669     mov $C,@T[1]
  670     mov $D,12($ctx)
  671     xor $D,@T[1]
  672     mov $E,16($ctx)
  673     and @T[1],@T[0]
  674     jmp .Loop_ssse3
  675 
  676 .Ldone_ssse3:
  677 ___
  678                 $jj=$j=$saved_j; @V=@saved_V;
  679                 $r=$saved_r;     @rndkey=@saved_rndkey;
  680 
  681     &Xtail_ssse3(\&body_20_39);
  682     &Xtail_ssse3(\&body_20_39);
  683     &Xtail_ssse3(\&body_20_39);
  684 
  685 $code.=<<___;
  686     movups  $iv,48($out,$in0)       # write output
  687     mov 88(%rsp),$ivp           # restore $ivp
  688 
  689     add 0($ctx),$A          # update context
  690     add 4($ctx),@T[0]
  691     add 8($ctx),$C
  692     mov $A,0($ctx)
  693     add 12($ctx),$D
  694     mov @T[0],4($ctx)
  695     add 16($ctx),$E
  696     mov $C,8($ctx)
  697     mov $D,12($ctx)
  698     mov $E,16($ctx)
  699     movups  $iv,($ivp)          # write IV
  700 ___
  701 $code.=<<___ if ($win64);
  702     movaps  96+0(%rsp),%xmm6
  703     movaps  96+16(%rsp),%xmm7
  704     movaps  96+32(%rsp),%xmm8
  705     movaps  96+48(%rsp),%xmm9
  706     movaps  96+64(%rsp),%xmm10
  707     movaps  96+80(%rsp),%xmm11
  708     movaps  96+96(%rsp),%xmm12
  709     movaps  96+112(%rsp),%xmm13
  710     movaps  96+128(%rsp),%xmm14
  711     movaps  96+144(%rsp),%xmm15
  712 ___
  713 $code.=<<___;
  714     lea `104+($win64?10*16:0)`(%rsp),%rsi
  715     mov 0(%rsi),%r15
  716     mov 8(%rsi),%r14
  717     mov 16(%rsi),%r13
  718     mov 24(%rsi),%r12
  719     mov 32(%rsi),%rbp
  720     mov 40(%rsi),%rbx
  721     lea 48(%rsi),%rsp
  722 .Lepilogue_ssse3:
  723     ret
  724 .size   aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
  725 ___
  726 
  727                         if ($stitched_decrypt) {{{
  728 # reset
  729 ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
  730 $j=$jj=$r=$rx=0;
  731 $Xi=4;
  732 
  733 # reassign for Atom Silvermont (see above)
  734 ($inout0,$inout1,$inout2,$inout3,$rndkey0)=map("%xmm$_",(0..4));
  735 @X=map("%xmm$_",(8..13,6,7));
  736 @Tx=map("%xmm$_",(14,15,5));
  737 
  738 my @aes256_dec = (
  739     '&movdqu($inout0,"0x00($in0)");',
  740     '&movdqu($inout1,"0x10($in0)"); &pxor   ($inout0,$rndkey0);',
  741     '&movdqu($inout2,"0x20($in0)"); &pxor   ($inout1,$rndkey0);',
  742     '&movdqu($inout3,"0x30($in0)"); &pxor   ($inout2,$rndkey0);',
  743 
  744     '&pxor  ($inout3,$rndkey0); &movups ($rndkey0,"16-112($key)");',
  745     '&movaps("64(%rsp)",@X[2]);',   # save IV, originally @X[3]
  746     undef,undef
  747     );
  748 for ($i=0;$i<13;$i++) {
  749     push (@aes256_dec,(
  750     '&aesdec    ($inout0,$rndkey0);',
  751     '&aesdec    ($inout1,$rndkey0);',
  752     '&aesdec    ($inout2,$rndkey0);',
  753     '&aesdec    ($inout3,$rndkey0); &movups($rndkey0,"'.(16*($i+2)-112).'($key)");'
  754     ));
  755     push (@aes256_dec,(undef,undef))    if (($i>=3 && $i<=5) || $i>=11);
  756     push (@aes256_dec,(undef,undef))    if ($i==5);
  757 }
  758 push(@aes256_dec,(
  759     '&aesdeclast    ($inout0,$rndkey0); &movups (@X[0],"0x00($in0)");',
  760     '&aesdeclast    ($inout1,$rndkey0); &movups (@X[1],"0x10($in0)");',
  761     '&aesdeclast    ($inout2,$rndkey0); &movups (@X[2],"0x20($in0)");',
  762     '&aesdeclast    ($inout3,$rndkey0); &movups (@X[3],"0x30($in0)");',
  763 
  764     '&xorps     ($inout0,"64(%rsp)");   &movdqu ($rndkey0,"-112($key)");',
  765     '&xorps     ($inout1,@X[0]);    &movups ("0x00($out,$in0)",$inout0);',
  766     '&xorps     ($inout2,@X[1]);    &movups ("0x10($out,$in0)",$inout1);',
  767     '&xorps     ($inout3,@X[2]);    &movups ("0x20($out,$in0)",$inout2);',
  768 
  769     '&movups    ("0x30($out,$in0)",$inout3);'
  770     ));
  771 
  772 sub body_00_19_dec () { # ((c^d)&b)^d
  773     # on start @T[0]=(c^d)&b
  774     return &body_20_39_dec() if ($rx==19);
  775 
  776     my @r=@body_00_19;
  777 
  778     unshift (@r,@aes256_dec[$rx])   if (@aes256_dec[$rx]);
  779     $rx++;
  780 
  781     return @r;
  782 }
  783 
  784 sub body_20_39_dec () { # b^d^c
  785     # on entry @T[0]=b^d
  786     return &body_40_59_dec() if ($rx==39);
  787   
  788     my @r=@body_20_39;
  789 
  790     unshift (@r,@aes256_dec[$rx])   if (@aes256_dec[$rx]);
  791     $rx++;
  792 
  793     return @r;
  794 }
  795 
  796 sub body_40_59_dec () { # ((b^c)&(c^d))^c
  797     # on entry @T[0]=(b^c), (c^=d)
  798 
  799     my @r=@body_40_59;
  800 
  801     unshift (@r,@aes256_dec[$rx])   if (@aes256_dec[$rx]);
  802     $rx++;
  803 
  804     return @r;
  805 }
  806 
  807 $code.=<<___;
  808 .globl  aesni256_cbc_sha1_dec
  809 .type   aesni256_cbc_sha1_dec,\@abi-omnipotent
  810 .align  32
  811 aesni256_cbc_sha1_dec:
  812     # caller should check for SSSE3 and AES-NI bits
  813     mov OPENSSL_ia32cap_P+0(%rip),%r10d
  814     mov OPENSSL_ia32cap_P+4(%rip),%r11d
  815 ___
  816 $code.=<<___ if ($avx);
  817     and \$`1<<28`,%r11d     # mask AVX bit
  818     and \$`1<<30`,%r10d     # mask "Intel CPU" bit
  819     or  %r11d,%r10d
  820     cmp \$`1<<28|1<<30`,%r10d
  821     je  aesni256_cbc_sha1_dec_avx
  822 ___
  823 $code.=<<___;
  824     jmp aesni256_cbc_sha1_dec_ssse3
  825     ret
  826 .size   aesni256_cbc_sha1_dec,.-aesni256_cbc_sha1_dec
  827 
  828 .type   aesni256_cbc_sha1_dec_ssse3,\@function,6
  829 .align  32
  830 aesni256_cbc_sha1_dec_ssse3:
  831     mov `($win64?56:8)`(%rsp),$inp  # load 7th argument
  832     push    %rbx
  833     push    %rbp
  834     push    %r12
  835     push    %r13
  836     push    %r14
  837     push    %r15
  838     lea `-104-($win64?10*16:0)`(%rsp),%rsp
  839 ___
  840 $code.=<<___ if ($win64);
  841     movaps  %xmm6,96+0(%rsp)
  842     movaps  %xmm7,96+16(%rsp)
  843     movaps  %xmm8,96+32(%rsp)
  844     movaps  %xmm9,96+48(%rsp)
  845     movaps  %xmm10,96+64(%rsp)
  846     movaps  %xmm11,96+80(%rsp)
  847     movaps  %xmm12,96+96(%rsp)
  848     movaps  %xmm13,96+112(%rsp)
  849     movaps  %xmm14,96+128(%rsp)
  850     movaps  %xmm15,96+144(%rsp)
  851 .Lprologue_dec_ssse3:
  852 ___
  853 $code.=<<___;
  854     mov $in0,%r12           # reassign arguments
  855     mov $out,%r13
  856     mov $len,%r14
  857     lea 112($key),%r15          # size optimization
  858     movdqu  ($ivp),@X[3]            # load IV
  859     #mov    $ivp,88(%rsp)           # save $ivp
  860 ___
  861 ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
  862 $code.=<<___;
  863     shl \$6,$len
  864     sub $in0,$out
  865     add $inp,$len       # end of input
  866 
  867     lea K_XX_XX(%rip),$K_XX_XX
  868     mov 0($ctx),$A      # load context
  869     mov 4($ctx),$B
  870     mov 8($ctx),$C
  871     mov 12($ctx),$D
  872     mov $B,@T[0]        # magic seed
  873     mov 16($ctx),$E
  874     mov $C,@T[1]
  875     xor $D,@T[1]
  876     and @T[1],@T[0]
  877 
  878     movdqa  64($K_XX_XX),@Tx[2] # pbswap mask
  879     movdqa  0($K_XX_XX),@Tx[1]  # K_00_19
  880     movdqu  0($inp),@X[-4&7]    # load input to %xmm[0-3]
  881     movdqu  16($inp),@X[-3&7]
  882     movdqu  32($inp),@X[-2&7]
  883     movdqu  48($inp),@X[-1&7]
  884     pshufb  @Tx[2],@X[-4&7]     # byte swap
  885     add \$64,$inp
  886     pshufb  @Tx[2],@X[-3&7]
  887     pshufb  @Tx[2],@X[-2&7]
  888     pshufb  @Tx[2],@X[-1&7]
  889     paddd   @Tx[1],@X[-4&7]     # add K_00_19
  890     paddd   @Tx[1],@X[-3&7]
  891     paddd   @Tx[1],@X[-2&7]
  892     movdqa  @X[-4&7],0(%rsp)    # X[]+K xfer to IALU
  893     psubd   @Tx[1],@X[-4&7]     # restore X[]
  894     movdqa  @X[-3&7],16(%rsp)
  895     psubd   @Tx[1],@X[-3&7]
  896     movdqa  @X[-2&7],32(%rsp)
  897     psubd   @Tx[1],@X[-2&7]
  898     movdqu  -112($key),$rndkey0 # $key[0]
  899     jmp .Loop_dec_ssse3
  900 
  901 .align  32
  902 .Loop_dec_ssse3:
  903 ___
  904     &Xupdate_ssse3_16_31(\&body_00_19_dec);
  905     &Xupdate_ssse3_16_31(\&body_00_19_dec);
  906     &Xupdate_ssse3_16_31(\&body_00_19_dec);
  907     &Xupdate_ssse3_16_31(\&body_00_19_dec);
  908     &Xupdate_ssse3_32_79(\&body_00_19_dec);
  909     &Xupdate_ssse3_32_79(\&body_20_39_dec);
  910     &Xupdate_ssse3_32_79(\&body_20_39_dec);
  911     &Xupdate_ssse3_32_79(\&body_20_39_dec);
  912     &Xupdate_ssse3_32_79(\&body_20_39_dec);
  913     &Xupdate_ssse3_32_79(\&body_20_39_dec);
  914     &Xupdate_ssse3_32_79(\&body_40_59_dec);
  915     &Xupdate_ssse3_32_79(\&body_40_59_dec);
  916     &Xupdate_ssse3_32_79(\&body_40_59_dec);
  917     &Xupdate_ssse3_32_79(\&body_40_59_dec);
  918     &Xupdate_ssse3_32_79(\&body_40_59_dec);
  919     &Xupdate_ssse3_32_79(\&body_20_39_dec);
  920     &Xuplast_ssse3_80(\&body_20_39_dec,".Ldone_dec_ssse3"); # can jump to "done"
  921 
  922                 $saved_j=$j;   @saved_V=@V;
  923                 $saved_rx=$rx;
  924 
  925     &Xloop_ssse3(\&body_20_39_dec);
  926     &Xloop_ssse3(\&body_20_39_dec);
  927     &Xloop_ssse3(\&body_20_39_dec);
  928 
  929     eval(@aes256_dec[-1]);          # last store
  930 $code.=<<___;
  931     lea 64($in0),$in0
  932 
  933     add 0($ctx),$A          # update context
  934     add 4($ctx),@T[0]
  935     add 8($ctx),$C
  936     add 12($ctx),$D
  937     mov $A,0($ctx)
  938     add 16($ctx),$E
  939     mov @T[0],4($ctx)
  940     mov @T[0],$B            # magic seed
  941     mov $C,8($ctx)
  942     mov $C,@T[1]
  943     mov $D,12($ctx)
  944     xor $D,@T[1]
  945     mov $E,16($ctx)
  946     and @T[1],@T[0]
  947     jmp .Loop_dec_ssse3
  948 
  949 .Ldone_dec_ssse3:
  950 ___
  951                 $jj=$j=$saved_j; @V=@saved_V;
  952                 $rx=$saved_rx;
  953 
  954     &Xtail_ssse3(\&body_20_39_dec);
  955     &Xtail_ssse3(\&body_20_39_dec);
  956     &Xtail_ssse3(\&body_20_39_dec);
  957 
  958     eval(@aes256_dec[-1]);          # last store
  959 $code.=<<___;
  960     add 0($ctx),$A          # update context
  961     add 4($ctx),@T[0]
  962     add 8($ctx),$C
  963     mov $A,0($ctx)
  964     add 12($ctx),$D
  965     mov @T[0],4($ctx)
  966     add 16($ctx),$E
  967     mov $C,8($ctx)
  968     mov $D,12($ctx)
  969     mov $E,16($ctx)
  970     movups  @X[3],($ivp)            # write IV
  971 ___
  972 $code.=<<___ if ($win64);
  973     movaps  96+0(%rsp),%xmm6
  974     movaps  96+16(%rsp),%xmm7
  975     movaps  96+32(%rsp),%xmm8
  976     movaps  96+48(%rsp),%xmm9
  977     movaps  96+64(%rsp),%xmm10
  978     movaps  96+80(%rsp),%xmm11
  979     movaps  96+96(%rsp),%xmm12
  980     movaps  96+112(%rsp),%xmm13
  981     movaps  96+128(%rsp),%xmm14
  982     movaps  96+144(%rsp),%xmm15
  983 ___
  984 $code.=<<___;
  985     lea `104+($win64?10*16:0)`(%rsp),%rsi
  986     mov 0(%rsi),%r15
  987     mov 8(%rsi),%r14
  988     mov 16(%rsi),%r13
  989     mov 24(%rsi),%r12
  990     mov 32(%rsi),%rbp
  991     mov 40(%rsi),%rbx
  992     lea 48(%rsi),%rsp
  993 .Lepilogue_dec_ssse3:
  994     ret
  995 .size   aesni256_cbc_sha1_dec_ssse3,.-aesni256_cbc_sha1_dec_ssse3
  996 ___
  997                         }}}
  998 $j=$jj=$r=$rx=0;
  999 
 1000 if ($avx) {
 1001 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
 1002 
 1003 my $Xi=4;
 1004 my @X=map("%xmm$_",(4..7,0..3));
 1005 my @Tx=map("%xmm$_",(8..10));
 1006 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");    # size optimization
 1007 my @T=("%esi","%edi");
 1008 my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13));
 1009 my @rndkey=("%xmm14","%xmm15");
 1010 my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15));    # for dec
 1011 my $Kx=@Tx[2];
 1012 
 1013 my $_rol=sub { &shld(@_[0],@_) };
 1014 my $_ror=sub { &shrd(@_[0],@_) };
 1015 
 1016 $code.=<<___;
 1017 .type   aesni_cbc_sha1_enc_avx,\@function,6
 1018 .align  32
 1019 aesni_cbc_sha1_enc_avx:
 1020     mov `($win64?56:8)`(%rsp),$inp  # load 7th argument
 1021     #shr    \$6,$len            # debugging artefact
 1022     #jz .Lepilogue_avx          # debugging artefact
 1023     push    %rbx
 1024     push    %rbp
 1025     push    %r12
 1026     push    %r13
 1027     push    %r14
 1028     push    %r15
 1029     lea `-104-($win64?10*16:0)`(%rsp),%rsp
 1030     #mov    $in0,$inp           # debugging artefact
 1031     #lea    64(%rsp),$ctx           # debugging artefact
 1032 ___
 1033 $code.=<<___ if ($win64);
 1034     movaps  %xmm6,96+0(%rsp)
 1035     movaps  %xmm7,96+16(%rsp)
 1036     movaps  %xmm8,96+32(%rsp)
 1037     movaps  %xmm9,96+48(%rsp)
 1038     movaps  %xmm10,96+64(%rsp)
 1039     movaps  %xmm11,96+80(%rsp)
 1040     movaps  %xmm12,96+96(%rsp)
 1041     movaps  %xmm13,96+112(%rsp)
 1042     movaps  %xmm14,96+128(%rsp)
 1043     movaps  %xmm15,96+144(%rsp)
 1044 .Lprologue_avx:
 1045 ___
 1046 $code.=<<___;
 1047     vzeroall
 1048     mov $in0,%r12           # reassign arguments
 1049     mov $out,%r13
 1050     mov $len,%r14
 1051     lea 112($key),%r15          # size optimization
 1052     vmovdqu ($ivp),$iv          # load IV
 1053     mov $ivp,88(%rsp)           # save $ivp
 1054 ___
 1055 ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
 1056 my $rounds="${ivp}d";
 1057 $code.=<<___;
 1058     shl \$6,$len
 1059     sub $in0,$out
 1060     mov 240-112($key),$rounds
 1061     add $inp,$len       # end of input
 1062 
 1063     lea K_XX_XX(%rip),$K_XX_XX
 1064     mov 0($ctx),$A      # load context
 1065     mov 4($ctx),$B
 1066     mov 8($ctx),$C
 1067     mov 12($ctx),$D
 1068     mov $B,@T[0]        # magic seed
 1069     mov 16($ctx),$E
 1070     mov $C,@T[1]
 1071     xor $D,@T[1]
 1072     and @T[1],@T[0]
 1073 
 1074     vmovdqa 64($K_XX_XX),@X[2]  # pbswap mask
 1075     vmovdqa 0($K_XX_XX),$Kx     # K_00_19
 1076     vmovdqu 0($inp),@X[-4&7]    # load input to %xmm[0-3]
 1077     vmovdqu 16($inp),@X[-3&7]
 1078     vmovdqu 32($inp),@X[-2&7]
 1079     vmovdqu 48($inp),@X[-1&7]
 1080     vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
 1081     add \$64,$inp
 1082     vpshufb @X[2],@X[-3&7],@X[-3&7]
 1083     vpshufb @X[2],@X[-2&7],@X[-2&7]
 1084     vpshufb @X[2],@X[-1&7],@X[-1&7]
 1085     vpaddd  $Kx,@X[-4&7],@X[0]  # add K_00_19
 1086     vpaddd  $Kx,@X[-3&7],@X[1]
 1087     vpaddd  $Kx,@X[-2&7],@X[2]
 1088     vmovdqa @X[0],0(%rsp)       # X[]+K xfer to IALU
 1089     vmovdqa @X[1],16(%rsp)
 1090     vmovdqa @X[2],32(%rsp)
 1091     vmovups -112($key),$rndkey[1]   # $key[0]
 1092     vmovups 16-112($key),$rndkey[0] # forward reference
 1093     jmp .Loop_avx
 1094 ___
 1095 
 1096 my $aesenc=sub {
 1097   use integer;
 1098   my ($n,$k)=($r/10,$r%10);
 1099     if ($k==0) {
 1100       $code.=<<___;
 1101     vmovdqu     `16*$n`($in0),$in       # load input
 1102     vpxor       $rndkey[1],$in,$in
 1103 ___
 1104       $code.=<<___ if ($n);
 1105     vmovups     $iv,`16*($n-1)`($out,$in0)  # write output
 1106 ___
 1107       $code.=<<___;
 1108     vpxor       $in,$iv,$iv
 1109     vaesenc     $rndkey[0],$iv,$iv
 1110     vmovups     `32+16*$k-112`($key),$rndkey[1]
 1111 ___
 1112     } elsif ($k==9) {
 1113       $sn++;
 1114       $code.=<<___;
 1115     cmp     \$11,$rounds
 1116     jb      .Lvaesenclast$sn
 1117     vaesenc     $rndkey[0],$iv,$iv
 1118     vmovups     `32+16*($k+0)-112`($key),$rndkey[1]
 1119     vaesenc     $rndkey[1],$iv,$iv
 1120     vmovups     `32+16*($k+1)-112`($key),$rndkey[0]
 1121     je      .Lvaesenclast$sn
 1122     vaesenc     $rndkey[0],$iv,$iv
 1123     vmovups     `32+16*($k+2)-112`($key),$rndkey[1]
 1124     vaesenc     $rndkey[1],$iv,$iv
 1125     vmovups     `32+16*($k+3)-112`($key),$rndkey[0]
 1126 .Lvaesenclast$sn:
 1127     vaesenclast $rndkey[0],$iv,$iv
 1128     vmovups     -112($key),$rndkey[0]
 1129     vmovups     16-112($key),$rndkey[1]     # forward reference
 1130 ___
 1131     } else {
 1132       $code.=<<___;
 1133     vaesenc     $rndkey[0],$iv,$iv
 1134     vmovups     `32+16*$k-112`($key),$rndkey[1]
 1135 ___
 1136     }
 1137     $r++;   unshift(@rndkey,pop(@rndkey));
 1138 };
 1139 
 1140 sub Xupdate_avx_16_31()     # recall that $Xi starts wtih 4
 1141 { use integer;
 1142   my $body = shift;
 1143   my @insns = (&$body,&$body,&$body,&$body);    # 40 instructions
 1144   my ($a,$b,$c,$d,$e);
 1145 
 1146      eval(shift(@insns));
 1147      eval(shift(@insns));
 1148     &vpalignr(@X[0],@X[-3&7],@X[-4&7],8);   # compose "X[-14]" in "X[0]"
 1149      eval(shift(@insns));
 1150      eval(shift(@insns));
 1151 
 1152       &vpaddd   (@Tx[1],$Kx,@X[-1&7]);
 1153      eval(shift(@insns));
 1154      eval(shift(@insns));
 1155     &vpsrldq(@Tx[0],@X[-1&7],4);        # "X[-3]", 3 dwords
 1156      eval(shift(@insns));
 1157      eval(shift(@insns));
 1158     &vpxor  (@X[0],@X[0],@X[-4&7]);     # "X[0]"^="X[-16]"
 1159      eval(shift(@insns));
 1160      eval(shift(@insns));
 1161 
 1162     &vpxor  (@Tx[0],@Tx[0],@X[-2&7]);   # "X[-3]"^"X[-8]"
 1163      eval(shift(@insns));
 1164      eval(shift(@insns));
 1165      eval(shift(@insns));
 1166      eval(shift(@insns));
 1167 
 1168     &vpxor  (@X[0],@X[0],@Tx[0]);       # "X[0]"^="X[-3]"^"X[-8]"
 1169      eval(shift(@insns));
 1170      eval(shift(@insns));
 1171       &vmovdqa  (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
 1172      eval(shift(@insns));
 1173      eval(shift(@insns));
 1174 
 1175     &vpsrld (@Tx[0],@X[0],31);
 1176      eval(shift(@insns));
 1177      eval(shift(@insns));
 1178      eval(shift(@insns));
 1179      eval(shift(@insns));
 1180 
 1181     &vpslldq(@Tx[1],@X[0],12);      # "X[0]"<<96, extract one dword
 1182     &vpaddd (@X[0],@X[0],@X[0]);
 1183      eval(shift(@insns));
 1184      eval(shift(@insns));
 1185      eval(shift(@insns));
 1186      eval(shift(@insns));
 1187 
 1188     &vpor   (@X[0],@X[0],@Tx[0]);       # "X[0]"<<<=1
 1189     &vpsrld (@Tx[0],@Tx[1],30);
 1190      eval(shift(@insns));
 1191      eval(shift(@insns));
 1192      eval(shift(@insns));
 1193      eval(shift(@insns));
 1194 
 1195     &vpslld (@Tx[1],@Tx[1],2);
 1196     &vpxor  (@X[0],@X[0],@Tx[0]);
 1197      eval(shift(@insns));
 1198      eval(shift(@insns));
 1199      eval(shift(@insns));
 1200      eval(shift(@insns));
 1201 
 1202     &vpxor  (@X[0],@X[0],@Tx[1]);       # "X[0]"^=("X[0]">>96)<<<2
 1203      eval(shift(@insns));
 1204      eval(shift(@insns));
 1205       &vmovdqa  ($Kx,eval(16*(($Xi)/5))."($K_XX_XX)")   if ($Xi%5==0);  # K_XX_XX
 1206      eval(shift(@insns));
 1207      eval(shift(@insns));
 1208 
 1209 
 1210      foreach (@insns) { eval; } # remaining instructions [if any]
 1211 
 1212   $Xi++;    push(@X,shift(@X)); # "rotate" X[]
 1213 }
 1214 
 1215 sub Xupdate_avx_32_79()
 1216 { use integer;
 1217   my $body = shift;
 1218   my @insns = (&$body,&$body,&$body,&$body);    # 32 to 48 instructions
 1219   my ($a,$b,$c,$d,$e);
 1220 
 1221     &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);  # compose "X[-6]"
 1222     &vpxor  (@X[0],@X[0],@X[-4&7]);     # "X[0]"="X[-32]"^"X[-16]"
 1223      eval(shift(@insns));       # body_20_39
 1224      eval(shift(@insns));
 1225      eval(shift(@insns));
 1226      eval(shift(@insns));       # rol
 1227 
 1228     &vpxor  (@X[0],@X[0],@X[-7&7]);     # "X[0]"^="X[-28]"
 1229      eval(shift(@insns));
 1230      eval(shift(@insns))    if (@insns[0] !~ /&ro[rl]/);
 1231       &vpaddd   (@Tx[1],$Kx,@X[-1&7]);
 1232       &vmovdqa  ($Kx,eval(16*($Xi/5))."($K_XX_XX)") if ($Xi%5==0);
 1233      eval(shift(@insns));       # ror
 1234      eval(shift(@insns));
 1235 
 1236     &vpxor  (@X[0],@X[0],@Tx[0]);       # "X[0]"^="X[-6]"
 1237      eval(shift(@insns));       # body_20_39
 1238      eval(shift(@insns));
 1239      eval(shift(@insns));
 1240      eval(shift(@insns));       # rol
 1241 
 1242     &vpsrld (@Tx[0],@X[0],30);
 1243       &vmovdqa  (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
 1244      eval(shift(@insns));
 1245      eval(shift(@insns));
 1246      eval(shift(@insns));       # ror
 1247      eval(shift(@insns));
 1248 
 1249     &vpslld (@X[0],@X[0],2);
 1250      eval(shift(@insns));       # body_20_39
 1251      eval(shift(@insns));
 1252      eval(shift(@insns));
 1253      eval(shift(@insns));       # rol
 1254      eval(shift(@insns));
 1255      eval(shift(@insns));
 1256      eval(shift(@insns));       # ror
 1257      eval(shift(@insns));
 1258 
 1259     &vpor   (@X[0],@X[0],@Tx[0]);       # "X[0]"<<<=2
 1260      eval(shift(@insns));       # body_20_39
 1261      eval(shift(@insns));
 1262      eval(shift(@insns));
 1263      eval(shift(@insns));       # rol
 1264      eval(shift(@insns));
 1265      eval(shift(@insns));
 1266      eval(shift(@insns));       # rol
 1267      eval(shift(@insns));
 1268 
 1269      foreach (@insns) { eval; } # remaining instructions
 1270 
 1271   $Xi++;    push(@X,shift(@X)); # "rotate" X[]
 1272 }
 1273 
 1274 sub Xuplast_avx_80()
 1275 { use integer;
 1276   my $body = shift;
 1277   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 1278   my ($a,$b,$c,$d,$e);
 1279 
 1280      eval(shift(@insns));
 1281       &vpaddd   (@Tx[1],$Kx,@X[-1&7]);
 1282      eval(shift(@insns));
 1283      eval(shift(@insns));
 1284      eval(shift(@insns));
 1285      eval(shift(@insns));
 1286 
 1287       &vmovdqa  (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
 1288 
 1289      foreach (@insns) { eval; }     # remaining instructions
 1290 
 1291     &cmp    ($inp,$len);
 1292     &je (shift);
 1293 
 1294     &vmovdqa(@Tx[1],"64($K_XX_XX)");    # pbswap mask
 1295     &vmovdqa($Kx,"0($K_XX_XX)");        # K_00_19
 1296     &vmovdqu(@X[-4&7],"0($inp)");       # load input
 1297     &vmovdqu(@X[-3&7],"16($inp)");
 1298     &vmovdqu(@X[-2&7],"32($inp)");
 1299     &vmovdqu(@X[-1&7],"48($inp)");
 1300     &vpshufb(@X[-4&7],@X[-4&7],@Tx[1]); # byte swap
 1301     &add    ($inp,64);
 1302 
 1303   $Xi=0;
 1304 }
 1305 
 1306 sub Xloop_avx()
 1307 { use integer;
 1308   my $body = shift;
 1309   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 1310   my ($a,$b,$c,$d,$e);
 1311 
 1312      eval(shift(@insns));
 1313      eval(shift(@insns));
 1314     &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@Tx[1]);
 1315      eval(shift(@insns));
 1316      eval(shift(@insns));
 1317     &vpaddd (@Tx[0],@X[($Xi-4)&7],$Kx);
 1318      eval(shift(@insns));
 1319      eval(shift(@insns));
 1320      eval(shift(@insns));
 1321      eval(shift(@insns));
 1322     &vmovdqa(eval(16*$Xi)."(%rsp)",@Tx[0]); # X[]+K xfer to IALU
 1323      eval(shift(@insns));
 1324      eval(shift(@insns));
 1325 
 1326     foreach (@insns) { eval; }
 1327   $Xi++;
 1328 }
 1329 
 1330 sub Xtail_avx()
 1331 { use integer;
 1332   my $body = shift;
 1333   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 1334   my ($a,$b,$c,$d,$e);
 1335 
 1336     foreach (@insns) { eval; }
 1337 }
 1338 
 1339 $code.=<<___;
 1340 .align  32
 1341 .Loop_avx:
 1342 ___
 1343     &Xupdate_avx_16_31(\&body_00_19);
 1344     &Xupdate_avx_16_31(\&body_00_19);
 1345     &Xupdate_avx_16_31(\&body_00_19);
 1346     &Xupdate_avx_16_31(\&body_00_19);
 1347     &Xupdate_avx_32_79(\&body_00_19);
 1348     &Xupdate_avx_32_79(\&body_20_39);
 1349     &Xupdate_avx_32_79(\&body_20_39);
 1350     &Xupdate_avx_32_79(\&body_20_39);
 1351     &Xupdate_avx_32_79(\&body_20_39);
 1352     &Xupdate_avx_32_79(\&body_20_39);
 1353     &Xupdate_avx_32_79(\&body_40_59);
 1354     &Xupdate_avx_32_79(\&body_40_59);
 1355     &Xupdate_avx_32_79(\&body_40_59);
 1356     &Xupdate_avx_32_79(\&body_40_59);
 1357     &Xupdate_avx_32_79(\&body_40_59);
 1358     &Xupdate_avx_32_79(\&body_20_39);
 1359     &Xuplast_avx_80(\&body_20_39,".Ldone_avx"); # can jump to "done"
 1360 
 1361                 $saved_j=$j; @saved_V=@V;
 1362                 $saved_r=$r; @saved_rndkey=@rndkey;
 1363 
 1364     &Xloop_avx(\&body_20_39);
 1365     &Xloop_avx(\&body_20_39);
 1366     &Xloop_avx(\&body_20_39);
 1367 
 1368 $code.=<<___;
 1369     vmovups $iv,48($out,$in0)       # write output
 1370     lea 64($in0),$in0
 1371 
 1372     add 0($ctx),$A          # update context
 1373     add 4($ctx),@T[0]
 1374     add 8($ctx),$C
 1375     add 12($ctx),$D
 1376     mov $A,0($ctx)
 1377     add 16($ctx),$E
 1378     mov @T[0],4($ctx)
 1379     mov @T[0],$B            # magic seed
 1380     mov $C,8($ctx)
 1381     mov $C,@T[1]
 1382     mov $D,12($ctx)
 1383     xor $D,@T[1]
 1384     mov $E,16($ctx)
 1385     and @T[1],@T[0]
 1386     jmp .Loop_avx
 1387 
 1388 .Ldone_avx:
 1389 ___
 1390                 $jj=$j=$saved_j; @V=@saved_V;
 1391                 $r=$saved_r;     @rndkey=@saved_rndkey;
 1392 
 1393     &Xtail_avx(\&body_20_39);
 1394     &Xtail_avx(\&body_20_39);
 1395     &Xtail_avx(\&body_20_39);
 1396 
 1397 $code.=<<___;
 1398     vmovups $iv,48($out,$in0)       # write output
 1399     mov 88(%rsp),$ivp           # restore $ivp
 1400 
 1401     add 0($ctx),$A          # update context
 1402     add 4($ctx),@T[0]
 1403     add 8($ctx),$C
 1404     mov $A,0($ctx)
 1405     add 12($ctx),$D
 1406     mov @T[0],4($ctx)
 1407     add 16($ctx),$E
 1408     mov $C,8($ctx)
 1409     mov $D,12($ctx)
 1410     mov $E,16($ctx)
 1411     vmovups $iv,($ivp)          # write IV
 1412     vzeroall
 1413 ___
 1414 $code.=<<___ if ($win64);
 1415     movaps  96+0(%rsp),%xmm6
 1416     movaps  96+16(%rsp),%xmm7
 1417     movaps  96+32(%rsp),%xmm8
 1418     movaps  96+48(%rsp),%xmm9
 1419     movaps  96+64(%rsp),%xmm10
 1420     movaps  96+80(%rsp),%xmm11
 1421     movaps  96+96(%rsp),%xmm12
 1422     movaps  96+112(%rsp),%xmm13
 1423     movaps  96+128(%rsp),%xmm14
 1424     movaps  96+144(%rsp),%xmm15
 1425 ___
 1426 $code.=<<___;
 1427     lea `104+($win64?10*16:0)`(%rsp),%rsi
 1428     mov 0(%rsi),%r15
 1429     mov 8(%rsi),%r14
 1430     mov 16(%rsi),%r13
 1431     mov 24(%rsi),%r12
 1432     mov 32(%rsi),%rbp
 1433     mov 40(%rsi),%rbx
 1434     lea 48(%rsi),%rsp
 1435 .Lepilogue_avx:
 1436     ret
 1437 .size   aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
 1438 ___
 1439 
 1440                         if ($stitched_decrypt) {{{
 1441 # reset
 1442 ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
 1443 
 1444 $j=$jj=$r=$rx=0;
 1445 $Xi=4;
 1446 
 1447 @aes256_dec = (
 1448     '&vpxor ($inout0,$rndkey0,"0x00($in0)");',
 1449     '&vpxor ($inout1,$rndkey0,"0x10($in0)");',
 1450     '&vpxor ($inout2,$rndkey0,"0x20($in0)");',
 1451     '&vpxor ($inout3,$rndkey0,"0x30($in0)");',
 1452 
 1453     '&vmovups($rndkey0,"16-112($key)");',
 1454     '&vmovups("64(%rsp)",@X[2]);',      # save IV, originally @X[3]
 1455     undef,undef
 1456     );
 1457 for ($i=0;$i<13;$i++) {
 1458     push (@aes256_dec,(
 1459     '&vaesdec   ($inout0,$inout0,$rndkey0);',
 1460     '&vaesdec   ($inout1,$inout1,$rndkey0);',
 1461     '&vaesdec   ($inout2,$inout2,$rndkey0);',
 1462     '&vaesdec   ($inout3,$inout3,$rndkey0); &vmovups($rndkey0,"'.(16*($i+2)-112).'($key)");'
 1463     ));
 1464     push (@aes256_dec,(undef,undef))    if (($i>=3 && $i<=5) || $i>=11);
 1465     push (@aes256_dec,(undef,undef))    if ($i==5);
 1466 }
 1467 push(@aes256_dec,(
 1468     '&vaesdeclast   ($inout0,$inout0,$rndkey0); &vmovups(@X[0],"0x00($in0)");',
 1469     '&vaesdeclast   ($inout1,$inout1,$rndkey0); &vmovups(@X[1],"0x10($in0)");',
 1470     '&vaesdeclast   ($inout2,$inout2,$rndkey0); &vmovups(@X[2],"0x20($in0)");',
 1471     '&vaesdeclast   ($inout3,$inout3,$rndkey0); &vmovups(@X[3],"0x30($in0)");',
 1472 
 1473     '&vxorps    ($inout0,$inout0,"64(%rsp)");   &vmovdqu($rndkey0,"-112($key)");',
 1474     '&vxorps    ($inout1,$inout1,@X[0]);    &vmovups("0x00($out,$in0)",$inout0);',
 1475     '&vxorps    ($inout2,$inout2,@X[1]);    &vmovups("0x10($out,$in0)",$inout1);',
 1476     '&vxorps    ($inout3,$inout3,@X[2]);    &vmovups("0x20($out,$in0)",$inout2);',
 1477 
 1478     '&vmovups   ("0x30($out,$in0)",$inout3);'
 1479     ));
 1480 
 1481 $code.=<<___;
 1482 .type   aesni256_cbc_sha1_dec_avx,\@function,6
 1483 .align  32
 1484 aesni256_cbc_sha1_dec_avx:
 1485     mov `($win64?56:8)`(%rsp),$inp  # load 7th argument
 1486     push    %rbx
 1487     push    %rbp
 1488     push    %r12
 1489     push    %r13
 1490     push    %r14
 1491     push    %r15
 1492     lea `-104-($win64?10*16:0)`(%rsp),%rsp
 1493 ___
 1494 $code.=<<___ if ($win64);
 1495     movaps  %xmm6,96+0(%rsp)
 1496     movaps  %xmm7,96+16(%rsp)
 1497     movaps  %xmm8,96+32(%rsp)
 1498     movaps  %xmm9,96+48(%rsp)
 1499     movaps  %xmm10,96+64(%rsp)
 1500     movaps  %xmm11,96+80(%rsp)
 1501     movaps  %xmm12,96+96(%rsp)
 1502     movaps  %xmm13,96+112(%rsp)
 1503     movaps  %xmm14,96+128(%rsp)
 1504     movaps  %xmm15,96+144(%rsp)
 1505 .Lprologue_dec_avx:
 1506 ___
 1507 $code.=<<___;
 1508     vzeroall
 1509     mov $in0,%r12           # reassign arguments
 1510     mov $out,%r13
 1511     mov $len,%r14
 1512     lea 112($key),%r15          # size optimization
 1513     vmovdqu ($ivp),@X[3]            # load IV
 1514 ___
 1515 ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
 1516 $code.=<<___;
 1517     shl \$6,$len
 1518     sub $in0,$out
 1519     add $inp,$len       # end of input
 1520 
 1521     lea K_XX_XX(%rip),$K_XX_XX
 1522     mov 0($ctx),$A      # load context
 1523     mov 4($ctx),$B
 1524     mov 8($ctx),$C
 1525     mov 12($ctx),$D
 1526     mov $B,@T[0]        # magic seed
 1527     mov 16($ctx),$E
 1528     mov $C,@T[1]
 1529     xor $D,@T[1]
 1530     and @T[1],@T[0]
 1531 
 1532     vmovdqa 64($K_XX_XX),@X[2]  # pbswap mask
 1533     vmovdqa 0($K_XX_XX),$Kx     # K_00_19
 1534     vmovdqu 0($inp),@X[-4&7]    # load input to %xmm[0-3]
 1535     vmovdqu 16($inp),@X[-3&7]
 1536     vmovdqu 32($inp),@X[-2&7]
 1537     vmovdqu 48($inp),@X[-1&7]
 1538     vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
 1539     add \$64,$inp
 1540     vpshufb @X[2],@X[-3&7],@X[-3&7]
 1541     vpshufb @X[2],@X[-2&7],@X[-2&7]
 1542     vpshufb @X[2],@X[-1&7],@X[-1&7]
 1543     vpaddd  $Kx,@X[-4&7],@X[0]  # add K_00_19
 1544     vpaddd  $Kx,@X[-3&7],@X[1]
 1545     vpaddd  $Kx,@X[-2&7],@X[2]
 1546     vmovdqa @X[0],0(%rsp)       # X[]+K xfer to IALU
 1547     vmovdqa @X[1],16(%rsp)
 1548     vmovdqa @X[2],32(%rsp)
 1549     vmovups -112($key),$rndkey0 # $key[0]
 1550     jmp .Loop_dec_avx
 1551 
 1552 .align  32
 1553 .Loop_dec_avx:
 1554 ___
 1555     &Xupdate_avx_16_31(\&body_00_19_dec);
 1556     &Xupdate_avx_16_31(\&body_00_19_dec);
 1557     &Xupdate_avx_16_31(\&body_00_19_dec);
 1558     &Xupdate_avx_16_31(\&body_00_19_dec);
 1559     &Xupdate_avx_32_79(\&body_00_19_dec);
 1560     &Xupdate_avx_32_79(\&body_20_39_dec);
 1561     &Xupdate_avx_32_79(\&body_20_39_dec);
 1562     &Xupdate_avx_32_79(\&body_20_39_dec);
 1563     &Xupdate_avx_32_79(\&body_20_39_dec);
 1564     &Xupdate_avx_32_79(\&body_20_39_dec);
 1565     &Xupdate_avx_32_79(\&body_40_59_dec);
 1566     &Xupdate_avx_32_79(\&body_40_59_dec);
 1567     &Xupdate_avx_32_79(\&body_40_59_dec);
 1568     &Xupdate_avx_32_79(\&body_40_59_dec);
 1569     &Xupdate_avx_32_79(\&body_40_59_dec);
 1570     &Xupdate_avx_32_79(\&body_20_39_dec);
 1571     &Xuplast_avx_80(\&body_20_39_dec,".Ldone_dec_avx"); # can jump to "done"
 1572 
 1573                 $saved_j=$j; @saved_V=@V;
 1574                 $saved_rx=$rx;
 1575 
 1576     &Xloop_avx(\&body_20_39_dec);
 1577     &Xloop_avx(\&body_20_39_dec);
 1578     &Xloop_avx(\&body_20_39_dec);
 1579 
 1580     eval(@aes256_dec[-1]);          # last store
 1581 $code.=<<___;
 1582     lea 64($in0),$in0
 1583 
 1584     add 0($ctx),$A          # update context
 1585     add 4($ctx),@T[0]
 1586     add 8($ctx),$C
 1587     add 12($ctx),$D
 1588     mov $A,0($ctx)
 1589     add 16($ctx),$E
 1590     mov @T[0],4($ctx)
 1591     mov @T[0],$B            # magic seed
 1592     mov $C,8($ctx)
 1593     mov $C,@T[1]
 1594     mov $D,12($ctx)
 1595     xor $D,@T[1]
 1596     mov $E,16($ctx)
 1597     and @T[1],@T[0]
 1598     jmp .Loop_dec_avx
 1599 
 1600 .Ldone_dec_avx:
 1601 ___
 1602                 $jj=$j=$saved_j; @V=@saved_V;
 1603                 $rx=$saved_rx;
 1604 
 1605     &Xtail_avx(\&body_20_39_dec);
 1606     &Xtail_avx(\&body_20_39_dec);
 1607     &Xtail_avx(\&body_20_39_dec);
 1608 
 1609     eval(@aes256_dec[-1]);          # last store
 1610 $code.=<<___;
 1611 
 1612     add 0($ctx),$A          # update context
 1613     add 4($ctx),@T[0]
 1614     add 8($ctx),$C
 1615     mov $A,0($ctx)
 1616     add 12($ctx),$D
 1617     mov @T[0],4($ctx)
 1618     add 16($ctx),$E
 1619     mov $C,8($ctx)
 1620     mov $D,12($ctx)
 1621     mov $E,16($ctx)
 1622     vmovups @X[3],($ivp)            # write IV
 1623     vzeroall
 1624 ___
 1625 $code.=<<___ if ($win64);
 1626     movaps  96+0(%rsp),%xmm6
 1627     movaps  96+16(%rsp),%xmm7
 1628     movaps  96+32(%rsp),%xmm8
 1629     movaps  96+48(%rsp),%xmm9
 1630     movaps  96+64(%rsp),%xmm10
 1631     movaps  96+80(%rsp),%xmm11
 1632     movaps  96+96(%rsp),%xmm12
 1633     movaps  96+112(%rsp),%xmm13
 1634     movaps  96+128(%rsp),%xmm14
 1635     movaps  96+144(%rsp),%xmm15
 1636 ___
 1637 $code.=<<___;
 1638     lea `104+($win64?10*16:0)`(%rsp),%rsi
 1639     mov 0(%rsi),%r15
 1640     mov 8(%rsi),%r14
 1641     mov 16(%rsi),%r13
 1642     mov 24(%rsi),%r12
 1643     mov 32(%rsi),%rbp
 1644     mov 40(%rsi),%rbx
 1645     lea 48(%rsi),%rsp
 1646 .Lepilogue_dec_avx:
 1647     ret
 1648 .size   aesni256_cbc_sha1_dec_avx,.-aesni256_cbc_sha1_dec_avx
 1649 ___
 1650                         }}}
 1651 }
 1652 $code.=<<___;
 1653 .align  64
 1654 K_XX_XX:
 1655 .long   0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
 1656 .long   0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
 1657 .long   0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
 1658 .long   0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
 1659 .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
 1660 .byte   0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
 1661 
 1662 .asciz  "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 1663 .align  64
 1664 ___
 1665                         if ($shaext) {{{
 1666 ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
 1667 
 1668 $rounds="%r11d";
 1669 
 1670 ($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15));
 1671 @rndkey=("%xmm0","%xmm1");
 1672 $r=0;
 1673 
 1674 my ($BSWAP,$ABCD,$E,$E_,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(7..12));
 1675 my @MSG=map("%xmm$_",(3..6));
 1676 
 1677 $code.=<<___;
 1678 .type   aesni_cbc_sha1_enc_shaext,\@function,6
 1679 .align  32
 1680 aesni_cbc_sha1_enc_shaext:
 1681     mov `($win64?56:8)`(%rsp),$inp  # load 7th argument
 1682 ___
 1683 $code.=<<___ if ($win64);
 1684     lea `-8-10*16`(%rsp),%rsp
 1685     movaps  %xmm6,-8-10*16(%rax)
 1686     movaps  %xmm7,-8-9*16(%rax)
 1687     movaps  %xmm8,-8-8*16(%rax)
 1688     movaps  %xmm9,-8-7*16(%rax)
 1689     movaps  %xmm10,-8-6*16(%rax)
 1690     movaps  %xmm11,-8-5*16(%rax)
 1691     movaps  %xmm12,-8-4*16(%rax)
 1692     movaps  %xmm13,-8-3*16(%rax)
 1693     movaps  %xmm14,-8-2*16(%rax)
 1694     movaps  %xmm15,-8-1*16(%rax)
 1695 .Lprologue_shaext:
 1696 ___
 1697 $code.=<<___;
 1698     movdqu  ($ctx),$ABCD
 1699     movd    16($ctx),$E
 1700     movdqa  K_XX_XX+0x50(%rip),$BSWAP   # byte-n-word swap
 1701 
 1702     mov 240($key),$rounds
 1703     sub $in0,$out
 1704     movups  ($key),$rndkey0         # $key[0]
 1705     movups  ($ivp),$iv          # load IV
 1706     movups  16($key),$rndkey[0]     # forward reference
 1707     lea 112($key),$key          # size optimization
 1708 
 1709     pshufd  \$0b00011011,$ABCD,$ABCD    # flip word order
 1710     pshufd  \$0b00011011,$E,$E      # flip word order
 1711     jmp .Loop_shaext
 1712 
 1713 .align  16
 1714 .Loop_shaext:
 1715 ___
 1716     &$aesenc();
 1717 $code.=<<___;
 1718     movdqu      ($inp),@MSG[0]
 1719     movdqa      $E,$E_SAVE      # offload $E
 1720     pshufb      $BSWAP,@MSG[0]
 1721     movdqu      0x10($inp),@MSG[1]
 1722     movdqa      $ABCD,$ABCD_SAVE    # offload $ABCD
 1723 ___
 1724     &$aesenc();
 1725 $code.=<<___;
 1726     pshufb      $BSWAP,@MSG[1]
 1727 
 1728     paddd       @MSG[0],$E
 1729     movdqu      0x20($inp),@MSG[2]
 1730     lea     0x40($inp),$inp
 1731     pxor        $E_SAVE,@MSG[0]     # black magic
 1732 ___
 1733     &$aesenc();
 1734 $code.=<<___;
 1735     pxor        $E_SAVE,@MSG[0]     # black magic
 1736     movdqa      $ABCD,$E_
 1737     pshufb      $BSWAP,@MSG[2]
 1738     sha1rnds4   \$0,$E,$ABCD        # 0-3
 1739     sha1nexte   @MSG[1],$E_
 1740 ___
 1741     &$aesenc();
 1742 $code.=<<___;
 1743     sha1msg1    @MSG[1],@MSG[0]
 1744     movdqu      -0x10($inp),@MSG[3]
 1745     movdqa      $ABCD,$E
 1746     pshufb      $BSWAP,@MSG[3]
 1747 ___
 1748     &$aesenc();
 1749 $code.=<<___;
 1750     sha1rnds4   \$0,$E_,$ABCD       # 4-7
 1751     sha1nexte   @MSG[2],$E
 1752     pxor        @MSG[2],@MSG[0]
 1753     sha1msg1    @MSG[2],@MSG[1]
 1754 ___
 1755     &$aesenc();
 1756 
 1757 for($i=2;$i<20-4;$i++) {
 1758 $code.=<<___;
 1759     movdqa      $ABCD,$E_
 1760     sha1rnds4   \$`int($i/5)`,$E,$ABCD  # 8-11
 1761     sha1nexte   @MSG[3],$E_
 1762 ___
 1763     &$aesenc();
 1764 $code.=<<___;
 1765     sha1msg2    @MSG[3],@MSG[0]
 1766     pxor        @MSG[3],@MSG[1]
 1767     sha1msg1    @MSG[3],@MSG[2]
 1768 ___
 1769     ($E,$E_)=($E_,$E);
 1770     push(@MSG,shift(@MSG));
 1771 
 1772     &$aesenc();
 1773 }
 1774 $code.=<<___;
 1775     movdqa      $ABCD,$E_
 1776     sha1rnds4   \$3,$E,$ABCD        # 64-67
 1777     sha1nexte   @MSG[3],$E_
 1778     sha1msg2    @MSG[3],@MSG[0]
 1779     pxor        @MSG[3],@MSG[1]
 1780 ___
 1781     &$aesenc();
 1782 $code.=<<___;
 1783     movdqa      $ABCD,$E
 1784     sha1rnds4   \$3,$E_,$ABCD       # 68-71
 1785     sha1nexte   @MSG[0],$E
 1786     sha1msg2    @MSG[0],@MSG[1]
 1787 ___
 1788     &$aesenc();
 1789 $code.=<<___;
 1790     movdqa      $E_SAVE,@MSG[0]
 1791     movdqa      $ABCD,$E_
 1792     sha1rnds4   \$3,$E,$ABCD        # 72-75
 1793     sha1nexte   @MSG[1],$E_
 1794 ___
 1795     &$aesenc();
 1796 $code.=<<___;
 1797     movdqa      $ABCD,$E
 1798     sha1rnds4   \$3,$E_,$ABCD       # 76-79
 1799     sha1nexte   $MSG[0],$E
 1800 ___
 1801     while($r<40)    { &$aesenc(); }     # remaining aesenc's
 1802 $code.=<<___;
 1803     dec     $len
 1804 
 1805     paddd       $ABCD_SAVE,$ABCD
 1806     movups      $iv,48($out,$in0)   # write output
 1807     lea     64($in0),$in0
 1808     jnz     .Loop_shaext
 1809 
 1810     pshufd  \$0b00011011,$ABCD,$ABCD
 1811     pshufd  \$0b00011011,$E,$E
 1812     movups  $iv,($ivp)          # write IV
 1813     movdqu  $ABCD,($ctx)
 1814     movd    $E,16($ctx)
 1815 ___
 1816 $code.=<<___ if ($win64);
 1817     movaps  -8-10*16(%rax),%xmm6
 1818     movaps  -8-9*16(%rax),%xmm7
 1819     movaps  -8-8*16(%rax),%xmm8
 1820     movaps  -8-7*16(%rax),%xmm9
 1821     movaps  -8-6*16(%rax),%xmm10
 1822     movaps  -8-5*16(%rax),%xmm11
 1823     movaps  -8-4*16(%rax),%xmm12
 1824     movaps  -8-3*16(%rax),%xmm13
 1825     movaps  -8-2*16(%rax),%xmm14
 1826     movaps  -8-1*16(%rax),%xmm15
 1827     mov %rax,%rsp
 1828 .Lepilogue_shaext:
 1829 ___
 1830 $code.=<<___;
 1831     ret
 1832 .size   aesni_cbc_sha1_enc_shaext,.-aesni_cbc_sha1_enc_shaext
 1833 ___
 1834                         }}}
 1835 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
 1836 #       CONTEXT *context,DISPATCHER_CONTEXT *disp)
 1837 if ($win64) {
 1838 $rec="%rcx";
 1839 $frame="%rdx";
 1840 $context="%r8";
 1841 $disp="%r9";
 1842 
 1843 $code.=<<___;
 1844 .extern __imp_RtlVirtualUnwind
 1845 .type   ssse3_handler,\@abi-omnipotent
 1846 .align  16
 1847 ssse3_handler:
 1848     push    %rsi
 1849     push    %rdi
 1850     push    %rbx
 1851     push    %rbp
 1852     push    %r12
 1853     push    %r13
 1854     push    %r14
 1855     push    %r15
 1856     pushfq
 1857     sub \$64,%rsp
 1858 
 1859     mov 120($context),%rax  # pull context->Rax
 1860     mov 248($context),%rbx  # pull context->Rip
 1861 
 1862     mov 8($disp),%rsi       # disp->ImageBase
 1863     mov 56($disp),%r11      # disp->HandlerData
 1864 
 1865     mov 0(%r11),%r10d       # HandlerData[0]
 1866     lea (%rsi,%r10),%r10    # prologue label
 1867     cmp %r10,%rbx       # context->Rip<prologue label
 1868     jb  .Lcommon_seh_tail
 1869 
 1870     mov 152($context),%rax  # pull context->Rsp
 1871 
 1872     mov 4(%r11),%r10d       # HandlerData[1]
 1873     lea (%rsi,%r10),%r10    # epilogue label
 1874     cmp %r10,%rbx       # context->Rip>=epilogue label
 1875     jae .Lcommon_seh_tail
 1876 ___
 1877 $code.=<<___ if ($shaext);
 1878     lea aesni_cbc_sha1_enc_shaext(%rip),%r10
 1879     cmp %r10,%rbx
 1880     jb  .Lseh_no_shaext
 1881 
 1882     lea (%rax),%rsi
 1883     lea 512($context),%rdi  # &context.Xmm6
 1884     mov \$20,%ecx
 1885     .long   0xa548f3fc      # cld; rep movsq
 1886     lea 168(%rax),%rax      # adjust stack pointer
 1887     jmp .Lcommon_seh_tail
 1888 .Lseh_no_shaext:
 1889 ___
 1890 $code.=<<___;
 1891     lea 96(%rax),%rsi
 1892     lea 512($context),%rdi  # &context.Xmm6
 1893     mov \$20,%ecx
 1894     .long   0xa548f3fc      # cld; rep movsq
 1895     lea `104+10*16`(%rax),%rax  # adjust stack pointer
 1896 
 1897     mov 0(%rax),%r15
 1898     mov 8(%rax),%r14
 1899     mov 16(%rax),%r13
 1900     mov 24(%rax),%r12
 1901     mov 32(%rax),%rbp
 1902     mov 40(%rax),%rbx
 1903     lea 48(%rax),%rax
 1904     mov %rbx,144($context)  # restore context->Rbx
 1905     mov %rbp,160($context)  # restore context->Rbp
 1906     mov %r12,216($context)  # restore context->R12
 1907     mov %r13,224($context)  # restore context->R13
 1908     mov %r14,232($context)  # restore context->R14
 1909     mov %r15,240($context)  # restore context->R15
 1910 
 1911 .Lcommon_seh_tail:
 1912     mov 8(%rax),%rdi
 1913     mov 16(%rax),%rsi
 1914     mov %rax,152($context)  # restore context->Rsp
 1915     mov %rsi,168($context)  # restore context->Rsi
 1916     mov %rdi,176($context)  # restore context->Rdi
 1917 
 1918     mov 40($disp),%rdi      # disp->ContextRecord
 1919     mov $context,%rsi       # context
 1920     mov \$154,%ecx      # sizeof(CONTEXT)
 1921     .long   0xa548f3fc      # cld; rep movsq
 1922 
 1923     mov $disp,%rsi
 1924     xor %rcx,%rcx       # arg1, UNW_FLAG_NHANDLER
 1925     mov 8(%rsi),%rdx        # arg2, disp->ImageBase
 1926     mov 0(%rsi),%r8     # arg3, disp->ControlPc
 1927     mov 16(%rsi),%r9        # arg4, disp->FunctionEntry
 1928     mov 40(%rsi),%r10       # disp->ContextRecord
 1929     lea 56(%rsi),%r11       # &disp->HandlerData
 1930     lea 24(%rsi),%r12       # &disp->EstablisherFrame
 1931     mov %r10,32(%rsp)       # arg5
 1932     mov %r11,40(%rsp)       # arg6
 1933     mov %r12,48(%rsp)       # arg7
 1934     mov %rcx,56(%rsp)       # arg8, (NULL)
 1935     call    *__imp_RtlVirtualUnwind(%rip)
 1936 
 1937     mov \$1,%eax        # ExceptionContinueSearch
 1938     add \$64,%rsp
 1939     popfq
 1940     pop %r15
 1941     pop %r14
 1942     pop %r13
 1943     pop %r12
 1944     pop %rbp
 1945     pop %rbx
 1946     pop %rdi
 1947     pop %rsi
 1948     ret
 1949 .size   ssse3_handler,.-ssse3_handler
 1950 
 1951 .section    .pdata
 1952 .align  4
 1953     .rva    .LSEH_begin_aesni_cbc_sha1_enc_ssse3
 1954     .rva    .LSEH_end_aesni_cbc_sha1_enc_ssse3
 1955     .rva    .LSEH_info_aesni_cbc_sha1_enc_ssse3
 1956 ___
 1957 $code.=<<___ if ($avx);
 1958     .rva    .LSEH_begin_aesni_cbc_sha1_enc_avx
 1959     .rva    .LSEH_end_aesni_cbc_sha1_enc_avx
 1960     .rva    .LSEH_info_aesni_cbc_sha1_enc_avx
 1961 ___
 1962 $code.=<<___ if ($shaext);
 1963     .rva    .LSEH_begin_aesni_cbc_sha1_enc_shaext
 1964     .rva    .LSEH_end_aesni_cbc_sha1_enc_shaext
 1965     .rva    .LSEH_info_aesni_cbc_sha1_enc_shaext
 1966 ___
 1967 $code.=<<___;
 1968 .section    .xdata
 1969 .align  8
 1970 .LSEH_info_aesni_cbc_sha1_enc_ssse3:
 1971     .byte   9,0,0,0
 1972     .rva    ssse3_handler
 1973     .rva    .Lprologue_ssse3,.Lepilogue_ssse3   # HandlerData[]
 1974 ___
 1975 $code.=<<___ if ($avx);
 1976 .LSEH_info_aesni_cbc_sha1_enc_avx:
 1977     .byte   9,0,0,0
 1978     .rva    ssse3_handler
 1979     .rva    .Lprologue_avx,.Lepilogue_avx       # HandlerData[]
 1980 ___
 1981 $code.=<<___ if ($shaext);
 1982 .LSEH_info_aesni_cbc_sha1_enc_shaext:
 1983     .byte   9,0,0,0
 1984     .rva    ssse3_handler
 1985     .rva    .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
 1986 ___
 1987 }
 1988 
 1989 ####################################################################
 1990 sub rex {
 1991   local *opcode=shift;
 1992   my ($dst,$src)=@_;
 1993   my $rex=0;
 1994 
 1995     $rex|=0x04          if($dst>=8);
 1996     $rex|=0x01          if($src>=8);
 1997     unshift @opcode,$rex|0x40   if($rex);
 1998 }
 1999 
 2000 sub sha1rnds4 {
 2001     if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
 2002       my @opcode=(0x0f,0x3a,0xcc);
 2003     rex(\@opcode,$3,$2);
 2004     push @opcode,0xc0|($2&7)|(($3&7)<<3);       # ModR/M
 2005     my $c=$1;
 2006     push @opcode,$c=~/^0/?oct($c):$c;
 2007     return ".byte\t".join(',',@opcode);
 2008     } else {
 2009     return "sha1rnds4\t".@_[0];
 2010     }
 2011 }
 2012 
 2013 sub sha1op38 {
 2014     my $instr = shift;
 2015     my %opcodelet = (
 2016         "sha1nexte" => 0xc8,
 2017         "sha1msg1"  => 0xc9,
 2018         "sha1msg2"  => 0xca );
 2019 
 2020     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
 2021       my @opcode=(0x0f,0x38);
 2022     rex(\@opcode,$2,$1);
 2023     push @opcode,$opcodelet{$instr};
 2024     push @opcode,0xc0|($1&7)|(($2&7)<<3);       # ModR/M
 2025     return ".byte\t".join(',',@opcode);
 2026     } else {
 2027     return $instr."\t".@_[0];
 2028     }
 2029 }
 2030 
 2031 sub aesni {
 2032   my $line=shift;
 2033   my @opcode=(0x0f,0x38);
 2034 
 2035     if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
 2036     my %opcodelet = (
 2037         "aesenc" => 0xdc,   "aesenclast" => 0xdd,
 2038         "aesdec" => 0xde,   "aesdeclast" => 0xdf
 2039     );
 2040     return undef if (!defined($opcodelet{$1}));
 2041     rex(\@opcode,$3,$2);
 2042     push @opcode,$opcodelet{$1},0xc0|($2&7)|(($3&7)<<3);    # ModR/M
 2043     unshift @opcode,0x66;
 2044     return ".byte\t".join(',',@opcode);
 2045     }
 2046     return $line;
 2047 }
 2048 
 2049 foreach (split("\n",$code)) {
 2050         s/\`([^\`]*)\`/eval $1/geo;
 2051 
 2052     s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo        or
 2053     s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo     or
 2054     s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/geo;
 2055 
 2056     print $_,"\n";
 2057 }
 2058 close STDOUT;